/[pearpc]/src/cpu/cpu_jitc_x86/ppc_vec.cc
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /src/cpu/cpu_jitc_x86/ppc_vec.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (hide annotations)
Wed Sep 5 17:11:21 2007 UTC (16 years, 7 months ago) by dpavlin
File size: 177354 byte(s)
import upstream CVS
1 dpavlin 1 /*
2     * PearPC
3     * ppc_vec.cc
4     *
5     * Copyright (C) 2004 Daniel Foesch (dfoesch@cs.nmsu.edu)
6     *
7     * This program is free software; you can redistribute it and/or modify
8     * it under the terms of the GNU General Public License version 2 as
9     * published by the Free Software Foundation.
10     *
11     * This program is distributed in the hope that it will be useful,
12     * but WITHOUT ANY WARRANTY; without even the implied warranty of
13     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14     * GNU General Public License for more details.
15     *
16     * You should have received a copy of the GNU General Public License
17     * along with this program; if not, write to the Free Software
18     * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19     */
20    
21     /* Pages marked: v.???
22     * From: IBM PowerPC MicroProcessor Family: Altivec(tm) Technology...
23     * Programming Environments Manual
24     */
25    
26     #include <string.h>
27     #include <math.h>
28    
29     /*
30     * FIXME: put somewhere appropriate
31     */
32     #ifndef HAS_LOG2
33     #define log2(x) log(x)/log(2)
34     #endif /* HAS_LOG2 */
35    
36     #ifndef HAS_EXP2
37     #define exp2(x) pow(2, x)
38     #endif /* HAS_EXP2 */
39    
40     #define ASSERT_FLUSHED(vrA)
41     //#define ASSERT_FLUSHED(vrA) jitcAssertFlushedVectorRegister(vrA)
42    
43     #include "debug/tracers.h"
44     #include "ppc_cpu.h"
45     #include "ppc_dec.h"
46     #include "ppc_fpu.h"
47     #include "ppc_vec.h"
48    
49     #define SIGN32 0x80000000
50    
51     #define RL2RH(RL) (NativeReg8)(RL+4)
52    
53     #define vrT JITC_VECTOR_TEMP
54     #define vrNEG1 JITC_VECTOR_NEG1
55    
56     static inline void commutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB);
57     static inline void noncommutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB);
58    
59     static inline void commutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB);
60     static inline void noncommutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB);
61    
62     /* PACK_PIXEL Packs a uint32 pixel to uint16 pixel
63     * v.219
64     */
65     static inline uint16 PACK_PIXEL(uint32 clr)
66     {
67     return (((clr & 0x000000f8) >> 3) | \
68     ((clr & 0x0000f800) >> 6) | \
69     ((clr & 0x01f80000) >> 9));
70     }
71    
72     /* UNPACK_PIXEL Unpacks a uint16 pixel to uint32 pixel
73     * v.276 & v.279
74     */
75     static inline uint32 UNPACK_PIXEL(uint16 clr)
76     {
77     return (((uint32)(clr & 0x001f)) | \
78     ((uint32)(clr & 0x03E0) << 3) | \
79     ((uint32)(clr & 0x7c00) << 6) | \
80     (((clr) & 0x8000) ? 0xff000000 : 0));
81     }
82    
83     static inline uint8 SATURATE_UB(uint16 val)
84     {
85     if (val & 0xff00) {
86     gCPU.vscr |= VSCR_SAT;
87     return 0xff;
88     }
89     return val;
90     }
91     static inline uint8 SATURATE_0B(uint16 val)
92     {
93     if (val & 0xff00) {
94     gCPU.vscr |= VSCR_SAT;
95     return 0;
96     }
97     return val;
98     }
99    
100     static inline uint16 SATURATE_UH(uint32 val)
101     {
102     if (val & 0xffff0000) {
103     gCPU.vscr |= VSCR_SAT;
104     return 0xffff;
105     }
106     return val;
107     }
108    
109     static inline uint16 SATURATE_0H(uint32 val)
110     {
111     if (val & 0xffff0000) {
112     gCPU.vscr |= VSCR_SAT;
113     return 0;
114     }
115     return val;
116     }
117    
118     static inline sint8 SATURATE_SB(sint16 val)
119     {
120     if (val > 127) { // 0x7F
121     gCPU.vscr |= VSCR_SAT;
122     return 127;
123     } else if (val < -128) { // 0x80
124     gCPU.vscr |= VSCR_SAT;
125     return -128;
126     }
127     return val;
128     }
129    
130     static inline uint8 SATURATE_USB(sint16 val)
131     {
132     if (val > 0xff) {
133     gCPU.vscr |= VSCR_SAT;
134     return 0xff;
135     } else if (val < 0) {
136     gCPU.vscr |= VSCR_SAT;
137     return 0;
138     }
139     return (uint8)val;
140     }
141    
142     static inline sint16 SATURATE_SH(sint32 val)
143     {
144     if (val > 32767) { // 0x7fff
145     gCPU.vscr |= VSCR_SAT;
146     return 32767;
147     } else if (val < -32768) { // 0x8000
148     gCPU.vscr |= VSCR_SAT;
149     return -32768;
150     }
151     return val;
152     }
153    
154     static inline uint16 SATURATE_USH(sint32 val)
155     {
156     if (val > 0xffff) {
157     gCPU.vscr |= VSCR_SAT;
158     return 0xffff;
159     } else if (val < 0) {
160     gCPU.vscr |= VSCR_SAT;
161     return 0;
162     }
163     return (uint16)val;
164     }
165    
166     static inline sint32 SATURATE_UW(sint64 val)
167     {
168     if (val > 0xffffffffLL) {
169     gCPU.vscr |= VSCR_SAT;
170     return 0xffffffffLL;
171     }
172     return val;
173     }
174    
175     static inline sint32 SATURATE_SW(sint64 val)
176     {
177     if (val > 2147483647LL) { // 0x7fffffff
178     gCPU.vscr |= VSCR_SAT;
179     return 2147483647LL;
180     } else if (val < -2147483648LL) { // 0x80000000
181     gCPU.vscr |= VSCR_SAT;
182     return -2147483648LL;
183     }
184     return val;
185     }
186    
187     static inline void vec_raise_saturate(void)
188     {
189     asmOR(&gCPU.vscr, VSCR_SAT);
190     }
191    
192     static inline void vec_saturateSB(NativeReg reg, NativeReg reg2)
193     {
194     jitcClobberCarryAndFlags();
195    
196     asmMOV_NoFlags(reg2, 0x7f);
197     asmALU(X86_CMP, reg, reg2);
198     NativeAddress skip1 = asmJxxFixup(X86_G);
199    
200     asmMOV_NoFlags(reg2, 0xffffff80);
201     asmALU(X86_CMP, reg, reg2);
202     NativeAddress skip2 = asmJxxFixup(X86_L);
203    
204     NativeAddress skip3 = asmJMPFixup();
205    
206     asmResolveFixup(skip1);
207     asmResolveFixup(skip2);
208    
209     asmALU(X86_MOV, reg, reg2);
210     vec_raise_saturate();
211    
212     asmResolveFixup(skip3);
213     }
214    
215     static inline void vec_saturateSUB(NativeReg reg, NativeReg reg2)
216     {
217     jitcClobberCarryAndFlags();
218    
219     asmMOV_NoFlags(reg2, 0xff);
220     asmALU(X86_CMP, reg, reg2);
221     NativeAddress skip1 = asmJxxFixup(X86_G);
222    
223     asmALU(X86_XOR, reg2, reg2);
224     asmALU(X86_CMP, reg, reg2);
225     NativeAddress skip2 = asmJxxFixup(X86_L);
226    
227     NativeAddress skip3 = asmJMPFixup();
228    
229     asmResolveFixup(skip1);
230     asmResolveFixup(skip2);
231    
232     asmALU(X86_MOV, reg, reg2);
233     vec_raise_saturate();
234    
235     asmResolveFixup(skip3);
236     }
237    
238     static inline void vec_saturateUB(NativeReg8 reg)
239     {
240     jitcClobberCarryAndFlags();
241    
242     asmALU(X86_CMP, (NativeReg)reg, 0xff);
243     NativeAddress skip1 = asmJxxFixup(X86_BE);
244    
245     asmSET(X86_S, reg);
246     asmALU(X86_SUB, reg, 1);
247    
248     vec_raise_saturate();
249    
250     asmResolveFixup(skip1);
251     }
252    
253    
254     static inline void vec_saturateSH(NativeReg reg, NativeReg reg2)
255     {
256     jitcClobberCarryAndFlags();
257    
258     asmMOV_NoFlags(reg2, 0x7fff);
259     asmALU(X86_CMP, reg, reg2);
260     NativeAddress skip1 = asmJxxFixup(X86_G);
261    
262     asmMOV_NoFlags(reg2, 0xffff8000);
263     asmALU(X86_CMP, reg, reg2);
264     NativeAddress skip2 = asmJxxFixup(X86_L);
265    
266     NativeAddress skip3 = asmJMPFixup();
267    
268     asmResolveFixup(skip1);
269     asmResolveFixup(skip2);
270    
271     asmALU(X86_MOV, reg, reg2);
272     vec_raise_saturate();
273    
274     asmResolveFixup(skip3);
275     }
276    
277     static inline void vec_saturateSUH(NativeReg reg, NativeReg reg2)
278     {
279     jitcClobberCarryAndFlags();
280    
281     asmMOV_NoFlags(reg2, 0xffff);
282     asmALU(X86_CMP, reg, reg2);
283     NativeAddress skip1 = asmJxxFixup(X86_G);
284    
285     asmALU(X86_XOR, reg2, reg2);
286     asmALU(X86_CMP, reg, reg2);
287     NativeAddress skip2 = asmJxxFixup(X86_L);
288    
289     NativeAddress skip3 = asmJMPFixup();
290    
291     asmResolveFixup(skip1);
292     asmResolveFixup(skip2);
293    
294     asmALU(X86_MOV, reg, reg2);
295     vec_raise_saturate();
296    
297     asmResolveFixup(skip3);
298     }
299    
300     static inline void vec_saturateUH(NativeReg8 reg)
301     {
302     jitcClobberCarryAndFlags();
303    
304     asmALU(X86_CMP, (NativeReg)reg, 0xffff);
305     NativeAddress skip1 = asmJxxFixup(X86_BE);
306    
307     asmSET(X86_S, reg);
308     asmMOVxx(X86_MOVZX, (NativeReg)reg, reg);
309     asmALU(X86_SUB, (NativeReg)reg, 1);
310    
311     vec_raise_saturate();
312    
313     asmResolveFixup(skip1);
314     }
315    
316     static inline void vec_getByte(NativeReg8 dest, int src, NativeReg r, int i)
317     {
318     modrm_o modrm;
319    
320     ASSERT_FLUSHED(src);
321    
322     asmALU(X86_MOV, dest, x86_mem2(modrm, r, &(gCPU.vr[src].b[i])));
323     }
324    
325     static inline void vec_setByte(int dest, NativeReg r, int i, NativeReg8 src)
326     {
327     modrm_o modrm;
328    
329     ASSERT_FLUSHED(dest);
330    
331     asmALU(X86_MOV, x86_mem2(modrm, r, &(gCPU.vr[dest].b[i])), src);
332     }
333    
334     static inline void vec_getByteZ(NativeReg dest, int src, int i)
335     {
336     modrm_o modrm;
337    
338     ASSERT_FLUSHED(src);
339    
340     asmMOVxx_B(X86_MOVZX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
341     }
342    
343     static inline void vec_getByteS(NativeReg dest, int src, int i)
344     {
345     modrm_o modrm;
346    
347     ASSERT_FLUSHED(src);
348    
349     asmMOVxx_B(X86_MOVSX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
350     }
351    
352     static inline void vec_getHalf(NativeReg16 dest, int src, int i)
353     {
354     ASSERT_FLUSHED(src);
355    
356     asmMOV(dest, &(gCPU.vr[src].b[i]));
357     }
358    
359     static inline void vec_setHalf(int dest, int i, NativeReg16 src)
360     {
361     ASSERT_FLUSHED(dest);
362    
363     asmMOV(&(gCPU.vr[dest].b[i]), src);
364     }
365    
366     static inline void vec_getHalfZ(NativeReg dest, int src, int i)
367     {
368     modrm_o modrm;
369    
370     ASSERT_FLUSHED(src);
371    
372     asmMOVxx_W(X86_MOVZX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
373     }
374    
375     static inline void vec_getHalfS(NativeReg dest, int src, int i)
376     {
377     modrm_o modrm;
378    
379     ASSERT_FLUSHED(src);
380    
381     asmMOVxx_W(X86_MOVSX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
382     }
383    
384     static inline void vec_getWord(NativeReg dest, int src, int i)
385     {
386     ASSERT_FLUSHED(src);
387    
388     asmMOV(dest, &(gCPU.vr[src].b[i]));
389     }
390    
391     static inline void vec_setWord(int dest, int i, NativeReg src)
392     {
393     ASSERT_FLUSHED(dest);
394    
395     asmMOV(&(gCPU.vr[dest].b[i]), src);
396     }
397    
398     #define SSE_AVAIL gJITC.hostCPUCaps.sse
399     //#define SSE_AVAIL 0
400    
401     #define SSE2_AVAIL gJITC.hostCPUCaps.sse2
402     //#define SSE2_AVAIL 0
403    
404     #define SSE_NO 0
405     #define SSE2_NO 0
406    
407     const static sint32 sint32_max = 2147483647;
408     const static sint32 sint32_min = -2147483648;
409     const static double uint32_max = 4294967295;
410     const static uint32 uint32_min = 0;
411    
412     static inline void vec_Copy(int dest, int src)
413     {
414     if (dest == src);
415    
416     else if (SSE_AVAIL) {
417     NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
418     NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(src);
419    
420     if (reg2 == VECTREG_NO)
421     asmMOVAPS(reg, &gCPU.vr[src]);
422     else
423     asmALUPS(X86_MOVAPS, reg, reg2);
424     } else {
425     jitcFlushClientVectorRegister(src);
426     jitcDropClientVectorRegister(dest);
427    
428     NativeReg reg = jitcAllocRegister();
429    
430     for (int i=0; i<16; i+=4) {
431     vec_getWord(reg, src, i);
432     vec_setWord(dest, i, reg);
433     }
434     }
435     }
436    
437     static inline void vec_Zero(int dest)
438     {
439     if (SSE_AVAIL) {
440     NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
441    
442     asmALUPS(X86_XORPS, reg, reg);
443     } else {
444     jitcDropClientVectorRegister(dest);
445    
446     NativeReg reg = jitcAllocRegister();
447    
448     asmMOV_NoFlags(reg, 0); // avoid flag clobber
449    
450     vec_setWord(dest, 0, reg);
451     vec_setWord(dest, 4, reg);
452     vec_setWord(dest, 8, reg);
453     vec_setWord(dest,12, reg);
454     }
455     }
456    
457     static inline void vec_Neg1(int dest)
458     {
459     if (SSE_AVAIL) {
460     NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
461     NativeVectorReg n1 = jitcGetClientVectorRegister(vrNEG1);
462    
463     asmALUPS(X86_MOVAPS, reg, n1);
464     } else {
465     jitcDropClientVectorRegister(dest);
466    
467     NativeReg reg = jitcAllocRegister();
468    
469     asmMOV_NoFlags(reg, 0xffffffff); // avoid flag clobber
470    
471     vec_setWord(dest, 0, reg);
472     vec_setWord(dest, 4, reg);
473     vec_setWord(dest, 8, reg);
474     vec_setWord(dest,12, reg);
475     }
476     }
477    
478     static inline void vec_Not(int dest, int src)
479     {
480     if (SSE_AVAIL) {
481     NativeVectorReg reg1;
482     NativeVectorReg reg2;
483    
484     if (src == dest) {
485     reg1 = jitcGetClientVectorRegisterDirty(dest);
486     } else {
487     reg1 = jitcMapClientVectorRegisterDirty(dest);
488     reg2 = jitcGetClientVectorRegisterMapping(src);
489    
490     if (reg2 == VECTREG_NO)
491     asmMOVAPS(reg1, &gCPU.vr[src]);
492     else
493     asmALUPS(X86_MOVAPS, reg1, reg2);
494     }
495    
496     reg2 = jitcGetClientVectorRegister(vrNEG1);
497    
498     asmALUPS(X86_XORPS, reg1, reg2);
499     } else {
500     jitcFlushClientVectorRegister(src);
501     jitcDropClientVectorRegister(dest);
502    
503     // according to documentation NOT does not effect flags!
504     NativeReg reg = jitcAllocRegister();
505    
506     for (int i=0; i<16; i+=4) {
507     vec_getWord(reg, src, i);
508     asmALU(X86_NOT, reg);
509     vec_setWord(dest, i, reg);
510     }
511     }
512     }
513    
514     static inline void vec_SelectiveLoadByte(X86FlagTest flags, NativeReg8 reg1, NativeReg reg2, int vrA, int vrB)
515     {
516     modrm_o modrm;
517    
518     NativeAddress skip = asmJxxFixup(flags);
519     asmALU(X86_MOV, (NativeReg)reg1,
520     x86_mem2(modrm, reg2, &(gCPU.vr[vrA].b[16])));
521    
522     NativeAddress end = asmJMPFixup();
523    
524     asmResolveFixup(skip);
525     asmALU(X86_MOV, (NativeReg)reg1,
526     x86_mem2(modrm, reg2, &(gCPU.vr[vrB].b[16])));
527    
528     asmResolveFixup(end);
529     }
530    
531     static inline void vec_PermuteQuad(NativeReg8 reg1, NativeReg reg2, NativeReg8 regT, NativeReg regA, NativeReg regB, int vrA, int vrB, int vrD, int i)
532     {
533     modrm_o modrm;
534    
535     asmMOVxx(X86_MOVSX, regA, reg1);
536     asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
537    
538     if (gJITC.hostCPUCaps.cmov) {
539     asmALU(X86_TEST, reg2, 0x10);
540     asmCMOV(X86_Z, (NativeReg)regT,
541     x86_mem2(modrm, regA, &(gCPU.vr[vrA].b[16])));
542     asmCMOV(X86_NZ, (NativeReg)regT,
543     x86_mem2(modrm, regA, &(gCPU.vr[vrB].b[16])));
544    
545     vec_setByte(vrD, REG_NO, i, regT);
546    
547     asmALU(X86_TEST, reg2, 0x1000);
548     asmCMOV(X86_Z, (NativeReg)regT,
549     x86_mem2(modrm, regB, &(gCPU.vr[vrA].b[16])));
550     asmCMOV(X86_NZ, (NativeReg)regT,
551     x86_mem2(modrm, regB, &(gCPU.vr[vrB].b[16])));
552    
553     vec_setByte(vrD, REG_NO, i+1, regT);
554    
555     asmShift(X86_SHR, (NativeReg)reg1, 16);
556    
557     asmMOVxx(X86_MOVSX, regA, reg1);
558     asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
559    
560     asmALU(X86_TEST, reg2, 0x100000);
561     asmCMOV(X86_Z, (NativeReg)regT,
562     x86_mem2(modrm, regA, &(gCPU.vr[vrA].b[16])));
563     asmCMOV(X86_NZ, (NativeReg)regT,
564     x86_mem2(modrm, regA, &(gCPU.vr[vrB].b[16])));
565    
566     vec_setByte(vrD, REG_NO, i+2, regT);
567    
568     asmALU(X86_TEST, reg2, 0x10000000);
569     asmCMOV(X86_Z, (NativeReg)regT,
570     x86_mem2(modrm, regB, &(gCPU.vr[vrA].b[16])));
571     asmCMOV(X86_NZ, (NativeReg)regT,
572     x86_mem2(modrm, regB, &(gCPU.vr[vrB].b[16])));
573    
574     vec_setByte(vrD, REG_NO, i+3, regT);
575     } else {
576     asmALU(X86_TEST, reg2, 0x10);
577     vec_SelectiveLoadByte(X86_NZ, regT, regA, vrA, vrB);
578     vec_setByte(vrD, REG_NO, i, regT);
579    
580     asmALU(X86_TEST, reg2, 0x1000);
581     vec_SelectiveLoadByte(X86_NZ, regT, regB, vrA, vrB);
582     vec_setByte(vrD, REG_NO, i+1, regT);
583    
584     asmShift(X86_SHR, (NativeReg)reg1, 16);
585    
586     asmMOVxx(X86_MOVSX, regA, reg1);
587     asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
588    
589     asmALU(X86_TEST, reg2, 0x100000);
590     vec_SelectiveLoadByte(X86_NZ, regT, regA, vrA, vrB);
591     vec_setByte(vrD, REG_NO, i+2, regT);
592    
593     asmALU(X86_TEST, reg2, 0x10000000);
594     vec_SelectiveLoadByte(X86_NZ, regT, regB, vrA, vrB);
595     vec_setByte(vrD, REG_NO, i+3, regT);
596     }
597     }
598    
599     /* vperm Vector Permutation
600     * v.218
601     */
602     void ppc_opc_vperm()
603     {
604     VECTOR_DEBUG_COMMON;
605     int vrD, vrA, vrB, vrC;
606     int sel;
607     Vector_t r;
608     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
609     for (int i=0; i<16; i++) {
610     sel = gCPU.vr[vrC].b[i];
611     if (sel & 0x10)
612     r.b[i] = VECT_B(gCPU.vr[vrB], sel & 0xf);
613     else
614     r.b[i] = VECT_B(gCPU.vr[vrA], sel & 0xf);
615     }
616    
617     gCPU.vr[vrD] = r;
618     }
619     JITCFlow ppc_opc_gen_vperm()
620     {
621     #if 0
622     ppc_opc_gen_interpret(ppc_opc_vperm);
623     return flowEndBlock;
624     #else
625     int vrD, vrA, vrB, vrC, vrTMP;
626     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
627    
628     jitcFlushClientVectorRegister(vrA);
629     jitcFlushClientVectorRegister(vrB);
630     jitcFlushClientVectorRegister(vrC);
631    
632     jitcClobberCarryAndFlags();
633     NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
634     NativeReg8 regT = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
635     NativeReg reg2 = jitcAllocRegister();
636     NativeReg regA = jitcAllocRegister();
637     NativeReg regB = jitcAllocRegister();
638    
639     vrTMP = ((vrD == vrA) || (vrD == vrB)) ? vrT : vrD;
640     jitcDropClientVectorRegister(vrTMP);
641    
642     for (int i=0; i<16; i+=4) {
643     vec_getWord((NativeReg)reg1, vrC, i);
644    
645     asmALU(X86_MOV, reg2, (NativeReg)reg1);
646    
647     asmALU(X86_AND, (NativeReg)reg1, 0x0f0f0f0f);
648     asmALU(X86_NOT, (NativeReg)reg1);
649    
650     // reg1 = original map (selector mask)
651     // reg2 = negative offsets - 1
652     // regT = spare use register (must be 8-bit)
653     // regA = spare use register
654     // regB = spare use register
655     vec_PermuteQuad(reg1, reg2, regT, regA, regB, vrA, vrB, vrTMP, i);
656     }
657    
658     if (vrTMP == vrT) {
659     if (SSE_AVAIL) {
660     vec_Copy(vrD, vrT);
661     } else {
662     /* Since we already have a number of registers
663     * allocated, this is much faster than the non-SSE
664     * vec_Copy()
665     */
666     jitcDropClientVectorRegister(vrD);
667    
668     vec_getWord((NativeReg)reg1, vrT, 0);
669     vec_getWord(reg2, vrT, 4);
670     vec_getWord(regA, vrT, 8);
671     vec_getWord(regB, vrT,12);
672    
673     vec_setWord(vrD, 0, (NativeReg)reg1);
674     vec_setWord(vrD, 4, reg2);
675     vec_setWord(vrD, 8, regA);
676     vec_setWord(vrD,12, regB);
677     }
678     }
679    
680     return flowContinue;
681     #endif
682     }
683    
684     /* vsel Vector Select
685     * v.238
686     */
687     void ppc_opc_vsel()
688     {
689     VECTOR_DEBUG;
690     int vrD, vrA, vrB, vrC;
691     uint64 mask, val;
692     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
693    
694     mask = gCPU.vr[vrC].d[0];
695     val = gCPU.vr[vrB].d[0] & mask;
696     val |= gCPU.vr[vrA].d[0] & ~mask;
697     gCPU.vr[vrD].d[0] = val;
698    
699     mask = gCPU.vr[vrC].d[1];
700     val = gCPU.vr[vrB].d[1] & mask;
701     val |= gCPU.vr[vrA].d[1] & ~mask;
702     gCPU.vr[vrD].d[1] = val;
703     }
704     JITCFlow ppc_opc_gen_vsel()
705     {
706     #if 0
707     ppc_opc_gen_interpret(ppc_opc_vsel);
708     return flowEndBlock;
709     #else
710     int vrD, vrA, vrB, vrC;
711     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
712    
713     if (vrA == vrB) {
714     vec_Copy(vrD, vrB);
715     return flowContinue;
716     }
717    
718     if (SSE_AVAIL) {
719     modrm_o modrm;
720    
721     /* We cannot use jitcMapClientVectorRegisterDirty(vrD) here,
722     * because if vrD == vrC
723     */
724     NativeVectorReg d = jitcAllocVectorRegister();
725     NativeVectorReg c;
726    
727     if (vrA == vrC) {
728     c = jitcGetClientVectorRegisterMapping(vrC);
729    
730     if (c == VECTREG_NO) {
731     asmMOVAPS(d, &gCPU.vr[vrC]);
732     } else {
733     asmALUPS(X86_MOVAPS, d, c);
734     }
735     } else {
736     c = jitcGetClientVectorRegister(vrC);
737     jitcTrashVectorRegister(NATIVE_REG | c);
738    
739     asmALUPS(X86_MOVAPS, d, c);
740     }
741    
742     if (vrB != vrC) {
743     NativeVectorReg b = jitcGetClientVectorRegisterMapping(vrB);
744    
745     if (b == VECTREG_NO) {
746     asmALUPS(X86_ANDPS, d,
747     x86_mem2(modrm, &gCPU.vr[vrB]));
748     } else
749     asmALUPS(X86_ANDPS, d, b);
750     }
751    
752     if (vrA != vrC) {
753     NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
754    
755     if (a == VECTREG_NO) {
756     asmALUPS(X86_ANDNPS, c,
757     x86_mem2(modrm, &gCPU.vr[vrA]));
758     } else
759     asmALUPS(X86_ANDNPS, c, a);
760    
761     asmALUPS(X86_ORPS, d, c);
762     }
763    
764     jitcRenameVectorRegisterDirty(d, vrD);
765    
766     return flowContinue;
767     }
768    
769     jitcFlushClientVectorRegister(vrA);
770     jitcFlushClientVectorRegister(vrB);
771     jitcFlushClientVectorRegister(vrC);
772     jitcDropClientVectorRegister(vrD);
773    
774     jitcClobberCarryAndFlags();
775     NativeReg reg1 = jitcAllocRegister();
776     NativeReg reg2 = jitcAllocRegister();
777     modrm_o modrm;
778    
779     for (int i=0; i<16; i+=4) {
780     vec_getWord(reg1, vrC, i);
781     asmALU(X86_MOV, reg2, reg1);
782     asmALU(X86_NOT, reg2);
783    
784     asmALU(X86_AND, reg1, x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
785    
786     asmALU(X86_AND, reg2, x86_mem2(modrm, &(gCPU.vr[vrA].b[i])));
787    
788     asmALU(X86_OR, reg1, reg2);
789     vec_setWord(vrD, i, reg1);
790     }
791    
792     return flowContinue;
793     #endif
794     }
795    
796     /* vsrb Vector Shift Right Byte
797     * v.256
798     */
799     void ppc_opc_vsrb()
800     {
801     VECTOR_DEBUG;
802     int vrD, vrA, vrB;
803     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
804     for (int i=0; i<16; i++) {
805     gCPU.vr[vrD].b[i] = gCPU.vr[vrA].b[i] >> (gCPU.vr[vrB].b[i] & 0x7);
806     }
807     }
808     JITCFlow ppc_opc_gen_vsrb()
809     {
810     #if 0
811     ppc_opc_gen_interpret(ppc_opc_vsrb);
812     return flowEndBlock;
813     #else
814     int vrD, vrA, vrB;
815     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
816    
817     jitcFlushClientVectorRegister(vrA);
818     jitcFlushClientVectorRegister(vrB);
819     jitcDropClientVectorRegister(vrD);
820    
821     jitcClobberCarryAndFlags();
822     jitcAllocRegister(NATIVE_REG | ECX);
823     NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
824    
825     for (int i=0; i<16; i++) {
826     vec_getByte(reg, vrA, REG_NO, i);
827     vec_getByte(CL, vrB, REG_NO, i);
828    
829     asmALU(X86_AND, CL, 0x7);
830     asmShift_CL(X86_SHR, reg);
831    
832     vec_setByte(vrD, REG_NO, i, reg);
833     }
834    
835     return flowContinue;
836     #endif
837     }
838    
839     /* vsrh Vector Shift Right Half Word
840     * v.257
841     */
842     void ppc_opc_vsrh()
843     {
844     VECTOR_DEBUG;
845     int vrD, vrA, vrB;
846     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
847     for (int i=0; i<8; i++) {
848     gCPU.vr[vrD].h[i] = gCPU.vr[vrA].h[i] >> (gCPU.vr[vrB].h[i] & 0xf);
849     }
850     }
851     JITCFlow ppc_opc_gen_vsrh()
852     {
853     #if 0
854     ppc_opc_gen_interpret(ppc_opc_vsrh);
855     return flowEndBlock;
856     #else
857     int vrD, vrA, vrB;
858     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
859    
860     jitcFlushClientVectorRegister(vrA);
861     jitcFlushClientVectorRegister(vrB);
862     jitcDropClientVectorRegister(vrD);
863    
864     jitcClobberCarryAndFlags();
865     jitcAllocRegister(NATIVE_REG | ECX);
866     NativeReg16 reg = (NativeReg16)jitcAllocRegister();
867    
868     for (int i=0; i<16; i+=2) {
869     vec_getHalf(reg, vrA, i);
870     vec_getHalf(CX, vrB, i);
871    
872     asmALU(X86_AND, CL, 0x0f);
873     asmShift_CL(X86_SHR, reg);
874    
875     vec_setHalf(vrD, i, reg);
876     }
877    
878     return flowContinue;
879     #endif
880     }
881    
882     /* vsrw Vector Shift Right Word
883     * v.259
884     */
885     void ppc_opc_vsrw()
886     {
887     VECTOR_DEBUG;
888     int vrD, vrA, vrB;
889     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
890     for (int i=0; i<4; i++) {
891     gCPU.vr[vrD].w[i] = gCPU.vr[vrA].w[i] >> (gCPU.vr[vrB].w[i] & 0x1f);
892     }
893     }
894     JITCFlow ppc_opc_gen_vsrw()
895     {
896     #if 0
897     ppc_opc_gen_interpret(ppc_opc_vsrw);
898     return flowEndBlock;
899     #else
900     int vrD, vrA, vrB;
901     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
902    
903     jitcFlushClientVectorRegister(vrA);
904     jitcFlushClientVectorRegister(vrB);
905     jitcDropClientVectorRegister(vrD);
906    
907     jitcClobberCarryAndFlags();
908     jitcAllocRegister(NATIVE_REG | ECX);
909     NativeReg reg = jitcAllocRegister();
910    
911     for (int i=0; i<16; i+=4) {
912     vec_getWord(reg, vrA, i);
913     vec_getWord(ECX, vrB, i);
914    
915     // this instruction auto-masks to 0x1f
916     asmShift_CL(X86_SHR, reg);
917    
918     vec_setWord(vrD, i, reg);
919     }
920    
921     return flowContinue;
922     #endif
923     }
924    
925     /* vsrab Vector Shift Right Arithmetic Byte
926     * v.253
927     */
928     void ppc_opc_vsrab()
929     {
930     VECTOR_DEBUG;
931     int vrD, vrA, vrB;
932     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
933     for (int i=0; i<16; i++) {
934     gCPU.vr[vrD].sb[i] = gCPU.vr[vrA].sb[i] >> (gCPU.vr[vrB].b[i] & 0x7);
935     }
936     }
937     JITCFlow ppc_opc_gen_vsrab()
938     {
939     #if 0
940     ppc_opc_gen_interpret(ppc_opc_vsrab);
941     return flowEndBlock;
942     #else
943     int vrD, vrA, vrB;
944     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
945    
946     jitcFlushClientVectorRegister(vrA);
947     jitcFlushClientVectorRegister(vrB);
948     jitcDropClientVectorRegister(vrD);
949    
950     jitcClobberCarryAndFlags();
951     jitcAllocRegister(NATIVE_REG | ECX);
952     NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
953    
954     for (int i=0; i<16; i++) {
955     vec_getByte(reg, vrA, REG_NO, i);
956     vec_getByte(CL, vrB, REG_NO, i);
957    
958     asmALU(X86_AND, CL, 0x7);
959     asmShift_CL(X86_SAR, reg);
960    
961     vec_setByte(vrD, REG_NO, i, reg);
962     }
963    
964     return flowContinue;
965     #endif
966     }
967    
968     /* vsrah Vector Shift Right Arithmetic Half Word
969     * v.254
970     */
971     void ppc_opc_vsrah()
972     {
973     VECTOR_DEBUG;
974     int vrD, vrA, vrB;
975     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
976     for (int i=0; i<8; i++) {
977     gCPU.vr[vrD].sh[i] = gCPU.vr[vrA].sh[i] >> (gCPU.vr[vrB].h[i] & 0xf);
978     }
979     }
980     JITCFlow ppc_opc_gen_vsrah()
981     {
982     #if 0
983     ppc_opc_gen_interpret(ppc_opc_vsrah);
984     return flowEndBlock;
985     #else
986     int vrD, vrA, vrB;
987     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
988    
989     jitcFlushClientVectorRegister(vrA);
990     jitcFlushClientVectorRegister(vrB);
991     jitcDropClientVectorRegister(vrD);
992    
993     jitcClobberCarryAndFlags();
994     jitcAllocRegister(NATIVE_REG | ECX);
995     NativeReg16 reg = (NativeReg16)jitcAllocRegister();
996    
997     for (int i=0; i<16; i+=2) {
998     vec_getHalf(reg, vrA, i);
999     vec_getHalf(CX, vrB, i);
1000    
1001     asmALU(X86_AND, CL, 0x0f);
1002     asmShift_CL(X86_SAR, reg);
1003    
1004     vec_setHalf(vrD, i, reg);
1005     }
1006    
1007     return flowContinue;
1008     #endif
1009     }
1010    
1011     /* vsraw Vector Shift Right Arithmetic Word
1012     * v.255
1013     */
1014     void ppc_opc_vsraw()
1015     {
1016     VECTOR_DEBUG;
1017     int vrD, vrA, vrB;
1018     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1019     for (int i=0; i<4; i++) {
1020     gCPU.vr[vrD].sw[i] = gCPU.vr[vrA].sw[i] >> (gCPU.vr[vrB].w[i] & 0x1f);
1021     }
1022     }
1023     JITCFlow ppc_opc_gen_vsraw()
1024     {
1025     #if 0
1026     ppc_opc_gen_interpret(ppc_opc_vsraw);
1027     return flowEndBlock;
1028     #else
1029     int vrD, vrA, vrB;
1030     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1031    
1032     jitcFlushClientVectorRegister(vrB);
1033     jitcDropClientVectorRegister(vrD);
1034    
1035     jitcClobberCarryAndFlags();
1036     jitcAllocRegister(NATIVE_REG | ECX);
1037     NativeReg reg = jitcAllocRegister();
1038    
1039     for (int i=0; i<16; i+=4) {
1040     vec_getWord(reg, vrA, i);
1041     vec_getWord(ECX, vrB, i);
1042    
1043     // this instruction auto-masks to 0x1f
1044     asmShift_CL(X86_SAR, reg);
1045    
1046     vec_setWord(vrD, i, reg);
1047     }
1048    
1049     return flowContinue;
1050     #endif
1051     }
1052    
1053     /* vslb Vector Shift Left Byte
1054     * v.240
1055     */
1056     void ppc_opc_vslb()
1057     {
1058     VECTOR_DEBUG;
1059     int vrD, vrA, vrB;
1060     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1061     for (int i=0; i<16; i++) {
1062     gCPU.vr[vrD].b[i] = gCPU.vr[vrA].b[i] << (gCPU.vr[vrB].b[i] & 0x7);
1063     }
1064     }
1065     JITCFlow ppc_opc_gen_vslb()
1066     {
1067     #if 0
1068     ppc_opc_gen_interpret(ppc_opc_vslb);
1069     return flowEndBlock;
1070     #else
1071     int vrD, vrA, vrB;
1072     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1073    
1074     jitcFlushClientVectorRegister(vrA);
1075     jitcFlushClientVectorRegister(vrB);
1076     jitcDropClientVectorRegister(vrD);
1077    
1078     jitcClobberCarryAndFlags();
1079     jitcAllocRegister(NATIVE_REG | ECX);
1080     NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1081    
1082     for (int i=0; i<16; i++) {
1083     vec_getByte(reg, vrA, REG_NO, i);
1084     vec_getByte(CL, vrB, REG_NO, i);
1085    
1086     asmALU(X86_AND, CL, 0x7);
1087     asmShift_CL(X86_SHL, reg);
1088    
1089     vec_setByte(vrD, REG_NO, i, reg);
1090     }
1091    
1092     return flowContinue;
1093     #endif
1094     }
1095    
1096     /* vslh Vector Shift Left Half Word
1097     * v.242
1098     */
1099     void ppc_opc_vslh()
1100     {
1101     VECTOR_DEBUG;
1102     int vrD, vrA, vrB;
1103     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1104     for (int i=0; i<8; i++) {
1105     gCPU.vr[vrD].h[i] = gCPU.vr[vrA].h[i] << (gCPU.vr[vrB].h[i] & 0xf);
1106     }
1107     }
1108     JITCFlow ppc_opc_gen_vslh()
1109     {
1110     #if 0
1111     ppc_opc_gen_interpret(ppc_opc_vslh);
1112     return flowEndBlock;
1113     #else
1114     int vrD, vrA, vrB;
1115     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1116    
1117     jitcFlushClientVectorRegister(vrA);
1118     jitcFlushClientVectorRegister(vrB);
1119     jitcDropClientVectorRegister(vrD);
1120    
1121     jitcClobberCarryAndFlags();
1122     jitcAllocRegister(NATIVE_REG | ECX);
1123     NativeReg16 reg = (NativeReg16)jitcAllocRegister();
1124    
1125     for (int i=0; i<16; i+=2) {
1126     vec_getHalf(reg, vrA, i);
1127     vec_getHalf(CX, vrB, i);
1128    
1129     asmALU(X86_AND, CL, 0x0f);
1130     asmShift_CL(X86_SHL, reg);
1131    
1132     vec_setHalf(vrD, i, reg);
1133     }
1134    
1135     return flowContinue;
1136     #endif
1137     }
1138    
1139     /* vslw Vector Shift Left Word
1140     * v.244
1141     */
1142     void ppc_opc_vslw()
1143     {
1144     VECTOR_DEBUG;
1145     int vrD, vrA, vrB;
1146     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1147     for (int i=0; i<4; i++) {
1148     gCPU.vr[vrD].w[i] = gCPU.vr[vrA].w[i] << (gCPU.vr[vrB].w[i] & 0x1f);
1149     }
1150     }
1151     JITCFlow ppc_opc_gen_vslw()
1152     {
1153     #if 0
1154     ppc_opc_gen_interpret(ppc_opc_vslw);
1155     return flowEndBlock;
1156     #else
1157     int vrD, vrA, vrB;
1158     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1159    
1160     jitcFlushClientVectorRegister(vrA);
1161     jitcFlushClientVectorRegister(vrB);
1162     jitcDropClientVectorRegister(vrD);
1163    
1164     jitcClobberCarryAndFlags();
1165     jitcAllocRegister(NATIVE_REG | ECX);
1166     NativeReg reg = jitcAllocRegister();
1167    
1168     for (int i=0; i<16; i+=4) {
1169     vec_getWord(reg, vrA, i);
1170     vec_getWord(ECX, vrB, i);
1171    
1172     // this instruction auto-masks to 0x1f
1173     asmShift_CL(X86_SHL, reg);
1174    
1175     vec_setWord(vrD, i, reg);
1176     }
1177    
1178     return flowContinue;
1179     #endif
1180     }
1181    
1182     /* vsr Vector Shift Right
1183     * v.251
1184     */
1185     void ppc_opc_vsr()
1186     {
1187     VECTOR_DEBUG;
1188     int vrD, vrA, vrB;
1189     Vector_t r;
1190     int shift;
1191     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1192    
1193     /* Specs say that the low-order 3 bits of all byte elements in vB
1194     * must be the same, or the result is undefined. So we can just
1195     * use the same low-order 3 bits for all of our shifts.
1196     */
1197     shift = gCPU.vr[vrB].w[0] & 0x7;
1198    
1199     r.d[0] = gCPU.vr[vrA].d[0] >> shift;
1200     r.d[1] = gCPU.vr[vrA].d[1] >> shift;
1201    
1202     VECT_D(r, 1) |= VECT_D(gCPU.vr[vrA], 0) << (64 - shift);
1203    
1204     gCPU.vr[vrD] = r;
1205     }
1206     JITCFlow ppc_opc_gen_vsr()
1207     {
1208     ppc_opc_gen_interpret(ppc_opc_vsr);
1209     return flowEndBlock;
1210     }
1211    
1212     /* vsro Vector Shift Right Octet
1213     * v.258
1214     */
1215     void ppc_opc_vsro()
1216     {
1217     VECTOR_DEBUG;
1218     int vrD, vrA, vrB;
1219     Vector_t r;
1220     int shift, i;
1221     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1222    
1223     shift = (gCPU.vr[vrB].w[0] >> 3) & 0xf;
1224     #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1225     for (i=0; i<(16-shift); i++) {
1226     r.b[i] = gCPU.vr[vrA].b[i+shift];
1227     }
1228    
1229     for (; i<16; i++) {
1230     r.b[i] = 0;
1231     }
1232     #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1233     for (i=0; i<shift; i++) {
1234     r.b[i] = 0;
1235     }
1236    
1237     for (; i<16; i++) {
1238     r.b[i] = gCPU.vr[vrA].b[i-shift];
1239     }
1240     #else
1241     #error Endianess not supported!
1242     #endif
1243    
1244     gCPU.vr[vrD] = r;
1245     }
1246     JITCFlow ppc_opc_gen_vsro()
1247     {
1248     ppc_opc_gen_interpret(ppc_opc_vsro);
1249     return flowEndBlock;
1250     }
1251    
1252     /* vsl Vector Shift Left
1253     * v.239
1254     */
1255     void ppc_opc_vsl()
1256     {
1257     VECTOR_DEBUG;
1258     int vrD, vrA, vrB;
1259     Vector_t r;
1260     int shift;
1261     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1262    
1263     /* Specs say that the low-order 3 bits of all byte elements in vB
1264     * must be the same, or the result is undefined. So we can just
1265     * use the same low-order 3 bits for all of our shifts.
1266     */
1267     shift = gCPU.vr[vrB].w[0] & 0x7;
1268    
1269     r.d[0] = gCPU.vr[vrA].d[0] << shift;
1270     r.d[1] = gCPU.vr[vrA].d[1] << shift;
1271    
1272     VECT_D(r, 0) |= VECT_D(gCPU.vr[vrA], 1) >> (64 - shift);
1273    
1274     gCPU.vr[vrD] = r;
1275     }
1276     JITCFlow ppc_opc_gen_vsl()
1277     {
1278     ppc_opc_gen_interpret(ppc_opc_vsl);
1279     return flowEndBlock;
1280     }
1281    
1282     /* vslo Vector Shift Left Octet
1283     * v.243
1284     */
1285     void ppc_opc_vslo()
1286     {
1287     VECTOR_DEBUG;
1288     int vrD, vrA, vrB;
1289     Vector_t r;
1290     int shift, i;
1291     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1292    
1293     shift = (gCPU.vr[vrB].w[0] >> 3) & 0xf;
1294     #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1295     for (i=0; i<shift; i++) {
1296     r.b[i] = 0;
1297     }
1298    
1299     for (; i<16; i++) {
1300     r.b[i] = gCPU.vr[vrA].b[i-shift];
1301     }
1302     #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1303     for (i=0; i<(16-shift); i++) {
1304     r.b[i] = gCPU.vr[vrA].b[i+shift];
1305     }
1306    
1307     for (; i<16; i++) {
1308     r.b[i] = 0;
1309     }
1310     #else
1311     #error Endianess not supported!
1312     #endif
1313    
1314     gCPU.vr[vrD] = r;
1315     }
1316     JITCFlow ppc_opc_gen_vslo()
1317     {
1318     ppc_opc_gen_interpret(ppc_opc_vslo);
1319     return flowEndBlock;
1320     }
1321    
1322     static inline void ppc_opc_gen_helper_vsldoi_left(int vrA, int vrD, int shift, int pshift, NativeReg reg)
1323     {
1324     for (int i=12; i>=shift; i-=4) {
1325     vec_getWord(reg, vrA, i-shift);
1326    
1327     vec_setWord(vrD, i, reg);
1328     }
1329    
1330     if (pshift) {
1331     for (int i=shift+4-pshift; i>=shift; i--) {
1332     vec_getByte((NativeReg8)reg, vrA, REG_NO, i-shift);
1333    
1334     vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1335     }
1336     }
1337     }
1338    
1339     static inline void ppc_opc_gen_helper_vsldoi_right(int vrB, int vrD, int shift, int ashift, int bshift, NativeReg reg)
1340     {
1341     for (int i=0; i<bshift; i+=4) {
1342     vec_getWord(reg, vrB, i+ashift);
1343    
1344     vec_setWord(vrD, i, reg);
1345     }
1346    
1347     for (int i=bshift; i<shift; i++) {
1348     vec_getByte((NativeReg8)reg, vrB, REG_NO, i+ashift);
1349    
1350     vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1351     }
1352     }
1353    
1354     /* vsldoi Vector Shift Left Double by Octet Immediate
1355     * v.241
1356     */
1357     void ppc_opc_vsldoi()
1358     {
1359     VECTOR_DEBUG_COMMON;
1360     int vrD, vrA, vrB, shift, ashift;
1361     int i;
1362     Vector_t r;
1363     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, shift);
1364    
1365     shift &= 0xf;
1366     ashift = 16 - shift;
1367    
1368     #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1369     for (i=0; i<shift; i++) {
1370     r.b[i] = gCPU.vr[vrB].b[i+ashift];
1371     }
1372    
1373     for (; i<16; i++) {
1374     r.b[i] = gCPU.vr[vrA].b[i-shift];
1375     }
1376     #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1377     for (i=0; i<ashift; i++) {
1378     r.b[i] = gCPU.vr[vrA].b[i+shift];
1379     }
1380    
1381     for (; i<16; i++) {
1382     r.b[i] = gCPU.vr[vrB].b[i-ashift];
1383     }
1384     #else
1385     #error Endianess not supported!
1386     #endif
1387    
1388     gCPU.vr[vrD] = r;
1389     }
1390     JITCFlow ppc_opc_gen_vsldoi()
1391     {
1392     #if 0
1393     ppc_opc_gen_interpret(ppc_opc_vsldoi);
1394     return flowEndBlock;
1395     #else
1396     int vrD, vrA, vrB;
1397     int shift, ashift, pshift, bshift;
1398     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, shift);
1399    
1400     jitcFlushClientVectorRegister(vrA);
1401     jitcFlushClientVectorRegister(vrB);
1402     jitcDropClientVectorRegister(vrD);
1403    
1404     shift &= 0xf;
1405     ashift = 16 - shift;
1406     bshift = (shift & ~0x03);
1407     pshift = shift & 0x03;
1408    
1409     NativeReg reg = jitcAllocRegister(NATIVE_REG_8);
1410    
1411     if ((vrD == vrA) && (vrD != vrB)) {
1412     ppc_opc_gen_helper_vsldoi_left(vrA, vrD, shift, pshift, reg);
1413    
1414     ppc_opc_gen_helper_vsldoi_right(vrB, vrD, shift, ashift, bshift, reg);
1415    
1416     return flowContinue;
1417     }
1418    
1419     if (vrD == vrA) {
1420     jitcDropClientVectorRegister(vrT);
1421     ppc_opc_gen_helper_vsldoi_right(vrB, vrT, shift, ashift, bshift, reg);
1422     } else {
1423     ppc_opc_gen_helper_vsldoi_right(vrB, vrD, shift, ashift, bshift, reg);
1424     }
1425    
1426     ppc_opc_gen_helper_vsldoi_left(vrA, vrD, shift, pshift, reg);
1427    
1428     if (vrD == vrA) {
1429     for (int i=0; i<bshift; i+=4) {
1430     vec_getWord(reg, vrT, i);
1431     vec_setWord(vrD, i, reg);
1432     }
1433    
1434     for (int i=bshift; i<shift; i++) {
1435     vec_getByte((NativeReg8)reg, vrT, REG_NO, i);
1436     vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1437     }
1438     }
1439    
1440     return flowContinue;
1441     #endif
1442     }
1443    
1444     /* vrlb Vector Rotate Left Byte
1445     * v.234
1446     */
1447     void ppc_opc_vrlb()
1448     {
1449     VECTOR_DEBUG;
1450     int vrD, vrA, vrB, shift;
1451     Vector_t r;
1452     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1453    
1454     for (int i=0; i<16; i++) {
1455     shift = (gCPU.vr[vrB].b[i] & 0x7);
1456    
1457     r.b[i] = gCPU.vr[vrA].b[i] << shift;
1458     r.b[i] |= gCPU.vr[vrA].b[i] >> (8 - shift);
1459     }
1460    
1461     gCPU.vr[vrD] = r;
1462     }
1463     JITCFlow ppc_opc_gen_vrlb()
1464     {
1465     #if 0
1466     ppc_opc_gen_interpret(ppc_opc_vrlb);
1467     return flowEndBlock;
1468     #else
1469    
1470     int vrD, vrA, vrB;
1471     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1472    
1473     jitcFlushClientVectorRegister(vrA);
1474     jitcFlushClientVectorRegister(vrB);
1475     jitcDropClientVectorRegister(vrD);
1476    
1477     jitcClobberCarryAndFlags();
1478     jitcAllocRegister(NATIVE_REG | ECX);
1479     NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1480    
1481     for (int i=0; i<16; i++) {
1482     vec_getByte(reg, vrA, REG_NO, i);
1483     vec_getByte(CL, vrB, REG_NO, i);
1484    
1485     asmShift_CL(X86_ROL, reg);
1486    
1487     vec_setByte(vrD, REG_NO, i, reg);
1488     }
1489    
1490     return flowContinue;
1491     #endif
1492     }
1493    
1494     /* vrlh Vector Rotate Left Half Word
1495     * v.235
1496     */
1497     void ppc_opc_vrlh()
1498     {
1499     VECTOR_DEBUG;
1500     int vrD, vrA, vrB, shift;
1501     Vector_t r;
1502     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1503    
1504     for (int i=0; i<8; i++) {
1505     shift = (gCPU.vr[vrB].h[i] & 0xf);
1506    
1507     r.h[i] = gCPU.vr[vrA].h[i] << shift;
1508     r.h[i] |= gCPU.vr[vrA].h[i] >> (16 - shift);
1509     }
1510    
1511     gCPU.vr[vrD] = r;
1512     }
1513     JITCFlow ppc_opc_gen_vrlh()
1514     {
1515     ppc_opc_gen_interpret(ppc_opc_vrlh);
1516     return flowEndBlock;
1517     }
1518    
1519     /* vrlw Vector Rotate Left Word
1520     * v.236
1521     */
1522     void ppc_opc_vrlw()
1523     {
1524     VECTOR_DEBUG;
1525     int vrD, vrA, vrB, shift;
1526     Vector_t r;
1527     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1528    
1529     for (int i=0; i<4; i++) {
1530     shift = (gCPU.vr[vrB].w[i] & 0x1F);
1531    
1532     r.w[i] = gCPU.vr[vrA].w[i] << shift;
1533     r.w[i] |= gCPU.vr[vrA].w[i] >> (32 - shift);
1534     }
1535    
1536     gCPU.vr[vrD] = r;
1537     }
1538     JITCFlow ppc_opc_gen_vrlw()
1539     {
1540     ppc_opc_gen_interpret(ppc_opc_vrlw);
1541     return flowEndBlock;
1542     }
1543    
1544     /* With the merges, I just don't see any point in risking that a compiler
1545     * might generate actual alu code to calculate anything when it's
1546     * compile-time known. Plus, it's easier to validate it like this.
1547     */
1548    
1549     /* vmrghb Vector Merge High Byte
1550     * v.195
1551     */
1552     void ppc_opc_vmrghb()
1553     {
1554     VECTOR_DEBUG;
1555     int vrD, vrA, vrB;
1556     Vector_t r;
1557     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1558    
1559     VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 0);
1560     VECT_B(r, 1) = VECT_B(gCPU.vr[vrB], 0);
1561     VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 1);
1562     VECT_B(r, 3) = VECT_B(gCPU.vr[vrB], 1);
1563     VECT_B(r, 4) = VECT_B(gCPU.vr[vrA], 2);
1564     VECT_B(r, 5) = VECT_B(gCPU.vr[vrB], 2);
1565     VECT_B(r, 6) = VECT_B(gCPU.vr[vrA], 3);
1566     VECT_B(r, 7) = VECT_B(gCPU.vr[vrB], 3);
1567     VECT_B(r, 8) = VECT_B(gCPU.vr[vrA], 4);
1568     VECT_B(r, 9) = VECT_B(gCPU.vr[vrB], 4);
1569     VECT_B(r,10) = VECT_B(gCPU.vr[vrA], 5);
1570     VECT_B(r,11) = VECT_B(gCPU.vr[vrB], 5);
1571     VECT_B(r,12) = VECT_B(gCPU.vr[vrA], 6);
1572     VECT_B(r,13) = VECT_B(gCPU.vr[vrB], 6);
1573     VECT_B(r,14) = VECT_B(gCPU.vr[vrA], 7);
1574     VECT_B(r,15) = VECT_B(gCPU.vr[vrB], 7);
1575    
1576     gCPU.vr[vrD] = r;
1577     }
1578     JITCFlow ppc_opc_gen_vmrghb()
1579     {
1580     #if 0
1581     ppc_opc_gen_interpret(ppc_opc_vmrghb);
1582     return flowEndBlock;
1583     #else
1584     int vrD, vrA, vrB;
1585     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1586    
1587     if (SSE2_AVAIL) {
1588     noncommutative_operation(PALUB(X86_PUNPCKH), vrD, vrB, vrA);
1589     return flowContinue;
1590     }
1591    
1592     jitcFlushClientVectorRegister(vrA);
1593     jitcFlushClientVectorRegister(vrB);
1594     jitcDropClientVectorRegister(vrD);
1595    
1596     NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1597     NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1598    
1599     vec_getHalf((NativeReg16)reg1, vrB, 8);
1600     vec_getHalf((NativeReg16)reg2, vrA, 8);
1601     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1602     vec_setHalf(vrD, 0, (NativeReg16)reg1);
1603     vec_setHalf(vrD, 2, (NativeReg16)reg2);
1604    
1605     vec_getHalf((NativeReg16)reg1, vrB, 10);
1606     vec_getHalf((NativeReg16)reg2, vrA, 10);
1607     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1608     vec_setHalf(vrD, 4, (NativeReg16)reg1);
1609     vec_setHalf(vrD, 6, (NativeReg16)reg2);
1610    
1611     vec_getHalf((NativeReg16)reg1, vrB, 12);
1612     vec_getHalf((NativeReg16)reg2, vrA, 12);
1613     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1614     vec_setHalf(vrD, 8, (NativeReg16)reg1);
1615     vec_setHalf(vrD, 10, (NativeReg16)reg2);
1616    
1617     vec_getHalf((NativeReg16)reg1, vrB, 14);
1618     vec_getHalf((NativeReg16)reg2, vrA, 14);
1619     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1620     vec_setHalf(vrD, 12, (NativeReg16)reg1);
1621     vec_setHalf(vrD, 14, (NativeReg16)reg2);
1622    
1623     return flowContinue;
1624     #endif
1625     }
1626    
1627     /* vmrghh Vector Merge High Half Word
1628     * v.196
1629     */
1630     void ppc_opc_vmrghh()
1631     {
1632     VECTOR_DEBUG;
1633     int vrD, vrA, vrB;
1634     Vector_t r;
1635     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1636    
1637     VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 0);
1638     VECT_H(r, 1) = VECT_H(gCPU.vr[vrB], 0);
1639     VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 1);
1640     VECT_H(r, 3) = VECT_H(gCPU.vr[vrB], 1);
1641     VECT_H(r, 4) = VECT_H(gCPU.vr[vrA], 2);
1642     VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 2);
1643     VECT_H(r, 6) = VECT_H(gCPU.vr[vrA], 3);
1644     VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 3);
1645    
1646     gCPU.vr[vrD] = r;
1647     }
1648     JITCFlow ppc_opc_gen_vmrghh()
1649     {
1650     #if 0
1651     ppc_opc_gen_interpret(ppc_opc_vmrghh);
1652     return flowEndBlock;
1653     #else
1654     int vrD, vrA, vrB;
1655     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1656    
1657     if (SSE2_AVAIL) {
1658     noncommutative_operation(PALUW(X86_PUNPCKH), vrD, vrB, vrA);
1659     return flowContinue;
1660     }
1661    
1662     jitcFlushClientVectorRegister(vrA);
1663     jitcFlushClientVectorRegister(vrB);
1664     jitcDropClientVectorRegister(vrD);
1665    
1666     NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
1667    
1668     vec_getHalf(reg1, vrB, 8);
1669     vec_setHalf(vrD, 0, reg1);
1670     vec_getHalf(reg1, vrB, 10);
1671     vec_setHalf(vrD, 4, reg1);
1672    
1673     vec_getHalf(reg1, vrA, 8);
1674     vec_setHalf(vrD, 2, reg1);
1675     vec_getHalf(reg1, vrA, 10);
1676     vec_setHalf(vrD, 6, reg1);
1677    
1678     vec_getHalf(reg1, vrB, 12);
1679     vec_setHalf(vrD, 8, reg1);
1680    
1681     vec_getHalf(reg1, vrA, 12);
1682     vec_setHalf(vrD, 10, reg1);
1683    
1684     vec_getHalf(reg1, vrB, 14);
1685     vec_setHalf(vrD, 12, reg1);
1686    
1687     if (vrD != vrA) {
1688     vec_getHalf(reg1, vrA, 14);
1689     vec_setHalf(vrD, 14, reg1);
1690     }
1691    
1692     return flowContinue;
1693     #endif
1694     }
1695    
1696     /* vmrghw Vector Merge High Word
1697     * v.197
1698     */
1699     void ppc_opc_vmrghw()
1700     {
1701     VECTOR_DEBUG;
1702     int vrD, vrA, vrB;
1703     Vector_t r;
1704     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1705    
1706     VECT_W(r, 0) = VECT_W(gCPU.vr[vrA], 0);
1707     VECT_W(r, 1) = VECT_W(gCPU.vr[vrB], 0);
1708     VECT_W(r, 2) = VECT_W(gCPU.vr[vrA], 1);
1709     VECT_W(r, 3) = VECT_W(gCPU.vr[vrB], 1);
1710    
1711     gCPU.vr[vrD] = r;
1712     }
1713     JITCFlow ppc_opc_gen_vmrghw()
1714     {
1715     #if 0
1716     ppc_opc_gen_interpret(ppc_opc_vmrghw);
1717     return flowEndBlock;
1718     #else
1719     int vrD, vrA, vrB;
1720     modrm_o modrm;
1721     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1722    
1723     if (SSE2_AVAIL) {
1724     noncommutative_operation(PALUD(X86_PUNPCKH), vrD, vrB, vrA);
1725     return flowContinue;
1726     }
1727    
1728     if (SSE_AVAIL) {
1729     NativeVectorReg dest, src;
1730     int shuf_map = 0x72;
1731     int vrS = vrB;
1732    
1733     if (vrA == vrD) {
1734     dest = jitcGetClientVectorRegisterDirty(vrA);
1735     src = jitcGetClientVectorRegisterMapping(vrB);
1736     } else if (vrB == vrD) {
1737     dest = jitcGetClientVectorRegisterDirty(vrB);
1738     src = jitcGetClientVectorRegisterMapping(vrA);
1739     shuf_map = 0xd8;
1740     vrS = vrA;
1741     } else {
1742     dest = jitcMapClientVectorRegisterDirty(vrD);
1743     src = jitcGetClientVectorRegisterMapping(vrA);
1744    
1745     if (src == VECTREG_NO)
1746     asmMOVAPS(dest, &gCPU.vr[vrA]);
1747     else
1748     asmALUPS(X86_MOVAPS, dest, src);
1749    
1750     src = jitcGetClientVectorRegisterMapping(vrB);
1751     }
1752    
1753     if (src == VECTREG_NO) {
1754     asmSHUFPS(dest, x86_mem2(modrm, &gCPU.vr[vrS]), 0xee);
1755     } else {
1756     asmSHUFPS(dest, src, 0xee);
1757     }
1758    
1759     asmSHUFPS(dest, dest, shuf_map);
1760    
1761     return flowContinue;
1762     }
1763    
1764     jitcFlushClientVectorRegister(vrA);
1765     jitcFlushClientVectorRegister(vrB);
1766     jitcDropClientVectorRegister(vrD);
1767    
1768     NativeReg reg = jitcAllocRegister();
1769     JitcVectorReg vrTMP = vrD;
1770    
1771     if (vrA == vrD || vrB == vrD) {
1772     vrTMP = vrT;
1773     }
1774    
1775     vec_getWord(reg, vrA, 12);
1776     vec_setWord(vrTMP, 12, reg);
1777    
1778     vec_getWord(reg, vrB, 12);
1779     vec_setWord(vrTMP, 8, reg);
1780    
1781     vec_getWord(reg, vrA, 8);
1782     vec_setWord(vrTMP, 4, reg);
1783    
1784     vec_getWord(reg, vrB, 8);
1785     vec_setWord(vrTMP, 0, reg);
1786    
1787     vec_Copy(vrD, vrTMP);
1788    
1789     return flowContinue;
1790     #endif
1791     }
1792    
1793     /* vmrglb Vector Merge Low Byte
1794     * v.198
1795     */
1796     void ppc_opc_vmrglb()
1797     {
1798     VECTOR_DEBUG;
1799     int vrD, vrA, vrB;
1800     Vector_t r;
1801     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1802    
1803     VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 8);
1804     VECT_B(r, 1) = VECT_B(gCPU.vr[vrB], 8);
1805     VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 9);
1806     VECT_B(r, 3) = VECT_B(gCPU.vr[vrB], 9);
1807     VECT_B(r, 4) = VECT_B(gCPU.vr[vrA],10);
1808     VECT_B(r, 5) = VECT_B(gCPU.vr[vrB],10);
1809     VECT_B(r, 6) = VECT_B(gCPU.vr[vrA],11);
1810     VECT_B(r, 7) = VECT_B(gCPU.vr[vrB],11);
1811     VECT_B(r, 8) = VECT_B(gCPU.vr[vrA],12);
1812     VECT_B(r, 9) = VECT_B(gCPU.vr[vrB],12);
1813     VECT_B(r,10) = VECT_B(gCPU.vr[vrA],13);
1814     VECT_B(r,11) = VECT_B(gCPU.vr[vrB],13);
1815     VECT_B(r,12) = VECT_B(gCPU.vr[vrA],14);
1816     VECT_B(r,13) = VECT_B(gCPU.vr[vrB],14);
1817     VECT_B(r,14) = VECT_B(gCPU.vr[vrA],15);
1818     VECT_B(r,15) = VECT_B(gCPU.vr[vrB],15);
1819    
1820     gCPU.vr[vrD] = r;
1821     }
1822     JITCFlow ppc_opc_gen_vmrglb()
1823     {
1824     #if 0
1825     ppc_opc_gen_interpret(ppc_opc_vmrglb);
1826     return flowEndBlock;
1827     #else
1828     int vrD, vrA, vrB;
1829     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1830    
1831     if (SSE2_AVAIL) {
1832     noncommutative_operation(PALUB(X86_PUNPCKL), vrD, vrB, vrA);
1833     return flowContinue;
1834     }
1835    
1836     jitcFlushClientVectorRegister(vrA);
1837     jitcFlushClientVectorRegister(vrB);
1838     jitcDropClientVectorRegister(vrD);
1839    
1840     NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1841     NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1842    
1843     vec_getHalf((NativeReg16)reg1, vrB, 6);
1844     vec_getHalf((NativeReg16)reg2, vrA, 6);
1845     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1846     vec_setHalf(vrD, 12, (NativeReg16)reg1);
1847     vec_setHalf(vrD, 14, (NativeReg16)reg2);
1848    
1849     vec_getHalf((NativeReg16)reg1, vrB, 4);
1850     vec_getHalf((NativeReg16)reg2, vrA, 4);
1851     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1852     vec_setHalf(vrD, 8, (NativeReg16)reg1);
1853     vec_setHalf(vrD, 10, (NativeReg16)reg2);
1854    
1855     vec_getHalf((NativeReg16)reg1, vrB, 2);
1856     vec_getHalf((NativeReg16)reg2, vrA, 2);
1857     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1858     vec_setHalf(vrD, 4, (NativeReg16)reg1);
1859     vec_setHalf(vrD, 6, (NativeReg16)reg2);
1860    
1861     vec_getHalf((NativeReg16)reg1, vrB, 0);
1862     vec_getHalf((NativeReg16)reg2, vrA, 0);
1863     asmALU(X86_XCHG, RL2RH(reg1), reg2);
1864     vec_setHalf(vrD, 0, (NativeReg16)reg1);
1865     vec_setHalf(vrD, 2, (NativeReg16)reg2);
1866    
1867     return flowContinue;
1868     #endif
1869     }
1870    
1871     /* vmrglh Vector Merge Low Half Word
1872     * v.199
1873     */
1874     void ppc_opc_vmrglh()
1875     {
1876     VECTOR_DEBUG;
1877     int vrD, vrA, vrB;
1878     Vector_t r;
1879     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1880    
1881     VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 4);
1882     VECT_H(r, 1) = VECT_H(gCPU.vr[vrB], 4);
1883     VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 5);
1884     VECT_H(r, 3) = VECT_H(gCPU.vr[vrB], 5);
1885     VECT_H(r, 4) = VECT_H(gCPU.vr[vrA], 6);
1886     VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 6);
1887     VECT_H(r, 6) = VECT_H(gCPU.vr[vrA], 7);
1888     VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 7);
1889    
1890     gCPU.vr[vrD] = r;
1891     }
1892     JITCFlow ppc_opc_gen_vmrglh()
1893     {
1894     #if 0
1895     ppc_opc_gen_interpret(ppc_opc_vmrglh);
1896     return flowEndBlock;
1897     #else
1898     int vrD, vrA, vrB;
1899     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1900    
1901     if (SSE2_AVAIL) {
1902     noncommutative_operation(PALUW(X86_PUNPCKL), vrD, vrB, vrA);
1903     return flowContinue;
1904     }
1905    
1906     jitcFlushClientVectorRegister(vrA);
1907     jitcFlushClientVectorRegister(vrB);
1908     jitcDropClientVectorRegister(vrD);
1909    
1910     NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
1911    
1912     vec_getHalf(reg1, vrA, 6);
1913     vec_setHalf(vrD, 14, reg1);
1914     vec_getHalf(reg1, vrA, 4);
1915     vec_setHalf(vrD, 10, reg1);
1916    
1917     vec_getHalf(reg1, vrB, 6);
1918     vec_setHalf(vrD, 12, reg1);
1919     vec_getHalf(reg1, vrB, 4);
1920     vec_setHalf(vrD, 8, reg1);
1921    
1922     vec_getHalf(reg1, vrA, 2);
1923     vec_setHalf(vrD, 6, reg1);
1924    
1925     vec_getHalf(reg1, vrB, 2);
1926     vec_setHalf(vrD, 4, reg1);
1927    
1928     vec_getHalf(reg1, vrA, 0);
1929     vec_setHalf(vrD, 2, reg1);
1930    
1931     if (vrD != vrB) {
1932     vec_getHalf(reg1, vrB, 0);
1933     vec_setHalf(vrD, 0, reg1);
1934     }
1935    
1936     return flowContinue;
1937     #endif
1938     }
1939    
1940     /* vmrglw Vector Merge Low Word
1941     * v.200
1942     */
1943     void ppc_opc_vmrglw()
1944     {
1945     VECTOR_DEBUG;
1946     int vrD, vrA, vrB;
1947     Vector_t r;
1948     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1949    
1950     VECT_W(r, 0) = VECT_W(gCPU.vr[vrA], 2);
1951     VECT_W(r, 1) = VECT_W(gCPU.vr[vrB], 2);
1952     VECT_W(r, 2) = VECT_W(gCPU.vr[vrA], 3);
1953     VECT_W(r, 3) = VECT_W(gCPU.vr[vrB], 3);
1954    
1955     gCPU.vr[vrD] = r;
1956     }
1957     JITCFlow ppc_opc_gen_vmrglw()
1958     {
1959     #if 0
1960     ppc_opc_gen_interpret(ppc_opc_vmrglw);
1961     return flowEndBlock;
1962     #else
1963     int vrD, vrA, vrB;
1964     modrm_o modrm;
1965     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1966    
1967     if (SSE2_AVAIL) {
1968     noncommutative_operation(PALUD(X86_PUNPCKL), vrD, vrB, vrA);
1969     return flowContinue;
1970     }
1971    
1972     if (SSE_AVAIL) {
1973     NativeVectorReg dest, src;
1974     int shuf_map = 0x72;
1975     int vrS = vrB;
1976    
1977     if (vrA == vrD) {
1978     dest = jitcGetClientVectorRegisterDirty(vrA);
1979     src = jitcGetClientVectorRegisterMapping(vrB);
1980     } else if (vrB == vrD) {
1981     dest = jitcGetClientVectorRegisterDirty(vrB);
1982     src = jitcGetClientVectorRegisterMapping(vrA);
1983     shuf_map = 0xd8;
1984     vrS = vrA;
1985     } else {
1986     dest = jitcGetClientVectorRegister(vrA);
1987     jitcRenameVectorRegisterDirty(dest, vrD);
1988    
1989     src = jitcGetClientVectorRegisterMapping(vrB);
1990     }
1991    
1992     if (src == VECTREG_NO) {
1993     asmSHUFPS(dest, x86_mem2(modrm, &gCPU.vr[vrS]), 0x44);
1994     } else {
1995     asmSHUFPS(dest, src, 0x44);
1996     }
1997    
1998     asmSHUFPS(dest, dest, shuf_map);
1999    
2000     return flowContinue;
2001     }
2002    
2003     NativeReg reg = jitcAllocRegister();
2004    
2005     jitcFlushClientVectorRegister(vrA);
2006     jitcFlushClientVectorRegister(vrB);
2007     jitcDropClientVectorRegister(vrD);
2008    
2009     vec_getWord(reg, vrA, 4);
2010     vec_setWord(vrD, 12, reg);
2011    
2012     vec_getWord(reg, vrB, 4);
2013     vec_setWord(vrD, 8, reg);
2014    
2015     vec_getWord(reg, vrA, 0);
2016     vec_setWord(vrD, 4, reg);
2017    
2018     if (vrB != vrD) {
2019     vec_getWord(reg, vrB, 0);
2020     vec_setWord(vrD, 0, reg);
2021     }
2022    
2023     return flowContinue;
2024     #endif
2025     }
2026    
2027     /* vspltb Vector Splat Byte
2028     * v.245
2029     */
2030     void ppc_opc_vspltb()
2031     {
2032     VECTOR_DEBUG;
2033     int vrD, vrB;
2034     uint32 uimm;
2035     uint64 val;
2036     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2037    
2038     /* The documentation doesn't stipulate what a value higher than 0xf
2039     * will do. Thus, this is by default an undefined value. We
2040     * are thus doing this the fastest way that won't crash us.
2041     */
2042     val = VECT_B(gCPU.vr[vrB], uimm & 0xf);
2043     val |= (val << 8);
2044     val |= (val << 16);
2045     val |= (val << 32);
2046    
2047     gCPU.vr[vrD].d[0] = val;
2048     gCPU.vr[vrD].d[1] = val;
2049     }
2050     JITCFlow ppc_opc_gen_vspltb()
2051     {
2052     #if 0
2053     ppc_opc_gen_interpret(ppc_opc_vspltb);
2054     return flowEndBlock;
2055     #else
2056     int vrD, vrB;
2057     uint32 uimm;
2058     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2059    
2060     jitcFlushClientVectorRegister(vrB);
2061     jitcDropClientVectorRegister(vrD);
2062    
2063     jitcClobberCarryAndFlags();
2064     NativeReg reg1 = jitcAllocRegister(NATIVE_REG_8);
2065     NativeReg reg2 = jitcAllocRegister();
2066    
2067     /* The documentation doesn't stipulate what a value higher than 0xf
2068     * will do. Thus, this is by default an undefined value. We
2069     * are thus doing this the fastest way that won't crash us.
2070     */
2071     vec_getByteZ(reg1, vrB, (15 - (uimm & 0xf)));
2072     asmALU(X86_MOV, RL2RH(reg1), (NativeReg8)reg1);
2073    
2074     asmALU(X86_MOV, reg2, reg1);
2075     asmShift(X86_SHL, reg2, 16);
2076     asmALU(X86_OR, reg1, reg2);
2077    
2078     vec_setWord(vrD, 0, reg1);
2079     vec_setWord(vrD, 4, reg1);
2080     vec_setWord(vrD, 8, reg1);
2081     vec_setWord(vrD, 12, reg1);
2082    
2083     return flowContinue;
2084     #endif
2085     }
2086    
2087     /* vsplth Vector Splat Half Word
2088     * v.246
2089     */
2090     void ppc_opc_vsplth()
2091     {
2092     VECTOR_DEBUG;
2093     int vrD, vrB;
2094     uint32 uimm;
2095     uint64 val;
2096     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2097    
2098     /* The documentation doesn't stipulate what a value higher than 0x7
2099     * will do. Thus, this is by default an undefined value. We
2100     * are thus doing this the fastest way that won't crash us.
2101     */
2102     val = VECT_H(gCPU.vr[vrB], uimm & 0x7);
2103     val |= (val << 16);
2104     val |= (val << 32);
2105    
2106     gCPU.vr[vrD].d[0] = val;
2107     gCPU.vr[vrD].d[1] = val;
2108     }
2109     JITCFlow ppc_opc_gen_vsplth()
2110     {
2111     #if 0
2112     ppc_opc_gen_interpret(ppc_opc_vsplth);
2113     return flowEndBlock;
2114     #else
2115     int vrD, vrB;
2116     uint32 uimm;
2117     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2118    
2119     jitcFlushClientVectorRegister(vrB);
2120     jitcDropClientVectorRegister(vrD);
2121    
2122     jitcClobberCarryAndFlags();
2123     NativeReg reg1 = jitcAllocRegister();
2124     NativeReg reg2 = jitcAllocRegister();
2125    
2126     /* The documentation doesn't stipulate what a value higher than 0x7
2127     * will do. Thus, this is by default an undefined value. We
2128     * are thus doing this the fastest way that won't crash us.
2129     */
2130     vec_getHalfZ(reg1, vrB, (7 - (uimm & 0x7)) << 1);
2131    
2132     asmALU(X86_MOV, reg2, reg1);
2133     asmShift(X86_SHL, reg2, 16);
2134     asmALU(X86_OR, reg1, reg2);
2135    
2136     vec_setWord(vrD, 0, reg1);
2137     vec_setWord(vrD, 4, reg1);
2138     vec_setWord(vrD, 8, reg1);
2139     vec_setWord(vrD, 12, reg1);
2140    
2141     return flowContinue;
2142     #endif
2143     }
2144    
2145     /* vspltw Vector Splat Word
2146     * v.250
2147     */
2148     void ppc_opc_vspltw()
2149     {
2150     VECTOR_DEBUG;
2151     int vrD, vrB;
2152     uint32 uimm;
2153     uint64 val;
2154     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2155    
2156     /* The documentation doesn't stipulate what a value higher than 0x3
2157     * will do. Thus, this is by default an undefined value. We
2158     * are thus doing this the fastest way that won't crash us.
2159     */
2160     val = VECT_W(gCPU.vr[vrB], uimm & 0x3);
2161     val |= (val << 32);
2162    
2163     gCPU.vr[vrD].d[0] = val;
2164     gCPU.vr[vrD].d[1] = val;
2165     }
2166     JITCFlow ppc_opc_gen_vspltw()
2167     {
2168     #if 0
2169     ppc_opc_gen_interpret(ppc_opc_vspltw);
2170     return flowEndBlock;
2171     #else
2172     int vrD, vrB;
2173     uint32 uimm;
2174     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2175    
2176     if (SSE_AVAIL) {
2177     NativeVectorReg reg;
2178    
2179     if (vrD == vrB) {
2180     reg = jitcGetClientVectorRegisterDirty(vrD);
2181     } else {
2182     reg = jitcMapClientVectorRegisterDirty(vrD);
2183     NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(vrB);
2184    
2185     if (reg2 == VECTREG_NO)
2186     asmMOVAPS(reg, &gCPU.vr[vrB]);
2187     else
2188     asmALUPS(X86_MOVAPS, reg, reg2);
2189     }
2190    
2191     asmSHUFPS(reg, reg, 0xff - ((uimm & 0x3) * 0x55));
2192    
2193     return flowContinue;
2194     }
2195    
2196     jitcFlushClientVectorRegister(vrB);
2197     jitcDropClientVectorRegister(vrD);
2198    
2199     jitcClobberCarryAndFlags();
2200     NativeReg reg1 = jitcAllocRegister();
2201    
2202     /* The documentation doesn't stipulate what a value higher than 0xf
2203     * will do. Thus, this is by default an undefined value. We
2204     * are thus doing this the fastest way that won't crash us.
2205     */
2206     vec_getWord(reg1, vrB, (3 - (uimm & 0x3)) << 2);
2207    
2208     vec_setWord(vrD, 0, reg1);
2209     vec_setWord(vrD, 4, reg1);
2210     vec_setWord(vrD, 8, reg1);
2211     vec_setWord(vrD, 12, reg1);
2212    
2213     return flowContinue;
2214     #endif
2215     }
2216    
2217     /* vspltisb Vector Splat Immediate Signed Byte
2218     * v.247
2219     */
2220     void ppc_opc_vspltisb()
2221     {
2222     VECTOR_DEBUG_COMMON;
2223     int vrD, vrB;
2224     uint32 simm;
2225     uint64 val;
2226     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2227     PPC_OPC_ASSERT(vrB==0);
2228    
2229     val = (simm & 0x10) ? (simm | 0xE0) : simm;
2230     val |= (val << 8);
2231     val |= (val << 16);
2232     val |= (val << 32);
2233    
2234     gCPU.vr[vrD].d[0] = val;
2235     gCPU.vr[vrD].d[1] = val;
2236     }
2237     JITCFlow ppc_opc_gen_vspltisb()
2238     {
2239     #if 0
2240     ppc_opc_gen_interpret(ppc_opc_vspltisb);
2241     return flowEndBlock;
2242     #else
2243     int vrD, vrB;
2244     uint32 simm, val;
2245     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2246    
2247     if (simm == 0) {
2248     vec_Zero(vrD);
2249     return flowContinue;
2250     } else if (simm == 0x1f) {
2251     vec_Neg1(vrD);
2252     return flowContinue;
2253     }
2254    
2255     val = (simm & 0x10) ? (simm | 0xE0) : simm;
2256     val |= (val << 8);
2257     val |= (val << 16);
2258    
2259     if (SSE_AVAIL) {
2260     modrm_o modrm;
2261     NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2262    
2263     asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2264     asmMOVSS(reg1, &gCPU.vtemp);
2265     asmSHUFPS(reg1, reg1, 0x00);
2266    
2267     return flowContinue;
2268     }
2269    
2270     jitcDropClientVectorRegister(vrD);
2271    
2272     NativeReg r1 = jitcAllocRegister();
2273     asmALU(X86_MOV, r1, val);
2274    
2275     vec_setWord(vrD, 0, r1);
2276     vec_setWord(vrD, 4, r1);
2277     vec_setWord(vrD, 8, r1);
2278     vec_setWord(vrD,12, r1);
2279    
2280     return flowContinue;
2281     #endif
2282     }
2283    
2284     /* vspltish Vector Splat Immediate Signed Half Word
2285     * v.248
2286     */
2287     void ppc_opc_vspltish()
2288     {
2289     VECTOR_DEBUG_COMMON;
2290     int vrD, vrB;
2291     uint32 simm;
2292     uint64 val;
2293     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2294     PPC_OPC_ASSERT(vrB==0);
2295    
2296     val = (simm & 0x10) ? (simm | 0xFFE0) : simm;
2297     val |= (val << 16);
2298     val |= (val << 32);
2299    
2300     gCPU.vr[vrD].d[0] = val;
2301     gCPU.vr[vrD].d[1] = val;
2302     }
2303     JITCFlow ppc_opc_gen_vspltish()
2304     {
2305     #if 0
2306     ppc_opc_gen_interpret(ppc_opc_vspltish);
2307     return flowEndBlock;
2308     #else
2309     int vrD, vrB;
2310     uint32 simm, val;
2311     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2312    
2313     if (simm == 0) {
2314     vec_Zero(vrD);
2315     return flowContinue;
2316     } else if (simm == 0x1f) {
2317     vec_Neg1(vrD);
2318     return flowContinue;
2319     }
2320    
2321     val = (simm & 0x10) ? (simm | 0xFFE0) : simm;
2322     val |= (val << 16);
2323    
2324     if (SSE_AVAIL) {
2325     modrm_o modrm;
2326     NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2327    
2328     asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2329     asmMOVSS(reg1, &gCPU.vtemp);
2330     asmSHUFPS(reg1, reg1, 0x00);
2331    
2332     return flowContinue;
2333     }
2334    
2335     jitcDropClientVectorRegister(vrD);
2336    
2337     NativeReg r1 = jitcAllocRegister();
2338     asmALU(X86_MOV, r1, val);
2339    
2340     vec_setWord(vrD, 0, r1);
2341     vec_setWord(vrD, 4, r1);
2342     vec_setWord(vrD, 8, r1);
2343     vec_setWord(vrD,12, r1);
2344    
2345     return flowContinue;
2346     #endif
2347     }
2348    
2349     /* vspltisw Vector Splat Immediate Signed Word
2350     * v.249
2351     */
2352     void ppc_opc_vspltisw()
2353     {
2354     VECTOR_DEBUG_COMMON;
2355     int vrD, vrB;
2356     uint32 simm;
2357     uint64 val;
2358     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2359     PPC_OPC_ASSERT(vrB==0);
2360    
2361     val = (simm & 0x10) ? (simm | 0xFFFFFFE0) : simm;
2362     val |= (val << 32);
2363    
2364     gCPU.vr[vrD].d[0] = val;
2365     gCPU.vr[vrD].d[1] = val;
2366     }
2367     JITCFlow ppc_opc_gen_vspltisw()
2368     {
2369     #if 0
2370     ppc_opc_gen_interpret(ppc_opc_vspltisw);
2371     return flowEndBlock;
2372     #else
2373     int vrD, vrB;
2374     uint32 simm, val;
2375     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2376    
2377     if (simm == 0) {
2378     vec_Zero(vrD);
2379     return flowContinue;
2380     } else if (simm == 0x1f) {
2381     vec_Neg1(vrD);
2382     return flowContinue;
2383     }
2384    
2385     val = (simm & 0x10) ? (simm | 0xFFFFFFE0) : simm;
2386    
2387     if (SSE_AVAIL) {
2388     modrm_o modrm;
2389     NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2390    
2391     asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2392     asmMOVSS(reg1, &gCPU.vtemp);
2393     asmSHUFPS(reg1, reg1, 0x00);
2394    
2395     return flowContinue;
2396     }
2397    
2398     jitcDropClientVectorRegister(vrD);
2399    
2400     NativeReg r1 = jitcAllocRegister();
2401     asmALU(X86_MOV, r1, val);
2402    
2403     vec_setWord(vrD, 0, r1);
2404     vec_setWord(vrD, 4, r1);
2405     vec_setWord(vrD, 8, r1);
2406     vec_setWord(vrD,12, r1);
2407    
2408     return flowContinue;
2409     #endif
2410     }
2411    
2412     /* mfvscr Move from Vector Status and Control Register
2413     * v.129
2414     */
2415     void ppc_opc_mfvscr()
2416     {
2417     VECTOR_DEBUG_COMMON;
2418     int vrD, vrA, vrB;
2419     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2420     PPC_OPC_ASSERT(vrA==0);
2421     PPC_OPC_ASSERT(vrB==0);
2422    
2423     VECT_W(gCPU.vr[vrD], 3) = gCPU.vscr;
2424     VECT_W(gCPU.vr[vrD], 2) = 0;
2425     VECT_D(gCPU.vr[vrD], 0) = 0;
2426     }
2427     JITCFlow ppc_opc_gen_mfvscr()
2428     {
2429     #if 0
2430     ppc_opc_gen_interpret(ppc_opc_mfvscr);
2431     return flowEndBlock;
2432     #else
2433     int vrD, vrA, vrB;
2434     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2435    
2436     if (SSE_AVAIL) {
2437     NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2438     NativeReg reg2 = jitcGetClientRegisterMapping(PPC_VSCR);
2439    
2440     if (reg1 != VECTREG_NO)
2441     jitcFlushRegister(NATIVE_REG | reg2);
2442    
2443     asmMOVSS(reg1, &gCPU.vscr);
2444     } else {
2445     jitcDropClientVectorRegister(vrD);
2446    
2447     NativeReg r1 = jitcGetClientRegister(PPC_VSCR);
2448     vec_setWord(vrD, 0, r1);
2449    
2450     jitcClobberRegister(NATIVE_REG | r1);
2451    
2452     asmMOV_NoFlags(r1, 0); // avoid flag clobber
2453     vec_setWord(vrD, 4, r1);
2454     vec_setWord(vrD, 8, r1);
2455     vec_setWord(vrD,12, r1);
2456     }
2457    
2458     return flowContinue;
2459     #endif
2460     }
2461    
2462     /* mtvscr Move to Vector Status and Control Register
2463     * v.130
2464     */
2465     void ppc_opc_mtvscr()
2466     {
2467     VECTOR_DEBUG_COMMON;
2468     int vrD, vrA, vrB;
2469     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2470     PPC_OPC_ASSERT(vrA==0);
2471     PPC_OPC_ASSERT(vrD==0);
2472    
2473     gCPU.vscr = VECT_W(gCPU.vr[vrB], 3);
2474     }
2475     JITCFlow ppc_opc_gen_mtvscr()
2476     {
2477     #if 0
2478     ppc_opc_gen_interpret(ppc_opc_mtvscr);
2479     return flowEndBlock;
2480     #else
2481     int vrD, vrA, vrB;
2482     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2483    
2484     if (SSE_AVAIL) {
2485     NativeVectorReg reg1 = jitcGetClientVectorRegisterMapping(vrB);
2486    
2487     if (reg1 != VECTREG_NO) {
2488     NativeReg reg2 = jitcGetClientRegisterMapping(PPC_VSCR);
2489    
2490     if (reg2 != REG_NO)
2491     jitcClobberRegister(NATIVE_REG | reg2);
2492    
2493     asmMOVSS(&gCPU.vscr, reg1);
2494     return flowContinue;
2495     }
2496     }
2497    
2498     jitcFlushClientVectorRegister(vrB);
2499    
2500     NativeReg r1 = jitcMapClientRegisterDirty(PPC_VSCR);
2501    
2502     vec_getWord(r1, vrB, 0);
2503     return flowContinue;
2504     #endif
2505     }
2506    
2507     /* vpkuhum Vector Pack Unsigned Half Word Unsigned Modulo
2508     * v.224
2509     */
2510     void ppc_opc_vpkuhum()
2511     {
2512     VECTOR_DEBUG;
2513     int vrD, vrA, vrB;
2514     Vector_t r;
2515     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2516    
2517     VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 1);
2518     VECT_B(r, 1) = VECT_B(gCPU.vr[vrA], 3);
2519     VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 5);
2520     VECT_B(r, 3) = VECT_B(gCPU.vr[vrA], 7);
2521     VECT_B(r, 4) = VECT_B(gCPU.vr[vrA], 9);
2522     VECT_B(r, 5) = VECT_B(gCPU.vr[vrA],11);
2523     VECT_B(r, 6) = VECT_B(gCPU.vr[vrA],13);
2524     VECT_B(r, 7) = VECT_B(gCPU.vr[vrA],15);
2525    
2526     VECT_B(r, 8) = VECT_B(gCPU.vr[vrB], 1);
2527     VECT_B(r, 9) = VECT_B(gCPU.vr[vrB], 3);
2528     VECT_B(r,10) = VECT_B(gCPU.vr[vrB], 5);
2529     VECT_B(r,11) = VECT_B(gCPU.vr[vrB], 7);
2530     VECT_B(r,12) = VECT_B(gCPU.vr[vrB], 9);
2531     VECT_B(r,13) = VECT_B(gCPU.vr[vrB],11);
2532     VECT_B(r,14) = VECT_B(gCPU.vr[vrB],13);
2533     VECT_B(r,15) = VECT_B(gCPU.vr[vrB],15);
2534    
2535     gCPU.vr[vrD] = r;
2536     }
2537     JITCFlow ppc_opc_gen_vpkuhum()
2538     {
2539     #if 0
2540     ppc_opc_gen_interpret(ppc_opc_vpkuhum);
2541     return flowEndBlock;
2542     #else
2543     int vrD, vrA, vrB;
2544     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2545    
2546     jitcFlushClientVectorRegister(vrA);
2547     jitcFlushClientVectorRegister(vrB);
2548     jitcDropClientVectorRegister(vrD);
2549    
2550     jitcClobberCarryAndFlags();
2551     NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
2552    
2553     if (vrB == vrD && vrA == vrD) {
2554     for (int i=0; i<16; i += 2) {
2555     vec_getHalf((NativeReg16)reg1, vrA, i);
2556    
2557     vec_setByte(vrT, REG_NO, (i >> 1), reg1);
2558     vec_setByte(vrT, REG_NO, 8+(i >> 1), reg1);
2559     }
2560    
2561     vec_Copy(vrD, vrT);
2562     } else if (vrA == vrD) {
2563     for (int i=0; i<16; i += 2) {
2564     vec_getHalf((NativeReg16)reg1, vrA, 14 - i);
2565     vec_setByte(vrD, REG_NO, 15 - (i >> 1), reg1);
2566     }
2567    
2568     for (int i=0; i<16; i += 2) {
2569     vec_getHalf((NativeReg16)reg1, vrB, i);
2570     vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2571     }
2572     } else {
2573     for (int i=0; i<16; i += 2) {
2574     vec_getHalf((NativeReg16)reg1, vrB, i);
2575     vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2576     }
2577    
2578     for (int i=0; i<16; i += 2) {
2579     vec_getHalf((NativeReg16)reg1, vrA, i);
2580     vec_setByte(vrD, REG_NO, 8 + (i >> 1), reg1);
2581     }
2582     }
2583    
2584     return flowContinue;
2585     #endif
2586     }
2587    
2588     /* vpkuwum Vector Pack Unsigned Word Unsigned Modulo
2589     * v.226
2590     */
2591     void ppc_opc_vpkuwum()
2592     {
2593     VECTOR_DEBUG;
2594     int vrD, vrA, vrB;
2595     Vector_t r;
2596     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2597    
2598     VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 1);
2599     VECT_H(r, 1) = VECT_H(gCPU.vr[vrA], 3);
2600     VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 5);
2601     VECT_H(r, 3) = VECT_H(gCPU.vr[vrA], 7);
2602    
2603     VECT_H(r, 4) = VECT_H(gCPU.vr[vrB], 1);
2604     VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 3);
2605     VECT_H(r, 6) = VECT_H(gCPU.vr[vrB], 5);
2606     VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 7);
2607    
2608     gCPU.vr[vrD] = r;
2609     }
2610     JITCFlow ppc_opc_gen_vpkuwum()
2611     {
2612     #if 0
2613     ppc_opc_gen_interpret(ppc_opc_vpkuwum);
2614     return flowEndBlock;
2615     #else
2616     int vrD, vrA, vrB;
2617     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2618    
2619     jitcFlushClientVectorRegister(vrA);
2620     jitcFlushClientVectorRegister(vrB);
2621     jitcDropClientVectorRegister(vrD);
2622    
2623     jitcClobberCarryAndFlags();
2624     NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
2625    
2626     if (vrB == vrD && vrA == vrD) {
2627     for (int i=0; i<16; i += 4) {
2628     vec_getWord((NativeReg)reg1, vrA, i);
2629    
2630     vec_setHalf(vrT, (i >> 1), reg1);
2631     vec_setHalf(vrT, 8+(i >> 1), reg1);
2632     }
2633    
2634     vec_Copy(vrD, vrT);
2635     } else if (vrA == vrD) {
2636     for (int i=0; i<16; i += 4) {
2637     vec_getWord((NativeReg)reg1, vrA, 12 - i);
2638     vec_setHalf(vrD, 14 - (i >> 1), reg1);
2639     }
2640    
2641     for (int i=0; i<16; i += 4) {
2642     vec_getWord((NativeReg)reg1, vrB, i);
2643     vec_setHalf(vrD, (i >> 1), reg1);
2644     }
2645     } else {
2646     for (int i=0; i<16; i += 4) {
2647     vec_getWord((NativeReg)reg1, vrB, i);
2648     vec_setHalf(vrD, (i >> 1), reg1);
2649     }
2650    
2651     for (int i=0; i<16; i += 4) {
2652     vec_getWord((NativeReg)reg1, vrA, i);
2653     vec_setHalf(vrD, 8 + (i >> 1), reg1);
2654     }
2655     }
2656    
2657     return flowContinue;
2658     #endif
2659     }
2660    
2661     static inline void ppc_opc_gen_helper_vpkpx(NativeReg reg1, NativeReg reg2, NativeReg reg3)
2662     {
2663     asmALU(X86_MOV, reg2, reg1);
2664     asmALU(X86_MOV, reg3, reg1);
2665    
2666     asmALU(X86_AND, reg1, 0x000000f8);
2667     asmALU(X86_AND, reg2, 0x0000f800);
2668     asmALU(X86_AND, reg3, 0x01f80000);
2669    
2670     asmShift(X86_SHR, reg1, 3);
2671     asmShift(X86_SHR, reg2, 6);
2672     asmShift(X86_SHR, reg3, 9);
2673    
2674     asmALU(X86_OR, reg1, reg2);
2675     asmALU(X86_OR, reg1, reg3);
2676     }
2677    
2678     /* vpkpx Vector Pack Pixel32
2679     * v.219
2680     */
2681     void ppc_opc_vpkpx()
2682     {
2683     VECTOR_DEBUG;
2684     int vrD, vrA, vrB;
2685     Vector_t r;
2686     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2687    
2688     VECT_H(r, 0) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 0));
2689     VECT_H(r, 1) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 1));
2690     VECT_H(r, 2) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 2));
2691     VECT_H(r, 3) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 3));
2692    
2693     VECT_H(r, 4) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 0));
2694     VECT_H(r, 5) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 1));
2695     VECT_H(r, 6) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 2));
2696     VECT_H(r, 7) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 3));
2697    
2698     gCPU.vr[vrD] = r;
2699     }
2700     JITCFlow ppc_opc_gen_vpkpx()
2701     {
2702     #if 0
2703     ppc_opc_gen_interpret(ppc_opc_vpkpx);
2704     return flowEndBlock;
2705     #else
2706     int vrD, vrA, vrB;
2707     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2708    
2709     jitcFlushClientVectorRegister(vrA);
2710     jitcFlushClientVectorRegister(vrB);
2711     jitcDropClientVectorRegister(vrD);
2712    
2713     jitcClobberCarryAndFlags();
2714     NativeReg reg1 = jitcAllocRegister();
2715     NativeReg reg2 = jitcAllocRegister();
2716     NativeReg reg3 = jitcAllocRegister();
2717    
2718     if((vrA == vrB)) {
2719     for (int i=0; i<8; i += 2) {
2720     vec_getWord(reg1, vrB, i<<1);
2721     ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2722     vec_setHalf(vrD, i, (NativeReg16)reg1);
2723    
2724     if (vrD != vrA)
2725     vec_setHalf(vrD, i+8, (NativeReg16)reg1);
2726     }
2727    
2728     if (vrD == vrB) {
2729     vec_getWord(reg1, vrD, 0);
2730     vec_getWord(reg2, vrD, 4);
2731    
2732     vec_setWord(vrD, 8, reg1);
2733     vec_setWord(vrD,12, reg2);
2734     }
2735     } else if (vrB == vrD) {
2736     for (int i=0; i<8; i += 2) {
2737     vec_getWord(reg1, vrB, i<<1);
2738     ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2739     vec_setHalf(vrD, i, (NativeReg16)reg1);
2740     }
2741    
2742     for (int i=0; i<8; i += 2) {
2743     vec_getWord(reg1, vrA, i<<1);
2744     ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2745     vec_setHalf(vrD, i+8, (NativeReg16)reg1);
2746     }
2747     } else {
2748     for (int i=0; i<8; i += 2) {
2749     vec_getWord(reg1, vrA, 12-(i<<1));
2750     ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2751     vec_setHalf(vrD, 14-i, (NativeReg16)reg1);
2752     }
2753    
2754     for (int i=0; i<8; i += 2) {
2755     vec_getWord(reg1, vrB, i<<1);
2756     ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2757     vec_setHalf(vrD, i, (NativeReg16)reg1);
2758     }
2759     }
2760    
2761     return flowContinue;
2762     #endif
2763     }
2764    
2765    
2766     /* vpkuhus Vector Pack Unsigned Half Word Unsigned Saturate
2767     * v.225
2768     */
2769     void ppc_opc_vpkuhus()
2770     {
2771     VECTOR_DEBUG;
2772     int vrD, vrA, vrB;
2773     Vector_t r;
2774     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2775    
2776     VECT_B(r, 0) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 0));
2777     VECT_B(r, 1) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 1));
2778     VECT_B(r, 2) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 2));
2779     VECT_B(r, 3) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 3));
2780     VECT_B(r, 4) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 4));
2781     VECT_B(r, 5) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 5));
2782     VECT_B(r, 6) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 6));
2783     VECT_B(r, 7) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 7));
2784    
2785     VECT_B(r, 8) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 0));
2786     VECT_B(r, 9) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 1));
2787     VECT_B(r,10) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 2));
2788     VECT_B(r,11) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 3));
2789     VECT_B(r,12) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 4));
2790     VECT_B(r,13) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 5));
2791     VECT_B(r,14) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 6));
2792     VECT_B(r,15) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 7));
2793    
2794     gCPU.vr[vrD] = r;
2795     }
2796     JITCFlow ppc_opc_gen_vpkuhus()
2797     {
2798     #if 0
2799     ppc_opc_gen_interpret(ppc_opc_vpkuhus);
2800     return flowEndBlock;
2801     #else
2802     int vrD, vrA, vrB;
2803     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2804    
2805     jitcFlushClientVectorRegister(vrA);
2806     jitcFlushClientVectorRegister(vrB);
2807     jitcDropClientVectorRegister(vrD);
2808    
2809     jitcClobberCarryAndFlags();
2810     NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
2811    
2812     if (vrB == vrD && vrA == vrD) {
2813     for (int i=0; i<16; i += 2) {
2814     vec_getHalfZ((NativeReg)reg1, vrA, i);
2815    
2816     vec_saturateUB(reg1);
2817    
2818     vec_setByte(vrT, REG_NO, (i >> 1), reg1);
2819     vec_setByte(vrT, REG_NO, 8+(i >> 1), reg1);
2820     }
2821    
2822     vec_Copy(vrD, vrT);
2823     } else if (vrA == vrD) {
2824     for (int i=0; i<16; i += 2) {
2825     vec_getHalfZ((NativeReg)reg1, vrA, 14 - i);
2826    
2827     vec_saturateUB(reg1);
2828    
2829     vec_setByte(vrD, REG_NO, 15 - (i >> 1), reg1);
2830     }
2831    
2832     for (int i=0; i<16; i += 2) {
2833     vec_getHalfZ((NativeReg)reg1, vrB, i);
2834    
2835     vec_saturateUB(reg1);
2836    
2837     vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2838     }
2839     } else {
2840     for (int i=0; i<16; i += 2) {
2841     vec_getHalfZ((NativeReg)reg1, vrB, i);
2842    
2843     vec_saturateUB(reg1);
2844    
2845     vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2846     }
2847    
2848     for (int i=0; i<16; i += 2) {
2849     vec_getHalfZ((NativeReg)reg1, vrA, i);
2850    
2851     vec_saturateUB(reg1);
2852    
2853     vec_setByte(vrD, REG_NO, 8 + (i >> 1), reg1);
2854     }
2855     }
2856    
2857     return flowContinue;
2858     #endif
2859     }
2860    
2861     /* vpkshss Vector Pack Signed Half Word Signed Saturate
2862     * v.220
2863     */
2864     void ppc_opc_vpkshss()
2865     {
2866     VECTOR_DEBUG;
2867     int vrD, vrA, vrB;
2868     Vector_t r;
2869     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2870    
2871     VECT_B(r, 0) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 0));
2872     VECT_B(r, 1) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 1));
2873     VECT_B(r, 2) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 2));
2874     VECT_B(r, 3) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 3));
2875     VECT_B(r, 4) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 4));
2876     VECT_B(r, 5) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 5));
2877     VECT_B(r, 6) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 6));
2878     VECT_B(r, 7) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 7));
2879    
2880     VECT_B(r, 8) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 0));
2881     VECT_B(r, 9) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 1));
2882     VECT_B(r,10) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 2));
2883     VECT_B(r,11) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 3));
2884     VECT_B(r,12) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 4));
2885     VECT_B(r,13) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 5));
2886     VECT_B(r,14) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 6));
2887     VECT_B(r,15) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 7));
2888    
2889     gCPU.vr[vrD] = r;
2890     }
2891     JITCFlow ppc_opc_gen_vpkshss()
2892     {
2893     ppc_opc_gen_interpret(ppc_opc_vpkshss);
2894     return flowEndBlock;
2895     }
2896    
2897     /* vpkuwus Vector Pack Unsigned Word Unsigned Saturate
2898     * v.227
2899     */
2900     void ppc_opc_vpkuwus()
2901     {
2902     VECTOR_DEBUG;
2903     int vrD, vrA, vrB;
2904     Vector_t r;
2905     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2906    
2907     VECT_H(r, 0) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 0));
2908     VECT_H(r, 1) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 1));
2909     VECT_H(r, 2) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 2));
2910     VECT_H(r, 3) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 3));
2911    
2912     VECT_H(r, 4) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 0));
2913     VECT_H(r, 5) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 1));
2914     VECT_H(r, 6) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 2));
2915     VECT_H(r, 7) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 3));
2916    
2917     gCPU.vr[vrD] = r;
2918     }
2919     JITCFlow ppc_opc_gen_vpkuwus()
2920     {
2921     ppc_opc_gen_interpret(ppc_opc_vpkuwus);
2922     return flowEndBlock;
2923     }
2924    
2925     /* vpkswss Vector Pack Signed Word Signed Saturate
2926     * v.222
2927     */
2928     void ppc_opc_vpkswss()
2929     {
2930     VECTOR_DEBUG;
2931     int vrD, vrA, vrB;
2932     Vector_t r;
2933     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2934    
2935     VECT_H(r, 0) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 0));
2936     VECT_H(r, 1) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 1));
2937     VECT_H(r, 2) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 2));
2938     VECT_H(r, 3) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 3));
2939    
2940     VECT_H(r, 4) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 0));
2941     VECT_H(r, 5) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 1));
2942     VECT_H(r, 6) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 2));
2943     VECT_H(r, 7) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 3));
2944    
2945     gCPU.vr[vrD] = r;
2946     }
2947     JITCFlow ppc_opc_gen_vpkswss()
2948     {
2949     #if 0
2950     ppc_opc_gen_interpret(ppc_opc_vpkswss);
2951     return flowEndBlock;
2952     #else
2953     int vrD, vrA, vrB;
2954     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2955    
2956     if (SSE2_AVAIL) {
2957     noncommutative_operation(X86_PACKSSDW, vrD, vrB, vrA);
2958     return flowContinue;
2959     }
2960    
2961     jitcFlushClientVectorRegister(vrA);
2962     jitcFlushClientVectorRegister(vrB);
2963     jitcDropClientVectorRegister(vrD);
2964    
2965     jitcClobberCarryAndFlags();
2966     NativeReg reg1 = jitcAllocRegister();
2967     NativeReg reg2 = jitcAllocRegister();
2968    
2969     if (vrB == vrD && vrA == vrD) {
2970     for (int i=0; i<16; i += 4) {
2971     vec_getWord(reg1, vrA, i);
2972    
2973     vec_saturateSH(reg1, reg2);
2974    
2975     vec_setHalf(vrT, (i >> 1), (NativeReg16)reg1);
2976     vec_setHalf(vrT, 8+(i >> 1), (NativeReg16)reg1);
2977     }
2978    
2979     vec_Copy(vrD, vrT);
2980     } else if (vrA == vrD) {
2981     for (int i=0; i<16; i += 4) {
2982     vec_getWord(reg1, vrA, 12 - i);
2983    
2984     vec_saturateSH(reg1, reg2);
2985    
2986     vec_setHalf(vrD, 14 - (i >> 1), (NativeReg16)reg1);
2987     }
2988    
2989     for (int i=0; i<16; i += 4) {
2990     vec_getWord(reg1, vrB, i);
2991    
2992     vec_saturateSH(reg1, reg2);
2993    
2994     vec_setHalf(vrD, (i >> 1), (NativeReg16)reg1);
2995     }
2996     } else {
2997     for (int i=0; i<16; i += 4) {
2998     vec_getWord(reg1, vrB, i);
2999    
3000     vec_saturateSH(reg1, reg2);
3001    
3002     vec_setHalf(vrD, (i >> 1), (NativeReg16)reg1);
3003     }
3004    
3005     for (int i=0; i<16; i += 4) {
3006     vec_getWord(reg1, vrA, i);
3007    
3008     vec_saturateSH(reg1, reg2);
3009    
3010     vec_setHalf(vrD, 8 + (i >> 1), (NativeReg16)reg1);
3011     }
3012     }
3013    
3014     return flowContinue;
3015     #endif
3016     }
3017    
3018     /* vpkshus Vector Pack Signed Half Word Unsigned Saturate
3019     * v.221
3020     */
3021     void ppc_opc_vpkshus()
3022     {
3023     VECTOR_DEBUG;
3024     int vrD, vrA, vrB;
3025     Vector_t r;
3026     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3027    
3028     VECT_B(r, 0) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 0));
3029     VECT_B(r, 1) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 1));
3030     VECT_B(r, 2) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 2));
3031     VECT_B(r, 3) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 3));
3032     VECT_B(r, 4) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 4));
3033     VECT_B(r, 5) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 5));
3034     VECT_B(r, 6) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 6));
3035     VECT_B(r, 7) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 7));
3036    
3037     VECT_B(r, 8) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 0));
3038     VECT_B(r, 9) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 1));
3039     VECT_B(r,10) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 2));
3040     VECT_B(r,11) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 3));
3041     VECT_B(r,12) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 4));
3042     VECT_B(r,13) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 5));
3043     VECT_B(r,14) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 6));
3044     VECT_B(r,15) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 7));
3045    
3046     gCPU.vr[vrD] = r;
3047     }
3048     JITCFlow ppc_opc_gen_vpkshus()
3049     {
3050     #if 0
3051     ppc_opc_gen_interpret(ppc_opc_vpkshus);
3052     return flowEndBlock;
3053     #else
3054     int vrD, vrA, vrB;
3055     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3056    
3057     if (SSE2_AVAIL) {
3058     noncommutative_operation(X86_PACKUSWB, vrD, vrB, vrA);
3059     return flowContinue;
3060     }
3061    
3062     NativeReg reg1 = jitcAllocRegister(NATIVE_REG_8);
3063     NativeReg reg2 = jitcAllocRegister();
3064    
3065     if (vrB == vrD && vrA == vrD) {
3066     for (int i=0; i<16; i += 2) {
3067     vec_getHalfS(reg1, vrA, i);
3068    
3069     vec_saturateSUB(reg1, reg2);
3070    
3071     vec_setByte(vrT, REG_NO, (i >> 1), (NativeReg8)reg1);
3072     vec_setByte(vrT, REG_NO, 8+(i >> 1), (NativeReg8)reg1);
3073     }
3074    
3075     vec_Copy(vrD, vrT);
3076     } else if (vrA == vrD) {
3077     for (int i=0; i<16; i += 2) {
3078     vec_getHalfS(reg1, vrA, 14 - i);
3079    
3080     vec_saturateSUB(reg1, reg2);
3081    
3082     vec_setByte(vrD, REG_NO, 15 - (i >> 1), (NativeReg8)reg1);
3083     }
3084    
3085     for (int i=0; i<16; i += 2) {
3086     vec_getHalfS(reg1, vrB, i);
3087    
3088     vec_saturateSUB(reg1, reg2);
3089    
3090     vec_setByte(vrD, REG_NO, (i >> 1), (NativeReg8)reg1);
3091     }
3092     } else {
3093     for (int i=0; i<16; i += 2) {
3094     vec_getHalfS(reg1, vrB, i);
3095    
3096     vec_saturateSUB(reg1, reg2);
3097    
3098     vec_setByte(vrD, REG_NO, (i >> 1), (NativeReg8)reg1);
3099     }
3100    
3101     for (int i=0; i<16; i += 2) {
3102     vec_getHalfS(reg1, vrA, i);
3103    
3104     vec_saturateSUB(reg1, reg2);
3105    
3106     vec_setByte(vrD, REG_NO, 8 + (i >> 1), (NativeReg8)reg1);
3107     }
3108     }
3109    
3110     return flowContinue;
3111     #endif
3112     }
3113    
3114     /* vpkswus Vector Pack Signed Word Unsigned Saturate
3115     * v.223
3116     */
3117     void ppc_opc_vpkswus()
3118     {
3119     VECTOR_DEBUG;
3120     int vrD, vrA, vrB;
3121     Vector_t r;
3122     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3123    
3124     VECT_H(r, 0) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 0));
3125     VECT_H(r, 1) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 1));
3126     VECT_H(r, 2) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 2));
3127     VECT_H(r, 3) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 3));
3128    
3129     VECT_H(r, 4) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 0));
3130     VECT_H(r, 5) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 1));
3131     VECT_H(r, 6) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 2));
3132     VECT_H(r, 7) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 3));
3133    
3134     gCPU.vr[vrD] = r;
3135     }
3136     JITCFlow ppc_opc_gen_vpkswus()
3137     {
3138     ppc_opc_gen_interpret(ppc_opc_vpkswus);
3139     return flowEndBlock;
3140     }
3141    
3142     /* vupkhsb Vector Unpack High Signed Byte
3143     * v.277
3144     */
3145     void ppc_opc_vupkhsb()
3146     {
3147     VECTOR_DEBUG;
3148     int vrD, vrA, vrB;
3149     Vector_t r;
3150     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3151     PPC_OPC_ASSERT(vrA==0);
3152    
3153     VECT_SH(r, 0) = VECT_SB(gCPU.vr[vrB], 0);
3154     VECT_SH(r, 1) = VECT_SB(gCPU.vr[vrB], 1);
3155     VECT_SH(r, 2) = VECT_SB(gCPU.vr[vrB], 2);
3156     VECT_SH(r, 3) = VECT_SB(gCPU.vr[vrB], 3);
3157     VECT_SH(r, 4) = VECT_SB(gCPU.vr[vrB], 4);
3158     VECT_SH(r, 5) = VECT_SB(gCPU.vr[vrB], 5);
3159     VECT_SH(r, 6) = VECT_SB(gCPU.vr[vrB], 6);
3160     VECT_SH(r, 7) = VECT_SB(gCPU.vr[vrB], 7);
3161    
3162     gCPU.vr[vrD] = r;
3163     }
3164     JITCFlow ppc_opc_gen_vupkhsb()
3165     {
3166     #if 0
3167     ppc_opc_gen_interpret(ppc_opc_vupkhsb);
3168     return flowEndBlock;
3169     #else
3170     int vrD, vrA, vrB;
3171     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3172    
3173     jitcFlushClientVectorRegister(vrB);
3174     jitcDropClientVectorRegister(vrD);
3175    
3176     NativeReg reg = jitcAllocRegister();
3177    
3178     for (int i=0; i<16; i += 2) {
3179     vec_getByteS(reg, vrB, 8 + (i >> 1));
3180    
3181     vec_setHalf(vrD, i, (NativeReg16)reg);
3182     }
3183    
3184     return flowContinue;
3185     #endif
3186     }
3187    
3188     /* vupkhpx Vector Unpack High Pixel32
3189     * v.279
3190     */
3191     void ppc_opc_vupkhpx()
3192     {
3193     VECTOR_DEBUG;
3194     int vrD, vrA, vrB;
3195     Vector_t r;
3196     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3197     PPC_OPC_ASSERT(vrA==0);
3198    
3199     VECT_W(r, 0) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 0));
3200     VECT_W(r, 1) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 1));
3201     VECT_W(r, 2) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 2));
3202     VECT_W(r, 3) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 3));
3203    
3204     gCPU.vr[vrD] = r;
3205     }
3206     JITCFlow ppc_opc_gen_vupkhpx()
3207     {
3208     ppc_opc_gen_interpret(ppc_opc_vupkhpx);
3209     return flowEndBlock;
3210     }
3211    
3212     /* vupkhsh Vector Unpack High Signed Half Word
3213     * v.278
3214     */
3215     void ppc_opc_vupkhsh()
3216     {
3217     VECTOR_DEBUG;
3218     int vrD, vrA, vrB;
3219     Vector_t r;
3220     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3221     PPC_OPC_ASSERT(vrA==0);
3222    
3223     VECT_SW(r, 0) = VECT_SH(gCPU.vr[vrB], 0);
3224     VECT_SW(r, 1) = VECT_SH(gCPU.vr[vrB], 1);
3225     VECT_SW(r, 2) = VECT_SH(gCPU.vr[vrB], 2);
3226     VECT_SW(r, 3) = VECT_SH(gCPU.vr[vrB], 3);
3227    
3228     gCPU.vr[vrD] = r;
3229     }
3230     JITCFlow ppc_opc_gen_vupkhsh()
3231     {
3232     ppc_opc_gen_interpret(ppc_opc_vupkhsh);
3233     return flowEndBlock;
3234     }
3235    
3236     /* vupklsb Vector Unpack Low Signed Byte
3237     * v.280
3238     */
3239     void ppc_opc_vupklsb()
3240     {
3241     VECTOR_DEBUG;
3242     int vrD, vrA, vrB;
3243     Vector_t r;
3244     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3245     PPC_OPC_ASSERT(vrA==0);
3246    
3247     VECT_SH(r, 0) = VECT_SB(gCPU.vr[vrB], 8);
3248     VECT_SH(r, 1) = VECT_SB(gCPU.vr[vrB], 9);
3249     VECT_SH(r, 2) = VECT_SB(gCPU.vr[vrB],10);
3250     VECT_SH(r, 3) = VECT_SB(gCPU.vr[vrB],11);
3251     VECT_SH(r, 4) = VECT_SB(gCPU.vr[vrB],12);
3252     VECT_SH(r, 5) = VECT_SB(gCPU.vr[vrB],13);
3253     VECT_SH(r, 6) = VECT_SB(gCPU.vr[vrB],14);
3254     VECT_SH(r, 7) = VECT_SB(gCPU.vr[vrB],15);
3255    
3256     gCPU.vr[vrD] = r;
3257     }
3258     JITCFlow ppc_opc_gen_vupklsb()
3259     {
3260     #if 0
3261     ppc_opc_gen_interpret(ppc_opc_vupklsb);
3262     return flowEndBlock;
3263     #else
3264     int vrD, vrA, vrB;
3265     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3266    
3267     jitcFlushClientVectorRegister(vrB);
3268     jitcDropClientVectorRegister(vrD);
3269    
3270     NativeReg reg = jitcAllocRegister();
3271    
3272     for (int i=0; i<16; i += 2) {
3273     vec_getByteS(reg, vrB, 7 - (i >> 1));
3274    
3275     vec_setHalf(vrD, 14 - i, (NativeReg16)reg);
3276     }
3277    
3278     return flowContinue;
3279     #endif
3280     }
3281    
3282     /* vupklpx Vector Unpack Low Pixel32
3283     * v.279
3284     */
3285     void ppc_opc_vupklpx()
3286     {
3287     VECTOR_DEBUG;
3288     int vrD, vrA, vrB;
3289     Vector_t r;
3290     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3291     PPC_OPC_ASSERT(vrA==0);
3292    
3293     VECT_W(r, 0) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 4));
3294     VECT_W(r, 1) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 5));
3295     VECT_W(r, 2) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 6));
3296     VECT_W(r, 3) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 7));
3297    
3298     gCPU.vr[vrD] = r;
3299     }
3300     JITCFlow ppc_opc_gen_vupklpx()
3301     {
3302     ppc_opc_gen_interpret(ppc_opc_vupklpx);
3303     return flowEndBlock;
3304     }
3305    
3306     /* vupklsh Vector Unpack Low Signed Half Word
3307     * v.281
3308     */
3309     void ppc_opc_vupklsh()
3310     {
3311     VECTOR_DEBUG;
3312     int vrD, vrA, vrB;
3313     Vector_t r;
3314     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3315     PPC_OPC_ASSERT(vrA==0);
3316    
3317     VECT_SW(r, 0) = VECT_SH(gCPU.vr[vrB], 4);
3318     VECT_SW(r, 1) = VECT_SH(gCPU.vr[vrB], 5);
3319     VECT_SW(r, 2) = VECT_SH(gCPU.vr[vrB], 6);
3320     VECT_SW(r, 3) = VECT_SH(gCPU.vr[vrB], 7);
3321    
3322     gCPU.vr[vrD] = r;
3323     }
3324     JITCFlow ppc_opc_gen_vupklsh()
3325     {
3326     ppc_opc_gen_interpret(ppc_opc_vupklsh);
3327     return flowEndBlock;
3328     }
3329    
3330     static inline void helper_gen_vaddubm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3331     {
3332     asmALU(X86_MOV, reg, modrma);
3333     asmALU(X86_MOV, reg2, reg);
3334    
3335     asmALU(X86_AND, reg, modrmb);
3336     asmALU(X86_XOR, reg2, modrmb);
3337    
3338     asmShift(X86_SHL, reg, 1);
3339     asmALU(X86_AND, reg2, 0xfefefefe);
3340    
3341     asmALU(X86_ADD, reg2, reg);
3342     asmALU(X86_AND, reg2, 0x01010100);
3343     }
3344    
3345     /* vaddubm Vector Add Unsigned Byte Modulo
3346     * v.141
3347     */
3348     void ppc_opc_vaddubm()
3349     {
3350     VECTOR_DEBUG;
3351     int vrD, vrA, vrB;
3352     uint8 res;
3353     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3354    
3355     for (int i=0; i<16; i++) {
3356     res = gCPU.vr[vrA].b[i] + gCPU.vr[vrB].b[i];
3357     gCPU.vr[vrD].b[i] = res;
3358     }
3359     }
3360     JITCFlow ppc_opc_gen_vaddubm()
3361     {
3362     #if 0
3363     ppc_opc_gen_interpret(ppc_opc_vaddubm);
3364     return flowEndBlock;
3365     #else
3366     int vrD, vrA, vrB;
3367     modrm_o modrma, modrmb;
3368     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3369    
3370     if (SSE2_AVAIL) {
3371     commutative_operation(PALUB(X86_PADD), vrD, vrA, vrB);
3372     return flowContinue;
3373     }
3374    
3375     jitcFlushClientVectorRegister(vrA);
3376     jitcFlushClientVectorRegister(vrB);
3377     jitcDropClientVectorRegister(vrD);
3378    
3379     jitcClobberCarryAndFlags();
3380     NativeReg reg = jitcAllocRegister();
3381     NativeReg reg2 = jitcAllocRegister();
3382    
3383     if ((vrA == vrD) || (vrB == vrD)) {
3384     int vrS = vrB;
3385    
3386     if (vrD == vrB) {
3387     vrS = vrA;
3388     }
3389    
3390     for (int i=0; i<16; i += 4) {
3391     x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3392     x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3393    
3394     helper_gen_vaddubm(reg, reg2, modrma, modrmb);
3395    
3396     asmALU(X86_SUB, reg2, modrmb);
3397     asmALU(X86_SUB, modrma, reg2);
3398     }
3399     } else {
3400     for (int i=0; i<16; i += 4) {
3401     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3402     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3403    
3404     helper_gen_vaddubm(reg, reg2, modrma, modrmb);
3405    
3406     asmALU(X86_MOV, reg, modrmb);
3407     asmALU(X86_SUB, reg, reg2);
3408     asmALU(X86_ADD, reg, modrma);
3409    
3410     vec_setWord(vrD, i, reg);
3411     }
3412     }
3413    
3414     return flowContinue;
3415     #endif
3416     }
3417    
3418     static inline void helper_gen_vadduhm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3419     {
3420     asmALU(X86_MOV, reg, modrma);
3421     asmALU(X86_MOV, reg2, reg);
3422    
3423     asmALU(X86_AND, reg, modrmb);
3424     asmALU(X86_XOR, reg2, modrmb);
3425    
3426     asmShift(X86_SHL, reg, 1);
3427     asmALU(X86_AND, reg2, 0xfffefffe);
3428    
3429     asmALU(X86_ADD, reg2, reg);
3430     asmALU(X86_AND, reg2, 0x00010000);
3431     }
3432    
3433     /* vadduhm Vector Add Unsigned Half Word Modulo
3434     * v.143
3435     */
3436     void ppc_opc_vadduhm()
3437     {
3438     VECTOR_DEBUG;
3439     int vrD, vrA, vrB;
3440     uint16 res;
3441     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3442    
3443     for (int i=0; i<8; i++) {
3444     res = gCPU.vr[vrA].h[i] + gCPU.vr[vrB].h[i];
3445     gCPU.vr[vrD].h[i] = res;
3446     }
3447     }
3448     JITCFlow ppc_opc_gen_vadduhm()
3449     {
3450     #if 0
3451     ppc_opc_gen_interpret(ppc_opc_vadduhm);
3452     return flowEndBlock;
3453     #else
3454     int vrD, vrA, vrB;
3455     modrm_o modrma, modrmb;
3456     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3457    
3458     if (SSE2_AVAIL) {
3459     commutative_operation(PALUW(X86_PADD), vrD, vrA, vrB);
3460     return flowContinue;
3461     }
3462    
3463     jitcFlushClientVectorRegister(vrA);
3464     jitcFlushClientVectorRegister(vrB);
3465     jitcDropClientVectorRegister(vrD);
3466    
3467     jitcClobberCarryAndFlags();
3468     NativeReg reg = jitcAllocRegister();
3469     NativeReg reg2 = jitcAllocRegister();
3470    
3471     if ((vrA == vrD) || (vrB == vrD)) {
3472     int vrS = vrB;
3473    
3474     if (vrD == vrB) {
3475     vrS = vrA;
3476     }
3477    
3478     for (int i=0; i<16; i += 4) {
3479     x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3480     x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3481    
3482     helper_gen_vadduhm(reg, reg2, modrma, modrmb);
3483    
3484     asmALU(X86_SUB, reg2, modrmb);
3485     asmALU(X86_SUB, modrma, reg2);
3486     }
3487     } else {
3488     for (int i=0; i<16; i += 4) {
3489     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3490     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3491    
3492     helper_gen_vadduhm(reg, reg2, modrma, modrmb);
3493    
3494     asmALU(X86_MOV, reg, modrmb);
3495     asmALU(X86_SUB, reg, reg2);
3496     asmALU(X86_ADD, reg, modrma);
3497    
3498     vec_setWord(vrD, i, reg);
3499     }
3500     }
3501    
3502     return flowContinue;
3503     #endif
3504     }
3505    
3506     /* vadduwm Vector Add Unsigned Word Modulo
3507     * v.145
3508     */
3509     void ppc_opc_vadduwm()
3510     {
3511     VECTOR_DEBUG;
3512     int vrD, vrA, vrB;
3513     uint32 res;
3514     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3515    
3516     for (int i=0; i<4; i++) {
3517     res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3518     gCPU.vr[vrD].w[i] = res;
3519     }
3520     }
3521     JITCFlow ppc_opc_gen_vadduwm()
3522     {
3523     #if 0
3524     ppc_opc_gen_interpret(ppc_opc_vadduwm);
3525     return flowEndBlock;
3526     #else
3527     int vrD, vrA, vrB;
3528     modrm_o modrm;
3529     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3530    
3531     jitcFlushClientVectorRegister(vrA);
3532     jitcFlushClientVectorRegister(vrB);
3533     jitcDropClientVectorRegister(vrD);
3534     NativeReg reg = jitcAllocRegister();
3535    
3536     if (SSE2_AVAIL) {
3537     commutative_operation(PALUD(X86_PADD), vrD, vrA, vrB);
3538     return flowContinue;
3539     }
3540    
3541     if ((vrA == vrD) || (vrB == vrD)) {
3542     int vrS = vrB;
3543    
3544     if (vrD == vrB) {
3545     vrS = vrA;
3546     }
3547    
3548     for (int i=0; i<16; i += 4) {
3549     vec_getWord(reg, vrS, i);
3550    
3551     asmALU(X86_ADD,
3552     x86_mem2(modrm, &(gCPU.vr[vrD].b[i])), reg);
3553     }
3554     } else {
3555     for (int i=0; i<16; i += 4) {
3556     vec_getWord(reg, vrA, i);
3557    
3558     asmALU(X86_ADD, reg,
3559     x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
3560    
3561     vec_setWord(vrD, i, reg);
3562     }
3563     }
3564    
3565     return flowContinue;
3566     #endif
3567     }
3568    
3569     /* vaddfp Vector Add Float Point
3570     * v.137
3571     */
3572     void ppc_opc_vaddfp()
3573     {
3574     VECTOR_DEBUG;
3575     int vrD, vrA, vrB;
3576     float res;
3577     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3578    
3579     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
3580     res = gCPU.vr[vrA].f[i] + gCPU.vr[vrB].f[i];
3581     gCPU.vr[vrD].f[i] = res;
3582     }
3583     }
3584     JITCFlow ppc_opc_gen_vaddfp()
3585     {
3586     #if 0
3587     ppc_opc_gen_interpret(ppc_opc_vaddfp);
3588     return flowEndBlock;
3589     #else
3590     int vrD, vrA, vrB;
3591     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3592    
3593     if (SSE_AVAIL) {
3594     commutative_operation(X86_ADDPS, vrD, vrA, vrB);
3595     return flowContinue;
3596     }
3597    
3598     ppc_opc_gen_interpret(ppc_opc_vaddfp);
3599     return flowEndBlock;
3600     #endif
3601     }
3602    
3603     /* vaddcuw Vector Add Carryout Unsigned Word
3604     * v.136
3605     */
3606     void ppc_opc_vaddcuw()
3607     {
3608     VECTOR_DEBUG;
3609     int vrD, vrA, vrB;
3610     uint32 res;
3611     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3612    
3613     for (int i=0; i<4; i++) {
3614     res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3615     gCPU.vr[vrD].w[i] = (res < gCPU.vr[vrA].w[i]) ? 1 : 0;
3616     }
3617     }
3618     JITCFlow ppc_opc_gen_vaddcuw()
3619     {
3620     ppc_opc_gen_interpret(ppc_opc_vaddcuw);
3621     return flowEndBlock;
3622     }
3623    
3624     static inline void helper_gen_vaddubs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3625     {
3626     asmALU(X86_MOV, reg, modrma);
3627     asmALU(X86_MOV, reg2, reg);
3628    
3629     asmALU(X86_XOR, reg, modrmb);
3630     asmALU(X86_AND, reg2, modrmb);
3631    
3632     asmShift(X86_SHR, reg, 1);
3633     asmALU(X86_AND, reg, 0x7f7f7f7f);
3634    
3635     asmALU(X86_ADD, reg2, reg);
3636     asmALU(X86_AND, reg2, 0x80808080);
3637    
3638     asmShift(X86_SHR, reg2, 7);
3639     asmALU(X86_ADD, reg2, 0x7f7f7f7f);
3640     asmALU(X86_XOR, reg2, 0x7f7f7f7f);
3641     NativeAddress skip = asmJxxFixup(X86_Z);
3642    
3643     vec_raise_saturate();
3644    
3645     asmResolveFixup(skip);
3646     }
3647    
3648     /* vaddubs Vector Add Unsigned Byte Saturate
3649     * v.142
3650     */
3651     void ppc_opc_vaddubs()
3652     {
3653     VECTOR_DEBUG;
3654     int vrD, vrA, vrB;
3655     uint16 res;
3656     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3657    
3658     for (int i=0; i<16; i++) {
3659     res = (uint16)gCPU.vr[vrA].b[i] + (uint16)gCPU.vr[vrB].b[i];
3660     gCPU.vr[vrD].b[i] = SATURATE_UB(res);
3661     }
3662     }
3663     JITCFlow ppc_opc_gen_vaddubs()
3664     {
3665     #if 0
3666     ppc_opc_gen_interpret(ppc_opc_vaddubs);
3667     return flowEndBlock;
3668     #else
3669     int vrD, vrA, vrB;
3670     modrm_o modrma, modrmb;
3671     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3672    
3673     if (SSE2_AVAIL) {
3674     commutative_operation(PALUB(X86_PADDUS), vrD, vrA, vrB);
3675     return flowContinue;
3676     }
3677    
3678     jitcFlushClientVectorRegister(vrA);
3679     jitcFlushClientVectorRegister(vrB);
3680     jitcDropClientVectorRegister(vrD);
3681    
3682     jitcClobberCarryAndFlags();
3683     NativeReg reg = jitcAllocRegister();
3684     NativeReg reg2 = jitcAllocRegister();
3685    
3686     if ((vrA == vrD) || (vrB == vrD)) {
3687     int vrS = vrB;
3688    
3689     if (vrD == vrB) {
3690     vrS = vrA;
3691     }
3692    
3693     for (int i=0; i<16; i += 4) {
3694     x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3695     x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3696    
3697     helper_gen_vaddubs(reg, reg2, modrma, modrmb);
3698    
3699     asmALU(X86_MOV, reg, modrmb);
3700     asmALU(X86_SUB, reg, reg2);
3701     asmALU(X86_ADD, modrma, reg);
3702     asmALU(X86_OR, modrma, reg2);
3703     }
3704     } else {
3705     for (int i=0; i<16; i += 4) {
3706     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3707     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3708    
3709     helper_gen_vaddubs(reg, reg2, modrma, modrmb);
3710    
3711     asmALU(X86_MOV, reg, modrmb);
3712     asmALU(X86_SUB, reg, reg2);
3713     asmALU(X86_ADD, reg, modrma);
3714     asmALU(X86_OR, reg, reg2);
3715    
3716     vec_setWord(vrD, i, reg);
3717     }
3718     }
3719    
3720     return flowContinue;
3721     #endif
3722     }
3723    
3724     /* vaddsbs Vector Add Signed Byte Saturate
3725     * v.138
3726     */
3727     void ppc_opc_vaddsbs()
3728     {
3729     VECTOR_DEBUG;
3730     int vrD, vrA, vrB;
3731     sint16 res;
3732     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3733    
3734     for (int i=0; i<16; i++) {
3735     res = (sint16)gCPU.vr[vrA].sb[i] + (sint16)gCPU.vr[vrB].sb[i];
3736     gCPU.vr[vrD].b[i] = SATURATE_SB(res);
3737     }
3738     }
3739     JITCFlow ppc_opc_gen_vaddsbs()
3740     {
3741     ppc_opc_gen_interpret(ppc_opc_vaddsbs);
3742     return flowEndBlock;
3743     }
3744    
3745     static inline void helper_gen_vadduhs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3746     {
3747     asmALU(X86_MOV, reg, modrma);
3748     asmALU(X86_MOV, reg2, reg);
3749    
3750     asmALU(X86_XOR, reg, modrmb);
3751     asmALU(X86_AND, reg2, modrmb);
3752    
3753     asmShift(X86_SHR, reg, 1);
3754     asmALU(X86_AND, reg, 0x7fff7fff);
3755    
3756     asmALU(X86_ADD, reg2, reg);
3757     asmALU(X86_AND, reg2, 0x80008000);
3758    
3759     asmShift(X86_SHR, reg2, 15);
3760     asmALU(X86_ADD, reg2, 0x7fff7fff);
3761     asmALU(X86_XOR, reg2, 0x7fff7fff);
3762     NativeAddress skip = asmJxxFixup(X86_Z);
3763    
3764     vec_raise_saturate();
3765    
3766     asmResolveFixup(skip);
3767     }
3768    
3769     /* vadduhs Vector Add Unsigned Half Word Saturate
3770     * v.144
3771     */
3772     void ppc_opc_vadduhs()
3773     {
3774     VECTOR_DEBUG;
3775     int vrD, vrA, vrB;
3776     uint32 res;
3777     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3778    
3779     for (int i=0; i<8; i++) {
3780     res = (uint32)gCPU.vr[vrA].h[i] + (uint32)gCPU.vr[vrB].h[i];
3781     gCPU.vr[vrD].h[i] = SATURATE_UH(res);
3782     }
3783     }
3784     JITCFlow ppc_opc_gen_vadduhs()
3785     {
3786     #if 0
3787     ppc_opc_gen_interpret(ppc_opc_vadduhs);
3788     return flowEndBlock;
3789     #else
3790     int vrD, vrA, vrB;
3791     modrm_o modrma, modrmb;
3792     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3793    
3794     if (SSE2_AVAIL) {
3795     commutative_operation(PALUW(X86_PADDUS), vrD, vrA, vrB);
3796     return flowContinue;
3797     }
3798    
3799     jitcFlushClientVectorRegister(vrA);
3800     jitcFlushClientVectorRegister(vrB);
3801     jitcDropClientVectorRegister(vrD);
3802    
3803     jitcClobberCarryAndFlags();
3804     NativeReg reg = jitcAllocRegister();
3805     NativeReg reg2 = jitcAllocRegister();
3806    
3807     if ((vrA == vrD) || (vrB == vrD)) {
3808     int vrS = vrB;
3809    
3810     if (vrD == vrB) {
3811     vrS = vrA;
3812     }
3813    
3814     for (int i=0; i<16; i += 4) {
3815     x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3816     x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3817    
3818     helper_gen_vadduhs(reg, reg2, modrma, modrmb);
3819    
3820     asmALU(X86_MOV, reg, modrmb);
3821     asmALU(X86_SUB, reg, reg2);
3822     asmALU(X86_ADD, modrma, reg);
3823     asmALU(X86_OR, modrma, reg2);
3824     }
3825     } else {
3826     for (int i=0; i<16; i += 4) {
3827     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3828     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3829    
3830     helper_gen_vadduhs(reg, reg2, modrma, modrmb);
3831    
3832     asmALU(X86_MOV, reg, modrmb);
3833     asmALU(X86_SUB, reg, reg2);
3834     asmALU(X86_ADD, reg, modrma);
3835     asmALU(X86_OR, reg, reg2);
3836    
3837     vec_setWord(vrD, i, reg);
3838     }
3839     }
3840    
3841     return flowContinue;
3842    
3843     #endif
3844     }
3845    
3846     /* vaddshs Vector Add Signed Half Word Saturate
3847     * v.139
3848     */
3849     void ppc_opc_vaddshs()
3850     {
3851     VECTOR_DEBUG;
3852     int vrD, vrA, vrB;
3853     sint32 res;
3854     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3855    
3856     for (int i=0; i<8; i++) {
3857     res = (sint32)gCPU.vr[vrA].sh[i] + (sint32)gCPU.vr[vrB].sh[i];
3858     gCPU.vr[vrD].h[i] = SATURATE_SH(res);
3859     }
3860     }
3861     JITCFlow ppc_opc_gen_vaddshs()
3862     {
3863     #if 0
3864     ppc_opc_gen_interpret(ppc_opc_vaddshs);
3865     return flowEndBlock;
3866     #else
3867     int vrD, vrA, vrB;
3868     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3869    
3870     if (SSE2_AVAIL) {
3871     commutative_operation(PALUW(X86_PADDS), vrD, vrA, vrB);
3872     return flowContinue;
3873     }
3874    
3875     jitcFlushClientVectorRegister(vrA);
3876     jitcFlushClientVectorRegister(vrB);
3877     jitcDropClientVectorRegister(vrD);
3878    
3879     jitcClobberCarryAndFlags();
3880     NativeReg reg = jitcAllocRegister();
3881     NativeReg reg2 = jitcAllocRegister();
3882    
3883     for (int i=0; i<16; i += 2) {
3884     vec_getHalfS(reg, vrA, i);
3885     vec_getHalfS(reg2, vrB, i);
3886    
3887     asmALU(X86_ADD, reg, reg2);
3888     vec_saturateSH(reg, reg2);
3889    
3890     vec_setHalf(vrD, i, (NativeReg16)reg);
3891     }
3892    
3893     return flowContinue;
3894     #endif
3895     }
3896    
3897     /* vadduws Vector Add Unsigned Word Saturate
3898     * v.146
3899     */
3900     void ppc_opc_vadduws()
3901     {
3902     VECTOR_DEBUG;
3903     int vrD, vrA, vrB;
3904     uint32 res;
3905     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3906    
3907     for (int i=0; i<4; i++) {
3908     res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3909    
3910     // We do this to prevent us from having to do 64-bit math
3911     if (res < gCPU.vr[vrA].w[i]) {
3912     res = 0xFFFFFFFF;
3913     gCPU.vscr |= VSCR_SAT;
3914     }
3915    
3916     /* 64-bit math | 32-bit hack
3917     * ------------------------+-------------------------------------
3918     * add, addc (a+b) | add (a+b)
3919     * sub, subb (r>ub) | sub (r<a)
3920     */
3921    
3922     gCPU.vr[vrD].w[i] = res;
3923     }
3924     }
3925     JITCFlow ppc_opc_gen_vadduws()
3926     {
3927     ppc_opc_gen_interpret(ppc_opc_vadduws);
3928     return flowEndBlock;
3929     }
3930    
3931     /* vaddsws Vector Add Signed Word Saturate
3932     * v.140
3933     */
3934     void ppc_opc_vaddsws()
3935     {
3936     VECTOR_DEBUG;
3937     int vrD, vrA, vrB;
3938     uint32 res;
3939     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3940    
3941     for (int i=0; i<4; i++) {
3942     res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3943    
3944     // We do this to prevent us from having to do 64-bit math
3945     if (((gCPU.vr[vrA].w[i] ^ gCPU.vr[vrB].w[i]) & SIGN32) == 0) {
3946     // the signs of both operands are the same
3947    
3948     if (((res ^ gCPU.vr[vrA].w[i]) & SIGN32) != 0) {
3949     // sign of result != sign of operands
3950    
3951     // if res is negative, should have been positive
3952     res = (res & SIGN32) ? (SIGN32 - 1) : SIGN32;
3953     gCPU.vscr |= VSCR_SAT;
3954     }
3955     }
3956    
3957     /* 64-bit math | 32-bit hack
3958     * ------------------------+-------------------------------------
3959     * add, addc (a+b) | add (a+b)
3960     * sub, subb (r>ub) | xor, and (sign == sign)
3961     * sub, subb (r<lb) | xor, and (sign != sign)
3962     * | and (which)
3963     */
3964    
3965     gCPU.vr[vrD].w[i] = res;
3966     }
3967     }
3968     JITCFlow ppc_opc_gen_vaddsws()
3969     {
3970     ppc_opc_gen_interpret(ppc_opc_vaddsws);
3971     return flowEndBlock;
3972     }
3973    
3974     static inline void helper_gen_vsububm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3975     {
3976     asmALU(X86_MOV, reg, modrma);
3977     asmALU(X86_NOT, reg);
3978     asmALU(X86_MOV, reg2, reg);
3979    
3980     asmALU(X86_AND, reg, modrmb);
3981     asmALU(X86_XOR, reg2, modrmb);
3982    
3983     asmShift(X86_SHL, reg, 1);
3984     asmALU(X86_AND, reg2, 0xfefefefe);
3985    
3986     asmALU(X86_ADD, reg2, reg);
3987     asmALU(X86_AND, reg2, 0x01010100);
3988     }
3989    
3990     /* vsububm Vector Subtract Unsigned Byte Modulo
3991     * v.265
3992     */
3993     void ppc_opc_vsububm()
3994     {
3995     VECTOR_DEBUG;
3996     int vrD, vrA, vrB;
3997     uint8 res;
3998     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3999    
4000     for (int i=0; i<16; i++) {
4001     res = gCPU.vr[vrA].b[i] - gCPU.vr[vrB].b[i];
4002     gCPU.vr[vrD].b[i] = res;
4003     }
4004     }
4005     JITCFlow ppc_opc_gen_vsububm()
4006     {
4007     #if 0
4008     ppc_opc_gen_interpret(ppc_opc_vsububm);
4009     return flowEndBlock;
4010     #else
4011     int vrD, vrA, vrB;
4012     modrm_o modrma, modrmb;
4013     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4014    
4015     if (SSE2_AVAIL) {
4016     noncommutative_operation(PALUB(X86_PSUB), vrD, vrA, vrB);
4017     return flowContinue;
4018     }
4019    
4020     jitcFlushClientVectorRegister(vrA);
4021     jitcFlushClientVectorRegister(vrB);
4022     jitcDropClientVectorRegister(vrD);
4023    
4024     jitcClobberCarryAndFlags();
4025     NativeReg reg = jitcAllocRegister();
4026     NativeReg reg2 = jitcAllocRegister();
4027    
4028     if (vrA == vrD) {
4029     for (int i=0; i<16; i += 4) {
4030     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4031     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4032    
4033     helper_gen_vsububm(reg, reg2, modrma, modrmb);
4034    
4035     asmALU(X86_SUB, reg2, modrmb);
4036     asmALU(X86_ADD, modrma, reg2);
4037     }
4038     } else {
4039     for (int i=0; i<16; i += 4) {
4040     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4041     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4042    
4043     helper_gen_vsububm(reg, reg2, modrma, modrmb);
4044    
4045     asmALU(X86_SUB, reg2, modrmb);
4046     asmALU(X86_ADD, reg2, modrma);
4047    
4048     vec_setWord(vrD, i, reg2);
4049     }
4050     }
4051    
4052     return flowContinue;
4053     #endif
4054     }
4055    
4056     static inline void helper_gen_vsubuhm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
4057     {
4058     asmALU(X86_MOV, reg, modrma);
4059     asmALU(X86_NOT, reg);
4060     asmALU(X86_MOV, reg2, reg);
4061    
4062     asmALU(X86_AND, reg, modrmb);
4063     asmALU(X86_XOR, reg2, modrmb);
4064    
4065     asmShift(X86_SHL, reg, 1);
4066     asmALU(X86_AND, reg2, 0xfffefffe);
4067    
4068     asmALU(X86_ADD, reg2, reg);
4069     asmALU(X86_AND, reg2, 0x00010000);
4070     }
4071    
4072    
4073     /* vsubuhm Vector Subtract Unsigned Half Word Modulo
4074     * v.267
4075     */
4076     void ppc_opc_vsubuhm()
4077     {
4078     VECTOR_DEBUG;
4079     int vrD, vrA, vrB;
4080     uint16 res;
4081     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4082    
4083     for (int i=0; i<8; i++) {
4084     res = gCPU.vr[vrA].h[i] - gCPU.vr[vrB].h[i];
4085     gCPU.vr[vrD].h[i] = res;
4086     }
4087     }
4088     JITCFlow ppc_opc_gen_vsubuhm()
4089     {
4090     #if 0
4091     ppc_opc_gen_interpret(ppc_opc_vsubuhm);
4092     return flowEndBlock;
4093     #else
4094     int vrD, vrA, vrB;
4095     modrm_o modrma, modrmb;
4096     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4097    
4098     if (SSE2_AVAIL) {
4099     noncommutative_operation(PALUW(X86_PSUB), vrD, vrA, vrB);
4100     return flowContinue;
4101     }
4102    
4103     jitcFlushClientVectorRegister(vrA);
4104     jitcFlushClientVectorRegister(vrB);
4105     jitcDropClientVectorRegister(vrD);
4106    
4107     NativeReg reg = jitcAllocRegister();
4108     NativeReg reg2 = jitcAllocRegister();
4109    
4110     if (vrA == vrD) {
4111     for (int i=0; i<16; i += 4) {
4112     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4113     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4114    
4115     helper_gen_vsubuhm(reg, reg2, modrma, modrmb);
4116    
4117     asmALU(X86_SUB, reg2, modrmb);
4118     asmALU(X86_ADD, modrma, reg2);
4119     }
4120     } else {
4121     for (int i=0; i<16; i += 4) {
4122     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4123     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4124    
4125     helper_gen_vsubuhm(reg, reg2, modrma, modrmb);
4126    
4127     asmALU(X86_SUB, reg2, modrmb);
4128     asmALU(X86_ADD, reg2, modrma);
4129    
4130     vec_setWord(vrD, i, reg2);
4131     }
4132     }
4133    
4134     return flowContinue;
4135     #endif
4136     }
4137    
4138     /* vsubuwm Vector Subtract Unsigned Word Modulo
4139     * v.269
4140     */
4141     void ppc_opc_vsubuwm()
4142     {
4143     VECTOR_DEBUG;
4144     int vrD, vrA, vrB;
4145     uint32 res;
4146     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4147    
4148     for (int i=0; i<4; i++) {
4149     res = gCPU.vr[vrA].w[i] - gCPU.vr[vrB].w[i];
4150     gCPU.vr[vrD].w[i] = res;
4151     }
4152     }
4153     JITCFlow ppc_opc_gen_vsubuwm()
4154     {
4155     #if 0
4156     ppc_opc_gen_interpret(ppc_opc_vsubuwm);
4157     return flowEndBlock;
4158     #else
4159     int vrD, vrA, vrB;
4160     modrm_o modrm;
4161     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4162    
4163     if (SSE2_AVAIL) {
4164     noncommutative_operation(PALUD(X86_PSUB), vrD, vrA, vrB);
4165     return flowContinue;
4166     }
4167    
4168     jitcFlushClientVectorRegister(vrA);
4169     jitcFlushClientVectorRegister(vrB);
4170     jitcDropClientVectorRegister(vrD);
4171     NativeReg reg = jitcAllocRegister();
4172    
4173     if (vrA == vrD) {
4174     for (int i=0; i<16; i += 4) {
4175     vec_getWord(reg, vrB, i);
4176    
4177     asmALU(X86_SUB,
4178     x86_mem2(modrm, &(gCPU.vr[vrD].b[i])), reg);
4179     }
4180     } else {
4181     for (int i=0; i<16; i += 4) {
4182     vec_getWord(reg, vrA, i);
4183    
4184     asmALU(X86_SUB, reg,
4185     x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4186    
4187     vec_setWord(vrD, i, reg);
4188     }
4189     }
4190    
4191     return flowContinue;
4192     #endif
4193     }
4194    
4195     /* vsubfp Vector Subtract Float Point
4196     * v.261
4197     */
4198     void ppc_opc_vsubfp()
4199     {
4200     VECTOR_DEBUG;
4201     int vrD, vrA, vrB;
4202     float res;
4203     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4204    
4205     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
4206     res = gCPU.vr[vrA].f[i] - gCPU.vr[vrB].f[i];
4207     gCPU.vr[vrD].f[i] = res;
4208     }
4209     }
4210     JITCFlow ppc_opc_gen_vsubfp()
4211     {
4212     #if 0
4213     ppc_opc_gen_interpret(ppc_opc_vsubfp);
4214     return flowEndBlock;
4215     #else
4216     int vrD, vrA, vrB;
4217     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4218    
4219     if (SSE_AVAIL) {
4220     noncommutative_operation(X86_SUBPS, vrD, vrA, vrB);
4221     return flowContinue;
4222     }
4223    
4224     ppc_opc_gen_interpret(ppc_opc_vsubfp);
4225     return flowEndBlock;
4226     #endif
4227     }
4228    
4229     /* vsubcuw Vector Subtract Carryout Unsigned Word
4230     * v.260
4231     */
4232     void ppc_opc_vsubcuw()
4233     {
4234     VECTOR_DEBUG;
4235     int vrD, vrA, vrB;
4236     uint32 res;
4237     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4238    
4239     for (int i=0; i<4; i++) {
4240     res = gCPU.vr[vrA].w[i] - gCPU.vr[vrB].w[i];
4241     gCPU.vr[vrD].w[i] = (res <= gCPU.vr[vrA].w[i]) ? 1 : 0;
4242     }
4243     }
4244     JITCFlow ppc_opc_gen_vsubcuw()
4245     {
4246     #if 0
4247     ppc_opc_gen_interpret(ppc_opc_vsubcuw);
4248     return flowEndBlock;
4249     #else
4250     int vrD, vrA, vrB;
4251     modrm_o modrm;
4252     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4253    
4254     if (vrA == vrB) {
4255     // no need to flush vrA, vrB, their values are irrelevant here
4256     jitcDropClientVectorRegister(vrD);
4257    
4258     NativeReg reg = jitcAllocRegister();
4259     asmMOV_NoFlags(reg, 1); // avoid flag clobber
4260    
4261     vec_setWord(vrD, 0, reg);
4262     vec_setWord(vrD, 4, reg);
4263     vec_setWord(vrD, 8, reg);
4264     vec_setWord(vrD,12, reg);
4265     } else {
4266     jitcFlushClientVectorRegister(vrA);
4267     jitcFlushClientVectorRegister(vrB);
4268     jitcFlushClientVectorRegister(vrD);
4269    
4270     NativeReg reg = jitcAllocRegister(NATIVE_REG_8);
4271    
4272     for (int i=0; i<16; i += 4) {
4273     vec_getWord(reg, vrA, i);
4274    
4275     asmALU(X86_SUB, reg,
4276     x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4277    
4278     asmSET(X86_NC, (NativeReg8)reg);
4279    
4280     asmMOVxx(X86_MOVZX, reg, (NativeReg8)reg);
4281    
4282     vec_setWord(vrD, i, reg);
4283     }
4284     }
4285    
4286     return flowContinue;
4287     #endif
4288     }
4289    
4290     static inline void helper_gen_vsububs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
4291     {
4292     asmALU(X86_MOV, reg, modrma);
4293     asmALU(X86_NOT, reg);
4294     asmALU(X86_MOV, reg2, reg);
4295    
4296     asmALU(X86_XOR, reg, modrmb);
4297     asmALU(X86_AND, reg2, modrmb);
4298    
4299     asmShift(X86_SHR, reg, 1);
4300     asmALU(X86_AND, reg, 0x7f7f7f7f);
4301    
4302     asmALU(X86_ADD, reg2, reg);
4303     asmALU(X86_AND, reg2, 0x80808080);
4304    
4305     asmShift(X86_SHR, reg2, 7);
4306     asmALU(X86_ADD, reg2, 0x7f7f7f7f);
4307     asmALU(X86_XOR, reg2, 0x7f7f7f7f);
4308     NativeAddress skip = asmJxxFixup(X86_Z);
4309    
4310     vec_raise_saturate();
4311    
4312     asmResolveFixup(skip);
4313     }
4314    
4315     /* vsububs Vector Subtract Unsigned Byte Saturate
4316     * v.266
4317     */
4318     void ppc_opc_vsububs()
4319     {
4320     VECTOR_DEBUG;
4321     int vrD, vrA, vrB;
4322     uint16 res;
4323     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4324    
4325     for (int i=0; i<16; i++) {
4326     res = (uint16)gCPU.vr[vrA].b[i] - (uint16)gCPU.vr[vrB].b[i];
4327    
4328     gCPU.vr[vrD].b[i] = SATURATE_0B(res);
4329     }
4330     }
4331     JITCFlow ppc_opc_gen_vsububs()
4332     {
4333     #if 0
4334     ppc_opc_gen_interpret(ppc_opc_vsububs);
4335     return flowEndBlock;
4336     #else
4337     int vrD, vrA, vrB;
4338     modrm_o modrma, modrmb;
4339     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4340    
4341     if (SSE2_AVAIL) {
4342     noncommutative_operation(PALUB(X86_PSUBUS), vrD, vrA, vrB);
4343     return flowContinue;
4344     }
4345    
4346     jitcFlushClientVectorRegister(vrA);
4347     jitcFlushClientVectorRegister(vrB);
4348     jitcDropClientVectorRegister(vrD);
4349    
4350     jitcClobberCarryAndFlags();
4351     NativeReg reg = jitcAllocRegister();
4352     NativeReg reg2 = jitcAllocRegister();
4353    
4354     if (vrA == vrD) {
4355     for (int i=0; i<16; i += 4) {
4356     x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
4357     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4358    
4359     helper_gen_vsububs(reg, reg2, modrma, modrmb);
4360    
4361     asmALU(X86_MOV, reg, modrmb);
4362     asmALU(X86_OR, reg, reg2);
4363    
4364     asmALU(X86_OR, modrma, reg2);
4365     asmALU(X86_SUB, modrma, reg);
4366     }
4367     } else {
4368     for (int i=0; i<16; i += 4) {
4369     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4370     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4371    
4372     helper_gen_vsububs(reg, reg2, modrma, modrmb);
4373    
4374     asmALU(X86_MOV, reg, modrmb);
4375     asmALU(X86_OR, reg, reg2);
4376    
4377     asmALU(X86_OR, reg2, modrma);
4378     asmALU(X86_SUB, reg2, reg);
4379    
4380     vec_setWord(vrD, i, reg2);
4381     }
4382     }
4383    
4384     return flowContinue;
4385     #endif
4386     }
4387    
4388     /* vsubsbs Vector Subtract Signed Byte Saturate
4389     * v.262
4390     */
4391     void ppc_opc_vsubsbs()
4392     {
4393     VECTOR_DEBUG;
4394     int vrD, vrA, vrB;
4395     sint16 res;
4396     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4397    
4398     for (int i=0; i<16; i++) {
4399     res = (sint16)gCPU.vr[vrA].sb[i] - (sint16)gCPU.vr[vrB].sb[i];
4400    
4401     gCPU.vr[vrD].sb[i] = SATURATE_SB(res);
4402     }
4403     }
4404     JITCFlow ppc_opc_gen_vsubsbs()
4405     {
4406     #if 0
4407     ppc_opc_gen_interpret(ppc_opc_vsubsbs);
4408     return flowEndBlock;
4409     #else
4410     int vrD, vrA, vrB;
4411     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4412    
4413     if (SSE2_AVAIL) {
4414     noncommutative_operation(PALUB(X86_PSUBS), vrD, vrA, vrB);
4415     return flowContinue;
4416     }
4417    
4418     jitcFlushClientVectorRegister(vrA);
4419     jitcFlushClientVectorRegister(vrB);
4420     jitcDropClientVectorRegister(vrD);
4421    
4422     jitcClobberCarryAndFlags();
4423     NativeReg reg = jitcAllocRegister(NATIVE_REG_8);
4424     NativeReg reg2 = jitcAllocRegister();
4425    
4426     for (int i=0; i<16; i++) {
4427     vec_getByteS(reg, vrA, i);
4428     vec_getByteS(reg2, vrB, i);
4429    
4430     asmALU(X86_SUB, reg, reg2);
4431     vec_saturateSB(reg, reg2);
4432    
4433     vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
4434     }
4435    
4436     return flowContinue;
4437     #endif
4438     }
4439    
4440     static inline void helper_gen_vsubuhs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
4441     {
4442     asmALU(X86_MOV, reg, modrma);
4443     asmALU(X86_NOT, reg);
4444     asmALU(X86_MOV, reg2, reg);
4445    
4446     asmALU(X86_XOR, reg, modrmb);
4447     asmALU(X86_AND, reg2, modrmb);
4448    
4449     asmShift(X86_SHR, reg, 1);
4450     asmALU(X86_AND, reg, 0x7fff7fff);
4451    
4452     asmALU(X86_ADD, reg2, reg);
4453     asmALU(X86_AND, reg2, 0x80008000);
4454    
4455     asmShift(X86_SHR, reg2, 15);
4456     asmALU(X86_ADD, reg2, 0x7fff7fff);
4457     asmALU(X86_XOR, reg2, 0x7fff7fff);
4458     NativeAddress skip = asmJxxFixup(X86_Z);
4459    
4460     vec_raise_saturate();
4461    
4462     asmResolveFixup(skip);
4463     }
4464    
4465     /* vsubuhs Vector Subtract Unsigned Half Word Saturate
4466     * v.268
4467     */
4468     void ppc_opc_vsubuhs()
4469     {
4470     VECTOR_DEBUG;
4471     int vrD, vrA, vrB;
4472     uint32 res;
4473     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4474    
4475     for (int i=0; i<8; i++) {
4476     res = (uint32)gCPU.vr[vrA].h[i] - (uint32)gCPU.vr[vrB].h[i];
4477    
4478     gCPU.vr[vrD].h[i] = SATURATE_0H(res);
4479     }
4480     }
4481     JITCFlow ppc_opc_gen_vsubuhs()
4482     {
4483     #if 0
4484     ppc_opc_gen_interpret(ppc_opc_vsubuhs);
4485     return flowEndBlock;
4486     #else
4487     int vrD, vrA, vrB;
4488     modrm_o modrma, modrmb;
4489     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4490    
4491     if (SSE2_AVAIL) {
4492     noncommutative_operation(PALUW(X86_PSUBUS), vrD, vrA, vrB);
4493     return flowContinue;
4494     }
4495    
4496     jitcFlushClientVectorRegister(vrA);
4497     jitcFlushClientVectorRegister(vrB);
4498     jitcDropClientVectorRegister(vrD);
4499    
4500     jitcClobberCarryAndFlags();
4501     NativeReg reg = jitcAllocRegister();
4502     NativeReg reg2 = jitcAllocRegister();
4503    
4504     if (vrA == vrD) {
4505     for (int i=0; i<16; i += 4) {
4506     x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
4507     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4508    
4509     helper_gen_vsubuhs(reg, reg2, modrma, modrmb);
4510    
4511     asmALU(X86_MOV, reg, modrmb);
4512     asmALU(X86_OR, reg, reg2);
4513    
4514     asmALU(X86_OR, modrma, reg2);
4515     asmALU(X86_SUB, modrma, reg);
4516     }
4517     } else {
4518     for (int i=0; i<16; i += 4) {
4519     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4520     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4521    
4522     helper_gen_vsubuhs(reg, reg2, modrma, modrmb);
4523    
4524     asmALU(X86_MOV, reg, modrmb);
4525     asmALU(X86_OR, reg, reg2);
4526    
4527     asmALU(X86_OR, reg2, modrma);
4528     asmALU(X86_SUB, reg2, reg);
4529    
4530     vec_setWord(vrD, i, reg2);
4531     }
4532     }
4533    
4534     return flowContinue;
4535     #endif
4536     }
4537    
4538     /* vsubshs Vector Subtract Signed Half Word Saturate
4539     * v.263
4540     */
4541     void ppc_opc_vsubshs()
4542     {
4543     VECTOR_DEBUG;
4544     int vrD, vrA, vrB;
4545     sint32 res;
4546     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4547    
4548     for (int i=0; i<8; i++) {
4549     res = (sint32)gCPU.vr[vrA].sh[i] - (sint32)gCPU.vr[vrB].sh[i];
4550    
4551     gCPU.vr[vrD].sh[i] = SATURATE_SH(res);
4552     }
4553     }
4554     JITCFlow ppc_opc_gen_vsubshs()
4555     {
4556     #if 0
4557     ppc_opc_gen_interpret(ppc_opc_vsubshs);
4558     return flowEndBlock;
4559     #else
4560     int vrD, vrA, vrB;
4561     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4562    
4563     if (SSE2_AVAIL) {
4564     noncommutative_operation(PALUW(X86_PSUBS), vrD, vrA, vrB);
4565     return flowContinue;
4566     }
4567    
4568     jitcFlushClientVectorRegister(vrA);
4569     jitcFlushClientVectorRegister(vrB);
4570     jitcDropClientVectorRegister(vrD);
4571    
4572     jitcClobberCarryAndFlags();
4573     NativeReg reg = jitcAllocRegister();
4574     NativeReg reg2 = jitcAllocRegister();
4575    
4576     for (int i=0; i<16; i += 2) {
4577     vec_getHalfS(reg, vrA, i);
4578     vec_getHalfS(reg2, vrB, i);
4579    
4580     asmALU(X86_SUB, reg, reg2);
4581     vec_saturateSH(reg, reg2);
4582    
4583     vec_setHalf(vrD, i, (NativeReg16)reg);
4584     }
4585    
4586     return flowContinue;
4587     #endif
4588     }
4589    
4590     /* vsubuws Vector Subtract Unsigned Word Saturate
4591     * v.270
4592     */
4593     void ppc_opc_vsubuws()
4594     {
4595     VECTOR_DEBUG;
4596     int vrD, vrA, vrB;
4597     uint32 res;
4598     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4599    
4600     for (int i=0; i<4; i++) {
4601     res = gCPU.vr[vrA].w[i] - gCPU.vr[vrB].w[i];
4602    
4603     // We do this to prevent us from having to do 64-bit math
4604     if (res > gCPU.vr[vrA].w[i]) {
4605     res = 0;
4606     gCPU.vscr |= VSCR_SAT;
4607     }
4608    
4609     /* 64-bit math | 32-bit hack
4610     * ------------------------+-------------------------------------
4611     * sub, subb (a+b) | sub (a+b)
4612     * sub, subb (r>ub) | sub (r<a)
4613     */
4614    
4615     gCPU.vr[vrD].w[i] = res;
4616     }
4617     }
4618     JITCFlow ppc_opc_gen_vsubuws()
4619     {
4620     #if 0
4621     ppc_opc_gen_interpret(ppc_opc_vsubuws);
4622     return flowEndBlock;
4623     #else
4624     int vrD, vrA, vrB;
4625     modrm_o modrm;
4626     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4627    
4628     jitcFlushClientVectorRegister(vrA);
4629     jitcFlushClientVectorRegister(vrB);
4630     jitcDropClientVectorRegister(vrD);
4631    
4632     jitcClobberCarryAndFlags();
4633     NativeReg reg = jitcAllocRegister();
4634    
4635     for (int i=0; i<16; i += 4) {
4636     vec_getWord(reg, vrA, i);
4637    
4638     asmALU(X86_SUB, reg,
4639     x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4640    
4641     NativeAddress skip = asmJxxFixup(X86_NC);
4642    
4643     asmALU(X86_XOR, reg, reg);
4644    
4645     asmResolveFixup(skip);
4646    
4647     vec_setWord(vrD, i, reg);
4648     }
4649    
4650     return flowContinue;
4651     #endif
4652     }
4653    
4654     /* vsubsws Vector Subtract Signed Word Saturate
4655     * v.264
4656     */
4657     void ppc_opc_vsubsws()
4658     {
4659     VECTOR_DEBUG;
4660     int vrD, vrA, vrB;
4661     uint32 res, tmp;
4662     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4663    
4664     for (int i=0; i<4; i++) {
4665     tmp = -gCPU.vr[vrB].w[i];
4666     res = gCPU.vr[vrA].w[i] + tmp;
4667    
4668     // We do this to prevent us from having to do 64-bit math
4669     if (((gCPU.vr[vrA].w[i] ^ tmp) & SIGN32) == 0) {
4670     // the signs of both operands are the same
4671    
4672     if (((res ^ tmp) & SIGN32) != 0) {
4673     // sign of result != sign of operands
4674    
4675     // if res is negative, should have been positive
4676     res = (res & SIGN32) ? (SIGN32 - 1) : SIGN32;
4677     gCPU.vscr |= VSCR_SAT;
4678     }
4679     }
4680    
4681     /* 64-bit math | 32-bit hack
4682     * ------------------------+-------------------------------------
4683     * sub, subc (a+b) | neg, add (a-b)
4684     * sub, subb (r>ub) | xor, and (sign == sign)
4685     * sub, subb (r<lb) | xor, and (sign != sign)
4686     * | and (which)
4687     */
4688    
4689     gCPU.vr[vrD].w[i] = res;
4690     }
4691     }
4692     JITCFlow ppc_opc_gen_vsubsws()
4693     {
4694     #if 0
4695     ppc_opc_gen_interpret(ppc_opc_vsubsws);
4696     return flowEndBlock;
4697     #else
4698     int vrD, vrA, vrB;
4699     modrm_o modrm;
4700     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4701    
4702     jitcFlushClientVectorRegister(vrA);
4703     jitcFlushClientVectorRegister(vrB);
4704     jitcDropClientVectorRegister(vrD);
4705    
4706     jitcClobberCarryAndFlags();
4707     NativeReg reg = jitcAllocRegister();
4708    
4709     for (int i=0; i<16; i += 4) {
4710     vec_getWord(reg, vrA, i);
4711    
4712     asmALU(X86_SUB, reg,
4713     x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4714    
4715     NativeAddress skip = asmJxxFixup(X86_NO);
4716    
4717     asmMOV_NoFlags(reg, 0x80000000);
4718    
4719     asmResolveFixup(skip);
4720    
4721     vec_setWord(vrD, i, reg);
4722     }
4723    
4724     return flowContinue;
4725     #endif
4726     }
4727    
4728     /* vmuleub Vector Multiply Even Unsigned Byte
4729     * v.209
4730     */
4731     void ppc_opc_vmuleub()
4732     {
4733     VECTOR_DEBUG;
4734     int vrD, vrA, vrB;
4735     uint16 res;
4736     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4737    
4738     for (int i=0; i<8; i++) {
4739     res = (uint16)gCPU.vr[vrA].b[VECT_EVEN(i)] *
4740     (uint16)gCPU.vr[vrB].b[VECT_EVEN(i)];
4741    
4742     gCPU.vr[vrD].h[i] = res;
4743     }
4744     }
4745     JITCFlow ppc_opc_gen_vmuleub()
4746     {
4747     #if 0
4748     ppc_opc_gen_interpret(ppc_opc_vmuleub);
4749     return flowEndBlock;
4750     #else
4751     int vrD, vrA, vrB;
4752     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4753    
4754     jitcFlushClientVectorRegister(vrA);
4755     jitcFlushClientVectorRegister(vrB);
4756     jitcDropClientVectorRegister(vrD);
4757    
4758     jitcClobberCarryAndFlags();
4759     NativeReg reg1 = jitcAllocRegister();
4760     NativeReg reg2 = jitcAllocRegister();
4761    
4762     for (int i=0; i<16; i+=2) {
4763     vec_getByteZ(reg1, vrA, i+1);
4764     vec_getByteZ(reg2, vrB, i+1);
4765    
4766     asmIMUL(reg1, reg2);
4767    
4768     vec_setHalf(vrD, i, (NativeReg16)reg1);
4769     }
4770    
4771     return flowContinue;
4772     #endif
4773     }
4774    
4775     /* vmulesb Vector Multiply Even Signed Byte
4776     * v.207
4777     */
4778     void ppc_opc_vmulesb()
4779     {
4780     VECTOR_DEBUG;
4781     int vrD, vrA, vrB;
4782     sint16 res;
4783     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4784    
4785     for (int i=0; i<8; i++) {
4786     res = (sint16)gCPU.vr[vrA].sb[VECT_EVEN(i)] *
4787     (sint16)gCPU.vr[vrB].sb[VECT_EVEN(i)];
4788    
4789     gCPU.vr[vrD].sh[i] = res;
4790     }
4791     }
4792     JITCFlow ppc_opc_gen_vmulesb()
4793     {
4794     #if 0
4795     ppc_opc_gen_interpret(ppc_opc_vmulesb);
4796     return flowEndBlock;
4797     #else
4798     int vrD, vrA, vrB;
4799     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4800    
4801     jitcFlushClientVectorRegister(vrA);
4802     jitcFlushClientVectorRegister(vrB);
4803     jitcDropClientVectorRegister(vrD);
4804    
4805     jitcClobberCarryAndFlags();
4806     NativeReg reg1 = jitcAllocRegister();
4807     NativeReg reg2 = jitcAllocRegister();
4808    
4809     for (int i=0; i<16; i+=2) {
4810     vec_getByteS(reg1, vrA, i+1);
4811     vec_getByteS(reg2, vrB, i+1);
4812    
4813     asmIMUL(reg1, reg2);
4814    
4815     vec_setHalf(vrD, i, (NativeReg16)reg1);
4816     }
4817    
4818     return flowContinue;
4819     #endif
4820     }
4821    
4822     /* vmuleuh Vector Multiply Even Unsigned Half Word
4823     * v.210
4824     */
4825     void ppc_opc_vmuleuh()
4826     {
4827     VECTOR_DEBUG;
4828     int vrD, vrA, vrB;
4829     uint32 res;
4830     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4831    
4832     for (int i=0; i<4; i++) {
4833     res = (uint32)gCPU.vr[vrA].h[VECT_EVEN(i)] *
4834     (uint32)gCPU.vr[vrB].h[VECT_EVEN(i)];
4835    
4836     gCPU.vr[vrD].w[i] = res;
4837     }
4838     }
4839     JITCFlow ppc_opc_gen_vmuleuh()
4840     {
4841     #if 0
4842     ppc_opc_gen_interpret(ppc_opc_vmuleuh);
4843     return flowEndBlock;
4844     #else
4845     int vrD, vrA, vrB;
4846     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4847    
4848     jitcFlushClientVectorRegister(vrA);
4849     jitcFlushClientVectorRegister(vrB);
4850     jitcDropClientVectorRegister(vrD);
4851    
4852     jitcClobberCarryAndFlags();
4853     NativeReg reg1 = jitcAllocRegister();
4854     NativeReg reg2 = jitcAllocRegister();
4855    
4856     for (int i=0; i<16; i+=4) {
4857     vec_getHalfZ(reg1, vrA, i+2);
4858     vec_getHalfZ(reg2, vrB, i+2);
4859    
4860     asmIMUL(reg1, reg2);
4861    
4862     vec_setWord(vrD, i, reg1);
4863     }
4864    
4865     return flowContinue;
4866     #endif
4867     }
4868    
4869     /* vmulesh Vector Multiply Even Signed Half Word
4870     * v.208
4871     */
4872     void ppc_opc_vmulesh()
4873     {
4874     VECTOR_DEBUG;
4875     int vrD, vrA, vrB;
4876     sint32 res;
4877     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4878    
4879     for (int i=0; i<4; i++) {
4880     res = (sint32)gCPU.vr[vrA].sh[VECT_EVEN(i)] *
4881     (sint32)gCPU.vr[vrB].sh[VECT_EVEN(i)];
4882    
4883     gCPU.vr[vrD].sw[i] = res;
4884     }
4885     }
4886     JITCFlow ppc_opc_gen_vmulesh()
4887     {
4888     #if 0
4889     ppc_opc_gen_interpret(ppc_opc_vmulesh);
4890     return flowEndBlock;
4891     #else
4892     int vrD, vrA, vrB;
4893     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4894    
4895     jitcFlushClientVectorRegister(vrA);
4896     jitcFlushClientVectorRegister(vrB);
4897     jitcDropClientVectorRegister(vrD);
4898    
4899     jitcClobberCarryAndFlags();
4900     NativeReg reg1 = jitcAllocRegister();
4901     NativeReg reg2 = jitcAllocRegister();
4902    
4903     for (int i=0; i<16; i+=4) {
4904     vec_getHalfS(reg1, vrA, i+2);
4905     vec_getHalfS(reg2, vrB, i+2);
4906    
4907     asmIMUL(reg1, reg2);
4908    
4909     vec_setWord(vrD, i, reg1);
4910     }
4911    
4912     return flowContinue;
4913     #endif
4914     }
4915    
4916     /* vmuloub Vector Multiply Odd Unsigned Byte
4917     * v.213
4918     */
4919     void ppc_opc_vmuloub()
4920     {
4921     VECTOR_DEBUG;
4922     int vrD, vrA, vrB;
4923     uint16 res;
4924     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4925    
4926     for (int i=0; i<8; i++) {
4927     res = (uint16)gCPU.vr[vrA].b[VECT_ODD(i)] *
4928     (uint16)gCPU.vr[vrB].b[VECT_ODD(i)];
4929    
4930     gCPU.vr[vrD].h[i] = res;
4931     }
4932     }
4933     JITCFlow ppc_opc_gen_vmuloub()
4934     {
4935     #if 0
4936     ppc_opc_gen_interpret(ppc_opc_vmuloub);
4937     return flowEndBlock;
4938     #else
4939     int vrD, vrA, vrB;
4940     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4941    
4942     jitcFlushClientVectorRegister(vrA);
4943     jitcFlushClientVectorRegister(vrB);
4944     jitcDropClientVectorRegister(vrD);
4945    
4946     jitcClobberCarryAndFlags();
4947     NativeReg reg1 = jitcAllocRegister();
4948     NativeReg reg2 = jitcAllocRegister();
4949    
4950     for (int i=0; i<16; i+=2) {
4951     vec_getByteZ(reg1, vrA, i);
4952     vec_getByteZ(reg2, vrB, i);
4953    
4954     asmIMUL(reg1, reg2);
4955    
4956     vec_setHalf(vrD, i, (NativeReg16)reg1);
4957     }
4958    
4959     return flowContinue;
4960     #endif
4961     }
4962    
4963     /* vmulosb Vector Multiply Odd Signed Byte
4964     * v.211
4965     */
4966     void ppc_opc_vmulosb()
4967     {
4968     VECTOR_DEBUG;
4969     int vrD, vrA, vrB;
4970     sint16 res;
4971     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4972    
4973     for (int i=0; i<8; i++) {
4974     res = (sint16)gCPU.vr[vrA].sb[VECT_ODD(i)] *
4975     (sint16)gCPU.vr[vrB].sb[VECT_ODD(i)];
4976    
4977     gCPU.vr[vrD].sh[i] = res;
4978     }
4979     }
4980     JITCFlow ppc_opc_gen_vmulosb()
4981     {
4982     #if 0
4983     ppc_opc_gen_interpret(ppc_opc_vmulosb);
4984     return flowEndBlock;
4985     #else
4986     int vrD, vrA, vrB;
4987     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4988    
4989     jitcFlushClientVectorRegister(vrA);
4990     jitcFlushClientVectorRegister(vrB);
4991     jitcDropClientVectorRegister(vrD);
4992    
4993     jitcClobberCarryAndFlags();
4994     NativeReg reg1 = jitcAllocRegister();
4995     NativeReg reg2 = jitcAllocRegister();
4996    
4997     for (int i=0; i<16; i+=2) {
4998     vec_getByteS(reg1, vrA, i);
4999     vec_getByteS(reg2, vrB, i);
5000    
5001     asmIMUL(reg1, reg2);
5002    
5003     vec_setHalf(vrD, i, (NativeReg16)reg1);
5004     }
5005    
5006     return flowContinue;
5007     #endif
5008     }
5009    
5010     /* vmulouh Vector Multiply Odd Unsigned Half Word
5011     * v.214
5012     */
5013     void ppc_opc_vmulouh()
5014     {
5015     VECTOR_DEBUG;
5016     int vrD, vrA, vrB;
5017     uint32 res;
5018     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5019    
5020     for (int i=0; i<4; i++) {
5021     res = (uint32)gCPU.vr[vrA].h[VECT_ODD(i)] *
5022     (uint32)gCPU.vr[vrB].h[VECT_ODD(i)];
5023    
5024     gCPU.vr[vrD].w[i] = res;
5025     }
5026     }
5027     JITCFlow ppc_opc_gen_vmulouh()
5028     {
5029     #if 0
5030     ppc_opc_gen_interpret(ppc_opc_vmulouh);
5031     return flowEndBlock;
5032     #else
5033     int vrD, vrA, vrB;
5034     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5035    
5036     jitcFlushClientVectorRegister(vrA);
5037     jitcFlushClientVectorRegister(vrB);
5038     jitcDropClientVectorRegister(vrD);
5039    
5040     jitcClobberCarryAndFlags();
5041     NativeReg reg1 = jitcAllocRegister();
5042     NativeReg reg2 = jitcAllocRegister();
5043    
5044     for (int i=0; i<16; i+=4) {
5045     vec_getHalfZ(reg1, vrA, i);
5046     vec_getHalfZ(reg2, vrB, i);
5047    
5048     asmIMUL(reg1, reg2);
5049    
5050     vec_setWord(vrD, i, reg1);
5051     }
5052    
5053     return flowContinue;
5054     #endif
5055     }
5056    
5057     /* vmulosh Vector Multiply Odd Signed Half Word
5058     * v.212
5059     */
5060     void ppc_opc_vmulosh()
5061     {
5062     VECTOR_DEBUG;
5063     int vrD, vrA, vrB;
5064     sint32 res;
5065     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5066    
5067     for (int i=0; i<4; i++) {
5068     res = (sint32)gCPU.vr[vrA].sh[VECT_ODD(i)] *
5069     (sint32)gCPU.vr[vrB].sh[VECT_ODD(i)];
5070    
5071     gCPU.vr[vrD].sw[i] = res;
5072     }
5073     }
5074     JITCFlow ppc_opc_gen_vmulosh()
5075     {
5076     #if 0
5077     ppc_opc_gen_interpret(ppc_opc_vmulosh);
5078     return flowEndBlock;
5079     #else
5080     int vrD, vrA, vrB;
5081     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5082    
5083     jitcFlushClientVectorRegister(vrA);
5084     jitcFlushClientVectorRegister(vrB);
5085     jitcDropClientVectorRegister(vrD);
5086    
5087     jitcClobberCarryAndFlags();
5088     NativeReg reg1 = jitcAllocRegister();
5089     NativeReg reg2 = jitcAllocRegister();
5090    
5091     for (int i=0; i<16; i+=4) {
5092     vec_getHalfS(reg1, vrA, i);
5093     vec_getHalfS(reg2, vrB, i);
5094    
5095     asmIMUL(reg1, reg2);
5096    
5097     vec_setWord(vrD, i, reg1);
5098     }
5099    
5100     return flowContinue;
5101     #endif
5102     }
5103    
5104     /* vmaddfp Vector Multiply Add Floating Point
5105     * v.177
5106     */
5107     void ppc_opc_vmaddfp()
5108     {
5109     VECTOR_DEBUG;
5110     int vrD, vrA, vrB, vrC;
5111     double res;
5112     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5113    
5114     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
5115     res = (double)gCPU.vr[vrA].f[i] * (double)gCPU.vr[vrC].f[i];
5116    
5117     res = (double)gCPU.vr[vrB].f[i] + res;
5118    
5119     gCPU.vr[vrD].f[i] = (float)res;
5120     }
5121     }
5122     JITCFlow ppc_opc_gen_vmaddfp()
5123     {
5124     #if 0
5125     ppc_opc_gen_interpret(ppc_opc_vmaddfp);
5126     return flowEndBlock;
5127     #else
5128     int vrD, vrA, vrB, vrC;
5129     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5130    
5131     if (SSE_AVAIL) {
5132     commutative_operation(X86_MULPS, vrT, vrA, vrC);
5133     commutative_operation(X86_ADDPS, vrD, vrB, vrT);
5134     return flowContinue;
5135     }
5136    
5137     ppc_opc_gen_interpret(ppc_opc_vmaddfp);
5138     return flowEndBlock;
5139     #endif
5140     }
5141    
5142     /* vmhaddshs Vector Multiply High and Add Signed Half Word Saturate
5143     * v.185
5144     */
5145     void ppc_opc_vmhaddshs()
5146     {
5147     VECTOR_DEBUG;
5148     int vrD, vrA, vrB, vrC;
5149     sint32 prod;
5150     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5151    
5152     for (int i=0; i<8; i++) {
5153     prod = (sint32)gCPU.vr[vrA].sh[i] * (sint32)gCPU.vr[vrB].sh[i];
5154    
5155     prod = (prod >> 15) + (sint32)gCPU.vr[vrC].sh[i];
5156    
5157     gCPU.vr[vrD].sh[i] = SATURATE_SH(prod);
5158     }
5159     }
5160     JITCFlow ppc_opc_gen_vmhaddshs()
5161     {
5162     #if 0
5163     ppc_opc_gen_interpret(ppc_opc_vmhaddshs);
5164     return flowEndBlock;
5165     #else
5166     int vrD, vrA, vrB, vrC;
5167     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5168    
5169     jitcFlushClientVectorRegister(vrA);
5170     jitcFlushClientVectorRegister(vrB);
5171     jitcFlushClientVectorRegister(vrC);
5172     jitcDropClientVectorRegister(vrD);
5173    
5174     jitcClobberCarryAndFlags();
5175     NativeReg reg1 = jitcAllocRegister();
5176     NativeReg reg2 = jitcAllocRegister();
5177    
5178     for (int i=0; i<16; i+=2) {
5179     vec_getHalfS(reg1, vrA, i);
5180     vec_getHalfS(reg2, vrB, i);
5181    
5182     asmIMUL(reg1, reg2);
5183     asmShift(X86_SAR, reg1, 15);
5184    
5185     vec_getHalfS(reg2, vrC, i);
5186     asmALU(X86_ADD, reg1, reg2);
5187    
5188     vec_saturateSH(reg1, reg2);
5189    
5190     vec_setHalf(vrD, i, (NativeReg16)reg1);
5191     }
5192    
5193     return flowContinue;
5194     #endif
5195     }
5196    
5197     /* vmladduhm Vector Multiply Low and Add Unsigned Half Word Modulo
5198     * v.194
5199     */
5200     void ppc_opc_vmladduhm()
5201     {
5202     VECTOR_DEBUG;
5203     int vrD, vrA, vrB, vrC;
5204     uint32 prod;
5205     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5206    
5207     for (int i=0; i<8; i++) {
5208     prod = (uint32)gCPU.vr[vrA].h[i] * (uint32)gCPU.vr[vrB].h[i];
5209    
5210     prod = prod + (uint32)gCPU.vr[vrC].h[i];
5211    
5212     gCPU.vr[vrD].h[i] = prod;
5213     }
5214     }
5215     JITCFlow ppc_opc_gen_vmladduhm()
5216     {
5217     #if 0
5218     ppc_opc_gen_interpret(ppc_opc_vmladduhm);
5219     return flowEndBlock;
5220     #else
5221     int vrD, vrA, vrB, vrC;
5222     modrm_o modrm;
5223     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5224    
5225     if (SSE2_AVAIL) {
5226     commutative_operation(X86_PMULLW, vrT, vrA, vrB);
5227     commutative_operation(PALUW(X86_PADD), vrD, vrT, vrC);
5228     return flowContinue;
5229     }
5230    
5231     jitcFlushClientVectorRegister(vrA);
5232     jitcFlushClientVectorRegister(vrB);
5233     jitcFlushClientVectorRegister(vrC);
5234     jitcDropClientVectorRegister(vrD);
5235    
5236     jitcClobberCarryAndFlags();
5237     NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
5238     NativeReg reg2 = jitcAllocRegister();
5239    
5240     for (int i=0; i<16; i+=2) {
5241     vec_getHalfZ((NativeReg)reg1, vrA, i);
5242     vec_getHalfZ(reg2, vrB, i);
5243    
5244     asmIMUL((NativeReg)reg1, reg2);
5245    
5246     if (vrC == vrD) {
5247     asmALU(X86_ADD,
5248     x86_mem2(modrm, &(gCPU.vr[vrD].b[i])), reg1);
5249     } else {
5250     asmALU(X86_ADD, reg1,
5251     x86_mem2(modrm, &(gCPU.vr[vrC].b[i])));
5252    
5253     vec_setHalf(vrD, i, reg1);
5254     }
5255     }
5256    
5257     return flowContinue;
5258     #endif
5259     }
5260    
5261     /* vmhraddshs Vector Multiply High Round and Add Signed Half Word Saturate
5262     * v.186
5263     */
5264     void ppc_opc_vmhraddshs()
5265     {
5266     VECTOR_DEBUG;
5267     int vrD, vrA, vrB, vrC;
5268     sint32 prod;
5269     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5270    
5271     for (int i=0; i<8; i++) {
5272     prod = (sint32)gCPU.vr[vrA].sh[i] * (sint32)gCPU.vr[vrB].sh[i];
5273    
5274     prod += 0x4000;
5275     prod = (prod >> 15) + (sint32)gCPU.vr[vrC].sh[i];
5276    
5277     gCPU.vr[vrD].sh[i] = SATURATE_SH(prod);
5278     }
5279     }
5280     JITCFlow ppc_opc_gen_vmhraddshs()
5281     {
5282     #if 0
5283     ppc_opc_gen_interpret(ppc_opc_vmhraddshs);
5284     return flowEndBlock;
5285     #else
5286     int vrD, vrA, vrB, vrC;
5287     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5288    
5289     jitcFlushClientVectorRegister(vrA);
5290     jitcFlushClientVectorRegister(vrB);
5291     jitcFlushClientVectorRegister(vrC);
5292     jitcDropClientVectorRegister(vrD);
5293    
5294     jitcClobberCarryAndFlags();
5295     NativeReg reg1 = jitcAllocRegister();
5296     NativeReg reg2 = jitcAllocRegister();
5297    
5298     for (int i=0; i<16; i+=2) {
5299     vec_getHalfS(reg1, vrA, i);
5300     vec_getHalfS(reg2, vrB, i);
5301    
5302     asmIMUL(reg1, reg2);
5303     asmALU(X86_ADD, reg1, 0x4000);
5304     asmShift(X86_SAR, reg1, 15);
5305    
5306     vec_getHalfS(reg2, vrC, i);
5307     asmALU(X86_ADD, reg1, reg2);
5308    
5309     vec_saturateSH(reg1, reg2);
5310    
5311     vec_setHalf(vrD, i, (NativeReg16)reg1);
5312     }
5313    
5314     return flowContinue;
5315     #endif
5316     }
5317    
5318     /* vmsumubm Vector Multiply Sum Unsigned Byte Modulo
5319     * v.204
5320     */
5321     void ppc_opc_vmsumubm()
5322     {
5323     VECTOR_DEBUG;
5324     int vrD, vrA, vrB, vrC;
5325     uint32 temp;
5326     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5327    
5328     for (int i=0; i<4; i++) {
5329     temp = gCPU.vr[vrC].w[i];
5330    
5331     temp += (uint16)gCPU.vr[vrA].b[i<<2] *
5332     (uint16)gCPU.vr[vrB].b[i<<2];
5333    
5334     temp += (uint16)gCPU.vr[vrA].b[(i<<2)+1] *
5335     (uint16)gCPU.vr[vrB].b[(i<<2)+1];
5336    
5337     temp += (uint16)gCPU.vr[vrA].b[(i<<2)+2] *
5338     (uint16)gCPU.vr[vrB].b[(i<<2)+2];
5339    
5340     temp += (uint16)gCPU.vr[vrA].b[(i<<2)+3] *
5341     (uint16)gCPU.vr[vrB].b[(i<<2)+3];
5342    
5343     gCPU.vr[vrD].w[i] = temp;
5344     }
5345     }
5346     JITCFlow ppc_opc_gen_vmsumubm()
5347     {
5348     ppc_opc_gen_interpret(ppc_opc_vmsumubm);
5349     return flowEndBlock;
5350     }
5351    
5352     /* vmsumuhm Vector Multiply Sum Unsigned Half Word Modulo
5353     * v.205
5354     */
5355     void ppc_opc_vmsumuhm()
5356     {
5357     VECTOR_DEBUG;
5358     int vrD, vrA, vrB, vrC;
5359     uint32 temp;
5360     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5361    
5362     for (int i=0; i<4; i++) {
5363     temp = gCPU.vr[vrC].w[i];
5364    
5365     temp += (uint32)gCPU.vr[vrA].h[i<<1] *
5366     (uint32)gCPU.vr[vrB].h[i<<1];
5367     temp += (uint32)gCPU.vr[vrA].h[(i<<1)+1] *
5368     (uint32)gCPU.vr[vrB].h[(i<<1)+1];
5369    
5370     gCPU.vr[vrD].w[i] = temp;
5371     }
5372     }
5373     JITCFlow ppc_opc_gen_vmsumuhm()
5374     {
5375     ppc_opc_gen_interpret(ppc_opc_vmsumuhm);
5376     return flowEndBlock;
5377     }
5378    
5379     /* vmsummbm Vector Multiply Sum Mixed-Sign Byte Modulo
5380     * v.201
5381     */
5382     void ppc_opc_vmsummbm()
5383     {
5384     VECTOR_DEBUG;
5385     int vrD, vrA, vrB, vrC;
5386     sint32 temp;
5387     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5388    
5389     for (int i=0; i<4; i++) {
5390     temp = gCPU.vr[vrC].sw[i];
5391    
5392     temp += (sint16)gCPU.vr[vrA].sb[i<<2] *
5393     (uint16)gCPU.vr[vrB].b[i<<2];
5394     temp += (sint16)gCPU.vr[vrA].sb[(i<<2)+1] *
5395     (uint16)gCPU.vr[vrB].b[(i<<2)+1];
5396     temp += (sint16)gCPU.vr[vrA].sb[(i<<2)+2] *
5397     (uint16)gCPU.vr[vrB].b[(i<<2)+2];
5398     temp += (sint16)gCPU.vr[vrA].sb[(i<<2)+3] *
5399     (uint16)gCPU.vr[vrB].b[(i<<2)+3];
5400    
5401     gCPU.vr[vrD].sw[i] = temp;
5402     }
5403     }
5404     JITCFlow ppc_opc_gen_vmsummbm()
5405     {
5406     #if 0
5407     ppc_opc_gen_interpret(ppc_opc_vmsummbm);
5408     return flowEndBlock;
5409     #else
5410     int vrD, vrA, vrB, vrC;
5411     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5412    
5413     jitcFlushClientVectorRegister(vrA);
5414     jitcFlushClientVectorRegister(vrB);
5415     jitcFlushClientVectorRegister(vrC);
5416     jitcDropClientVectorRegister(vrD);
5417    
5418     jitcClobberCarryAndFlags();
5419     NativeReg reg1 = jitcAllocRegister();
5420     NativeReg reg2 = jitcAllocRegister();
5421     NativeReg reg3 = jitcAllocRegister();
5422    
5423     for (int i=0; i<16; i+=4) {
5424     vec_getWord(reg1, vrC, i);
5425    
5426     for (int j=0; j<4; j++) {
5427     vec_getByteS(reg2, vrA, i+j);
5428     vec_getByteZ(reg3, vrB, i+j);
5429    
5430     asmIMUL(reg2, reg3);
5431     asmALU(X86_ADD, reg1, reg2);
5432     }
5433    
5434     vec_setWord(vrD, i, reg1);
5435     }
5436    
5437     return flowContinue;
5438     #endif
5439     }
5440    
5441     /* vmsumshm Vector Multiply Sum Signed Half Word Modulo
5442     * v.202
5443     */
5444     void ppc_opc_vmsumshm()
5445     {
5446     VECTOR_DEBUG;
5447     int vrD, vrA, vrB, vrC;
5448     sint32 temp;
5449     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5450    
5451     for (int i=0; i<4; i++) {
5452     temp = gCPU.vr[vrC].sw[i];
5453    
5454     temp += (sint32)gCPU.vr[vrA].sh[i<<1] *
5455     (sint32)gCPU.vr[vrB].sh[i<<1];
5456     temp += (sint32)gCPU.vr[vrA].sh[(i<<1)+1] *
5457     (sint32)gCPU.vr[vrB].sh[(i<<1)+1];
5458    
5459     gCPU.vr[vrD].sw[i] = temp;
5460     }
5461     }
5462     JITCFlow ppc_opc_gen_vmsumshm()
5463     {
5464     ppc_opc_gen_interpret(ppc_opc_vmsumshm);
5465     return flowEndBlock;
5466     }
5467    
5468     /* vmsumuhs Vector Multiply Sum Unsigned Half Word Saturate
5469     * v.206
5470     */
5471     void ppc_opc_vmsumuhs()
5472     {
5473     VECTOR_DEBUG;
5474     int vrD, vrA, vrB, vrC;
5475     uint64 temp;
5476     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5477    
5478     /* For this, there's no way to get around 64-bit math. If we use
5479     * the hacks used before, then we have to do it so often, that
5480     * we'll outpace the 64-bit math in execution time.
5481     */
5482     for (int i=0; i<4; i++) {
5483     temp = gCPU.vr[vrC].w[i];
5484    
5485     temp += (uint32)gCPU.vr[vrA].h[i<<1] *
5486     (uint32)gCPU.vr[vrB].h[i<<1];
5487    
5488     temp += (uint32)gCPU.vr[vrA].h[(i<<1)+1] *
5489     (uint32)gCPU.vr[vrB].h[(i<<1)+1];
5490    
5491     gCPU.vr[vrD].w[i] = SATURATE_UW(temp);
5492     }
5493     }
5494     JITCFlow ppc_opc_gen_vmsumuhs()
5495     {
5496     ppc_opc_gen_interpret(ppc_opc_vmsumuhs);
5497     return flowEndBlock;
5498     }
5499    
5500     /* vmsumshs Vector Multiply Sum Signed Half Word Saturate
5501     * v.203
5502     */
5503     void ppc_opc_vmsumshs()
5504     {
5505     VECTOR_DEBUG;
5506     int vrD, vrA, vrB, vrC;
5507     sint64 temp;
5508     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5509    
5510     /* For this, there's no way to get around 64-bit math. If we use
5511     * the hacks used before, then we have to do it so often, that
5512     * we'll outpace the 64-bit math in execution time.
5513     */
5514    
5515     for (int i=0; i<4; i++) {
5516     temp = gCPU.vr[vrC].sw[i];
5517    
5518     temp += (sint32)gCPU.vr[vrA].sh[i<<1] *
5519     (sint32)gCPU.vr[vrB].sh[i<<1];
5520     temp += (sint32)gCPU.vr[vrA].sh[(i<<1)+1] *
5521     (sint32)gCPU.vr[vrB].sh[(i<<1)+1];
5522    
5523     gCPU.vr[vrD].sw[i] = SATURATE_SW(temp);
5524     }
5525     }
5526     JITCFlow ppc_opc_gen_vmsumshs()
5527     {
5528     ppc_opc_gen_interpret(ppc_opc_vmsumshs);
5529     return flowEndBlock;
5530     }
5531    
5532     /* vsum4ubs Vector Sum Across Partial (1/4) Unsigned Byte Saturate
5533     * v.275
5534     */
5535     void ppc_opc_vsum4ubs()
5536     {
5537     VECTOR_DEBUG;
5538     int vrD, vrA, vrB;
5539     uint64 res;
5540     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5541    
5542     /* For this, there's no way to get around 64-bit math. If we use
5543     * the hacks used before, then we have to do it so often, that
5544     * we'll outpace the 64-bit math in execution time.
5545     */
5546    
5547     for (int i=0; i<4; i++) {
5548     res = (uint64)gCPU.vr[vrB].w[i];
5549    
5550     res += (uint64)gCPU.vr[vrA].b[(i<<2)];
5551     res += (uint64)gCPU.vr[vrA].b[(i<<2)+1];
5552     res += (uint64)gCPU.vr[vrA].b[(i<<2)+2];
5553     res += (uint64)gCPU.vr[vrA].b[(i<<2)+3];
5554    
5555     gCPU.vr[vrD].w[i] = SATURATE_UW(res);
5556     }
5557     }
5558     JITCFlow ppc_opc_gen_vsum4ubs()
5559     {
5560     ppc_opc_gen_interpret(ppc_opc_vsum4ubs);
5561     return flowEndBlock;
5562     }
5563    
5564     /* vsum4sbs Vector Sum Across Partial (1/4) Signed Byte Saturate
5565     * v.273
5566     */
5567     void ppc_opc_vsum4sbs()
5568     {
5569     VECTOR_DEBUG;
5570     int vrD, vrA, vrB;
5571     sint64 res;
5572     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5573    
5574     for (int i=0; i<4; i++) {
5575     res = (sint64)gCPU.vr[vrB].sw[i];
5576    
5577     res += (sint64)gCPU.vr[vrA].sb[(i<<2)];
5578     res += (sint64)gCPU.vr[vrA].sb[(i<<2)+1];
5579     res += (sint64)gCPU.vr[vrA].sb[(i<<2)+2];
5580     res += (sint64)gCPU.vr[vrA].sb[(i<<2)+3];
5581    
5582     gCPU.vr[vrD].sw[i] = SATURATE_SW(res);
5583     }
5584     }
5585     JITCFlow ppc_opc_gen_vsum4sbs()
5586     {
5587     ppc_opc_gen_interpret(ppc_opc_vsum4sbs);
5588     return flowEndBlock;
5589     }
5590    
5591     /* vsum4shs Vector Sum Across Partial (1/4) Signed Half Word Saturate
5592     * v.274
5593     */
5594     void ppc_opc_vsum4shs()
5595     {
5596     VECTOR_DEBUG;
5597     int vrD, vrA, vrB;
5598     sint64 res;
5599     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5600    
5601     for (int i=0; i<4; i++) {
5602     res = (sint64)gCPU.vr[vrB].sw[i];
5603    
5604     res += (sint64)gCPU.vr[vrA].sh[(i<<1)];
5605     res += (sint64)gCPU.vr[vrA].sh[(i<<1)+1];
5606    
5607     gCPU.vr[vrD].sw[i] = SATURATE_SW(res);
5608     }
5609     }
5610     JITCFlow ppc_opc_gen_vsum4shs()
5611     {
5612     ppc_opc_gen_interpret(ppc_opc_vsum4shs);
5613     return flowEndBlock;
5614     }
5615    
5616     /* vsum2sws Vector Sum Across Partial (1/2) Signed Word Saturate
5617     * v.272
5618     */
5619     void ppc_opc_vsum2sws()
5620     {
5621     VECTOR_DEBUG;
5622     int vrD, vrA, vrB;
5623     sint64 res;
5624     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5625    
5626     res = (sint64)gCPU.vr[vrA].sw[0] + (sint64)gCPU.vr[vrA].sw[1];
5627     res += (sint64)gCPU.vr[vrB].sw[VECT_ODD(0)];
5628    
5629     gCPU.vr[vrD].w[VECT_ODD(0)] = SATURATE_SW(res);
5630     gCPU.vr[vrD].w[VECT_EVEN(0)] = 0;
5631    
5632     res = (sint64)gCPU.vr[vrA].sw[2] + (sint64)gCPU.vr[vrA].sw[3];
5633     res += (sint64)gCPU.vr[vrB].sw[VECT_ODD(1)];
5634    
5635     gCPU.vr[vrD].w[VECT_ODD(1)] = SATURATE_SW(res);
5636     gCPU.vr[vrD].w[VECT_EVEN(1)] = 0;
5637     }
5638     JITCFlow ppc_opc_gen_vsum2sws()
5639     {
5640     ppc_opc_gen_interpret(ppc_opc_vsum2sws);
5641     return flowEndBlock;
5642     }
5643    
5644     /* vsumsws Vector Sum Across Signed Word Saturate
5645     * v.271
5646     */
5647     void ppc_opc_vsumsws()
5648     {
5649     VECTOR_DEBUG;
5650     int vrD, vrA, vrB;
5651     sint64 res;
5652     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5653    
5654     res = (sint64)gCPU.vr[vrA].sw[0] + (sint64)gCPU.vr[vrA].sw[1];
5655     res += (sint64)gCPU.vr[vrA].sw[2] + (sint64)gCPU.vr[vrA].sw[3];
5656    
5657     res += (sint64)VECT_W(gCPU.vr[vrB], 3);
5658    
5659     VECT_W(gCPU.vr[vrD], 3) = SATURATE_SW(res);
5660     VECT_W(gCPU.vr[vrD], 2) = 0;
5661     VECT_W(gCPU.vr[vrD], 1) = 0;
5662     VECT_W(gCPU.vr[vrD], 0) = 0;
5663     }
5664     JITCFlow ppc_opc_gen_vsumsws()
5665     {
5666     ppc_opc_gen_interpret(ppc_opc_vsumsws);
5667     return flowEndBlock;
5668     }
5669    
5670     /* vnmsubfp Vector Negative Multiply-Subtract Floating Point
5671     * v.215
5672     */
5673     void ppc_opc_vnmsubfp()
5674     {
5675     VECTOR_DEBUG;
5676     int vrD, vrA, vrB, vrC;
5677     double res;
5678     PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5679    
5680     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
5681     res = (double)gCPU.vr[vrA].f[i] * (double)gCPU.vr[vrC].f[i];
5682    
5683     res = (double)gCPU.vr[vrB].f[i] - res;
5684    
5685     gCPU.vr[vrD].f[i] = (float)res;
5686     }
5687     }
5688     JITCFlow ppc_opc_gen_vnmsubfp()
5689     {
5690     #if 0
5691     ppc_opc_gen_interpret(ppc_opc_vnmsubfp);
5692     return flowEndBlock;
5693     #else
5694     int vrD, vrA, vrB, vrC;
5695     PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5696    
5697     if (SSE_AVAIL) {
5698     commutative_operation(X86_MULPS, vrT, vrA, vrC);
5699     noncommutative_operation(X86_SUBPS, vrD, vrB, vrT);
5700     return flowContinue;
5701     }
5702    
5703     ppc_opc_gen_interpret(ppc_opc_vnmsubfp);
5704     return flowEndBlock;
5705     #endif
5706     }
5707    
5708     /* vavgub Vector Average Unsigned Byte
5709     * v.152
5710     */
5711     void ppc_opc_vavgub()
5712     {
5713     VECTOR_DEBUG;
5714     int vrD, vrA, vrB;
5715     uint16 res;
5716     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5717    
5718     for (int i=0; i<16; i++) {
5719     res = (uint16)gCPU.vr[vrA].b[i] +
5720     (uint16)gCPU.vr[vrB].b[i] + 1;
5721    
5722     gCPU.vr[vrD].b[i] = (res >> 1);
5723     }
5724     }
5725     JITCFlow ppc_opc_gen_vavgub()
5726     {
5727     #if 0
5728     ppc_opc_gen_interpret(ppc_opc_vavgub);
5729     return flowEndBlock;
5730     #else
5731     int vrD, vrA, vrB;
5732     modrm_o modrma, modrmb;
5733     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5734    
5735     if (SSE2_AVAIL) {
5736     commutative_operation(X86_PAVGB, vrD, vrA, vrB);
5737     return flowContinue;
5738     }
5739    
5740     jitcFlushClientVectorRegister(vrA);
5741     jitcFlushClientVectorRegister(vrB);
5742     jitcDropClientVectorRegister(vrD);
5743    
5744     jitcClobberCarryAndFlags();
5745     NativeReg reg = jitcAllocRegister();
5746     NativeReg reg2 = jitcAllocRegister();
5747     NativeReg reg3 = jitcAllocRegister();
5748    
5749     for (int i=0; i<16; i += 4) {
5750     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
5751     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
5752    
5753     asmALU(X86_MOV, reg, modrma);
5754     asmALU(X86_MOV, reg2, modrmb);
5755     asmALU(X86_MOV, reg3, reg);
5756    
5757     asmALU(X86_OR, reg3, reg2);
5758     asmALU(X86_AND, reg3, 0x01010101);
5759    
5760     asmShift(X86_SHR, reg, 1);
5761     asmShift(X86_SHR, reg2, 1);
5762    
5763     asmALU(X86_AND, reg, 0x7f7f7f7f);
5764     asmALU(X86_AND, reg2, 0x7f7f7f7f);
5765    
5766     asmALU(X86_ADD, reg, reg2);
5767     asmALU(X86_ADD, reg, reg3);
5768    
5769     vec_setWord(vrD, i, reg);
5770     }
5771    
5772     return flowContinue;
5773     #endif
5774     }
5775    
5776     /* vavguh Vector Average Unsigned Half Word
5777     * v.153
5778     */
5779     void ppc_opc_vavguh()
5780     {
5781     VECTOR_DEBUG;
5782     int vrD, vrA, vrB;
5783     uint32 res;
5784     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5785    
5786     for (int i=0; i<8; i++) {
5787     res = (uint32)gCPU.vr[vrA].h[i] +
5788     (uint32)gCPU.vr[vrB].h[i] + 1;
5789    
5790     gCPU.vr[vrD].h[i] = (res >> 1);
5791     }
5792     }
5793     JITCFlow ppc_opc_gen_vavguh()
5794     {
5795     #if 0
5796     ppc_opc_gen_interpret(ppc_opc_vavguh);
5797     return flowEndBlock;
5798     #else
5799     int vrD, vrA, vrB;
5800     modrm_o modrma, modrmb;
5801     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5802    
5803     if (SSE2_AVAIL) {
5804     commutative_operation(X86_PAVGW, vrD, vrA, vrB);
5805     return flowContinue;
5806     }
5807    
5808     jitcFlushClientVectorRegister(vrA);
5809     jitcFlushClientVectorRegister(vrB);
5810     jitcDropClientVectorRegister(vrD);
5811    
5812     jitcClobberCarryAndFlags();
5813     NativeReg reg = jitcAllocRegister();
5814     NativeReg reg2 = jitcAllocRegister();
5815     NativeReg reg3 = jitcAllocRegister();
5816    
5817     for (int i=0; i<16; i += 4) {
5818     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
5819     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
5820    
5821     asmALU(X86_MOV, reg, modrma);
5822     asmALU(X86_MOV, reg2, modrmb);
5823     asmALU(X86_MOV, reg3, reg);
5824    
5825     asmALU(X86_OR, reg3, reg2);
5826     asmALU(X86_AND, reg3, 0x00010001);
5827    
5828     asmShift(X86_SHR, reg, 1);
5829     asmShift(X86_SHR, reg2, 1);
5830    
5831     asmALU(X86_AND, reg, 0x7fff7fff);
5832     asmALU(X86_AND, reg2, 0x7fff7fff);
5833    
5834     asmALU(X86_ADD, reg, reg2);
5835     asmALU(X86_ADD, reg, reg3);
5836    
5837     vec_setWord(vrD, i, reg);
5838     }
5839    
5840     return flowContinue;
5841     #endif
5842     }
5843    
5844     /* vavguw Vector Average Unsigned Word
5845     * v.154
5846     */
5847     void ppc_opc_vavguw()
5848     {
5849     VECTOR_DEBUG;
5850     int vrD, vrA, vrB;
5851     uint64 res;
5852     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5853    
5854     for (int i=0; i<4; i++) {
5855     res = (uint64)gCPU.vr[vrA].w[i] +
5856     (uint64)gCPU.vr[vrB].w[i] + 1;
5857    
5858     gCPU.vr[vrD].w[i] = (res >> 1);
5859     }
5860     }
5861     JITCFlow ppc_opc_gen_vavguw()
5862     {
5863     #if 0
5864     ppc_opc_gen_interpret(ppc_opc_vavguw);
5865     return flowEndBlock;
5866     #else
5867     int vrD, vrA, vrB;
5868     modrm_o modrma, modrmb;
5869     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5870    
5871     jitcFlushClientVectorRegister(vrA);
5872     jitcFlushClientVectorRegister(vrB);
5873     jitcDropClientVectorRegister(vrD);
5874    
5875     jitcClobberCarryAndFlags();
5876     NativeReg reg = jitcAllocRegister();
5877     NativeReg reg2 = jitcAllocRegister();
5878     NativeReg reg3 = jitcAllocRegister();
5879    
5880     for (int i=0; i<16; i += 4) {
5881     x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
5882     x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
5883    
5884     asmALU(X86_MOV, reg, modrma);
5885     asmALU(X86_MOV, reg2, modrmb);
5886     asmALU(X86_MOV, reg3, reg);
5887    
5888     asmALU(X86_OR, reg3, reg2);
5889     asmALU(X86_AND, reg3, 0x00000001);
5890    
5891     asmShift(X86_SHR, reg, 1);
5892     asmShift(X86_SHR, reg2, 1);
5893    
5894     asmALU(X86_ADD, reg, reg2);
5895     asmALU(X86_ADD, reg, reg3);
5896    
5897     vec_setWord(vrD, i, reg);
5898     }
5899    
5900     return flowContinue;
5901     #endif
5902     }
5903    
5904     /* vavgsb Vector Average Signed Byte
5905     * v.149
5906     */
5907     void ppc_opc_vavgsb()
5908     {
5909     VECTOR_DEBUG;
5910     int vrD, vrA, vrB;
5911     sint16 res;
5912     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5913    
5914     for (int i=0; i<16; i++) {
5915     res = (sint16)gCPU.vr[vrA].sb[i] +
5916     (sint16)gCPU.vr[vrB].sb[i] + 1;
5917    
5918     gCPU.vr[vrD].sb[i] = (res >> 1);
5919     }
5920     }
5921     JITCFlow ppc_opc_gen_vavgsb()
5922     {
5923     ppc_opc_gen_interpret(ppc_opc_vavgsb);
5924     return flowEndBlock;
5925     }
5926    
5927     /* vavgsh Vector Average Signed Half Word
5928     * v.150
5929     */
5930     void ppc_opc_vavgsh()
5931     {
5932     VECTOR_DEBUG;
5933     int vrD, vrA, vrB;
5934     sint32 res;
5935     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5936    
5937     for (int i=0; i<8; i++) {
5938     res = (sint32)gCPU.vr[vrA].sh[i] +
5939     (sint32)gCPU.vr[vrB].sh[i] + 1;
5940    
5941     gCPU.vr[vrD].sh[i] = (res >> 1);
5942     }
5943     }
5944     JITCFlow ppc_opc_gen_vavgsh()
5945     {
5946     ppc_opc_gen_interpret(ppc_opc_vavgsh);
5947     return flowEndBlock;
5948     }
5949    
5950     /* vavgsw Vector Average Signed Word
5951     * v.151
5952     */
5953     void ppc_opc_vavgsw()
5954     {
5955     VECTOR_DEBUG;
5956     int vrD, vrA, vrB;
5957     sint64 res;
5958     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5959    
5960     for (int i=0; i<4; i++) {
5961     res = (sint64)gCPU.vr[vrA].sw[i] +
5962     (sint64)gCPU.vr[vrB].sw[i] + 1;
5963    
5964     gCPU.vr[vrD].sw[i] = (res >> 1);
5965     }
5966     }
5967     JITCFlow ppc_opc_gen_vavgsw()
5968     {
5969     ppc_opc_gen_interpret(ppc_opc_vavgsw);
5970     return flowEndBlock;
5971     }
5972    
5973     /* vmaxub Vector Maximum Unsigned Byte
5974     * v.182
5975     */
5976     void ppc_opc_vmaxub()
5977     {
5978     VECTOR_DEBUG;
5979     int vrD, vrA, vrB;
5980     uint8 res;
5981     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5982    
5983     for (int i=0; i<16; i++) {
5984     res = gCPU.vr[vrA].b[i];
5985    
5986     if (res < gCPU.vr[vrB].b[i])
5987     res = gCPU.vr[vrB].b[i];
5988    
5989     gCPU.vr[vrD].b[i] = res;
5990     }
5991     }
5992     JITCFlow ppc_opc_gen_vmaxub()
5993     {
5994     #if 0
5995     ppc_opc_gen_interpret(ppc_opc_vmaxub);
5996     return flowEndBlock;
5997     #else
5998     int vrD, vrA, vrB;
5999     modrm_o modrm;
6000     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6001    
6002     if (vrA == vrD && vrB == vrD) {
6003     return flowContinue;
6004     } else if (vrA == vrB) {
6005     vec_Copy(vrD, vrA);
6006    
6007     return flowContinue;
6008     }
6009    
6010     if (SSE2_AVAIL) {
6011     commutative_operation(X86_PMAXUB, vrD, vrA, vrB);
6012     return flowContinue;
6013     }
6014    
6015     jitcFlushClientVectorRegister(vrA);
6016     jitcFlushClientVectorRegister(vrB);
6017     jitcDropClientVectorRegister(vrD);
6018    
6019     jitcClobberCarryAndFlags();
6020     NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
6021    
6022     for (int i=0; i<16; i++) {
6023     vec_getByte(reg, vrA, REG_NO, i);
6024    
6025     x86_mem2(modrm, &(gCPU.vr[vrB].b[i]));
6026    
6027     asmALU(X86_CMP, reg, modrm);
6028     asmCMOV(X86_B, (NativeReg)reg, modrm);
6029    
6030     vec_setByte(vrD, REG_NO, i, reg);
6031     }
6032    
6033     return flowContinue;
6034     #endif
6035     }
6036    
6037     /* vmaxuh Vector Maximum Unsigned Half Word
6038     * v.183
6039     */
6040     void ppc_opc_vmaxuh()
6041     {
6042     VECTOR_DEBUG;
6043     int vrD, vrA, vrB;
6044     uint16 res;
6045     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6046    
6047     for (int i=0; i<8; i++) {
6048     res = gCPU.vr[vrA].h[i];
6049    
6050     if (res < gCPU.vr[vrB].h[i])
6051     res = gCPU.vr[vrB].h[i];
6052    
6053     gCPU.vr[vrD].h[i] = res;
6054     }
6055     }
6056     JITCFlow ppc_opc_gen_vmaxuh()
6057     {
6058     ppc_opc_gen_interpret(ppc_opc_vmaxuh);
6059     return flowEndBlock;
6060     }
6061    
6062     /* vmaxuw Vector Maximum Unsigned Word
6063     * v.184
6064     */
6065     void ppc_opc_vmaxuw()
6066     {
6067     VECTOR_DEBUG;
6068     int vrD, vrA, vrB;
6069     uint32 res;
6070     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6071    
6072     for (int i=0; i<4; i++) {
6073     res = gCPU.vr[vrA].w[i];
6074    
6075     if (res < gCPU.vr[vrB].w[i])
6076     res = gCPU.vr[vrB].w[i];
6077    
6078     gCPU.vr[vrD].w[i] = res;
6079     }
6080     }
6081     JITCFlow ppc_opc_gen_vmaxuw()
6082     {
6083     ppc_opc_gen_interpret(ppc_opc_vmaxuw);
6084     return flowEndBlock;
6085     }
6086    
6087     /* vmaxsb Vector Maximum Signed Byte
6088     * v.179
6089     */
6090     void ppc_opc_vmaxsb()
6091     {
6092     VECTOR_DEBUG;
6093     int vrD, vrA, vrB;
6094     sint8 res;
6095     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6096    
6097     for (int i=0; i<16; i++) {
6098     res = gCPU.vr[vrA].sb[i];
6099    
6100     if (res < gCPU.vr[vrB].sb[i])
6101     res = gCPU.vr[vrB].sb[i];
6102    
6103     gCPU.vr[vrD].sb[i] = res;
6104     }
6105     }
6106     JITCFlow ppc_opc_gen_vmaxsb()
6107     {
6108     ppc_opc_gen_interpret(ppc_opc_vmaxsb);
6109     return flowEndBlock;
6110     }
6111    
6112     /* vmaxsh Vector Maximum Signed Half Word
6113     * v.180
6114     */
6115     void ppc_opc_vmaxsh()
6116     {
6117     VECTOR_DEBUG;
6118     int vrD, vrA, vrB;
6119     sint16 res;
6120     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6121    
6122     for (int i=0; i<8; i++) {
6123     res = gCPU.vr[vrA].sh[i];
6124    
6125     if (res < gCPU.vr[vrB].sh[i])
6126     res = gCPU.vr[vrB].sh[i];
6127    
6128     gCPU.vr[vrD].sh[i] = res;
6129     }
6130     }
6131     JITCFlow ppc_opc_gen_vmaxsh()
6132     {
6133     ppc_opc_gen_interpret(ppc_opc_vmaxsh);
6134     return flowEndBlock;
6135     }
6136    
6137     /* vmaxsw Vector Maximum Signed Word
6138     * v.181
6139     */
6140     void ppc_opc_vmaxsw()
6141     {
6142     VECTOR_DEBUG;
6143     int vrD, vrA, vrB;
6144     sint32 res;
6145     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6146    
6147     for (int i=0; i<4; i++) {
6148     res = gCPU.vr[vrA].sw[i];
6149    
6150     if (res < gCPU.vr[vrB].sw[i])
6151     res = gCPU.vr[vrB].sw[i];
6152    
6153     gCPU.vr[vrD].sw[i] = res;
6154     }
6155     }
6156     JITCFlow ppc_opc_gen_vmaxsw()
6157     {
6158     ppc_opc_gen_interpret(ppc_opc_vmaxsw);
6159     return flowEndBlock;
6160     }
6161    
6162     /* vmaxfp Vector Maximum Floating Point
6163     * v.178
6164     */
6165     void ppc_opc_vmaxfp()
6166     {
6167     VECTOR_DEBUG;
6168     int vrD, vrA, vrB;
6169     float res;
6170     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6171    
6172     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6173     res = gCPU.vr[vrA].f[i];
6174    
6175     if (res < gCPU.vr[vrB].f[i])
6176     res = gCPU.vr[vrB].f[i];
6177    
6178     gCPU.vr[vrD].f[i] = res;
6179     }
6180     }
6181     JITCFlow ppc_opc_gen_vmaxfp()
6182     {
6183     #if 0
6184     ppc_opc_gen_interpret(ppc_opc_vmaxfp);
6185     return flowEndBlock;
6186     #else
6187     int vrD, vrA, vrB;
6188     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6189    
6190     if (vrA == vrD && vrB == vrD) {
6191     return flowContinue;
6192     } else if (vrA == vrB) {
6193     vec_Copy(vrD, vrA);
6194    
6195     return flowContinue;
6196     }
6197    
6198     if (SSE_AVAIL) {
6199     commutative_operation(X86_MAXPS, vrD, vrA, vrB);
6200     return flowContinue;
6201     }
6202    
6203     ppc_opc_gen_interpret(ppc_opc_vmaxfp);
6204     return flowEndBlock;
6205     #endif
6206     }
6207    
6208     /* vminub Vector Minimum Unsigned Byte
6209     * v.191
6210     */
6211     void ppc_opc_vminub()
6212     {
6213     VECTOR_DEBUG;
6214     int vrD, vrA, vrB;
6215     uint8 res;
6216     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6217    
6218     for (int i=0; i<16; i++) {
6219     res = gCPU.vr[vrA].b[i];
6220    
6221     if (res > gCPU.vr[vrB].b[i])
6222     res = gCPU.vr[vrB].b[i];
6223    
6224     gCPU.vr[vrD].b[i] = res;
6225     }
6226     }
6227     JITCFlow ppc_opc_gen_vminub()
6228     {
6229     #if 0
6230     ppc_opc_gen_interpret(ppc_opc_vminub);
6231     return flowEndBlock;
6232     #else
6233     int vrD, vrA, vrB;
6234     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6235    
6236     if (vrA == vrD && vrB == vrD) {
6237     return flowContinue;
6238     } else if (vrA == vrB) {
6239     vec_Copy(vrD, vrA);
6240    
6241     return flowContinue;
6242     }
6243    
6244     if (SSE2_AVAIL) {
6245     commutative_operation(X86_PMINUB, vrD, vrA, vrB);
6246     return flowContinue;
6247     }
6248    
6249     jitcFlushClientVectorRegister(vrA);
6250     jitcFlushClientVectorRegister(vrB);
6251     jitcDropClientVectorRegister(vrD);
6252    
6253     jitcClobberCarryAndFlags();
6254     NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
6255     NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
6256    
6257     for (int i=0; i<16; i++) {
6258     vec_getByte(reg, vrA, REG_NO, i);
6259     vec_getByte(reg2, vrB, REG_NO, i);
6260    
6261     asmALU(X86_CMP, reg, reg2);
6262     asmCMOV(X86_A, (NativeReg)reg, (NativeReg)reg2);
6263    
6264     vec_setByte(vrD, REG_NO, i, reg);
6265     }
6266    
6267     return flowContinue;
6268     #endif
6269     }
6270    
6271     /* vminuh Vector Minimum Unsigned Half Word
6272     * v.192
6273     */
6274     void ppc_opc_vminuh()
6275     {
6276     VECTOR_DEBUG;
6277     int vrD, vrA, vrB;
6278     uint16 res;
6279     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6280    
6281     for (int i=0; i<8; i++) {
6282     res = gCPU.vr[vrA].h[i];
6283    
6284     if (res > gCPU.vr[vrB].h[i])
6285     res = gCPU.vr[vrB].h[i];
6286    
6287     gCPU.vr[vrD].h[i] = res;
6288     }
6289     }
6290     JITCFlow ppc_opc_gen_vminuh()
6291     {
6292     ppc_opc_gen_interpret(ppc_opc_vminuh);
6293     return flowEndBlock;
6294     }
6295    
6296     /* vminuw Vector Minimum Unsigned Word
6297     * v.193
6298     */
6299     void ppc_opc_vminuw()
6300     {
6301     VECTOR_DEBUG;
6302     int vrD, vrA, vrB;
6303     uint32 res;
6304     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6305    
6306     for (int i=0; i<4; i++) {
6307     res = gCPU.vr[vrA].w[i];
6308    
6309     if (res > gCPU.vr[vrB].w[i])
6310     res = gCPU.vr[vrB].w[i];
6311    
6312     gCPU.vr[vrD].w[i] = res;
6313     }
6314     }
6315     JITCFlow ppc_opc_gen_vminuw()
6316     {
6317     ppc_opc_gen_interpret(ppc_opc_vminuw);
6318     return flowEndBlock;
6319     }
6320    
6321     /* vminsb Vector Minimum Signed Byte
6322     * v.188
6323     */
6324     void ppc_opc_vminsb()
6325     {
6326     VECTOR_DEBUG;
6327     int vrD, vrA, vrB;
6328     sint8 res;
6329     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6330    
6331     for (int i=0; i<16; i++) {
6332     res = gCPU.vr[vrA].sb[i];
6333    
6334     if (res > gCPU.vr[vrB].sb[i])
6335     res = gCPU.vr[vrB].sb[i];
6336    
6337     gCPU.vr[vrD].sb[i] = res;
6338     }
6339     }
6340     JITCFlow ppc_opc_gen_vminsb()
6341     {
6342     ppc_opc_gen_interpret(ppc_opc_vminsb);
6343     return flowEndBlock;
6344     }
6345    
6346     /* vminsh Vector Minimum Signed Half Word
6347     * v.189
6348     */
6349     void ppc_opc_vminsh()
6350     {
6351     VECTOR_DEBUG;
6352     int vrD, vrA, vrB;
6353     sint16 res;
6354     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6355    
6356     for (int i=0; i<8; i++) {
6357     res = gCPU.vr[vrA].sh[i];
6358    
6359     if (res > gCPU.vr[vrB].sh[i])
6360     res = gCPU.vr[vrB].sh[i];
6361    
6362     gCPU.vr[vrD].sh[i] = res;
6363     }
6364     }
6365     JITCFlow ppc_opc_gen_vminsh()
6366     {
6367     ppc_opc_gen_interpret(ppc_opc_vminsh);
6368     return flowEndBlock;
6369     }
6370    
6371     /* vminsw Vector Minimum Signed Word
6372     * v.190
6373     */
6374     void ppc_opc_vminsw()
6375     {
6376     VECTOR_DEBUG;
6377     int vrD, vrA, vrB;
6378     sint32 res;
6379     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6380    
6381     for (int i=0; i<4; i++) {
6382     res = gCPU.vr[vrA].sw[i];
6383    
6384     if (res > gCPU.vr[vrB].sw[i])
6385     res = gCPU.vr[vrB].sw[i];
6386    
6387     gCPU.vr[vrD].sw[i] = res;
6388     }
6389     }
6390     JITCFlow ppc_opc_gen_vminsw()
6391     {
6392     ppc_opc_gen_interpret(ppc_opc_vminsw);
6393     return flowEndBlock;
6394     }
6395    
6396     /* vminfp Vector Minimum Floating Point
6397     * v.187
6398     */
6399     void ppc_opc_vminfp()
6400     {
6401     VECTOR_DEBUG;
6402     int vrD, vrA, vrB;
6403     float res;
6404     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6405    
6406     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6407     res = gCPU.vr[vrA].f[i];
6408    
6409     if (res > gCPU.vr[vrB].f[i])
6410     res = gCPU.vr[vrB].f[i];
6411    
6412     gCPU.vr[vrD].f[i] = res;
6413     }
6414     }
6415     JITCFlow ppc_opc_gen_vminfp()
6416     {
6417     #if 0
6418     ppc_opc_gen_interpret(ppc_opc_vminfp);
6419     return flowEndBlock;
6420     #else
6421     int vrD, vrA, vrB;
6422     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6423    
6424     if (vrA == vrD && vrB == vrD) {
6425     return flowContinue;
6426     } else if (vrA == vrB) {
6427     vec_Copy(vrD, vrA);
6428    
6429     return flowContinue;
6430     }
6431    
6432     if (SSE_AVAIL) {
6433     commutative_operation(X86_MINPS, vrD, vrA, vrB);
6434     return flowContinue;
6435     }
6436    
6437     ppc_opc_gen_interpret(ppc_opc_vminfp);
6438     return flowEndBlock;
6439     #endif
6440     }
6441    
6442     #define RND_NEAREST 0x0000 // round to nearest
6443     #define RND_ZERO 0x0c00 // round to zero
6444     #define RND_PINF 0x0800 // round to +INF
6445     #define RND_MINF 0x0400 // round to -INF
6446    
6447     /* This functionality was borrowed from glibc 2.3.3 */
6448     static inline void set_roundmode(int mode)
6449     {
6450     modrm_o modrm;
6451    
6452     jitcClobberCarryAndFlags();
6453     NativeReg reg = jitcAllocRegister();
6454    
6455     x86_mem2(modrm, &gCPU.vfcw_save);
6456    
6457     asmFSTCW(x86_mem2(modrm, &gCPU.vfcw_save));
6458    
6459     asmMOV_NoFlags(reg, mode); // avoid flag clobber
6460     asmALU(X86_OR, reg, modrm);
6461    
6462     if (mode != RND_ZERO) {
6463     asmALU(X86_AND, reg, (0xffff ^ (~mode & 0x0c00)));
6464     }
6465    
6466     x86_mem2(modrm, &gCPU.vfcw);
6467    
6468     asmALU(X86_MOV, modrm, reg);
6469    
6470     asmFLDCW(modrm);
6471     }
6472    
6473     static inline void restore_roundmode(void)
6474     {
6475     modrm_o modrm;
6476    
6477     asmFLDCW(x86_mem2(modrm, &gCPU.vfcw_save));
6478     }
6479    
6480     /* vrfin Vector Round to Floating-Point Integer Nearest
6481     * v.231
6482     */
6483     void ppc_opc_vrfin()
6484     {
6485     VECTOR_DEBUG;
6486     int vrD, vrA, vrB;
6487     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6488     PPC_OPC_ASSERT(vrA==0);
6489    
6490     /* Documentation doesn't dictate how this instruction should
6491     * round from a middle point. With a test on a real G4, it was
6492     * found to be round to nearest, with bias to even if equidistant.
6493     *
6494     * This is covered by the function rint()
6495     */
6496     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6497     gCPU.vr[vrD].f[i] = rintf(gCPU.vr[vrB].f[i]);
6498     }
6499     }
6500     JITCFlow ppc_opc_gen_vrfin()
6501     {
6502     #if 0
6503     ppc_opc_gen_interpret(ppc_opc_vrfin);
6504     return flowEndBlock;
6505     #else
6506     int vrD, vrA, vrB;
6507     modrm_o modrm;
6508     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6509    
6510     jitcFlushClientVectorRegister(vrB);
6511     jitcDropClientVectorRegister(vrD);
6512    
6513     jitcFloatRegisterClobberAll();
6514     set_roundmode(RND_NEAREST);
6515    
6516     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6517     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6518    
6519     asmFSimple(FRNDINT);
6520    
6521     asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6522     }
6523    
6524     restore_roundmode();
6525    
6526     return flowContinue;
6527     #endif
6528     }
6529    
6530     /* vrfip Vector Round to Floating-Point Integer toward Plus Infinity
6531     * v.232
6532     */
6533     void ppc_opc_vrfip()
6534     {
6535     VECTOR_DEBUG;
6536     int vrD, vrA, vrB;
6537     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6538     PPC_OPC_ASSERT(vrA==0);
6539    
6540     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6541     gCPU.vr[vrD].f[i] = ceilf(gCPU.vr[vrB].f[i]);
6542     }
6543     }
6544     JITCFlow ppc_opc_gen_vrfip()
6545     {
6546     #if 0
6547     ppc_opc_gen_interpret(ppc_opc_vrfip);
6548     return flowEndBlock;
6549     #else
6550     int vrD, vrA, vrB;
6551     modrm_o modrm;
6552     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6553    
6554     jitcFlushClientVectorRegister(vrB);
6555     jitcDropClientVectorRegister(vrD);
6556    
6557     jitcFloatRegisterClobberAll();
6558     set_roundmode(RND_PINF);
6559    
6560     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6561     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6562    
6563     asmFSimple(FRNDINT);
6564    
6565     asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6566     }
6567    
6568     restore_roundmode();
6569    
6570     return flowContinue;
6571     #endif
6572     }
6573    
6574     /* vrfim Vector Round to Floating-Point Integer toward Minus Infinity
6575     * v.230
6576     */
6577     void ppc_opc_vrfim()
6578     {
6579     VECTOR_DEBUG;
6580     int vrD, vrA, vrB;
6581     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6582     PPC_OPC_ASSERT(vrA==0);
6583    
6584     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6585     gCPU.vr[vrD].f[i] = floorf(gCPU.vr[vrB].f[i]);
6586     }
6587     }
6588     JITCFlow ppc_opc_gen_vrfim()
6589     {
6590     #if 0
6591     ppc_opc_gen_interpret(ppc_opc_vrfim);
6592     return flowEndBlock;
6593     #else
6594     int vrD, vrA, vrB;
6595     modrm_o modrm;
6596     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6597    
6598     jitcFlushClientVectorRegister(vrB);
6599     jitcDropClientVectorRegister(vrD);
6600    
6601     jitcFloatRegisterClobberAll();
6602     set_roundmode(RND_MINF);
6603    
6604     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6605     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6606    
6607     asmFSimple(FRNDINT);
6608    
6609     asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6610     }
6611    
6612     restore_roundmode();
6613    
6614     return flowContinue;
6615     #endif
6616     }
6617    
6618     /* vrfiz Vector Round to Floating-Point Integer toward Zero
6619     * v.233
6620     */
6621     void ppc_opc_vrfiz()
6622     {
6623     VECTOR_DEBUG;
6624     int vrD, vrA, vrB;
6625     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6626     PPC_OPC_ASSERT(vrA==0);
6627    
6628     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6629     gCPU.vr[vrD].f[i] = truncf(gCPU.vr[vrD].f[i]);
6630     }
6631     }
6632     JITCFlow ppc_opc_gen_vrfiz()
6633     {
6634     #if 0
6635     ppc_opc_gen_interpret(ppc_opc_vrfiz);
6636     return flowEndBlock;
6637     #else
6638     int vrD, vrA, vrB;
6639     modrm_o modrm;
6640     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6641    
6642     jitcFlushClientVectorRegister(vrB);
6643     jitcDropClientVectorRegister(vrD);
6644    
6645     jitcFloatRegisterClobberAll();
6646     set_roundmode(RND_ZERO);
6647    
6648     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6649     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6650    
6651     asmFSimple(FRNDINT);
6652    
6653     asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6654     }
6655    
6656     restore_roundmode();
6657    
6658     return flowContinue;
6659     #endif
6660     }
6661    
6662     /* vrefp Vector Reciprocal Estimate Floating Point
6663     * v.228
6664     */
6665     void ppc_opc_vrefp()
6666     {
6667     VECTOR_DEBUG;
6668     int vrD, vrA, vrB;
6669     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6670     PPC_OPC_ASSERT(vrA==0);
6671    
6672     /* This emulation generates an exact value, instead of an estimate.
6673     * This is technically within specs, but some test-suites expect the
6674     * exact estimate value returned by G4s. These anomolous failures
6675     * should be ignored.
6676     */
6677    
6678     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6679     gCPU.vr[vrD].f[i] = 1 / gCPU.vr[vrB].f[i];
6680     }
6681     }
6682     JITCFlow ppc_opc_gen_vrefp()
6683     {
6684     #if 0
6685     ppc_opc_gen_interpret(ppc_opc_vrefp);
6686     return flowEndBlock;
6687     #else
6688     int vrD, vrA, vrB;
6689     modrm_o modrm;
6690     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6691    
6692     if (!(SSE_AVAIL)) {
6693     ppc_opc_gen_interpret(ppc_opc_vrefp);
6694     return flowEndBlock;
6695     }
6696    
6697     if (vrB == vrD) {
6698     NativeVectorReg reg = jitcGetClientVectorRegisterDirty(vrB);
6699    
6700     asmALUPS(X86_RCPPS, reg, reg);
6701    
6702     return flowContinue;
6703     }
6704    
6705     NativeVectorReg reg1 = jitcAllocVectorRegister();
6706     NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(vrB);
6707    
6708     if (reg2 == VECTREG_NO) {
6709     asmALUPS(X86_RCPPS, reg1, x86_mem2(modrm, &gCPU.vr[vrB]));
6710     } else {
6711     asmALUPS(X86_RCPPS, reg1, reg2);
6712     }
6713    
6714     jitcRenameVectorRegisterDirty(reg1, vrD);
6715    
6716     return flowContinue;
6717     #endif
6718     }
6719    
6720     /* vrsqrtefp Vector Reciprocal Square Root Estimate Floating Point
6721     * v.237
6722     */
6723     void ppc_opc_vrsqrtefp()
6724     {
6725     VECTOR_DEBUG;
6726     int vrD, vrA, vrB;
6727     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6728     PPC_OPC_ASSERT(vrA==0);
6729    
6730     /* This emulation generates an exact value, instead of an estimate.
6731     * This is technically within specs, but some test-suites expect the
6732     * exact estimate value returned by G4s. These anomolous failures
6733     * should be ignored.
6734     */
6735    
6736     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6737     gCPU.vr[vrD].f[i] = 1 / sqrt(gCPU.vr[vrB].f[i]);
6738     }
6739     }
6740     JITCFlow ppc_opc_gen_vrsqrtefp()
6741     {
6742     #if 0
6743     ppc_opc_gen_interpret(ppc_opc_vrsqrtefp);
6744     return flowEndBlock;
6745     #else
6746     int vrD, vrA, vrB;
6747     modrm_o modrm;
6748     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6749    
6750     if (!(SSE_AVAIL)) {
6751     ppc_opc_gen_interpret(ppc_opc_vrsqrtefp);
6752     return flowEndBlock;
6753     }
6754    
6755     if (vrB == vrD) {
6756     NativeVectorReg reg = jitcGetClientVectorRegisterDirty(vrB);
6757    
6758     asmALUPS(X86_SQRTPS, reg, reg);
6759     asmALUPS(X86_RCPPS, reg, reg);
6760    
6761     return flowContinue;
6762     }
6763    
6764     NativeVectorReg reg1 = jitcAllocVectorRegister();
6765     NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(vrB);
6766    
6767     if (reg2 == VECTREG_NO) {
6768     asmALUPS(X86_SQRTPS, reg1, x86_mem2(modrm, &gCPU.vr[vrB]));
6769     } else {
6770     asmALUPS(X86_SQRTPS, reg1, reg2);
6771     }
6772     asmALUPS(X86_RCPPS, reg1, reg1);
6773    
6774     jitcRenameVectorRegisterDirty(reg1, vrD);
6775    
6776     return flowContinue;
6777     #endif
6778     }
6779    
6780     /* vlogefp Vector Log2 Estimate Floating Point
6781     * v.175
6782     */
6783     void ppc_opc_vlogefp()
6784     {
6785     VECTOR_DEBUG;
6786     int vrD, vrA, vrB;
6787     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6788     PPC_OPC_ASSERT(vrA==0);
6789    
6790     /* This emulation generates an exact value, instead of an estimate.
6791     * This is technically within specs, but some test-suites expect the
6792     * exact estimate value returned by G4s. These anomolous failures
6793     * should be ignored.
6794     */
6795    
6796     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6797     gCPU.vr[vrD].f[i] = log2(gCPU.vr[vrB].f[i]);
6798     }
6799     }
6800     JITCFlow ppc_opc_gen_vlogefp()
6801     {
6802     #if 0
6803     ppc_opc_gen_interpret(ppc_opc_vlogefp);
6804     return flowEndBlock;
6805     #else
6806     int vrD, vrA, vrB;
6807     modrm_o modrm;
6808     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6809    
6810     jitcFlushClientVectorRegister(vrB);
6811     jitcDropClientVectorRegister(vrD);
6812    
6813     jitcFloatRegisterClobberAll();
6814    
6815     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6816     asmFSimple(FLD1);
6817    
6818     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6819    
6820     asmFSimple(FYL2X);
6821    
6822     asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6823     }
6824    
6825     return flowContinue;
6826     #endif
6827     }
6828    
6829     /* vexptefp Vector 2 Raised to the Exponent Estimate Floating Point
6830     * v.173
6831     */
6832     void ppc_opc_vexptefp()
6833     {
6834     VECTOR_DEBUG;
6835     int vrD, vrA, vrB;
6836     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6837     PPC_OPC_ASSERT(vrA==0);
6838    
6839     /* This emulation generates an exact value, instead of an estimate.
6840     * This is technically within specs, but some test-suites expect the
6841     * exact estimate value returned by G4s. These anomolous failures
6842     * should be ignored.
6843     */
6844    
6845     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6846     gCPU.vr[vrD].f[i] = exp2(gCPU.vr[vrB].f[i]);
6847     }
6848     }
6849     JITCFlow ppc_opc_gen_vexptefp()
6850     {
6851     #if 0
6852     ppc_opc_gen_interpret(ppc_opc_vexptefp);
6853     return flowEndBlock;
6854     #else
6855     int vrD, vrA, vrB;
6856     modrm_o modrm;
6857     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6858    
6859     jitcFlushClientVectorRegister(vrB);
6860     jitcDropClientVectorRegister(vrD);
6861    
6862     jitcFloatRegisterClobberAll();
6863    
6864     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6865     asmFSimple(FLD1);
6866    
6867     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6868    
6869     asmFSimple(F2XM1);
6870     asmFArithP_STi(X86_FADD, Float_ST1);
6871    
6872     asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6873     }
6874    
6875     return flowContinue;
6876     #endif
6877     }
6878    
6879     /* vcfux Vector Convert from Unsigned Fixed-Point Word
6880     * v.156
6881     */
6882     void ppc_opc_vcfux()
6883     {
6884     VECTOR_DEBUG;
6885     int vrD, vrB;
6886     uint32 uimm;
6887     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
6888    
6889     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6890     gCPU.vr[vrD].f[i] = ((float)gCPU.vr[vrB].w[i]) / (1 << uimm);
6891     }
6892     }
6893     JITCFlow ppc_opc_gen_vcfux()
6894     {
6895     ppc_opc_gen_interpret(ppc_opc_vcfux);
6896     return flowEndBlock;
6897     }
6898    
6899     /* vcfsx Vector Convert from Signed Fixed-Point Word
6900     * v.155
6901     */
6902     void ppc_opc_vcfsx()
6903     {
6904     VECTOR_DEBUG;
6905     int vrD, vrB;
6906     uint32 uimm;
6907     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
6908    
6909     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6910     gCPU.vr[vrD].f[i] = ((float)gCPU.vr[vrB].sw[i]) / (1 << uimm);
6911     }
6912     }
6913     JITCFlow ppc_opc_gen_vcfsx()
6914     {
6915     #if 0
6916     ppc_opc_gen_interpret(ppc_opc_vcfsx);
6917     return flowEndBlock;
6918     #else
6919     int vrD, vrB;
6920     uint32 uimm;
6921     modrm_o modrm;
6922     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
6923    
6924     jitcFlushClientVectorRegister(vrB);
6925     jitcDropClientVectorRegister(vrD);
6926    
6927     jitcClobberCarryAndFlags();
6928     jitcClobberRegister(NATIVE_REG | EAX);
6929    
6930     jitcFloatRegisterClobberAll();
6931    
6932     if (uimm == 1) {
6933     asmFSimple(FLD1);
6934     } else if (uimm > 1) {
6935     asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), uimm);
6936     asmFILD_D(modrm);
6937     }
6938    
6939     if (uimm) asmFSimple(FCHS);
6940    
6941     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6942     asmFILD_D(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6943    
6944     if (uimm) {
6945     asmFSimple(FSCALE);
6946     }
6947    
6948     asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6949     }
6950    
6951     if (uimm) asmFSTP(Float_ST0);
6952    
6953     return flowContinue;
6954     #endif
6955     }
6956    
6957     /* vctsxs Vector Convert To Signed Fixed-Point Word Saturate
6958     * v.171
6959     */
6960     void ppc_opc_vctsxs()
6961     {
6962     VECTOR_DEBUG;
6963     int vrD, vrB;
6964     uint32 uimm;
6965     float ftmp;
6966     sint32 tmp;
6967     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
6968    
6969     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6970     ftmp = gCPU.vr[vrB].f[i] * (float)(1 << uimm);
6971     ftmp = truncf(ftmp);
6972    
6973     tmp = (sint32)ftmp;
6974    
6975     if (ftmp > 2147483647.0) {
6976     tmp = 2147483647; // 0x7fffffff
6977     gCPU.vscr |= VSCR_SAT;
6978     } else if (ftmp < -2147483648.0) {
6979     tmp = -2147483648LL; // 0x80000000
6980     gCPU.vscr |= VSCR_SAT;
6981     }
6982    
6983     gCPU.vr[vrD].sw[i] = tmp;
6984     }
6985     }
6986     JITCFlow ppc_opc_gen_vctsxs()
6987     {
6988     #if 0
6989     ppc_opc_gen_interpret(ppc_opc_vctsxs);
6990     return flowEndBlock;
6991     #else
6992     int vrD, vrB;
6993     uint32 uimm;
6994     modrm_o modrm, dest_modrm;
6995     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
6996    
6997     jitcFlushClientVectorRegister(vrB);
6998     jitcDropClientVectorRegister(vrD);
6999    
7000     jitcClobberCarryAndFlags();
7001     jitcClobberRegister(NATIVE_REG | EAX);
7002    
7003     jitcFloatRegisterClobberAll();
7004     set_roundmode(RND_ZERO);
7005    
7006     if (uimm == 1) {
7007     asmFSimple(FLD1);
7008     } else if (uimm > 1) {
7009     asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), uimm);
7010     asmFILD_D(modrm);
7011     }
7012    
7013     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
7014     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
7015    
7016     if (uimm) {
7017     asmFSimple(FSCALE);
7018     }
7019    
7020     asmFSimple(FRNDINT);
7021    
7022     asmFIComp(X86_FICOM32, x86_mem2(modrm, &sint32_max));
7023     asmFSTSW_EAX();
7024     asmALU(X86_TEST, EAX, 0x4100);
7025     NativeAddress of_high = asmJxxFixup(X86_Z);
7026    
7027     asmFIComp(X86_FICOM32, x86_mem2(modrm, &sint32_min));
7028     asmFSTSW_EAX();
7029     asmALU(X86_TEST, EAX, 0x0100);
7030     NativeAddress of_low = asmJxxFixup(X86_NZ);
7031    
7032     asmFISTP_D(x86_mem2(dest_modrm, &(gCPU.vr[vrD].b[i])));
7033     NativeAddress skip_end = asmJMPFixup();
7034    
7035     asmResolveFixup(of_high);
7036     asmALU_D(X86_MOV, dest_modrm, 2147483647);
7037     NativeAddress skip1 = asmJMPFixup();
7038    
7039     asmResolveFixup(of_low);
7040     asmALU_D(X86_MOV, dest_modrm, -2147483648);
7041    
7042     asmResolveFixup(skip1);
7043     vec_raise_saturate();
7044    
7045     asmResolveFixup(skip_end);
7046     }
7047    
7048     if (uimm) asmFSTP(Float_ST0);
7049    
7050     restore_roundmode();
7051    
7052     return flowContinue;
7053     #endif
7054     }
7055    
7056     /* vctuxs Vector Convert to Unsigned Fixed-Point Word Saturate
7057     * v.172
7058     */
7059     void ppc_opc_vctuxs()
7060     {
7061     VECTOR_DEBUG;
7062     int vrD, vrB;
7063     uint32 tmp, uimm;
7064     float ftmp;
7065     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
7066    
7067     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
7068     ftmp = gCPU.vr[vrB].f[i] * (float)(1 << uimm);
7069     ftmp = truncf(ftmp);
7070    
7071     tmp = (uint32)ftmp;
7072    
7073     if (ftmp > 4294967295.0) {
7074     tmp = 0xffffffff;
7075     gCPU.vscr |= VSCR_SAT;
7076     } else if (ftmp < 0) {
7077     tmp = 0;
7078     gCPU.vscr |= VSCR_SAT;
7079     }
7080    
7081     gCPU.vr[vrD].w[i] = tmp;
7082     }
7083     }
7084     JITCFlow ppc_opc_gen_vctuxs()
7085     {
7086     #if 0
7087     ppc_opc_gen_interpret(ppc_opc_vctuxs);
7088     return flowEndBlock;
7089     #else
7090     int vrD, vrB;
7091     uint32 uimm;
7092     modrm_o modrm, dest_modrm;
7093     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
7094    
7095     jitcFlushClientVectorRegister(vrB);
7096     jitcDropClientVectorRegister(vrD);
7097    
7098     jitcClobberCarryAndFlags();
7099     jitcClobberRegister(NATIVE_REG | EAX);
7100    
7101     jitcFloatRegisterClobberAll();
7102     set_roundmode(RND_ZERO);
7103    
7104     NativeReg reg = jitcAllocRegister();
7105    
7106     if (uimm == 1) {
7107     asmFSimple(FLD1);
7108     } else if (uimm > 1) {
7109     asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), uimm);
7110     asmFILD_D(modrm);
7111     }
7112    
7113     for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
7114     asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
7115    
7116     if (uimm) {
7117     asmFSimple(FSCALE);
7118     }
7119    
7120     asmFSimple(FRNDINT);
7121    
7122     asmFISTP_Q(x86_mem2(modrm, &gCPU.vtemp64));
7123    
7124     asmMOV(reg, (byte *)&gCPU.vtemp64+4);
7125    
7126     asmALU(X86_TEST, reg, 0x80000000);
7127     NativeAddress of_low = asmJxxFixup(X86_NZ);
7128    
7129     asmALU(X86_TEST, reg, 0xffffffff);
7130     NativeAddress of_high = asmJxxFixup(X86_NZ);
7131    
7132     asmMOV(reg, &gCPU.vtemp64);
7133    
7134     x86_mem2(dest_modrm, &(gCPU.vr[vrD].b[i]));
7135    
7136     asmALU(X86_MOV, dest_modrm, reg);
7137     NativeAddress skip_end = asmJMPFixup();
7138    
7139     asmResolveFixup(of_high);
7140     asmALU_D(X86_MOV, dest_modrm, 0xffffffff);
7141     NativeAddress skip1 = asmJMPFixup();
7142    
7143     asmResolveFixup(of_low);
7144     asmALU_D(X86_MOV, dest_modrm, 0);
7145    
7146     asmResolveFixup(skip1);
7147     vec_raise_saturate();
7148    
7149     asmResolveFixup(skip_end);
7150     }
7151    
7152     if (uimm) asmFSTP(Float_ST0);
7153    
7154     restore_roundmode();
7155    
7156     return flowContinue;
7157     #endif
7158     }
7159    
7160     static inline void commutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB)
7161     {
7162     NativeVectorReg d, s;
7163     modrm_o modrm;
7164     int vrS;
7165    
7166     if (vrA == vrD) {
7167     d = jitcGetClientVectorRegisterDirty(vrD);
7168     s = jitcGetClientVectorRegisterMapping(vrB);
7169     vrS = vrB;
7170     } else if (vrB == vrD) {
7171     d = jitcGetClientVectorRegisterDirty(vrD);
7172     s = jitcGetClientVectorRegisterMapping(vrA);
7173     vrS = vrA;
7174     } else {
7175     d = jitcMapClientVectorRegisterDirty(vrD);
7176     NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7177     s = jitcGetClientVectorRegisterMapping(vrB);
7178     vrS = vrB;
7179    
7180     if (a == VECTREG_NO)
7181     asmMOVAPS(d, &gCPU.vr[vrA]);
7182     else
7183     asmALUPS(X86_MOVAPS, d, a);
7184     }
7185    
7186     if (s == VECTREG_NO) {
7187     asmALUPS(opc, d, x86_mem2(modrm, &gCPU.vr[vrS]));
7188     } else
7189     asmALUPS(opc, d, s);
7190     }
7191    
7192     static inline void commutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB)
7193     {
7194     NativeVectorReg d, s;
7195     modrm_o modrm;
7196     int vrS;
7197    
7198     if (vrA == vrD) {
7199     d = jitcGetClientVectorRegisterDirty(vrD);
7200     s = jitcGetClientVectorRegisterMapping(vrB);
7201     vrS = vrB;
7202     } else if (vrB == vrD) {
7203     d = jitcGetClientVectorRegisterDirty(vrD);
7204     s = jitcGetClientVectorRegisterMapping(vrA);
7205     vrS = vrA;
7206     } else {
7207     d = jitcMapClientVectorRegisterDirty(vrD);
7208     NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7209     s = jitcGetClientVectorRegisterMapping(vrB);
7210     vrS = vrB;
7211    
7212     if (a == VECTREG_NO)
7213     asmMOVAPS(d, &gCPU.vr[vrA]);
7214     else
7215     asmALUPS(X86_MOVAPS, d, a);
7216     }
7217    
7218     if (s == VECTREG_NO) {
7219     asmPALU(opc, d, x86_mem2(modrm, &gCPU.vr[vrS]));
7220     } else
7221     asmPALU(opc, d, s);
7222     }
7223    
7224     static inline void noncommutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB)
7225     {
7226     NativeVectorReg d, s;
7227     modrm_o modrm;
7228    
7229     if (vrA == vrD) {
7230     d = jitcGetClientVectorRegisterDirty(vrA);
7231     s = jitcGetClientVectorRegisterMapping(vrB);
7232     } else {
7233     if (vrB == vrD)
7234     d = jitcAllocVectorRegister();
7235     else
7236     d = jitcMapClientVectorRegisterDirty(vrD);
7237    
7238     NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7239     s = jitcGetClientVectorRegisterMapping(vrB);
7240    
7241     if (a == VECTREG_NO)
7242     asmMOVAPS(d, &gCPU.vr[vrA]);
7243     else
7244     asmALUPS(X86_MOVAPS, d, a);
7245     }
7246    
7247     if (s == VECTREG_NO) {
7248     asmALUPS(opc, d, x86_mem2(modrm, &gCPU.vr[vrB]));
7249     } else
7250     asmALUPS(opc, d, s);
7251    
7252     if (vrB == vrD)
7253     jitcRenameVectorRegisterDirty(d, vrD);
7254     }
7255    
7256     static inline void noncommutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB)
7257     {
7258     NativeVectorReg d, s;
7259     modrm_o modrm;
7260    
7261     if (vrA == vrD) {
7262     d = jitcGetClientVectorRegisterDirty(vrA);
7263     s = jitcGetClientVectorRegisterMapping(vrB);
7264     } else {
7265     if (vrB == vrD)
7266     d = jitcAllocVectorRegister();
7267     else
7268     d = jitcMapClientVectorRegisterDirty(vrD);
7269    
7270     NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7271     s = jitcGetClientVectorRegisterMapping(vrB);
7272    
7273     if (a == VECTREG_NO)
7274     asmMOVAPS(d, &gCPU.vr[vrA]);
7275     else
7276     asmALUPS(X86_MOVAPS, d, a);
7277     }
7278    
7279     if (s == VECTREG_NO) {
7280     asmPALU(opc, d, x86_mem2(modrm, &gCPU.vr[vrB]));
7281     } else
7282     asmPALU(opc, d, s);
7283    
7284     if (vrB == vrD)
7285     jitcRenameVectorRegisterDirty(d, vrD);
7286     }
7287    
7288     /* vand Vector Logical AND
7289     * v.147
7290     */
7291     void ppc_opc_vand()
7292     {
7293     VECTOR_DEBUG;
7294     int vrD, vrA, vrB;
7295     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7296    
7297     gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] & gCPU.vr[vrB].d[0];
7298     gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] & gCPU.vr[vrB].d[1];
7299     }
7300     JITCFlow ppc_opc_gen_vand()
7301     {
7302     #if 0
7303     ppc_opc_gen_interpret(ppc_opc_vand);
7304     return flowEndBlock;
7305     #else
7306     int vrD, vrA, vrB;
7307     modrm_o modrm;
7308     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7309    
7310     if (vrA == vrB) {
7311     vec_Copy(vrD, vrA);
7312     return flowContinue;
7313     }
7314    
7315     if (SSE_AVAIL) {
7316     commutative_operation(X86_ANDPS, vrD, vrA, vrB);
7317     return flowContinue;
7318     }
7319    
7320     jitcFlushClientVectorRegister(vrA);
7321     jitcFlushClientVectorRegister(vrB);
7322     jitcDropClientVectorRegister(vrD);
7323    
7324     jitcClobberCarryAndFlags();
7325     NativeReg reg = jitcAllocRegister();
7326    
7327     vec_getWord(reg, vrA, 0);
7328     asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB])));
7329     vec_setWord(vrD, 0, reg);
7330    
7331     vec_getWord(reg, vrA, 4);
7332     asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7333     vec_setWord(vrD, 4, reg);
7334    
7335     vec_getWord(reg, vrA, 8);
7336     asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7337     vec_setWord(vrD, 8, reg);
7338    
7339     vec_getWord(reg, vrA,12);
7340     asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7341     vec_setWord(vrD,12, reg);
7342    
7343     return flowContinue;
7344     #endif
7345     }
7346    
7347     /* vandc Vector Logical AND with Complement
7348     * v.148
7349     */
7350     void ppc_opc_vandc()
7351     {
7352     VECTOR_DEBUG;
7353     int vrD, vrA, vrB;
7354     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7355    
7356     gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] & ~gCPU.vr[vrB].d[0];
7357     gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] & ~gCPU.vr[vrB].d[1];
7358     }
7359     JITCFlow ppc_opc_gen_vandc()
7360     {
7361     #if 0
7362     ppc_opc_gen_interpret(ppc_opc_vandc);
7363     return flowEndBlock;
7364     #else
7365     int vrD, vrA, vrB;
7366     modrm_o modrm;
7367     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7368    
7369     if (vrA == vrB) {
7370     vec_Zero(vrD);
7371     return flowContinue;
7372     }
7373    
7374     if (SSE_AVAIL) {
7375     noncommutative_operation(X86_ANDNPS, vrD, vrB, vrA);
7376     return flowContinue;
7377     }
7378    
7379     jitcFlushClientVectorRegister(vrA);
7380     jitcFlushClientVectorRegister(vrB);
7381     jitcDropClientVectorRegister(vrD);
7382    
7383     jitcClobberCarryAndFlags();
7384     NativeReg reg = jitcAllocRegister();
7385    
7386     vec_getWord(reg, vrB, 0);
7387     asmALU(X86_NOT, reg);
7388     asmALU(X86_AND, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7389     vec_setWord(vrD, 0, reg);
7390    
7391     vec_getWord(reg, vrB, 4);
7392     asmALU(X86_NOT, reg);
7393     asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7394     vec_setWord(vrD, 4, reg);
7395    
7396     vec_getWord(reg, vrB, 8);
7397     asmALU(X86_NOT, reg);
7398     asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7399     vec_setWord(vrD, 8, reg);
7400    
7401     vec_getWord(reg, vrB,12);
7402     asmALU(X86_NOT, reg);
7403     asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7404     vec_setWord(vrD,12, reg);
7405    
7406     return flowContinue;
7407     #endif
7408     }
7409    
7410     /* vor Vector Logical OR
7411     * v.217
7412     */
7413     void ppc_opc_vor()
7414     {
7415     VECTOR_DEBUG_COMMON;
7416     int vrD, vrA, vrB;
7417     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7418    
7419     gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] | gCPU.vr[vrB].d[0];
7420     gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] | gCPU.vr[vrB].d[1];
7421     }
7422     JITCFlow ppc_opc_gen_vor()
7423     {
7424     #if 0
7425     ppc_opc_gen_interpret(ppc_opc_vor);
7426     return flowEndBlock;
7427     #else
7428     int vrD, vrA, vrB;
7429     modrm_o modrm;
7430     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7431    
7432     if (vrA == vrB) {
7433     vec_Copy(vrD, vrA);
7434     return flowContinue;
7435     }
7436    
7437     if (SSE_AVAIL) {
7438     commutative_operation(X86_ORPS, vrD, vrA, vrB);
7439     return flowContinue;
7440     }
7441    
7442     jitcFlushClientVectorRegister(vrA);
7443     jitcFlushClientVectorRegister(vrB);
7444     jitcDropClientVectorRegister(vrD);
7445    
7446     jitcClobberCarryAndFlags();
7447     NativeReg reg = jitcAllocRegister();
7448    
7449     vec_getWord(reg, vrA, 0);
7450     asmALU(X86_OR, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7451     vec_setWord(vrD, 0, reg);
7452    
7453     vec_getWord(reg, vrA, 4);
7454     asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7455     vec_setWord(vrD, 4, reg);
7456    
7457     vec_getWord(reg, vrA, 8);
7458     asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7459     vec_setWord(vrD, 8, reg);
7460    
7461     vec_getWord(reg, vrA,12);
7462     asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7463     vec_setWord(vrD,12, reg);
7464    
7465     return flowContinue;
7466     #endif
7467     }
7468    
7469     /* vnor Vector Logical NOR
7470     * v.216
7471     */
7472     void ppc_opc_vnor()
7473     {
7474     VECTOR_DEBUG;
7475     int vrD, vrA, vrB;
7476     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7477    
7478     gCPU.vr[vrD].d[0] = ~(gCPU.vr[vrA].d[0] | gCPU.vr[vrB].d[0]);
7479     gCPU.vr[vrD].d[1] = ~(gCPU.vr[vrA].d[1] | gCPU.vr[vrB].d[1]);
7480     }
7481     JITCFlow ppc_opc_gen_vnor()
7482     {
7483     #if 0
7484     ppc_opc_gen_interpret(ppc_opc_vnor);
7485     return flowEndBlock;
7486     #else
7487     int vrD, vrA, vrB;
7488     modrm_o modrm;
7489     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7490    
7491     if (vrA == vrB) {
7492     vec_Not(vrD, vrA);
7493     return flowContinue;
7494     }
7495    
7496     if (SSE_AVAIL) {
7497     commutative_operation(X86_ORPS, vrD, vrA, vrB);
7498     vec_Not(vrD, vrD);
7499     return flowContinue;
7500     }
7501    
7502     jitcFlushClientVectorRegister(vrA);
7503     jitcFlushClientVectorRegister(vrB);
7504     jitcDropClientVectorRegister(vrD);
7505    
7506     jitcClobberCarryAndFlags();
7507     NativeReg reg = jitcAllocRegister();
7508    
7509     vec_getWord(reg, vrA, 0);
7510     asmALU(X86_OR, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7511     asmALU(X86_NOT, reg);
7512     vec_setWord(vrD, 0, reg);
7513    
7514     vec_getWord(reg, vrA, 4);
7515     asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7516     asmALU(X86_NOT, reg);
7517     vec_setWord(vrD, 4, reg);
7518    
7519     vec_getWord(reg, vrA, 8);
7520     asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7521     asmALU(X86_NOT, reg);
7522     vec_setWord(vrD, 8, reg);
7523    
7524     vec_getWord(reg, vrA,12);
7525     asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7526     asmALU(X86_NOT, reg);
7527     vec_setWord(vrD,12, reg);
7528    
7529     return flowContinue;
7530     #endif
7531     }
7532    
7533     /* vxor Vector Logical XOR
7534     * v.282
7535     */
7536     void ppc_opc_vxor()
7537     {
7538     VECTOR_DEBUG_COMMON;
7539     int vrD, vrA, vrB;
7540     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7541    
7542     gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] ^ gCPU.vr[vrB].d[0];
7543     gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] ^ gCPU.vr[vrB].d[1];
7544     }
7545     JITCFlow ppc_opc_gen_vxor()
7546     {
7547     #if 0
7548     ppc_opc_gen_interpret(ppc_opc_vxor);
7549     return flowEndBlock;
7550     #else
7551     int vrD, vrA, vrB;
7552     modrm_o modrm;
7553     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7554    
7555     if (vrA == vrB) {
7556     vec_Zero(vrD);
7557     return flowContinue;
7558     }
7559    
7560     if (SSE_AVAIL) {
7561     commutative_operation(X86_XORPS, vrD, vrA, vrB);
7562     return flowContinue;
7563     }
7564    
7565     jitcFlushClientVectorRegister(vrA);
7566     jitcFlushClientVectorRegister(vrB);
7567     jitcDropClientVectorRegister(vrD);
7568    
7569     jitcClobberCarryAndFlags();
7570     NativeReg reg = jitcAllocRegister();
7571    
7572     vec_getWord(reg, vrA, 0);
7573     asmALU(X86_XOR, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7574     vec_setWord(vrD, 0, reg);
7575    
7576     vec_getWord(reg, vrA, 4);
7577     asmALU(X86_XOR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7578     vec_setWord(vrD, 4, reg);
7579    
7580     vec_getWord(reg, vrA, 8);
7581     asmALU(X86_XOR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7582     vec_setWord(vrD, 8, reg);
7583    
7584     vec_getWord(reg, vrA,12);
7585     asmALU(X86_XOR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7586     vec_setWord(vrD,12, reg);
7587    
7588     return flowContinue;
7589     #endif
7590     }
7591    
7592    
7593     #define CR_CR6 (0x00f0)
7594     #define CR_CR6_EQ (1<<7)
7595     #define CR_CR6_NE_SOME (1<<6)
7596     #define CR_CR6_NE (1<<5)
7597     #define CR_CR6_EQ_SOME (1<<4)
7598    
7599     /* vcmpequbx Vector Compare Equal-to Unsigned Byte
7600     * v.160
7601     */
7602     void ppc_opc_vcmpequbx()
7603     {
7604     VECTOR_DEBUG;
7605     int vrD, vrA, vrB;
7606     int tf=CR_CR6_EQ | CR_CR6_NE;
7607     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7608    
7609     for (int i=0; i<16; i++) {
7610     if (gCPU.vr[vrA].b[i] == gCPU.vr[vrB].b[i]) {
7611     gCPU.vr[vrD].b[i] = 0xff;
7612     tf &= ~CR_CR6_NE;
7613     tf |= CR_CR6_EQ_SOME;
7614     } else {
7615     gCPU.vr[vrD].b[i] = 0;
7616     tf &= ~CR_CR6_EQ;
7617     tf |= CR_CR6_NE_SOME;
7618     }
7619     }
7620    
7621     if (PPC_OPC_VRc & gCPU.current_opc) {
7622     gCPU.cr &= ~CR_CR6;
7623     gCPU.cr |= tf;
7624     }
7625     }
7626     JITCFlow ppc_opc_gen_vcmpequbx()
7627     {
7628     #if 0
7629     ppc_opc_gen_interpret(ppc_opc_vcmpequbx);
7630     return flowEndBlock;
7631     #else
7632     int vrD, vrA, vrB;
7633     modrm_o modrm;
7634     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7635    
7636     if (0 && vrA == vrB) {
7637     vec_Neg1(vrD);
7638    
7639     if (PPC_OPC_VRc & gJITC.current_opc) {
7640     jitcClobberCarryAndFlags();
7641    
7642     asmAND(&gCPU.cr, ~CR_CR6);
7643     asmOR(&gCPU.cr, CR_CR6_EQ | CR_CR6_EQ_SOME);
7644     }
7645    
7646     return flowContinue;
7647     }
7648    
7649     jitcFlushClientVectorRegister(vrA);
7650     jitcFlushClientVectorRegister(vrB);
7651     jitcDropClientVectorRegister(vrD);
7652    
7653     jitcClobberCarryAndFlags();
7654     NativeReg reg = REG_NO;
7655     NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
7656    
7657     if (PPC_OPC_VRc & gJITC.current_opc) {
7658     reg = jitcAllocRegister();
7659    
7660     asmMOV_NoFlags(reg, CR_CR6_EQ | CR_CR6_NE);
7661     }
7662    
7663     for (int i=0; i<16; i++) {
7664     vec_getByte(reg2, vrA, REG_NO, i);
7665    
7666     asmALU(X86_CMP, reg2, x86_mem2(modrm, &gCPU.vr[vrB]+i));
7667    
7668     if (PPC_OPC_VRc & gJITC.current_opc) {
7669     NativeAddress skip1 = asmJxxFixup(X86_NE);
7670    
7671     asmMOV_NoFlags((NativeReg)reg2, 0xff);
7672     asmALU(X86_AND, reg, ~CR_CR6_NE);
7673     asmALU(X86_OR, reg, CR_CR6_EQ_SOME);
7674     NativeAddress skip2 = asmJMPFixup();
7675    
7676     asmResolveFixup(skip1);
7677     asmALU(X86_XOR, reg2, reg2);
7678     asmALU(X86_AND, reg, ~CR_CR6_EQ);
7679     asmALU(X86_OR, reg, CR_CR6_NE_SOME);
7680    
7681     asmResolveFixup(skip2);
7682     } else {
7683     asmSET(X86_NE, reg2);
7684    
7685     asmALU(X86_SUB, reg2, 1);
7686     }
7687    
7688     vec_setByte(vrD, REG_NO, i, reg2);
7689     }
7690    
7691     if (PPC_OPC_VRc & gJITC.current_opc) {
7692     asmAND(&gCPU.cr, ~CR_CR6);
7693     asmALU(X86_OR, x86_mem2(modrm, &gCPU.cr), reg);
7694     }
7695    
7696     return flowContinue;
7697     #endif
7698     }
7699    
7700     /* vcmpequhx Vector Compare Equal-to Unsigned Half Word
7701     * v.161
7702     */
7703     void ppc_opc_vcmpequhx()
7704     {
7705     VECTOR_DEBUG;
7706     int vrD, vrA, vrB;
7707     int tf=CR_CR6_EQ | CR_CR6_NE;
7708     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7709    
7710     for (int i=0; i<8; i++) {
7711     if (gCPU.vr[vrA].h[i] == gCPU.vr[vrB].h[i]) {
7712     gCPU.vr[vrD].h[i] = 0xffff;
7713     tf &= ~CR_CR6_NE;
7714     tf |= CR_CR6_EQ_SOME;
7715     } else {
7716     gCPU.vr[vrD].h[i] = 0;
7717     tf &= ~CR_CR6_EQ;
7718     tf |= CR_CR6_NE_SOME;
7719     }
7720     }
7721    
7722     if (PPC_OPC_VRc & gCPU.current_opc) {
7723     gCPU.cr &= ~CR_CR6;
7724     gCPU.cr |= tf;
7725     }
7726     }
7727     JITCFlow ppc_opc_gen_vcmpequhx()
7728     {
7729     #if 0
7730     ppc_opc_gen_interpret(ppc_opc_vcmpequhx);
7731     return flowEndBlock;
7732     #else
7733     int vrD, vrA, vrB;
7734     modrm_o modrm;
7735     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7736    
7737     if (vrA == vrB) {
7738     vec_Neg1(vrD);
7739    
7740     if (PPC_OPC_VRc & gJITC.current_opc) {
7741     jitcClobberCarryAndFlags();
7742    
7743     asmAND(&gCPU.cr, ~CR_CR6);
7744     asmOR(&gCPU.cr, CR_CR6_EQ | CR_CR6_EQ_SOME);
7745     }
7746    
7747     return flowContinue;
7748     }
7749    
7750     jitcFlushClientVectorRegister(vrA);
7751     jitcFlushClientVectorRegister(vrB);
7752     jitcDropClientVectorRegister(vrD);
7753    
7754     jitcClobberCarryAndFlags();
7755     NativeReg reg = REG_NO;
7756     NativeReg16 reg2 = (NativeReg16)jitcAllocRegister(NATIVE_REG_8);
7757    
7758     if (PPC_OPC_VRc & gJITC.current_opc) {
7759     reg = jitcAllocRegister();
7760    
7761     asmMOV_NoFlags(reg, CR_CR6_EQ | CR_CR6_NE);
7762     }
7763    
7764     for (int i=0; i<16; i += 2) {
7765     vec_getHalf(reg2, vrA, i);
7766    
7767     asmALU(X86_CMP, reg2, x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
7768    
7769     if (PPC_OPC_VRc & gJITC.current_opc) {
7770     NativeAddress skip1 = asmJxxFixup(X86_NE);
7771    
7772     asmMOV_NoFlags(reg2, 0xffff);
7773     asmALU(X86_AND, reg, ~CR_CR6_NE);
7774     asmALU(X86_OR, reg, CR_CR6_EQ_SOME);
7775     NativeAddress skip2 = asmJMPFixup();
7776    
7777     asmResolveFixup(skip1);
7778     asmALU(X86_XOR, reg2, reg2);
7779     asmALU(X86_AND, reg, ~CR_CR6_EQ);
7780     asmALU(X86_OR, reg, CR_CR6_NE_SOME);
7781    
7782     asmResolveFixup(skip2, asmHERE());
7783     } else {
7784     asmSET(X86_NE, (NativeReg8)reg2);
7785     asmMOVxx(X86_MOVZX, (NativeReg)reg2, (NativeReg8)reg2);
7786    
7787     asmALU(X86_SUB, reg2, 1);
7788     }
7789    
7790     vec_setHalf(vrD, i, reg2);
7791     }
7792    
7793     if (PPC_OPC_VRc & gJITC.current_opc) {
7794     asmAND(&gCPU.cr, ~CR_CR6);
7795     asmALU(X86_OR, x86_mem2(modrm, &gCPU.cr), reg);
7796     }
7797    
7798     return flowContinue;
7799     #endif
7800     }
7801    
7802     /* vcmpequwx Vector Compare Equal-to Unsigned Word
7803     * v.162
7804     */
7805     void ppc_opc_vcmpequwx()
7806     {
7807     VECTOR_DEBUG;
7808     int vrD, vrA, vrB;
7809     int tf=CR_CR6_EQ | CR_CR6_NE;
7810     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7811    
7812     for (int i=0; i<4; i++) {
7813     if (gCPU.vr[vrA].w[i] == gCPU.vr[vrB].w[i]) {
7814     gCPU.vr[vrD].w[i] = 0xffffffff;
7815     tf &= ~CR_CR6_NE;
7816     tf |= CR_CR6_EQ_SOME;
7817     } else {
7818     gCPU.vr[vrD].w[i] = 0;
7819     tf &= ~CR_CR6_EQ;
7820     tf |= CR_CR6_NE_SOME;
7821     }
7822     }
7823    
7824     if (PPC_OPC_VRc & gCPU.current_opc) {
7825     gCPU.cr &= ~CR_CR6;
7826     gCPU.cr |= tf;
7827     }
7828     }
7829     JITCFlow ppc_opc_gen_vcmpequwx()
7830     {
7831     ppc_opc_gen_interpret(ppc_opc_vcmpequwx);
7832     return flowEndBlock;
7833     }
7834    
7835     /* vcmpeqfpx Vector Compare Equal-to-Floating Point
7836     * v.159
7837     */
7838     void ppc_opc_vcmpeqfpx()
7839     {
7840     VECTOR_DEBUG;
7841     int vrD, vrA, vrB;
7842     int tf=CR_CR6_EQ | CR_CR6_NE;
7843     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7844    
7845     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
7846     if (gCPU.vr[vrA].f[i] == gCPU.vr[vrB].f[i]) {
7847     gCPU.vr[vrD].w[i] = 0xffffffff;
7848     tf &= ~CR_CR6_NE;
7849     tf |= CR_CR6_EQ_SOME;
7850     } else {
7851     gCPU.vr[vrD].w[i] = 0;
7852     tf &= ~CR_CR6_EQ;
7853     tf |= CR_CR6_NE_SOME;
7854     }
7855     }
7856    
7857     if (PPC_OPC_VRc & gCPU.current_opc) {
7858     gCPU.cr &= ~CR_CR6;
7859     gCPU.cr |= tf;
7860     }
7861     }
7862     JITCFlow ppc_opc_gen_vcmpeqfpx()
7863     {
7864     ppc_opc_gen_interpret(ppc_opc_vcmpeqfpx);
7865     return flowEndBlock;
7866     }
7867    
7868     /* vcmpgtubx Vector Compare Greater-Than Unsigned Byte
7869     * v.168
7870     */
7871     void ppc_opc_vcmpgtubx()
7872     {
7873     VECTOR_DEBUG;
7874     int vrD, vrA, vrB;
7875     int tf=CR_CR6_EQ | CR_CR6_NE;
7876     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7877    
7878     for (int i=0; i<16; i++) {
7879     if (gCPU.vr[vrA].b[i] > gCPU.vr[vrB].b[i]) {
7880     gCPU.vr[vrD].b[i] = 0xff;
7881     tf &= ~CR_CR6_NE;
7882     tf |= CR_CR6_EQ_SOME;
7883     } else {
7884     gCPU.vr[vrD].b[i] = 0;
7885     tf &= ~CR_CR6_EQ;
7886     tf |= CR_CR6_NE_SOME;
7887     }
7888     }
7889    
7890     if (PPC_OPC_VRc & gCPU.current_opc) {
7891     gCPU.cr &= ~CR_CR6;
7892     gCPU.cr |= tf;
7893     }
7894     }
7895     JITCFlow ppc_opc_gen_vcmpgtubx()
7896     {
7897     ppc_opc_gen_interpret(ppc_opc_vcmpgtubx);
7898     return flowEndBlock;
7899     }
7900    
7901     /* vcmpgtsbx Vector Compare Greater-Than Signed Byte
7902     * v.165
7903     */
7904     void ppc_opc_vcmpgtsbx()
7905     {
7906     VECTOR_DEBUG;
7907     int vrD, vrA, vrB;
7908     int tf=CR_CR6_EQ | CR_CR6_NE;
7909     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7910    
7911     for (int i=0; i<16; i++) {
7912     if (gCPU.vr[vrA].sb[i] > gCPU.vr[vrB].sb[i]) {
7913     gCPU.vr[vrD].b[i] = 0xff;
7914     tf &= ~CR_CR6_NE;
7915     tf |= CR_CR6_EQ_SOME;
7916     } else {
7917     gCPU.vr[vrD].b[i] = 0;
7918     tf &= ~CR_CR6_EQ;
7919     tf |= CR_CR6_NE_SOME;
7920     }
7921     }
7922    
7923     if (PPC_OPC_VRc & gCPU.current_opc) {
7924     gCPU.cr &= ~CR_CR6;
7925     gCPU.cr |= tf;
7926     }
7927     }
7928     JITCFlow ppc_opc_gen_vcmpgtsbx()
7929     {
7930     ppc_opc_gen_interpret(ppc_opc_vcmpgtsbx);
7931     return flowEndBlock;
7932     }
7933    
7934     /* vcmpgtuhx Vector Compare Greater-Than Unsigned Half Word
7935     * v.169
7936     */
7937     void ppc_opc_vcmpgtuhx()
7938     {
7939     VECTOR_DEBUG;
7940     int vrD, vrA, vrB;
7941     int tf=CR_CR6_EQ | CR_CR6_NE;
7942     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7943    
7944     for (int i=0; i<8; i++) {
7945     if (gCPU.vr[vrA].h[i] > gCPU.vr[vrB].h[i]) {
7946     gCPU.vr[vrD].h[i] = 0xffff;
7947     tf &= ~CR_CR6_NE;
7948     tf |= CR_CR6_EQ_SOME;
7949     } else {
7950     gCPU.vr[vrD].h[i] = 0;
7951     tf &= ~CR_CR6_EQ;
7952     tf |= CR_CR6_NE_SOME;
7953     }
7954     }
7955    
7956     if (PPC_OPC_VRc & gCPU.current_opc) {
7957     gCPU.cr &= ~CR_CR6;
7958     gCPU.cr |= tf;
7959     }
7960     }
7961     JITCFlow ppc_opc_gen_vcmpgtuhx()
7962     {
7963     #if 0
7964     ppc_opc_gen_interpret(ppc_opc_vcmpgtuhx);
7965     return flowEndBlock;
7966     #else
7967     int vrD, vrA, vrB;
7968     modrm_o modrm;
7969     PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7970    
7971     if (vrA == vrB) {
7972     vec_Zero(vrD);
7973    
7974     if (PPC_OPC_VRc & gJITC.current_opc) {
7975     jitcClobberCarryAndFlags();
7976    
7977     asmAND(&gCPU.cr, ~CR_CR6);
7978     asmOR(&gCPU.cr, CR_CR6_NE | CR_CR6_NE_SOME);
7979     }
7980    
7981     return flowContinue;
7982     }
7983    
7984     jitcFlushClientVectorRegister(vrA);
7985     jitcFlushClientVectorRegister(vrB);
7986     jitcDropClientVectorRegister(vrD);
7987    
7988     jitcClobberCarryAndFlags();
7989     NativeReg reg = REG_NO;
7990     NativeReg16 reg2 = (NativeReg16)jitcAllocRegister(NATIVE_REG_8);
7991    
7992     if (PPC_OPC_VRc & gJITC.current_opc) {
7993     reg = jitcAllocRegister();
7994    
7995     asmMOV_NoFlags(reg, CR_CR6_EQ | CR_CR6_NE);
7996     }
7997    
7998     for (int i=0; i<16; i += 2) {
7999     vec_getHalf(reg2, vrA, i);
8000    
8001     asmALU(X86_CMP, reg2, x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
8002    
8003     if (PPC_OPC_VRc & gJITC.current_opc) {
8004     NativeAddress skip1 = asmJxxFixup(X86_BE);
8005    
8006     asmMOV_NoFlags(reg2, 0xffff);
8007     asmALU(X86_AND, reg, ~CR_CR6_NE);
8008     asmALU(X86_OR, reg, CR_CR6_EQ_SOME);
8009     NativeAddress skip2 = asmJMPFixup();
8010    
8011     asmResolveFixup(skip1);
8012     asmALU(X86_XOR, reg2, reg2);
8013     asmALU(X86_AND, reg, ~CR_CR6_EQ);
8014     asmALU(X86_OR, reg, CR_CR6_NE_SOME);
8015    
8016     asmResolveFixup(skip2);
8017     } else {
8018     asmSET(X86_BE, (NativeReg8)reg2);
8019     asmMOVxx(X86_MOVZX, (NativeReg)reg2, (NativeReg8)reg2);
8020    
8021     asmALU(X86_SUB, reg2, 1);
8022     }
8023    
8024     vec_setHalf(vrD, i, reg2);
8025     }
8026    
8027     if (PPC_OPC_VRc & gJITC.current_opc) {
8028     asmAND(&gCPU.cr, ~CR_CR6);
8029     asmALU(X86_OR, x86_mem2(modrm, &gCPU.cr), reg);
8030     }
8031    
8032     return flowContinue;
8033     #endif
8034     }
8035    
8036     /* vcmpgtshx Vector Compare Greater-Than Signed Half Word
8037     * v.166
8038     */
8039     void ppc_opc_vcmpgtshx()
8040     {
8041     VECTOR_DEBUG;
8042     int vrD, vrA, vrB;
8043     int tf=CR_CR6_EQ | CR_CR6_NE;
8044     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8045    
8046     for (int i=0; i<8; i++) {
8047     if (gCPU.vr[vrA].sh[i] > gCPU.vr[vrB].sh[i]) {
8048     gCPU.vr[vrD].h[i] = 0xffff;
8049     tf &= ~CR_CR6_NE;
8050     tf |= CR_CR6_EQ_SOME;
8051     } else {
8052     gCPU.vr[vrD].h[i] = 0;
8053     tf &= ~CR_CR6_EQ;
8054     tf |= CR_CR6_NE_SOME;
8055     }
8056     }
8057    
8058     if (PPC_OPC_VRc & gCPU.current_opc) {
8059     gCPU.cr &= ~CR_CR6;
8060     gCPU.cr |= tf;
8061     }
8062     }
8063     JITCFlow ppc_opc_gen_vcmpgtshx()
8064     {
8065     ppc_opc_gen_interpret(ppc_opc_vcmpgtshx);
8066     return flowEndBlock;
8067     }
8068    
8069     /* vcmpgtuwx Vector Compare Greater-Than Unsigned Word
8070     * v.170
8071     */
8072     void ppc_opc_vcmpgtuwx()
8073     {
8074     VECTOR_DEBUG;
8075     int vrD, vrA, vrB;
8076     int tf=CR_CR6_EQ | CR_CR6_NE;
8077     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8078    
8079     for (int i=0; i<4; i++) {
8080     if (gCPU.vr[vrA].w[i] > gCPU.vr[vrB].w[i]) {
8081     gCPU.vr[vrD].w[i] = 0xffffffff;
8082     tf &= ~CR_CR6_NE;
8083     tf |= CR_CR6_EQ_SOME;
8084     } else {
8085     gCPU.vr[vrD].w[i] = 0;
8086     tf &= ~CR_CR6_EQ;
8087     tf |= CR_CR6_NE_SOME;
8088     }
8089     }
8090    
8091     if (PPC_OPC_VRc & gCPU.current_opc) {
8092     gCPU.cr &= ~CR_CR6;
8093     gCPU.cr |= tf;
8094     }
8095     }
8096     JITCFlow ppc_opc_gen_vcmpgtuwx()
8097     {
8098     ppc_opc_gen_interpret(ppc_opc_vcmpgtuwx);
8099     return flowEndBlock;
8100     }
8101    
8102     /* vcmpgtswx Vector Compare Greater-Than Signed Word
8103     * v.167
8104     */
8105     void ppc_opc_vcmpgtswx()
8106     {
8107     VECTOR_DEBUG;
8108     int vrD, vrA, vrB;
8109     int tf=CR_CR6_EQ | CR_CR6_NE;
8110     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8111    
8112     for (int i=0; i<4; i++) {
8113     if (gCPU.vr[vrA].sw[i] > gCPU.vr[vrB].sw[i]) {
8114     gCPU.vr[vrD].w[i] = 0xffffffff;
8115     tf &= ~CR_CR6_NE;
8116     tf |= CR_CR6_EQ_SOME;
8117     } else {
8118     gCPU.vr[vrD].w[i] = 0;
8119     tf &= ~CR_CR6_EQ;
8120     tf |= CR_CR6_NE_SOME;
8121     }
8122     }
8123    
8124     if (PPC_OPC_VRc & gCPU.current_opc) {
8125     gCPU.cr &= ~CR_CR6;
8126     gCPU.cr |= tf;
8127     }
8128     }
8129     JITCFlow ppc_opc_gen_vcmpgtswx()
8130     {
8131     ppc_opc_gen_interpret(ppc_opc_vcmpgtswx);
8132     return flowEndBlock;
8133     }
8134    
8135     /* vcmpgtfpx Vector Compare Greater-Than Floating-Point
8136     * v.164
8137     */
8138     void ppc_opc_vcmpgtfpx()
8139     {
8140     VECTOR_DEBUG;
8141     int vrD, vrA, vrB;
8142     int tf=CR_CR6_EQ | CR_CR6_NE;
8143     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8144    
8145     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
8146     if (gCPU.vr[vrA].f[i] > gCPU.vr[vrB].f[i]) {
8147     gCPU.vr[vrD].w[i] = 0xffffffff;
8148     tf &= ~CR_CR6_NE;
8149     tf |= CR_CR6_EQ_SOME;
8150     } else {
8151     gCPU.vr[vrD].w[i] = 0;
8152     tf &= ~CR_CR6_EQ;
8153     tf |= CR_CR6_NE_SOME;
8154     }
8155     }
8156    
8157     if (PPC_OPC_VRc & gCPU.current_opc) {
8158     gCPU.cr &= ~CR_CR6;
8159     gCPU.cr |= tf;
8160     }
8161     }
8162     JITCFlow ppc_opc_gen_vcmpgtfpx()
8163     {
8164     ppc_opc_gen_interpret(ppc_opc_vcmpgtfpx);
8165     return flowEndBlock;
8166     }
8167    
8168     /* vcmpgefpx Vector Compare Greater-Than-or-Equal-to Floating Point
8169     * v.163
8170     */
8171     void ppc_opc_vcmpgefpx()
8172     {
8173     VECTOR_DEBUG;
8174     int vrD, vrA, vrB;
8175     int tf=CR_CR6_EQ | CR_CR6_NE;
8176     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8177    
8178     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
8179     if (gCPU.vr[vrA].f[i] >= gCPU.vr[vrB].f[i]) {
8180     gCPU.vr[vrD].w[i] = 0xffffffff;
8181     tf &= ~CR_CR6_NE;
8182     tf |= CR_CR6_EQ_SOME;
8183     } else {
8184     gCPU.vr[vrD].w[i] = 0;
8185     tf &= ~CR_CR6_EQ;
8186     tf |= CR_CR6_NE_SOME;
8187     }
8188     }
8189    
8190     if (PPC_OPC_VRc & gCPU.current_opc) {
8191     gCPU.cr &= ~CR_CR6;
8192     gCPU.cr |= tf;
8193     }
8194     }
8195     JITCFlow ppc_opc_gen_vcmpgefpx()
8196     {
8197     ppc_opc_gen_interpret(ppc_opc_vcmpgefpx);
8198     return flowEndBlock;
8199     }
8200    
8201     /* vcmpbfpx Vector Compare Bounds Floating Point
8202     * v.157
8203     */
8204     void ppc_opc_vcmpbfpx()
8205     {
8206     VECTOR_DEBUG;
8207     int vrD, vrA, vrB;
8208     int le, ge;
8209     int ib=CR_CR6_NE;
8210     PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8211    
8212     for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
8213     le = (gCPU.vr[vrA].f[i] <= gCPU.vr[vrB].f[i]) ? 0 : 0x80000000;
8214     ge = (gCPU.vr[vrA].f[i] >= -gCPU.vr[vrB].f[i]) ? 0 : 0x40000000;
8215    
8216     gCPU.vr[vrD].w[i] = le | ge;
8217     if (le | ge) {
8218     ib = 0;
8219     }
8220     }
8221    
8222     if (PPC_OPC_VRc & gCPU.current_opc) {
8223     gCPU.cr &= ~CR_CR6;
8224     gCPU.cr |= ib;
8225     }
8226     }
8227     JITCFlow ppc_opc_gen_vcmpbfpx()
8228     {
8229     ppc_opc_gen_interpret(ppc_opc_vcmpbfpx);
8230     return flowEndBlock;
8231     }

  ViewVC Help
Powered by ViewVC 1.1.26