/[pearpc]/src/cpu/cpu_jitc_x86/ppc_vec.cc
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /src/cpu/cpu_jitc_x86/ppc_vec.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (show annotations)
Wed Sep 5 17:11:21 2007 UTC (16 years, 6 months ago) by dpavlin
File size: 177354 byte(s)
import upstream CVS
1 /*
2 * PearPC
3 * ppc_vec.cc
4 *
5 * Copyright (C) 2004 Daniel Foesch (dfoesch@cs.nmsu.edu)
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /* Pages marked: v.???
22 * From: IBM PowerPC MicroProcessor Family: Altivec(tm) Technology...
23 * Programming Environments Manual
24 */
25
26 #include <string.h>
27 #include <math.h>
28
29 /*
30 * FIXME: put somewhere appropriate
31 */
32 #ifndef HAS_LOG2
33 #define log2(x) log(x)/log(2)
34 #endif /* HAS_LOG2 */
35
36 #ifndef HAS_EXP2
37 #define exp2(x) pow(2, x)
38 #endif /* HAS_EXP2 */
39
40 #define ASSERT_FLUSHED(vrA)
41 //#define ASSERT_FLUSHED(vrA) jitcAssertFlushedVectorRegister(vrA)
42
43 #include "debug/tracers.h"
44 #include "ppc_cpu.h"
45 #include "ppc_dec.h"
46 #include "ppc_fpu.h"
47 #include "ppc_vec.h"
48
49 #define SIGN32 0x80000000
50
51 #define RL2RH(RL) (NativeReg8)(RL+4)
52
53 #define vrT JITC_VECTOR_TEMP
54 #define vrNEG1 JITC_VECTOR_NEG1
55
56 static inline void commutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB);
57 static inline void noncommutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB);
58
59 static inline void commutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB);
60 static inline void noncommutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB);
61
62 /* PACK_PIXEL Packs a uint32 pixel to uint16 pixel
63 * v.219
64 */
65 static inline uint16 PACK_PIXEL(uint32 clr)
66 {
67 return (((clr & 0x000000f8) >> 3) | \
68 ((clr & 0x0000f800) >> 6) | \
69 ((clr & 0x01f80000) >> 9));
70 }
71
72 /* UNPACK_PIXEL Unpacks a uint16 pixel to uint32 pixel
73 * v.276 & v.279
74 */
75 static inline uint32 UNPACK_PIXEL(uint16 clr)
76 {
77 return (((uint32)(clr & 0x001f)) | \
78 ((uint32)(clr & 0x03E0) << 3) | \
79 ((uint32)(clr & 0x7c00) << 6) | \
80 (((clr) & 0x8000) ? 0xff000000 : 0));
81 }
82
83 static inline uint8 SATURATE_UB(uint16 val)
84 {
85 if (val & 0xff00) {
86 gCPU.vscr |= VSCR_SAT;
87 return 0xff;
88 }
89 return val;
90 }
91 static inline uint8 SATURATE_0B(uint16 val)
92 {
93 if (val & 0xff00) {
94 gCPU.vscr |= VSCR_SAT;
95 return 0;
96 }
97 return val;
98 }
99
100 static inline uint16 SATURATE_UH(uint32 val)
101 {
102 if (val & 0xffff0000) {
103 gCPU.vscr |= VSCR_SAT;
104 return 0xffff;
105 }
106 return val;
107 }
108
109 static inline uint16 SATURATE_0H(uint32 val)
110 {
111 if (val & 0xffff0000) {
112 gCPU.vscr |= VSCR_SAT;
113 return 0;
114 }
115 return val;
116 }
117
118 static inline sint8 SATURATE_SB(sint16 val)
119 {
120 if (val > 127) { // 0x7F
121 gCPU.vscr |= VSCR_SAT;
122 return 127;
123 } else if (val < -128) { // 0x80
124 gCPU.vscr |= VSCR_SAT;
125 return -128;
126 }
127 return val;
128 }
129
130 static inline uint8 SATURATE_USB(sint16 val)
131 {
132 if (val > 0xff) {
133 gCPU.vscr |= VSCR_SAT;
134 return 0xff;
135 } else if (val < 0) {
136 gCPU.vscr |= VSCR_SAT;
137 return 0;
138 }
139 return (uint8)val;
140 }
141
142 static inline sint16 SATURATE_SH(sint32 val)
143 {
144 if (val > 32767) { // 0x7fff
145 gCPU.vscr |= VSCR_SAT;
146 return 32767;
147 } else if (val < -32768) { // 0x8000
148 gCPU.vscr |= VSCR_SAT;
149 return -32768;
150 }
151 return val;
152 }
153
154 static inline uint16 SATURATE_USH(sint32 val)
155 {
156 if (val > 0xffff) {
157 gCPU.vscr |= VSCR_SAT;
158 return 0xffff;
159 } else if (val < 0) {
160 gCPU.vscr |= VSCR_SAT;
161 return 0;
162 }
163 return (uint16)val;
164 }
165
166 static inline sint32 SATURATE_UW(sint64 val)
167 {
168 if (val > 0xffffffffLL) {
169 gCPU.vscr |= VSCR_SAT;
170 return 0xffffffffLL;
171 }
172 return val;
173 }
174
175 static inline sint32 SATURATE_SW(sint64 val)
176 {
177 if (val > 2147483647LL) { // 0x7fffffff
178 gCPU.vscr |= VSCR_SAT;
179 return 2147483647LL;
180 } else if (val < -2147483648LL) { // 0x80000000
181 gCPU.vscr |= VSCR_SAT;
182 return -2147483648LL;
183 }
184 return val;
185 }
186
187 static inline void vec_raise_saturate(void)
188 {
189 asmOR(&gCPU.vscr, VSCR_SAT);
190 }
191
192 static inline void vec_saturateSB(NativeReg reg, NativeReg reg2)
193 {
194 jitcClobberCarryAndFlags();
195
196 asmMOV_NoFlags(reg2, 0x7f);
197 asmALU(X86_CMP, reg, reg2);
198 NativeAddress skip1 = asmJxxFixup(X86_G);
199
200 asmMOV_NoFlags(reg2, 0xffffff80);
201 asmALU(X86_CMP, reg, reg2);
202 NativeAddress skip2 = asmJxxFixup(X86_L);
203
204 NativeAddress skip3 = asmJMPFixup();
205
206 asmResolveFixup(skip1);
207 asmResolveFixup(skip2);
208
209 asmALU(X86_MOV, reg, reg2);
210 vec_raise_saturate();
211
212 asmResolveFixup(skip3);
213 }
214
215 static inline void vec_saturateSUB(NativeReg reg, NativeReg reg2)
216 {
217 jitcClobberCarryAndFlags();
218
219 asmMOV_NoFlags(reg2, 0xff);
220 asmALU(X86_CMP, reg, reg2);
221 NativeAddress skip1 = asmJxxFixup(X86_G);
222
223 asmALU(X86_XOR, reg2, reg2);
224 asmALU(X86_CMP, reg, reg2);
225 NativeAddress skip2 = asmJxxFixup(X86_L);
226
227 NativeAddress skip3 = asmJMPFixup();
228
229 asmResolveFixup(skip1);
230 asmResolveFixup(skip2);
231
232 asmALU(X86_MOV, reg, reg2);
233 vec_raise_saturate();
234
235 asmResolveFixup(skip3);
236 }
237
238 static inline void vec_saturateUB(NativeReg8 reg)
239 {
240 jitcClobberCarryAndFlags();
241
242 asmALU(X86_CMP, (NativeReg)reg, 0xff);
243 NativeAddress skip1 = asmJxxFixup(X86_BE);
244
245 asmSET(X86_S, reg);
246 asmALU(X86_SUB, reg, 1);
247
248 vec_raise_saturate();
249
250 asmResolveFixup(skip1);
251 }
252
253
254 static inline void vec_saturateSH(NativeReg reg, NativeReg reg2)
255 {
256 jitcClobberCarryAndFlags();
257
258 asmMOV_NoFlags(reg2, 0x7fff);
259 asmALU(X86_CMP, reg, reg2);
260 NativeAddress skip1 = asmJxxFixup(X86_G);
261
262 asmMOV_NoFlags(reg2, 0xffff8000);
263 asmALU(X86_CMP, reg, reg2);
264 NativeAddress skip2 = asmJxxFixup(X86_L);
265
266 NativeAddress skip3 = asmJMPFixup();
267
268 asmResolveFixup(skip1);
269 asmResolveFixup(skip2);
270
271 asmALU(X86_MOV, reg, reg2);
272 vec_raise_saturate();
273
274 asmResolveFixup(skip3);
275 }
276
277 static inline void vec_saturateSUH(NativeReg reg, NativeReg reg2)
278 {
279 jitcClobberCarryAndFlags();
280
281 asmMOV_NoFlags(reg2, 0xffff);
282 asmALU(X86_CMP, reg, reg2);
283 NativeAddress skip1 = asmJxxFixup(X86_G);
284
285 asmALU(X86_XOR, reg2, reg2);
286 asmALU(X86_CMP, reg, reg2);
287 NativeAddress skip2 = asmJxxFixup(X86_L);
288
289 NativeAddress skip3 = asmJMPFixup();
290
291 asmResolveFixup(skip1);
292 asmResolveFixup(skip2);
293
294 asmALU(X86_MOV, reg, reg2);
295 vec_raise_saturate();
296
297 asmResolveFixup(skip3);
298 }
299
300 static inline void vec_saturateUH(NativeReg8 reg)
301 {
302 jitcClobberCarryAndFlags();
303
304 asmALU(X86_CMP, (NativeReg)reg, 0xffff);
305 NativeAddress skip1 = asmJxxFixup(X86_BE);
306
307 asmSET(X86_S, reg);
308 asmMOVxx(X86_MOVZX, (NativeReg)reg, reg);
309 asmALU(X86_SUB, (NativeReg)reg, 1);
310
311 vec_raise_saturate();
312
313 asmResolveFixup(skip1);
314 }
315
316 static inline void vec_getByte(NativeReg8 dest, int src, NativeReg r, int i)
317 {
318 modrm_o modrm;
319
320 ASSERT_FLUSHED(src);
321
322 asmALU(X86_MOV, dest, x86_mem2(modrm, r, &(gCPU.vr[src].b[i])));
323 }
324
325 static inline void vec_setByte(int dest, NativeReg r, int i, NativeReg8 src)
326 {
327 modrm_o modrm;
328
329 ASSERT_FLUSHED(dest);
330
331 asmALU(X86_MOV, x86_mem2(modrm, r, &(gCPU.vr[dest].b[i])), src);
332 }
333
334 static inline void vec_getByteZ(NativeReg dest, int src, int i)
335 {
336 modrm_o modrm;
337
338 ASSERT_FLUSHED(src);
339
340 asmMOVxx_B(X86_MOVZX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
341 }
342
343 static inline void vec_getByteS(NativeReg dest, int src, int i)
344 {
345 modrm_o modrm;
346
347 ASSERT_FLUSHED(src);
348
349 asmMOVxx_B(X86_MOVSX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
350 }
351
352 static inline void vec_getHalf(NativeReg16 dest, int src, int i)
353 {
354 ASSERT_FLUSHED(src);
355
356 asmMOV(dest, &(gCPU.vr[src].b[i]));
357 }
358
359 static inline void vec_setHalf(int dest, int i, NativeReg16 src)
360 {
361 ASSERT_FLUSHED(dest);
362
363 asmMOV(&(gCPU.vr[dest].b[i]), src);
364 }
365
366 static inline void vec_getHalfZ(NativeReg dest, int src, int i)
367 {
368 modrm_o modrm;
369
370 ASSERT_FLUSHED(src);
371
372 asmMOVxx_W(X86_MOVZX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
373 }
374
375 static inline void vec_getHalfS(NativeReg dest, int src, int i)
376 {
377 modrm_o modrm;
378
379 ASSERT_FLUSHED(src);
380
381 asmMOVxx_W(X86_MOVSX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
382 }
383
384 static inline void vec_getWord(NativeReg dest, int src, int i)
385 {
386 ASSERT_FLUSHED(src);
387
388 asmMOV(dest, &(gCPU.vr[src].b[i]));
389 }
390
391 static inline void vec_setWord(int dest, int i, NativeReg src)
392 {
393 ASSERT_FLUSHED(dest);
394
395 asmMOV(&(gCPU.vr[dest].b[i]), src);
396 }
397
398 #define SSE_AVAIL gJITC.hostCPUCaps.sse
399 //#define SSE_AVAIL 0
400
401 #define SSE2_AVAIL gJITC.hostCPUCaps.sse2
402 //#define SSE2_AVAIL 0
403
404 #define SSE_NO 0
405 #define SSE2_NO 0
406
407 const static sint32 sint32_max = 2147483647;
408 const static sint32 sint32_min = -2147483648;
409 const static double uint32_max = 4294967295;
410 const static uint32 uint32_min = 0;
411
412 static inline void vec_Copy(int dest, int src)
413 {
414 if (dest == src);
415
416 else if (SSE_AVAIL) {
417 NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
418 NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(src);
419
420 if (reg2 == VECTREG_NO)
421 asmMOVAPS(reg, &gCPU.vr[src]);
422 else
423 asmALUPS(X86_MOVAPS, reg, reg2);
424 } else {
425 jitcFlushClientVectorRegister(src);
426 jitcDropClientVectorRegister(dest);
427
428 NativeReg reg = jitcAllocRegister();
429
430 for (int i=0; i<16; i+=4) {
431 vec_getWord(reg, src, i);
432 vec_setWord(dest, i, reg);
433 }
434 }
435 }
436
437 static inline void vec_Zero(int dest)
438 {
439 if (SSE_AVAIL) {
440 NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
441
442 asmALUPS(X86_XORPS, reg, reg);
443 } else {
444 jitcDropClientVectorRegister(dest);
445
446 NativeReg reg = jitcAllocRegister();
447
448 asmMOV_NoFlags(reg, 0); // avoid flag clobber
449
450 vec_setWord(dest, 0, reg);
451 vec_setWord(dest, 4, reg);
452 vec_setWord(dest, 8, reg);
453 vec_setWord(dest,12, reg);
454 }
455 }
456
457 static inline void vec_Neg1(int dest)
458 {
459 if (SSE_AVAIL) {
460 NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
461 NativeVectorReg n1 = jitcGetClientVectorRegister(vrNEG1);
462
463 asmALUPS(X86_MOVAPS, reg, n1);
464 } else {
465 jitcDropClientVectorRegister(dest);
466
467 NativeReg reg = jitcAllocRegister();
468
469 asmMOV_NoFlags(reg, 0xffffffff); // avoid flag clobber
470
471 vec_setWord(dest, 0, reg);
472 vec_setWord(dest, 4, reg);
473 vec_setWord(dest, 8, reg);
474 vec_setWord(dest,12, reg);
475 }
476 }
477
478 static inline void vec_Not(int dest, int src)
479 {
480 if (SSE_AVAIL) {
481 NativeVectorReg reg1;
482 NativeVectorReg reg2;
483
484 if (src == dest) {
485 reg1 = jitcGetClientVectorRegisterDirty(dest);
486 } else {
487 reg1 = jitcMapClientVectorRegisterDirty(dest);
488 reg2 = jitcGetClientVectorRegisterMapping(src);
489
490 if (reg2 == VECTREG_NO)
491 asmMOVAPS(reg1, &gCPU.vr[src]);
492 else
493 asmALUPS(X86_MOVAPS, reg1, reg2);
494 }
495
496 reg2 = jitcGetClientVectorRegister(vrNEG1);
497
498 asmALUPS(X86_XORPS, reg1, reg2);
499 } else {
500 jitcFlushClientVectorRegister(src);
501 jitcDropClientVectorRegister(dest);
502
503 // according to documentation NOT does not effect flags!
504 NativeReg reg = jitcAllocRegister();
505
506 for (int i=0; i<16; i+=4) {
507 vec_getWord(reg, src, i);
508 asmALU(X86_NOT, reg);
509 vec_setWord(dest, i, reg);
510 }
511 }
512 }
513
514 static inline void vec_SelectiveLoadByte(X86FlagTest flags, NativeReg8 reg1, NativeReg reg2, int vrA, int vrB)
515 {
516 modrm_o modrm;
517
518 NativeAddress skip = asmJxxFixup(flags);
519 asmALU(X86_MOV, (NativeReg)reg1,
520 x86_mem2(modrm, reg2, &(gCPU.vr[vrA].b[16])));
521
522 NativeAddress end = asmJMPFixup();
523
524 asmResolveFixup(skip);
525 asmALU(X86_MOV, (NativeReg)reg1,
526 x86_mem2(modrm, reg2, &(gCPU.vr[vrB].b[16])));
527
528 asmResolveFixup(end);
529 }
530
531 static inline void vec_PermuteQuad(NativeReg8 reg1, NativeReg reg2, NativeReg8 regT, NativeReg regA, NativeReg regB, int vrA, int vrB, int vrD, int i)
532 {
533 modrm_o modrm;
534
535 asmMOVxx(X86_MOVSX, regA, reg1);
536 asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
537
538 if (gJITC.hostCPUCaps.cmov) {
539 asmALU(X86_TEST, reg2, 0x10);
540 asmCMOV(X86_Z, (NativeReg)regT,
541 x86_mem2(modrm, regA, &(gCPU.vr[vrA].b[16])));
542 asmCMOV(X86_NZ, (NativeReg)regT,
543 x86_mem2(modrm, regA, &(gCPU.vr[vrB].b[16])));
544
545 vec_setByte(vrD, REG_NO, i, regT);
546
547 asmALU(X86_TEST, reg2, 0x1000);
548 asmCMOV(X86_Z, (NativeReg)regT,
549 x86_mem2(modrm, regB, &(gCPU.vr[vrA].b[16])));
550 asmCMOV(X86_NZ, (NativeReg)regT,
551 x86_mem2(modrm, regB, &(gCPU.vr[vrB].b[16])));
552
553 vec_setByte(vrD, REG_NO, i+1, regT);
554
555 asmShift(X86_SHR, (NativeReg)reg1, 16);
556
557 asmMOVxx(X86_MOVSX, regA, reg1);
558 asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
559
560 asmALU(X86_TEST, reg2, 0x100000);
561 asmCMOV(X86_Z, (NativeReg)regT,
562 x86_mem2(modrm, regA, &(gCPU.vr[vrA].b[16])));
563 asmCMOV(X86_NZ, (NativeReg)regT,
564 x86_mem2(modrm, regA, &(gCPU.vr[vrB].b[16])));
565
566 vec_setByte(vrD, REG_NO, i+2, regT);
567
568 asmALU(X86_TEST, reg2, 0x10000000);
569 asmCMOV(X86_Z, (NativeReg)regT,
570 x86_mem2(modrm, regB, &(gCPU.vr[vrA].b[16])));
571 asmCMOV(X86_NZ, (NativeReg)regT,
572 x86_mem2(modrm, regB, &(gCPU.vr[vrB].b[16])));
573
574 vec_setByte(vrD, REG_NO, i+3, regT);
575 } else {
576 asmALU(X86_TEST, reg2, 0x10);
577 vec_SelectiveLoadByte(X86_NZ, regT, regA, vrA, vrB);
578 vec_setByte(vrD, REG_NO, i, regT);
579
580 asmALU(X86_TEST, reg2, 0x1000);
581 vec_SelectiveLoadByte(X86_NZ, regT, regB, vrA, vrB);
582 vec_setByte(vrD, REG_NO, i+1, regT);
583
584 asmShift(X86_SHR, (NativeReg)reg1, 16);
585
586 asmMOVxx(X86_MOVSX, regA, reg1);
587 asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
588
589 asmALU(X86_TEST, reg2, 0x100000);
590 vec_SelectiveLoadByte(X86_NZ, regT, regA, vrA, vrB);
591 vec_setByte(vrD, REG_NO, i+2, regT);
592
593 asmALU(X86_TEST, reg2, 0x10000000);
594 vec_SelectiveLoadByte(X86_NZ, regT, regB, vrA, vrB);
595 vec_setByte(vrD, REG_NO, i+3, regT);
596 }
597 }
598
599 /* vperm Vector Permutation
600 * v.218
601 */
602 void ppc_opc_vperm()
603 {
604 VECTOR_DEBUG_COMMON;
605 int vrD, vrA, vrB, vrC;
606 int sel;
607 Vector_t r;
608 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
609 for (int i=0; i<16; i++) {
610 sel = gCPU.vr[vrC].b[i];
611 if (sel & 0x10)
612 r.b[i] = VECT_B(gCPU.vr[vrB], sel & 0xf);
613 else
614 r.b[i] = VECT_B(gCPU.vr[vrA], sel & 0xf);
615 }
616
617 gCPU.vr[vrD] = r;
618 }
619 JITCFlow ppc_opc_gen_vperm()
620 {
621 #if 0
622 ppc_opc_gen_interpret(ppc_opc_vperm);
623 return flowEndBlock;
624 #else
625 int vrD, vrA, vrB, vrC, vrTMP;
626 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
627
628 jitcFlushClientVectorRegister(vrA);
629 jitcFlushClientVectorRegister(vrB);
630 jitcFlushClientVectorRegister(vrC);
631
632 jitcClobberCarryAndFlags();
633 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
634 NativeReg8 regT = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
635 NativeReg reg2 = jitcAllocRegister();
636 NativeReg regA = jitcAllocRegister();
637 NativeReg regB = jitcAllocRegister();
638
639 vrTMP = ((vrD == vrA) || (vrD == vrB)) ? vrT : vrD;
640 jitcDropClientVectorRegister(vrTMP);
641
642 for (int i=0; i<16; i+=4) {
643 vec_getWord((NativeReg)reg1, vrC, i);
644
645 asmALU(X86_MOV, reg2, (NativeReg)reg1);
646
647 asmALU(X86_AND, (NativeReg)reg1, 0x0f0f0f0f);
648 asmALU(X86_NOT, (NativeReg)reg1);
649
650 // reg1 = original map (selector mask)
651 // reg2 = negative offsets - 1
652 // regT = spare use register (must be 8-bit)
653 // regA = spare use register
654 // regB = spare use register
655 vec_PermuteQuad(reg1, reg2, regT, regA, regB, vrA, vrB, vrTMP, i);
656 }
657
658 if (vrTMP == vrT) {
659 if (SSE_AVAIL) {
660 vec_Copy(vrD, vrT);
661 } else {
662 /* Since we already have a number of registers
663 * allocated, this is much faster than the non-SSE
664 * vec_Copy()
665 */
666 jitcDropClientVectorRegister(vrD);
667
668 vec_getWord((NativeReg)reg1, vrT, 0);
669 vec_getWord(reg2, vrT, 4);
670 vec_getWord(regA, vrT, 8);
671 vec_getWord(regB, vrT,12);
672
673 vec_setWord(vrD, 0, (NativeReg)reg1);
674 vec_setWord(vrD, 4, reg2);
675 vec_setWord(vrD, 8, regA);
676 vec_setWord(vrD,12, regB);
677 }
678 }
679
680 return flowContinue;
681 #endif
682 }
683
684 /* vsel Vector Select
685 * v.238
686 */
687 void ppc_opc_vsel()
688 {
689 VECTOR_DEBUG;
690 int vrD, vrA, vrB, vrC;
691 uint64 mask, val;
692 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
693
694 mask = gCPU.vr[vrC].d[0];
695 val = gCPU.vr[vrB].d[0] & mask;
696 val |= gCPU.vr[vrA].d[0] & ~mask;
697 gCPU.vr[vrD].d[0] = val;
698
699 mask = gCPU.vr[vrC].d[1];
700 val = gCPU.vr[vrB].d[1] & mask;
701 val |= gCPU.vr[vrA].d[1] & ~mask;
702 gCPU.vr[vrD].d[1] = val;
703 }
704 JITCFlow ppc_opc_gen_vsel()
705 {
706 #if 0
707 ppc_opc_gen_interpret(ppc_opc_vsel);
708 return flowEndBlock;
709 #else
710 int vrD, vrA, vrB, vrC;
711 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
712
713 if (vrA == vrB) {
714 vec_Copy(vrD, vrB);
715 return flowContinue;
716 }
717
718 if (SSE_AVAIL) {
719 modrm_o modrm;
720
721 /* We cannot use jitcMapClientVectorRegisterDirty(vrD) here,
722 * because if vrD == vrC
723 */
724 NativeVectorReg d = jitcAllocVectorRegister();
725 NativeVectorReg c;
726
727 if (vrA == vrC) {
728 c = jitcGetClientVectorRegisterMapping(vrC);
729
730 if (c == VECTREG_NO) {
731 asmMOVAPS(d, &gCPU.vr[vrC]);
732 } else {
733 asmALUPS(X86_MOVAPS, d, c);
734 }
735 } else {
736 c = jitcGetClientVectorRegister(vrC);
737 jitcTrashVectorRegister(NATIVE_REG | c);
738
739 asmALUPS(X86_MOVAPS, d, c);
740 }
741
742 if (vrB != vrC) {
743 NativeVectorReg b = jitcGetClientVectorRegisterMapping(vrB);
744
745 if (b == VECTREG_NO) {
746 asmALUPS(X86_ANDPS, d,
747 x86_mem2(modrm, &gCPU.vr[vrB]));
748 } else
749 asmALUPS(X86_ANDPS, d, b);
750 }
751
752 if (vrA != vrC) {
753 NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
754
755 if (a == VECTREG_NO) {
756 asmALUPS(X86_ANDNPS, c,
757 x86_mem2(modrm, &gCPU.vr[vrA]));
758 } else
759 asmALUPS(X86_ANDNPS, c, a);
760
761 asmALUPS(X86_ORPS, d, c);
762 }
763
764 jitcRenameVectorRegisterDirty(d, vrD);
765
766 return flowContinue;
767 }
768
769 jitcFlushClientVectorRegister(vrA);
770 jitcFlushClientVectorRegister(vrB);
771 jitcFlushClientVectorRegister(vrC);
772 jitcDropClientVectorRegister(vrD);
773
774 jitcClobberCarryAndFlags();
775 NativeReg reg1 = jitcAllocRegister();
776 NativeReg reg2 = jitcAllocRegister();
777 modrm_o modrm;
778
779 for (int i=0; i<16; i+=4) {
780 vec_getWord(reg1, vrC, i);
781 asmALU(X86_MOV, reg2, reg1);
782 asmALU(X86_NOT, reg2);
783
784 asmALU(X86_AND, reg1, x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
785
786 asmALU(X86_AND, reg2, x86_mem2(modrm, &(gCPU.vr[vrA].b[i])));
787
788 asmALU(X86_OR, reg1, reg2);
789 vec_setWord(vrD, i, reg1);
790 }
791
792 return flowContinue;
793 #endif
794 }
795
796 /* vsrb Vector Shift Right Byte
797 * v.256
798 */
799 void ppc_opc_vsrb()
800 {
801 VECTOR_DEBUG;
802 int vrD, vrA, vrB;
803 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
804 for (int i=0; i<16; i++) {
805 gCPU.vr[vrD].b[i] = gCPU.vr[vrA].b[i] >> (gCPU.vr[vrB].b[i] & 0x7);
806 }
807 }
808 JITCFlow ppc_opc_gen_vsrb()
809 {
810 #if 0
811 ppc_opc_gen_interpret(ppc_opc_vsrb);
812 return flowEndBlock;
813 #else
814 int vrD, vrA, vrB;
815 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
816
817 jitcFlushClientVectorRegister(vrA);
818 jitcFlushClientVectorRegister(vrB);
819 jitcDropClientVectorRegister(vrD);
820
821 jitcClobberCarryAndFlags();
822 jitcAllocRegister(NATIVE_REG | ECX);
823 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
824
825 for (int i=0; i<16; i++) {
826 vec_getByte(reg, vrA, REG_NO, i);
827 vec_getByte(CL, vrB, REG_NO, i);
828
829 asmALU(X86_AND, CL, 0x7);
830 asmShift_CL(X86_SHR, reg);
831
832 vec_setByte(vrD, REG_NO, i, reg);
833 }
834
835 return flowContinue;
836 #endif
837 }
838
839 /* vsrh Vector Shift Right Half Word
840 * v.257
841 */
842 void ppc_opc_vsrh()
843 {
844 VECTOR_DEBUG;
845 int vrD, vrA, vrB;
846 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
847 for (int i=0; i<8; i++) {
848 gCPU.vr[vrD].h[i] = gCPU.vr[vrA].h[i] >> (gCPU.vr[vrB].h[i] & 0xf);
849 }
850 }
851 JITCFlow ppc_opc_gen_vsrh()
852 {
853 #if 0
854 ppc_opc_gen_interpret(ppc_opc_vsrh);
855 return flowEndBlock;
856 #else
857 int vrD, vrA, vrB;
858 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
859
860 jitcFlushClientVectorRegister(vrA);
861 jitcFlushClientVectorRegister(vrB);
862 jitcDropClientVectorRegister(vrD);
863
864 jitcClobberCarryAndFlags();
865 jitcAllocRegister(NATIVE_REG | ECX);
866 NativeReg16 reg = (NativeReg16)jitcAllocRegister();
867
868 for (int i=0; i<16; i+=2) {
869 vec_getHalf(reg, vrA, i);
870 vec_getHalf(CX, vrB, i);
871
872 asmALU(X86_AND, CL, 0x0f);
873 asmShift_CL(X86_SHR, reg);
874
875 vec_setHalf(vrD, i, reg);
876 }
877
878 return flowContinue;
879 #endif
880 }
881
882 /* vsrw Vector Shift Right Word
883 * v.259
884 */
885 void ppc_opc_vsrw()
886 {
887 VECTOR_DEBUG;
888 int vrD, vrA, vrB;
889 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
890 for (int i=0; i<4; i++) {
891 gCPU.vr[vrD].w[i] = gCPU.vr[vrA].w[i] >> (gCPU.vr[vrB].w[i] & 0x1f);
892 }
893 }
894 JITCFlow ppc_opc_gen_vsrw()
895 {
896 #if 0
897 ppc_opc_gen_interpret(ppc_opc_vsrw);
898 return flowEndBlock;
899 #else
900 int vrD, vrA, vrB;
901 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
902
903 jitcFlushClientVectorRegister(vrA);
904 jitcFlushClientVectorRegister(vrB);
905 jitcDropClientVectorRegister(vrD);
906
907 jitcClobberCarryAndFlags();
908 jitcAllocRegister(NATIVE_REG | ECX);
909 NativeReg reg = jitcAllocRegister();
910
911 for (int i=0; i<16; i+=4) {
912 vec_getWord(reg, vrA, i);
913 vec_getWord(ECX, vrB, i);
914
915 // this instruction auto-masks to 0x1f
916 asmShift_CL(X86_SHR, reg);
917
918 vec_setWord(vrD, i, reg);
919 }
920
921 return flowContinue;
922 #endif
923 }
924
925 /* vsrab Vector Shift Right Arithmetic Byte
926 * v.253
927 */
928 void ppc_opc_vsrab()
929 {
930 VECTOR_DEBUG;
931 int vrD, vrA, vrB;
932 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
933 for (int i=0; i<16; i++) {
934 gCPU.vr[vrD].sb[i] = gCPU.vr[vrA].sb[i] >> (gCPU.vr[vrB].b[i] & 0x7);
935 }
936 }
937 JITCFlow ppc_opc_gen_vsrab()
938 {
939 #if 0
940 ppc_opc_gen_interpret(ppc_opc_vsrab);
941 return flowEndBlock;
942 #else
943 int vrD, vrA, vrB;
944 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
945
946 jitcFlushClientVectorRegister(vrA);
947 jitcFlushClientVectorRegister(vrB);
948 jitcDropClientVectorRegister(vrD);
949
950 jitcClobberCarryAndFlags();
951 jitcAllocRegister(NATIVE_REG | ECX);
952 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
953
954 for (int i=0; i<16; i++) {
955 vec_getByte(reg, vrA, REG_NO, i);
956 vec_getByte(CL, vrB, REG_NO, i);
957
958 asmALU(X86_AND, CL, 0x7);
959 asmShift_CL(X86_SAR, reg);
960
961 vec_setByte(vrD, REG_NO, i, reg);
962 }
963
964 return flowContinue;
965 #endif
966 }
967
968 /* vsrah Vector Shift Right Arithmetic Half Word
969 * v.254
970 */
971 void ppc_opc_vsrah()
972 {
973 VECTOR_DEBUG;
974 int vrD, vrA, vrB;
975 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
976 for (int i=0; i<8; i++) {
977 gCPU.vr[vrD].sh[i] = gCPU.vr[vrA].sh[i] >> (gCPU.vr[vrB].h[i] & 0xf);
978 }
979 }
980 JITCFlow ppc_opc_gen_vsrah()
981 {
982 #if 0
983 ppc_opc_gen_interpret(ppc_opc_vsrah);
984 return flowEndBlock;
985 #else
986 int vrD, vrA, vrB;
987 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
988
989 jitcFlushClientVectorRegister(vrA);
990 jitcFlushClientVectorRegister(vrB);
991 jitcDropClientVectorRegister(vrD);
992
993 jitcClobberCarryAndFlags();
994 jitcAllocRegister(NATIVE_REG | ECX);
995 NativeReg16 reg = (NativeReg16)jitcAllocRegister();
996
997 for (int i=0; i<16; i+=2) {
998 vec_getHalf(reg, vrA, i);
999 vec_getHalf(CX, vrB, i);
1000
1001 asmALU(X86_AND, CL, 0x0f);
1002 asmShift_CL(X86_SAR, reg);
1003
1004 vec_setHalf(vrD, i, reg);
1005 }
1006
1007 return flowContinue;
1008 #endif
1009 }
1010
1011 /* vsraw Vector Shift Right Arithmetic Word
1012 * v.255
1013 */
1014 void ppc_opc_vsraw()
1015 {
1016 VECTOR_DEBUG;
1017 int vrD, vrA, vrB;
1018 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1019 for (int i=0; i<4; i++) {
1020 gCPU.vr[vrD].sw[i] = gCPU.vr[vrA].sw[i] >> (gCPU.vr[vrB].w[i] & 0x1f);
1021 }
1022 }
1023 JITCFlow ppc_opc_gen_vsraw()
1024 {
1025 #if 0
1026 ppc_opc_gen_interpret(ppc_opc_vsraw);
1027 return flowEndBlock;
1028 #else
1029 int vrD, vrA, vrB;
1030 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1031
1032 jitcFlushClientVectorRegister(vrB);
1033 jitcDropClientVectorRegister(vrD);
1034
1035 jitcClobberCarryAndFlags();
1036 jitcAllocRegister(NATIVE_REG | ECX);
1037 NativeReg reg = jitcAllocRegister();
1038
1039 for (int i=0; i<16; i+=4) {
1040 vec_getWord(reg, vrA, i);
1041 vec_getWord(ECX, vrB, i);
1042
1043 // this instruction auto-masks to 0x1f
1044 asmShift_CL(X86_SAR, reg);
1045
1046 vec_setWord(vrD, i, reg);
1047 }
1048
1049 return flowContinue;
1050 #endif
1051 }
1052
1053 /* vslb Vector Shift Left Byte
1054 * v.240
1055 */
1056 void ppc_opc_vslb()
1057 {
1058 VECTOR_DEBUG;
1059 int vrD, vrA, vrB;
1060 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1061 for (int i=0; i<16; i++) {
1062 gCPU.vr[vrD].b[i] = gCPU.vr[vrA].b[i] << (gCPU.vr[vrB].b[i] & 0x7);
1063 }
1064 }
1065 JITCFlow ppc_opc_gen_vslb()
1066 {
1067 #if 0
1068 ppc_opc_gen_interpret(ppc_opc_vslb);
1069 return flowEndBlock;
1070 #else
1071 int vrD, vrA, vrB;
1072 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1073
1074 jitcFlushClientVectorRegister(vrA);
1075 jitcFlushClientVectorRegister(vrB);
1076 jitcDropClientVectorRegister(vrD);
1077
1078 jitcClobberCarryAndFlags();
1079 jitcAllocRegister(NATIVE_REG | ECX);
1080 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1081
1082 for (int i=0; i<16; i++) {
1083 vec_getByte(reg, vrA, REG_NO, i);
1084 vec_getByte(CL, vrB, REG_NO, i);
1085
1086 asmALU(X86_AND, CL, 0x7);
1087 asmShift_CL(X86_SHL, reg);
1088
1089 vec_setByte(vrD, REG_NO, i, reg);
1090 }
1091
1092 return flowContinue;
1093 #endif
1094 }
1095
1096 /* vslh Vector Shift Left Half Word
1097 * v.242
1098 */
1099 void ppc_opc_vslh()
1100 {
1101 VECTOR_DEBUG;
1102 int vrD, vrA, vrB;
1103 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1104 for (int i=0; i<8; i++) {
1105 gCPU.vr[vrD].h[i] = gCPU.vr[vrA].h[i] << (gCPU.vr[vrB].h[i] & 0xf);
1106 }
1107 }
1108 JITCFlow ppc_opc_gen_vslh()
1109 {
1110 #if 0
1111 ppc_opc_gen_interpret(ppc_opc_vslh);
1112 return flowEndBlock;
1113 #else
1114 int vrD, vrA, vrB;
1115 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1116
1117 jitcFlushClientVectorRegister(vrA);
1118 jitcFlushClientVectorRegister(vrB);
1119 jitcDropClientVectorRegister(vrD);
1120
1121 jitcClobberCarryAndFlags();
1122 jitcAllocRegister(NATIVE_REG | ECX);
1123 NativeReg16 reg = (NativeReg16)jitcAllocRegister();
1124
1125 for (int i=0; i<16; i+=2) {
1126 vec_getHalf(reg, vrA, i);
1127 vec_getHalf(CX, vrB, i);
1128
1129 asmALU(X86_AND, CL, 0x0f);
1130 asmShift_CL(X86_SHL, reg);
1131
1132 vec_setHalf(vrD, i, reg);
1133 }
1134
1135 return flowContinue;
1136 #endif
1137 }
1138
1139 /* vslw Vector Shift Left Word
1140 * v.244
1141 */
1142 void ppc_opc_vslw()
1143 {
1144 VECTOR_DEBUG;
1145 int vrD, vrA, vrB;
1146 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1147 for (int i=0; i<4; i++) {
1148 gCPU.vr[vrD].w[i] = gCPU.vr[vrA].w[i] << (gCPU.vr[vrB].w[i] & 0x1f);
1149 }
1150 }
1151 JITCFlow ppc_opc_gen_vslw()
1152 {
1153 #if 0
1154 ppc_opc_gen_interpret(ppc_opc_vslw);
1155 return flowEndBlock;
1156 #else
1157 int vrD, vrA, vrB;
1158 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1159
1160 jitcFlushClientVectorRegister(vrA);
1161 jitcFlushClientVectorRegister(vrB);
1162 jitcDropClientVectorRegister(vrD);
1163
1164 jitcClobberCarryAndFlags();
1165 jitcAllocRegister(NATIVE_REG | ECX);
1166 NativeReg reg = jitcAllocRegister();
1167
1168 for (int i=0; i<16; i+=4) {
1169 vec_getWord(reg, vrA, i);
1170 vec_getWord(ECX, vrB, i);
1171
1172 // this instruction auto-masks to 0x1f
1173 asmShift_CL(X86_SHL, reg);
1174
1175 vec_setWord(vrD, i, reg);
1176 }
1177
1178 return flowContinue;
1179 #endif
1180 }
1181
1182 /* vsr Vector Shift Right
1183 * v.251
1184 */
1185 void ppc_opc_vsr()
1186 {
1187 VECTOR_DEBUG;
1188 int vrD, vrA, vrB;
1189 Vector_t r;
1190 int shift;
1191 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1192
1193 /* Specs say that the low-order 3 bits of all byte elements in vB
1194 * must be the same, or the result is undefined. So we can just
1195 * use the same low-order 3 bits for all of our shifts.
1196 */
1197 shift = gCPU.vr[vrB].w[0] & 0x7;
1198
1199 r.d[0] = gCPU.vr[vrA].d[0] >> shift;
1200 r.d[1] = gCPU.vr[vrA].d[1] >> shift;
1201
1202 VECT_D(r, 1) |= VECT_D(gCPU.vr[vrA], 0) << (64 - shift);
1203
1204 gCPU.vr[vrD] = r;
1205 }
1206 JITCFlow ppc_opc_gen_vsr()
1207 {
1208 ppc_opc_gen_interpret(ppc_opc_vsr);
1209 return flowEndBlock;
1210 }
1211
1212 /* vsro Vector Shift Right Octet
1213 * v.258
1214 */
1215 void ppc_opc_vsro()
1216 {
1217 VECTOR_DEBUG;
1218 int vrD, vrA, vrB;
1219 Vector_t r;
1220 int shift, i;
1221 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1222
1223 shift = (gCPU.vr[vrB].w[0] >> 3) & 0xf;
1224 #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1225 for (i=0; i<(16-shift); i++) {
1226 r.b[i] = gCPU.vr[vrA].b[i+shift];
1227 }
1228
1229 for (; i<16; i++) {
1230 r.b[i] = 0;
1231 }
1232 #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1233 for (i=0; i<shift; i++) {
1234 r.b[i] = 0;
1235 }
1236
1237 for (; i<16; i++) {
1238 r.b[i] = gCPU.vr[vrA].b[i-shift];
1239 }
1240 #else
1241 #error Endianess not supported!
1242 #endif
1243
1244 gCPU.vr[vrD] = r;
1245 }
1246 JITCFlow ppc_opc_gen_vsro()
1247 {
1248 ppc_opc_gen_interpret(ppc_opc_vsro);
1249 return flowEndBlock;
1250 }
1251
1252 /* vsl Vector Shift Left
1253 * v.239
1254 */
1255 void ppc_opc_vsl()
1256 {
1257 VECTOR_DEBUG;
1258 int vrD, vrA, vrB;
1259 Vector_t r;
1260 int shift;
1261 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1262
1263 /* Specs say that the low-order 3 bits of all byte elements in vB
1264 * must be the same, or the result is undefined. So we can just
1265 * use the same low-order 3 bits for all of our shifts.
1266 */
1267 shift = gCPU.vr[vrB].w[0] & 0x7;
1268
1269 r.d[0] = gCPU.vr[vrA].d[0] << shift;
1270 r.d[1] = gCPU.vr[vrA].d[1] << shift;
1271
1272 VECT_D(r, 0) |= VECT_D(gCPU.vr[vrA], 1) >> (64 - shift);
1273
1274 gCPU.vr[vrD] = r;
1275 }
1276 JITCFlow ppc_opc_gen_vsl()
1277 {
1278 ppc_opc_gen_interpret(ppc_opc_vsl);
1279 return flowEndBlock;
1280 }
1281
1282 /* vslo Vector Shift Left Octet
1283 * v.243
1284 */
1285 void ppc_opc_vslo()
1286 {
1287 VECTOR_DEBUG;
1288 int vrD, vrA, vrB;
1289 Vector_t r;
1290 int shift, i;
1291 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1292
1293 shift = (gCPU.vr[vrB].w[0] >> 3) & 0xf;
1294 #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1295 for (i=0; i<shift; i++) {
1296 r.b[i] = 0;
1297 }
1298
1299 for (; i<16; i++) {
1300 r.b[i] = gCPU.vr[vrA].b[i-shift];
1301 }
1302 #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1303 for (i=0; i<(16-shift); i++) {
1304 r.b[i] = gCPU.vr[vrA].b[i+shift];
1305 }
1306
1307 for (; i<16; i++) {
1308 r.b[i] = 0;
1309 }
1310 #else
1311 #error Endianess not supported!
1312 #endif
1313
1314 gCPU.vr[vrD] = r;
1315 }
1316 JITCFlow ppc_opc_gen_vslo()
1317 {
1318 ppc_opc_gen_interpret(ppc_opc_vslo);
1319 return flowEndBlock;
1320 }
1321
1322 static inline void ppc_opc_gen_helper_vsldoi_left(int vrA, int vrD, int shift, int pshift, NativeReg reg)
1323 {
1324 for (int i=12; i>=shift; i-=4) {
1325 vec_getWord(reg, vrA, i-shift);
1326
1327 vec_setWord(vrD, i, reg);
1328 }
1329
1330 if (pshift) {
1331 for (int i=shift+4-pshift; i>=shift; i--) {
1332 vec_getByte((NativeReg8)reg, vrA, REG_NO, i-shift);
1333
1334 vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1335 }
1336 }
1337 }
1338
1339 static inline void ppc_opc_gen_helper_vsldoi_right(int vrB, int vrD, int shift, int ashift, int bshift, NativeReg reg)
1340 {
1341 for (int i=0; i<bshift; i+=4) {
1342 vec_getWord(reg, vrB, i+ashift);
1343
1344 vec_setWord(vrD, i, reg);
1345 }
1346
1347 for (int i=bshift; i<shift; i++) {
1348 vec_getByte((NativeReg8)reg, vrB, REG_NO, i+ashift);
1349
1350 vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1351 }
1352 }
1353
1354 /* vsldoi Vector Shift Left Double by Octet Immediate
1355 * v.241
1356 */
1357 void ppc_opc_vsldoi()
1358 {
1359 VECTOR_DEBUG_COMMON;
1360 int vrD, vrA, vrB, shift, ashift;
1361 int i;
1362 Vector_t r;
1363 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, shift);
1364
1365 shift &= 0xf;
1366 ashift = 16 - shift;
1367
1368 #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1369 for (i=0; i<shift; i++) {
1370 r.b[i] = gCPU.vr[vrB].b[i+ashift];
1371 }
1372
1373 for (; i<16; i++) {
1374 r.b[i] = gCPU.vr[vrA].b[i-shift];
1375 }
1376 #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1377 for (i=0; i<ashift; i++) {
1378 r.b[i] = gCPU.vr[vrA].b[i+shift];
1379 }
1380
1381 for (; i<16; i++) {
1382 r.b[i] = gCPU.vr[vrB].b[i-ashift];
1383 }
1384 #else
1385 #error Endianess not supported!
1386 #endif
1387
1388 gCPU.vr[vrD] = r;
1389 }
1390 JITCFlow ppc_opc_gen_vsldoi()
1391 {
1392 #if 0
1393 ppc_opc_gen_interpret(ppc_opc_vsldoi);
1394 return flowEndBlock;
1395 #else
1396 int vrD, vrA, vrB;
1397 int shift, ashift, pshift, bshift;
1398 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, shift);
1399
1400 jitcFlushClientVectorRegister(vrA);
1401 jitcFlushClientVectorRegister(vrB);
1402 jitcDropClientVectorRegister(vrD);
1403
1404 shift &= 0xf;
1405 ashift = 16 - shift;
1406 bshift = (shift & ~0x03);
1407 pshift = shift & 0x03;
1408
1409 NativeReg reg = jitcAllocRegister(NATIVE_REG_8);
1410
1411 if ((vrD == vrA) && (vrD != vrB)) {
1412 ppc_opc_gen_helper_vsldoi_left(vrA, vrD, shift, pshift, reg);
1413
1414 ppc_opc_gen_helper_vsldoi_right(vrB, vrD, shift, ashift, bshift, reg);
1415
1416 return flowContinue;
1417 }
1418
1419 if (vrD == vrA) {
1420 jitcDropClientVectorRegister(vrT);
1421 ppc_opc_gen_helper_vsldoi_right(vrB, vrT, shift, ashift, bshift, reg);
1422 } else {
1423 ppc_opc_gen_helper_vsldoi_right(vrB, vrD, shift, ashift, bshift, reg);
1424 }
1425
1426 ppc_opc_gen_helper_vsldoi_left(vrA, vrD, shift, pshift, reg);
1427
1428 if (vrD == vrA) {
1429 for (int i=0; i<bshift; i+=4) {
1430 vec_getWord(reg, vrT, i);
1431 vec_setWord(vrD, i, reg);
1432 }
1433
1434 for (int i=bshift; i<shift; i++) {
1435 vec_getByte((NativeReg8)reg, vrT, REG_NO, i);
1436 vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1437 }
1438 }
1439
1440 return flowContinue;
1441 #endif
1442 }
1443
1444 /* vrlb Vector Rotate Left Byte
1445 * v.234
1446 */
1447 void ppc_opc_vrlb()
1448 {
1449 VECTOR_DEBUG;
1450 int vrD, vrA, vrB, shift;
1451 Vector_t r;
1452 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1453
1454 for (int i=0; i<16; i++) {
1455 shift = (gCPU.vr[vrB].b[i] & 0x7);
1456
1457 r.b[i] = gCPU.vr[vrA].b[i] << shift;
1458 r.b[i] |= gCPU.vr[vrA].b[i] >> (8 - shift);
1459 }
1460
1461 gCPU.vr[vrD] = r;
1462 }
1463 JITCFlow ppc_opc_gen_vrlb()
1464 {
1465 #if 0
1466 ppc_opc_gen_interpret(ppc_opc_vrlb);
1467 return flowEndBlock;
1468 #else
1469
1470 int vrD, vrA, vrB;
1471 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1472
1473 jitcFlushClientVectorRegister(vrA);
1474 jitcFlushClientVectorRegister(vrB);
1475 jitcDropClientVectorRegister(vrD);
1476
1477 jitcClobberCarryAndFlags();
1478 jitcAllocRegister(NATIVE_REG | ECX);
1479 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1480
1481 for (int i=0; i<16; i++) {
1482 vec_getByte(reg, vrA, REG_NO, i);
1483 vec_getByte(CL, vrB, REG_NO, i);
1484
1485 asmShift_CL(X86_ROL, reg);
1486
1487 vec_setByte(vrD, REG_NO, i, reg);
1488 }
1489
1490 return flowContinue;
1491 #endif
1492 }
1493
1494 /* vrlh Vector Rotate Left Half Word
1495 * v.235
1496 */
1497 void ppc_opc_vrlh()
1498 {
1499 VECTOR_DEBUG;
1500 int vrD, vrA, vrB, shift;
1501 Vector_t r;
1502 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1503
1504 for (int i=0; i<8; i++) {
1505 shift = (gCPU.vr[vrB].h[i] & 0xf);
1506
1507 r.h[i] = gCPU.vr[vrA].h[i] << shift;
1508 r.h[i] |= gCPU.vr[vrA].h[i] >> (16 - shift);
1509 }
1510
1511 gCPU.vr[vrD] = r;
1512 }
1513 JITCFlow ppc_opc_gen_vrlh()
1514 {
1515 ppc_opc_gen_interpret(ppc_opc_vrlh);
1516 return flowEndBlock;
1517 }
1518
1519 /* vrlw Vector Rotate Left Word
1520 * v.236
1521 */
1522 void ppc_opc_vrlw()
1523 {
1524 VECTOR_DEBUG;
1525 int vrD, vrA, vrB, shift;
1526 Vector_t r;
1527 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1528
1529 for (int i=0; i<4; i++) {
1530 shift = (gCPU.vr[vrB].w[i] & 0x1F);
1531
1532 r.w[i] = gCPU.vr[vrA].w[i] << shift;
1533 r.w[i] |= gCPU.vr[vrA].w[i] >> (32 - shift);
1534 }
1535
1536 gCPU.vr[vrD] = r;
1537 }
1538 JITCFlow ppc_opc_gen_vrlw()
1539 {
1540 ppc_opc_gen_interpret(ppc_opc_vrlw);
1541 return flowEndBlock;
1542 }
1543
1544 /* With the merges, I just don't see any point in risking that a compiler
1545 * might generate actual alu code to calculate anything when it's
1546 * compile-time known. Plus, it's easier to validate it like this.
1547 */
1548
1549 /* vmrghb Vector Merge High Byte
1550 * v.195
1551 */
1552 void ppc_opc_vmrghb()
1553 {
1554 VECTOR_DEBUG;
1555 int vrD, vrA, vrB;
1556 Vector_t r;
1557 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1558
1559 VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 0);
1560 VECT_B(r, 1) = VECT_B(gCPU.vr[vrB], 0);
1561 VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 1);
1562 VECT_B(r, 3) = VECT_B(gCPU.vr[vrB], 1);
1563 VECT_B(r, 4) = VECT_B(gCPU.vr[vrA], 2);
1564 VECT_B(r, 5) = VECT_B(gCPU.vr[vrB], 2);
1565 VECT_B(r, 6) = VECT_B(gCPU.vr[vrA], 3);
1566 VECT_B(r, 7) = VECT_B(gCPU.vr[vrB], 3);
1567 VECT_B(r, 8) = VECT_B(gCPU.vr[vrA], 4);
1568 VECT_B(r, 9) = VECT_B(gCPU.vr[vrB], 4);
1569 VECT_B(r,10) = VECT_B(gCPU.vr[vrA], 5);
1570 VECT_B(r,11) = VECT_B(gCPU.vr[vrB], 5);
1571 VECT_B(r,12) = VECT_B(gCPU.vr[vrA], 6);
1572 VECT_B(r,13) = VECT_B(gCPU.vr[vrB], 6);
1573 VECT_B(r,14) = VECT_B(gCPU.vr[vrA], 7);
1574 VECT_B(r,15) = VECT_B(gCPU.vr[vrB], 7);
1575
1576 gCPU.vr[vrD] = r;
1577 }
1578 JITCFlow ppc_opc_gen_vmrghb()
1579 {
1580 #if 0
1581 ppc_opc_gen_interpret(ppc_opc_vmrghb);
1582 return flowEndBlock;
1583 #else
1584 int vrD, vrA, vrB;
1585 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1586
1587 if (SSE2_AVAIL) {
1588 noncommutative_operation(PALUB(X86_PUNPCKH), vrD, vrB, vrA);
1589 return flowContinue;
1590 }
1591
1592 jitcFlushClientVectorRegister(vrA);
1593 jitcFlushClientVectorRegister(vrB);
1594 jitcDropClientVectorRegister(vrD);
1595
1596 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1597 NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1598
1599 vec_getHalf((NativeReg16)reg1, vrB, 8);
1600 vec_getHalf((NativeReg16)reg2, vrA, 8);
1601 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1602 vec_setHalf(vrD, 0, (NativeReg16)reg1);
1603 vec_setHalf(vrD, 2, (NativeReg16)reg2);
1604
1605 vec_getHalf((NativeReg16)reg1, vrB, 10);
1606 vec_getHalf((NativeReg16)reg2, vrA, 10);
1607 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1608 vec_setHalf(vrD, 4, (NativeReg16)reg1);
1609 vec_setHalf(vrD, 6, (NativeReg16)reg2);
1610
1611 vec_getHalf((NativeReg16)reg1, vrB, 12);
1612 vec_getHalf((NativeReg16)reg2, vrA, 12);
1613 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1614 vec_setHalf(vrD, 8, (NativeReg16)reg1);
1615 vec_setHalf(vrD, 10, (NativeReg16)reg2);
1616
1617 vec_getHalf((NativeReg16)reg1, vrB, 14);
1618 vec_getHalf((NativeReg16)reg2, vrA, 14);
1619 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1620 vec_setHalf(vrD, 12, (NativeReg16)reg1);
1621 vec_setHalf(vrD, 14, (NativeReg16)reg2);
1622
1623 return flowContinue;
1624 #endif
1625 }
1626
1627 /* vmrghh Vector Merge High Half Word
1628 * v.196
1629 */
1630 void ppc_opc_vmrghh()
1631 {
1632 VECTOR_DEBUG;
1633 int vrD, vrA, vrB;
1634 Vector_t r;
1635 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1636
1637 VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 0);
1638 VECT_H(r, 1) = VECT_H(gCPU.vr[vrB], 0);
1639 VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 1);
1640 VECT_H(r, 3) = VECT_H(gCPU.vr[vrB], 1);
1641 VECT_H(r, 4) = VECT_H(gCPU.vr[vrA], 2);
1642 VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 2);
1643 VECT_H(r, 6) = VECT_H(gCPU.vr[vrA], 3);
1644 VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 3);
1645
1646 gCPU.vr[vrD] = r;
1647 }
1648 JITCFlow ppc_opc_gen_vmrghh()
1649 {
1650 #if 0
1651 ppc_opc_gen_interpret(ppc_opc_vmrghh);
1652 return flowEndBlock;
1653 #else
1654 int vrD, vrA, vrB;
1655 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1656
1657 if (SSE2_AVAIL) {
1658 noncommutative_operation(PALUW(X86_PUNPCKH), vrD, vrB, vrA);
1659 return flowContinue;
1660 }
1661
1662 jitcFlushClientVectorRegister(vrA);
1663 jitcFlushClientVectorRegister(vrB);
1664 jitcDropClientVectorRegister(vrD);
1665
1666 NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
1667
1668 vec_getHalf(reg1, vrB, 8);
1669 vec_setHalf(vrD, 0, reg1);
1670 vec_getHalf(reg1, vrB, 10);
1671 vec_setHalf(vrD, 4, reg1);
1672
1673 vec_getHalf(reg1, vrA, 8);
1674 vec_setHalf(vrD, 2, reg1);
1675 vec_getHalf(reg1, vrA, 10);
1676 vec_setHalf(vrD, 6, reg1);
1677
1678 vec_getHalf(reg1, vrB, 12);
1679 vec_setHalf(vrD, 8, reg1);
1680
1681 vec_getHalf(reg1, vrA, 12);
1682 vec_setHalf(vrD, 10, reg1);
1683
1684 vec_getHalf(reg1, vrB, 14);
1685 vec_setHalf(vrD, 12, reg1);
1686
1687 if (vrD != vrA) {
1688 vec_getHalf(reg1, vrA, 14);
1689 vec_setHalf(vrD, 14, reg1);
1690 }
1691
1692 return flowContinue;
1693 #endif
1694 }
1695
1696 /* vmrghw Vector Merge High Word
1697 * v.197
1698 */
1699 void ppc_opc_vmrghw()
1700 {
1701 VECTOR_DEBUG;
1702 int vrD, vrA, vrB;
1703 Vector_t r;
1704 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1705
1706 VECT_W(r, 0) = VECT_W(gCPU.vr[vrA], 0);
1707 VECT_W(r, 1) = VECT_W(gCPU.vr[vrB], 0);
1708 VECT_W(r, 2) = VECT_W(gCPU.vr[vrA], 1);
1709 VECT_W(r, 3) = VECT_W(gCPU.vr[vrB], 1);
1710
1711 gCPU.vr[vrD] = r;
1712 }
1713 JITCFlow ppc_opc_gen_vmrghw()
1714 {
1715 #if 0
1716 ppc_opc_gen_interpret(ppc_opc_vmrghw);
1717 return flowEndBlock;
1718 #else
1719 int vrD, vrA, vrB;
1720 modrm_o modrm;
1721 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1722
1723 if (SSE2_AVAIL) {
1724 noncommutative_operation(PALUD(X86_PUNPCKH), vrD, vrB, vrA);
1725 return flowContinue;
1726 }
1727
1728 if (SSE_AVAIL) {
1729 NativeVectorReg dest, src;
1730 int shuf_map = 0x72;
1731 int vrS = vrB;
1732
1733 if (vrA == vrD) {
1734 dest = jitcGetClientVectorRegisterDirty(vrA);
1735 src = jitcGetClientVectorRegisterMapping(vrB);
1736 } else if (vrB == vrD) {
1737 dest = jitcGetClientVectorRegisterDirty(vrB);
1738 src = jitcGetClientVectorRegisterMapping(vrA);
1739 shuf_map = 0xd8;
1740 vrS = vrA;
1741 } else {
1742 dest = jitcMapClientVectorRegisterDirty(vrD);
1743 src = jitcGetClientVectorRegisterMapping(vrA);
1744
1745 if (src == VECTREG_NO)
1746 asmMOVAPS(dest, &gCPU.vr[vrA]);
1747 else
1748 asmALUPS(X86_MOVAPS, dest, src);
1749
1750 src = jitcGetClientVectorRegisterMapping(vrB);
1751 }
1752
1753 if (src == VECTREG_NO) {
1754 asmSHUFPS(dest, x86_mem2(modrm, &gCPU.vr[vrS]), 0xee);
1755 } else {
1756 asmSHUFPS(dest, src, 0xee);
1757 }
1758
1759 asmSHUFPS(dest, dest, shuf_map);
1760
1761 return flowContinue;
1762 }
1763
1764 jitcFlushClientVectorRegister(vrA);
1765 jitcFlushClientVectorRegister(vrB);
1766 jitcDropClientVectorRegister(vrD);
1767
1768 NativeReg reg = jitcAllocRegister();
1769 JitcVectorReg vrTMP = vrD;
1770
1771 if (vrA == vrD || vrB == vrD) {
1772 vrTMP = vrT;
1773 }
1774
1775 vec_getWord(reg, vrA, 12);
1776 vec_setWord(vrTMP, 12, reg);
1777
1778 vec_getWord(reg, vrB, 12);
1779 vec_setWord(vrTMP, 8, reg);
1780
1781 vec_getWord(reg, vrA, 8);
1782 vec_setWord(vrTMP, 4, reg);
1783
1784 vec_getWord(reg, vrB, 8);
1785 vec_setWord(vrTMP, 0, reg);
1786
1787 vec_Copy(vrD, vrTMP);
1788
1789 return flowContinue;
1790 #endif
1791 }
1792
1793 /* vmrglb Vector Merge Low Byte
1794 * v.198
1795 */
1796 void ppc_opc_vmrglb()
1797 {
1798 VECTOR_DEBUG;
1799 int vrD, vrA, vrB;
1800 Vector_t r;
1801 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1802
1803 VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 8);
1804 VECT_B(r, 1) = VECT_B(gCPU.vr[vrB], 8);
1805 VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 9);
1806 VECT_B(r, 3) = VECT_B(gCPU.vr[vrB], 9);
1807 VECT_B(r, 4) = VECT_B(gCPU.vr[vrA],10);
1808 VECT_B(r, 5) = VECT_B(gCPU.vr[vrB],10);
1809 VECT_B(r, 6) = VECT_B(gCPU.vr[vrA],11);
1810 VECT_B(r, 7) = VECT_B(gCPU.vr[vrB],11);
1811 VECT_B(r, 8) = VECT_B(gCPU.vr[vrA],12);
1812 VECT_B(r, 9) = VECT_B(gCPU.vr[vrB],12);
1813 VECT_B(r,10) = VECT_B(gCPU.vr[vrA],13);
1814 VECT_B(r,11) = VECT_B(gCPU.vr[vrB],13);
1815 VECT_B(r,12) = VECT_B(gCPU.vr[vrA],14);
1816 VECT_B(r,13) = VECT_B(gCPU.vr[vrB],14);
1817 VECT_B(r,14) = VECT_B(gCPU.vr[vrA],15);
1818 VECT_B(r,15) = VECT_B(gCPU.vr[vrB],15);
1819
1820 gCPU.vr[vrD] = r;
1821 }
1822 JITCFlow ppc_opc_gen_vmrglb()
1823 {
1824 #if 0
1825 ppc_opc_gen_interpret(ppc_opc_vmrglb);
1826 return flowEndBlock;
1827 #else
1828 int vrD, vrA, vrB;
1829 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1830
1831 if (SSE2_AVAIL) {
1832 noncommutative_operation(PALUB(X86_PUNPCKL), vrD, vrB, vrA);
1833 return flowContinue;
1834 }
1835
1836 jitcFlushClientVectorRegister(vrA);
1837 jitcFlushClientVectorRegister(vrB);
1838 jitcDropClientVectorRegister(vrD);
1839
1840 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1841 NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1842
1843 vec_getHalf((NativeReg16)reg1, vrB, 6);
1844 vec_getHalf((NativeReg16)reg2, vrA, 6);
1845 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1846 vec_setHalf(vrD, 12, (NativeReg16)reg1);
1847 vec_setHalf(vrD, 14, (NativeReg16)reg2);
1848
1849 vec_getHalf((NativeReg16)reg1, vrB, 4);
1850 vec_getHalf((NativeReg16)reg2, vrA, 4);
1851 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1852 vec_setHalf(vrD, 8, (NativeReg16)reg1);
1853 vec_setHalf(vrD, 10, (NativeReg16)reg2);
1854
1855 vec_getHalf((NativeReg16)reg1, vrB, 2);
1856 vec_getHalf((NativeReg16)reg2, vrA, 2);
1857 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1858 vec_setHalf(vrD, 4, (NativeReg16)reg1);
1859 vec_setHalf(vrD, 6, (NativeReg16)reg2);
1860
1861 vec_getHalf((NativeReg16)reg1, vrB, 0);
1862 vec_getHalf((NativeReg16)reg2, vrA, 0);
1863 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1864 vec_setHalf(vrD, 0, (NativeReg16)reg1);
1865 vec_setHalf(vrD, 2, (NativeReg16)reg2);
1866
1867 return flowContinue;
1868 #endif
1869 }
1870
1871 /* vmrglh Vector Merge Low Half Word
1872 * v.199
1873 */
1874 void ppc_opc_vmrglh()
1875 {
1876 VECTOR_DEBUG;
1877 int vrD, vrA, vrB;
1878 Vector_t r;
1879 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1880
1881 VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 4);
1882 VECT_H(r, 1) = VECT_H(gCPU.vr[vrB], 4);
1883 VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 5);
1884 VECT_H(r, 3) = VECT_H(gCPU.vr[vrB], 5);
1885 VECT_H(r, 4) = VECT_H(gCPU.vr[vrA], 6);
1886 VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 6);
1887 VECT_H(r, 6) = VECT_H(gCPU.vr[vrA], 7);
1888 VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 7);
1889
1890 gCPU.vr[vrD] = r;
1891 }
1892 JITCFlow ppc_opc_gen_vmrglh()
1893 {
1894 #if 0
1895 ppc_opc_gen_interpret(ppc_opc_vmrglh);
1896 return flowEndBlock;
1897 #else
1898 int vrD, vrA, vrB;
1899 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1900
1901 if (SSE2_AVAIL) {
1902 noncommutative_operation(PALUW(X86_PUNPCKL), vrD, vrB, vrA);
1903 return flowContinue;
1904 }
1905
1906 jitcFlushClientVectorRegister(vrA);
1907 jitcFlushClientVectorRegister(vrB);
1908 jitcDropClientVectorRegister(vrD);
1909
1910 NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
1911
1912 vec_getHalf(reg1, vrA, 6);
1913 vec_setHalf(vrD, 14, reg1);
1914 vec_getHalf(reg1, vrA, 4);
1915 vec_setHalf(vrD, 10, reg1);
1916
1917 vec_getHalf(reg1, vrB, 6);
1918 vec_setHalf(vrD, 12, reg1);
1919 vec_getHalf(reg1, vrB, 4);
1920 vec_setHalf(vrD, 8, reg1);
1921
1922 vec_getHalf(reg1, vrA, 2);
1923 vec_setHalf(vrD, 6, reg1);
1924
1925 vec_getHalf(reg1, vrB, 2);
1926 vec_setHalf(vrD, 4, reg1);
1927
1928 vec_getHalf(reg1, vrA, 0);
1929 vec_setHalf(vrD, 2, reg1);
1930
1931 if (vrD != vrB) {
1932 vec_getHalf(reg1, vrB, 0);
1933 vec_setHalf(vrD, 0, reg1);
1934 }
1935
1936 return flowContinue;
1937 #endif
1938 }
1939
1940 /* vmrglw Vector Merge Low Word
1941 * v.200
1942 */
1943 void ppc_opc_vmrglw()
1944 {
1945 VECTOR_DEBUG;
1946 int vrD, vrA, vrB;
1947 Vector_t r;
1948 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1949
1950 VECT_W(r, 0) = VECT_W(gCPU.vr[vrA], 2);
1951 VECT_W(r, 1) = VECT_W(gCPU.vr[vrB], 2);
1952 VECT_W(r, 2) = VECT_W(gCPU.vr[vrA], 3);
1953 VECT_W(r, 3) = VECT_W(gCPU.vr[vrB], 3);
1954
1955 gCPU.vr[vrD] = r;
1956 }
1957 JITCFlow ppc_opc_gen_vmrglw()
1958 {
1959 #if 0
1960 ppc_opc_gen_interpret(ppc_opc_vmrglw);
1961 return flowEndBlock;
1962 #else
1963 int vrD, vrA, vrB;
1964 modrm_o modrm;
1965 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1966
1967 if (SSE2_AVAIL) {
1968 noncommutative_operation(PALUD(X86_PUNPCKL), vrD, vrB, vrA);
1969 return flowContinue;
1970 }
1971
1972 if (SSE_AVAIL) {
1973 NativeVectorReg dest, src;
1974 int shuf_map = 0x72;
1975 int vrS = vrB;
1976
1977 if (vrA == vrD) {
1978 dest = jitcGetClientVectorRegisterDirty(vrA);
1979 src = jitcGetClientVectorRegisterMapping(vrB);
1980 } else if (vrB == vrD) {
1981 dest = jitcGetClientVectorRegisterDirty(vrB);
1982 src = jitcGetClientVectorRegisterMapping(vrA);
1983 shuf_map = 0xd8;
1984 vrS = vrA;
1985 } else {
1986 dest = jitcGetClientVectorRegister(vrA);
1987 jitcRenameVectorRegisterDirty(dest, vrD);
1988
1989 src = jitcGetClientVectorRegisterMapping(vrB);
1990 }
1991
1992 if (src == VECTREG_NO) {
1993 asmSHUFPS(dest, x86_mem2(modrm, &gCPU.vr[vrS]), 0x44);
1994 } else {
1995 asmSHUFPS(dest, src, 0x44);
1996 }
1997
1998 asmSHUFPS(dest, dest, shuf_map);
1999
2000 return flowContinue;
2001 }
2002
2003 NativeReg reg = jitcAllocRegister();
2004
2005 jitcFlushClientVectorRegister(vrA);
2006 jitcFlushClientVectorRegister(vrB);
2007 jitcDropClientVectorRegister(vrD);
2008
2009 vec_getWord(reg, vrA, 4);
2010 vec_setWord(vrD, 12, reg);
2011
2012 vec_getWord(reg, vrB, 4);
2013 vec_setWord(vrD, 8, reg);
2014
2015 vec_getWord(reg, vrA, 0);
2016 vec_setWord(vrD, 4, reg);
2017
2018 if (vrB != vrD) {
2019 vec_getWord(reg, vrB, 0);
2020 vec_setWord(vrD, 0, reg);
2021 }
2022
2023 return flowContinue;
2024 #endif
2025 }
2026
2027 /* vspltb Vector Splat Byte
2028 * v.245
2029 */
2030 void ppc_opc_vspltb()
2031 {
2032 VECTOR_DEBUG;
2033 int vrD, vrB;
2034 uint32 uimm;
2035 uint64 val;
2036 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2037
2038 /* The documentation doesn't stipulate what a value higher than 0xf
2039 * will do. Thus, this is by default an undefined value. We
2040 * are thus doing this the fastest way that won't crash us.
2041 */
2042 val = VECT_B(gCPU.vr[vrB], uimm & 0xf);
2043 val |= (val << 8);
2044 val |= (val << 16);
2045 val |= (val << 32);
2046
2047 gCPU.vr[vrD].d[0] = val;
2048 gCPU.vr[vrD].d[1] = val;
2049 }
2050 JITCFlow ppc_opc_gen_vspltb()
2051 {
2052 #if 0
2053 ppc_opc_gen_interpret(ppc_opc_vspltb);
2054 return flowEndBlock;
2055 #else
2056 int vrD, vrB;
2057 uint32 uimm;
2058 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2059
2060 jitcFlushClientVectorRegister(vrB);
2061 jitcDropClientVectorRegister(vrD);
2062
2063 jitcClobberCarryAndFlags();
2064 NativeReg reg1 = jitcAllocRegister(NATIVE_REG_8);
2065 NativeReg reg2 = jitcAllocRegister();
2066
2067 /* The documentation doesn't stipulate what a value higher than 0xf
2068 * will do. Thus, this is by default an undefined value. We
2069 * are thus doing this the fastest way that won't crash us.
2070 */
2071 vec_getByteZ(reg1, vrB, (15 - (uimm & 0xf)));
2072 asmALU(X86_MOV, RL2RH(reg1), (NativeReg8)reg1);
2073
2074 asmALU(X86_MOV, reg2, reg1);
2075 asmShift(X86_SHL, reg2, 16);
2076 asmALU(X86_OR, reg1, reg2);
2077
2078 vec_setWord(vrD, 0, reg1);
2079 vec_setWord(vrD, 4, reg1);
2080 vec_setWord(vrD, 8, reg1);
2081 vec_setWord(vrD, 12, reg1);
2082
2083 return flowContinue;
2084 #endif
2085 }
2086
2087 /* vsplth Vector Splat Half Word
2088 * v.246
2089 */
2090 void ppc_opc_vsplth()
2091 {
2092 VECTOR_DEBUG;
2093 int vrD, vrB;
2094 uint32 uimm;
2095 uint64 val;
2096 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2097
2098 /* The documentation doesn't stipulate what a value higher than 0x7
2099 * will do. Thus, this is by default an undefined value. We
2100 * are thus doing this the fastest way that won't crash us.
2101 */
2102 val = VECT_H(gCPU.vr[vrB], uimm & 0x7);
2103 val |= (val << 16);
2104 val |= (val << 32);
2105
2106 gCPU.vr[vrD].d[0] = val;
2107 gCPU.vr[vrD].d[1] = val;
2108 }
2109 JITCFlow ppc_opc_gen_vsplth()
2110 {
2111 #if 0
2112 ppc_opc_gen_interpret(ppc_opc_vsplth);
2113 return flowEndBlock;
2114 #else
2115 int vrD, vrB;
2116 uint32 uimm;
2117 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2118
2119 jitcFlushClientVectorRegister(vrB);
2120 jitcDropClientVectorRegister(vrD);
2121
2122 jitcClobberCarryAndFlags();
2123 NativeReg reg1 = jitcAllocRegister();
2124 NativeReg reg2 = jitcAllocRegister();
2125
2126 /* The documentation doesn't stipulate what a value higher than 0x7
2127 * will do. Thus, this is by default an undefined value. We
2128 * are thus doing this the fastest way that won't crash us.
2129 */
2130 vec_getHalfZ(reg1, vrB, (7 - (uimm & 0x7)) << 1);
2131
2132 asmALU(X86_MOV, reg2, reg1);
2133 asmShift(X86_SHL, reg2, 16);
2134 asmALU(X86_OR, reg1, reg2);
2135
2136 vec_setWord(vrD, 0, reg1);
2137 vec_setWord(vrD, 4, reg1);
2138 vec_setWord(vrD, 8, reg1);
2139 vec_setWord(vrD, 12, reg1);
2140
2141 return flowContinue;
2142 #endif
2143 }
2144
2145 /* vspltw Vector Splat Word
2146 * v.250
2147 */
2148 void ppc_opc_vspltw()
2149 {
2150 VECTOR_DEBUG;
2151 int vrD, vrB;
2152 uint32 uimm;
2153 uint64 val;
2154 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2155
2156 /* The documentation doesn't stipulate what a value higher than 0x3
2157 * will do. Thus, this is by default an undefined value. We
2158 * are thus doing this the fastest way that won't crash us.
2159 */
2160 val = VECT_W(gCPU.vr[vrB], uimm & 0x3);
2161 val |= (val << 32);
2162
2163 gCPU.vr[vrD].d[0] = val;
2164 gCPU.vr[vrD].d[1] = val;
2165 }
2166 JITCFlow ppc_opc_gen_vspltw()
2167 {
2168 #if 0
2169 ppc_opc_gen_interpret(ppc_opc_vspltw);
2170 return flowEndBlock;
2171 #else
2172 int vrD, vrB;
2173 uint32 uimm;
2174 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2175
2176 if (SSE_AVAIL) {
2177 NativeVectorReg reg;
2178
2179 if (vrD == vrB) {
2180 reg = jitcGetClientVectorRegisterDirty(vrD);
2181 } else {
2182 reg = jitcMapClientVectorRegisterDirty(vrD);
2183 NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(vrB);
2184
2185 if (reg2 == VECTREG_NO)
2186 asmMOVAPS(reg, &gCPU.vr[vrB]);
2187 else
2188 asmALUPS(X86_MOVAPS, reg, reg2);
2189 }
2190
2191 asmSHUFPS(reg, reg, 0xff - ((uimm & 0x3) * 0x55));
2192
2193 return flowContinue;
2194 }
2195
2196 jitcFlushClientVectorRegister(vrB);
2197 jitcDropClientVectorRegister(vrD);
2198
2199 jitcClobberCarryAndFlags();
2200 NativeReg reg1 = jitcAllocRegister();
2201
2202 /* The documentation doesn't stipulate what a value higher than 0xf
2203 * will do. Thus, this is by default an undefined value. We
2204 * are thus doing this the fastest way that won't crash us.
2205 */
2206 vec_getWord(reg1, vrB, (3 - (uimm & 0x3)) << 2);
2207
2208 vec_setWord(vrD, 0, reg1);
2209 vec_setWord(vrD, 4, reg1);
2210 vec_setWord(vrD, 8, reg1);
2211 vec_setWord(vrD, 12, reg1);
2212
2213 return flowContinue;
2214 #endif
2215 }
2216
2217 /* vspltisb Vector Splat Immediate Signed Byte
2218 * v.247
2219 */
2220 void ppc_opc_vspltisb()
2221 {
2222 VECTOR_DEBUG_COMMON;
2223 int vrD, vrB;
2224 uint32 simm;
2225 uint64 val;
2226 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2227 PPC_OPC_ASSERT(vrB==0);
2228
2229 val = (simm & 0x10) ? (simm | 0xE0) : simm;
2230 val |= (val << 8);
2231 val |= (val << 16);
2232 val |= (val << 32);
2233
2234 gCPU.vr[vrD].d[0] = val;
2235 gCPU.vr[vrD].d[1] = val;
2236 }
2237 JITCFlow ppc_opc_gen_vspltisb()
2238 {
2239 #if 0
2240 ppc_opc_gen_interpret(ppc_opc_vspltisb);
2241 return flowEndBlock;
2242 #else
2243 int vrD, vrB;
2244 uint32 simm, val;
2245 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2246
2247 if (simm == 0) {
2248 vec_Zero(vrD);
2249 return flowContinue;
2250 } else if (simm == 0x1f) {
2251 vec_Neg1(vrD);
2252 return flowContinue;
2253 }
2254
2255 val = (simm & 0x10) ? (simm | 0xE0) : simm;
2256 val |= (val << 8);
2257 val |= (val << 16);
2258
2259 if (SSE_AVAIL) {
2260 modrm_o modrm;
2261 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2262
2263 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2264 asmMOVSS(reg1, &gCPU.vtemp);
2265 asmSHUFPS(reg1, reg1, 0x00);
2266
2267 return flowContinue;
2268 }
2269
2270 jitcDropClientVectorRegister(vrD);
2271
2272 NativeReg r1 = jitcAllocRegister();
2273 asmALU(X86_MOV, r1, val);
2274
2275 vec_setWord(vrD, 0, r1);
2276 vec_setWord(vrD, 4, r1);
2277 vec_setWord(vrD, 8, r1);
2278 vec_setWord(vrD,12, r1);
2279
2280 return flowContinue;
2281 #endif
2282 }
2283
2284 /* vspltish Vector Splat Immediate Signed Half Word
2285 * v.248
2286 */
2287 void ppc_opc_vspltish()
2288 {
2289 VECTOR_DEBUG_COMMON;
2290 int vrD, vrB;
2291 uint32 simm;
2292 uint64 val;
2293 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2294 PPC_OPC_ASSERT(vrB==0);
2295
2296 val = (simm & 0x10) ? (simm | 0xFFE0) : simm;
2297 val |= (val << 16);
2298 val |= (val << 32);
2299
2300 gCPU.vr[vrD].d[0] = val;
2301 gCPU.vr[vrD].d[1] = val;
2302 }
2303 JITCFlow ppc_opc_gen_vspltish()
2304 {
2305 #if 0
2306 ppc_opc_gen_interpret(ppc_opc_vspltish);
2307 return flowEndBlock;
2308 #else
2309 int vrD, vrB;
2310 uint32 simm, val;
2311 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2312
2313 if (simm == 0) {
2314 vec_Zero(vrD);
2315 return flowContinue;
2316 } else if (simm == 0x1f) {
2317 vec_Neg1(vrD);
2318 return flowContinue;
2319 }
2320
2321 val = (simm & 0x10) ? (simm | 0xFFE0) : simm;
2322 val |= (val << 16);
2323
2324 if (SSE_AVAIL) {
2325 modrm_o modrm;
2326 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2327
2328 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2329 asmMOVSS(reg1, &gCPU.vtemp);
2330 asmSHUFPS(reg1, reg1, 0x00);
2331
2332 return flowContinue;
2333 }
2334
2335 jitcDropClientVectorRegister(vrD);
2336
2337 NativeReg r1 = jitcAllocRegister();
2338 asmALU(X86_MOV, r1, val);
2339
2340 vec_setWord(vrD, 0, r1);
2341 vec_setWord(vrD, 4, r1);
2342 vec_setWord(vrD, 8, r1);
2343 vec_setWord(vrD,12, r1);
2344
2345 return flowContinue;
2346 #endif
2347 }
2348
2349 /* vspltisw Vector Splat Immediate Signed Word
2350 * v.249
2351 */
2352 void ppc_opc_vspltisw()
2353 {
2354 VECTOR_DEBUG_COMMON;
2355 int vrD, vrB;
2356 uint32 simm;
2357 uint64 val;
2358 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2359 PPC_OPC_ASSERT(vrB==0);
2360
2361 val = (simm & 0x10) ? (simm | 0xFFFFFFE0) : simm;
2362 val |= (val << 32);
2363
2364 gCPU.vr[vrD].d[0] = val;
2365 gCPU.vr[vrD].d[1] = val;
2366 }
2367 JITCFlow ppc_opc_gen_vspltisw()
2368 {
2369 #if 0
2370 ppc_opc_gen_interpret(ppc_opc_vspltisw);
2371 return flowEndBlock;
2372 #else
2373 int vrD, vrB;
2374 uint32 simm, val;
2375 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2376
2377 if (simm == 0) {
2378 vec_Zero(vrD);
2379 return flowContinue;
2380 } else if (simm == 0x1f) {
2381 vec_Neg1(vrD);
2382 return flowContinue;
2383 }
2384
2385 val = (simm & 0x10) ? (simm | 0xFFFFFFE0) : simm;
2386
2387 if (SSE_AVAIL) {
2388 modrm_o modrm;
2389 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2390
2391 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2392 asmMOVSS(reg1, &gCPU.vtemp);
2393 asmSHUFPS(reg1, reg1, 0x00);
2394
2395 return flowContinue;
2396 }
2397
2398 jitcDropClientVectorRegister(vrD);
2399
2400 NativeReg r1 = jitcAllocRegister();
2401 asmALU(X86_MOV, r1, val);
2402
2403 vec_setWord(vrD, 0, r1);
2404 vec_setWord(vrD, 4, r1);
2405 vec_setWord(vrD, 8, r1);
2406 vec_setWord(vrD,12, r1);
2407
2408 return flowContinue;
2409 #endif
2410 }
2411
2412 /* mfvscr Move from Vector Status and Control Register
2413 * v.129
2414 */
2415 void ppc_opc_mfvscr()
2416 {
2417 VECTOR_DEBUG_COMMON;
2418 int vrD, vrA, vrB;
2419 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2420 PPC_OPC_ASSERT(vrA==0);
2421 PPC_OPC_ASSERT(vrB==0);
2422
2423 VECT_W(gCPU.vr[vrD], 3) = gCPU.vscr;
2424 VECT_W(gCPU.vr[vrD], 2) = 0;
2425 VECT_D(gCPU.vr[vrD], 0) = 0;
2426 }
2427 JITCFlow ppc_opc_gen_mfvscr()
2428 {
2429 #if 0
2430 ppc_opc_gen_interpret(ppc_opc_mfvscr);
2431 return flowEndBlock;
2432 #else
2433 int vrD, vrA, vrB;
2434 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2435
2436 if (SSE_AVAIL) {
2437 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2438 NativeReg reg2 = jitcGetClientRegisterMapping(PPC_VSCR);
2439
2440 if (reg1 != VECTREG_NO)
2441 jitcFlushRegister(NATIVE_REG | reg2);
2442
2443 asmMOVSS(reg1, &gCPU.vscr);
2444 } else {
2445 jitcDropClientVectorRegister(vrD);
2446
2447 NativeReg r1 = jitcGetClientRegister(PPC_VSCR);
2448 vec_setWord(vrD, 0, r1);
2449
2450 jitcClobberRegister(NATIVE_REG | r1);
2451
2452 asmMOV_NoFlags(r1, 0); // avoid flag clobber
2453 vec_setWord(vrD, 4, r1);
2454 vec_setWord(vrD, 8, r1);
2455 vec_setWord(vrD,12, r1);
2456 }
2457
2458 return flowContinue;
2459 #endif
2460 }
2461
2462 /* mtvscr Move to Vector Status and Control Register
2463 * v.130
2464 */
2465 void ppc_opc_mtvscr()
2466 {
2467 VECTOR_DEBUG_COMMON;
2468 int vrD, vrA, vrB;
2469 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2470 PPC_OPC_ASSERT(vrA==0);
2471 PPC_OPC_ASSERT(vrD==0);
2472
2473 gCPU.vscr = VECT_W(gCPU.vr[vrB], 3);
2474 }
2475 JITCFlow ppc_opc_gen_mtvscr()
2476 {
2477 #if 0
2478 ppc_opc_gen_interpret(ppc_opc_mtvscr);
2479 return flowEndBlock;
2480 #else
2481 int vrD, vrA, vrB;
2482 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2483
2484 if (SSE_AVAIL) {
2485 NativeVectorReg reg1 = jitcGetClientVectorRegisterMapping(vrB);
2486
2487 if (reg1 != VECTREG_NO) {
2488 NativeReg reg2 = jitcGetClientRegisterMapping(PPC_VSCR);
2489
2490 if (reg2 != REG_NO)
2491 jitcClobberRegister(NATIVE_REG | reg2);
2492
2493 asmMOVSS(&gCPU.vscr, reg1);
2494 return flowContinue;
2495 }
2496 }
2497
2498 jitcFlushClientVectorRegister(vrB);
2499
2500 NativeReg r1 = jitcMapClientRegisterDirty(PPC_VSCR);
2501
2502 vec_getWord(r1, vrB, 0);
2503 return flowContinue;
2504 #endif
2505 }
2506
2507 /* vpkuhum Vector Pack Unsigned Half Word Unsigned Modulo
2508 * v.224
2509 */
2510 void ppc_opc_vpkuhum()
2511 {
2512 VECTOR_DEBUG;
2513 int vrD, vrA, vrB;
2514 Vector_t r;
2515 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2516
2517 VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 1);
2518 VECT_B(r, 1) = VECT_B(gCPU.vr[vrA], 3);
2519 VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 5);
2520 VECT_B(r, 3) = VECT_B(gCPU.vr[vrA], 7);
2521 VECT_B(r, 4) = VECT_B(gCPU.vr[vrA], 9);
2522 VECT_B(r, 5) = VECT_B(gCPU.vr[vrA],11);
2523 VECT_B(r, 6) = VECT_B(gCPU.vr[vrA],13);
2524 VECT_B(r, 7) = VECT_B(gCPU.vr[vrA],15);
2525
2526 VECT_B(r, 8) = VECT_B(gCPU.vr[vrB], 1);
2527 VECT_B(r, 9) = VECT_B(gCPU.vr[vrB], 3);
2528 VECT_B(r,10) = VECT_B(gCPU.vr[vrB], 5);
2529 VECT_B(r,11) = VECT_B(gCPU.vr[vrB], 7);
2530 VECT_B(r,12) = VECT_B(gCPU.vr[vrB], 9);
2531 VECT_B(r,13) = VECT_B(gCPU.vr[vrB],11);
2532 VECT_B(r,14) = VECT_B(gCPU.vr[vrB],13);
2533 VECT_B(r,15) = VECT_B(gCPU.vr[vrB],15);
2534
2535 gCPU.vr[vrD] = r;
2536 }
2537 JITCFlow ppc_opc_gen_vpkuhum()
2538 {
2539 #if 0
2540 ppc_opc_gen_interpret(ppc_opc_vpkuhum);
2541 return flowEndBlock;
2542 #else
2543 int vrD, vrA, vrB;
2544 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2545
2546 jitcFlushClientVectorRegister(vrA);
2547 jitcFlushClientVectorRegister(vrB);
2548 jitcDropClientVectorRegister(vrD);
2549
2550 jitcClobberCarryAndFlags();
2551 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
2552
2553 if (vrB == vrD && vrA == vrD) {
2554 for (int i=0; i<16; i += 2) {
2555 vec_getHalf((NativeReg16)reg1, vrA, i);
2556
2557 vec_setByte(vrT, REG_NO, (i >> 1), reg1);
2558 vec_setByte(vrT, REG_NO, 8+(i >> 1), reg1);
2559 }
2560
2561 vec_Copy(vrD, vrT);
2562 } else if (vrA == vrD) {
2563 for (int i=0; i<16; i += 2) {
2564 vec_getHalf((NativeReg16)reg1, vrA, 14 - i);
2565 vec_setByte(vrD, REG_NO, 15 - (i >> 1), reg1);
2566 }
2567
2568 for (int i=0; i<16; i += 2) {
2569 vec_getHalf((NativeReg16)reg1, vrB, i);
2570 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2571 }
2572 } else {
2573 for (int i=0; i<16; i += 2) {
2574 vec_getHalf((NativeReg16)reg1, vrB, i);
2575 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2576 }
2577
2578 for (int i=0; i<16; i += 2) {
2579 vec_getHalf((NativeReg16)reg1, vrA, i);
2580 vec_setByte(vrD, REG_NO, 8 + (i >> 1), reg1);
2581 }
2582 }
2583
2584 return flowContinue;
2585 #endif
2586 }
2587
2588 /* vpkuwum Vector Pack Unsigned Word Unsigned Modulo
2589 * v.226
2590 */
2591 void ppc_opc_vpkuwum()
2592 {
2593 VECTOR_DEBUG;
2594 int vrD, vrA, vrB;
2595 Vector_t r;
2596 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2597
2598 VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 1);
2599 VECT_H(r, 1) = VECT_H(gCPU.vr[vrA], 3);
2600 VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 5);
2601 VECT_H(r, 3) = VECT_H(gCPU.vr[vrA], 7);
2602
2603 VECT_H(r, 4) = VECT_H(gCPU.vr[vrB], 1);
2604 VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 3);
2605 VECT_H(r, 6) = VECT_H(gCPU.vr[vrB], 5);
2606 VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 7);
2607
2608 gCPU.vr[vrD] = r;
2609 }
2610 JITCFlow ppc_opc_gen_vpkuwum()
2611 {
2612 #if 0
2613 ppc_opc_gen_interpret(ppc_opc_vpkuwum);
2614 return flowEndBlock;
2615 #else
2616 int vrD, vrA, vrB;
2617 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2618
2619 jitcFlushClientVectorRegister(vrA);
2620 jitcFlushClientVectorRegister(vrB);
2621 jitcDropClientVectorRegister(vrD);
2622
2623 jitcClobberCarryAndFlags();
2624 NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
2625
2626 if (vrB == vrD && vrA == vrD) {
2627 for (int i=0; i<16; i += 4) {
2628 vec_getWord((NativeReg)reg1, vrA, i);
2629
2630 vec_setHalf(vrT, (i >> 1), reg1);
2631 vec_setHalf(vrT, 8+(i >> 1), reg1);
2632 }
2633
2634 vec_Copy(vrD, vrT);
2635 } else if (vrA == vrD) {
2636 for (int i=0; i<16; i += 4) {
2637 vec_getWord((NativeReg)reg1, vrA, 12 - i);
2638 vec_setHalf(vrD, 14 - (i >> 1), reg1);
2639 }
2640
2641 for (int i=0; i<16; i += 4) {
2642 vec_getWord((NativeReg)reg1, vrB, i);
2643 vec_setHalf(vrD, (i >> 1), reg1);
2644 }
2645 } else {
2646 for (int i=0; i<16; i += 4) {
2647 vec_getWord((NativeReg)reg1, vrB, i);
2648 vec_setHalf(vrD, (i >> 1), reg1);
2649 }
2650
2651 for (int i=0; i<16; i += 4) {
2652 vec_getWord((NativeReg)reg1, vrA, i);
2653 vec_setHalf(vrD, 8 + (i >> 1), reg1);
2654 }
2655 }
2656
2657 return flowContinue;
2658 #endif
2659 }
2660
2661 static inline void ppc_opc_gen_helper_vpkpx(NativeReg reg1, NativeReg reg2, NativeReg reg3)
2662 {
2663 asmALU(X86_MOV, reg2, reg1);
2664 asmALU(X86_MOV, reg3, reg1);
2665
2666 asmALU(X86_AND, reg1, 0x000000f8);
2667 asmALU(X86_AND, reg2, 0x0000f800);
2668 asmALU(X86_AND, reg3, 0x01f80000);
2669
2670 asmShift(X86_SHR, reg1, 3);
2671 asmShift(X86_SHR, reg2, 6);
2672 asmShift(X86_SHR, reg3, 9);
2673
2674 asmALU(X86_OR, reg1, reg2);
2675 asmALU(X86_OR, reg1, reg3);
2676 }
2677
2678 /* vpkpx Vector Pack Pixel32
2679 * v.219
2680 */
2681 void ppc_opc_vpkpx()
2682 {
2683 VECTOR_DEBUG;
2684 int vrD, vrA, vrB;
2685 Vector_t r;
2686 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2687
2688 VECT_H(r, 0) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 0));
2689 VECT_H(r, 1) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 1));
2690 VECT_H(r, 2) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 2));
2691 VECT_H(r, 3) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 3));
2692
2693 VECT_H(r, 4) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 0));
2694 VECT_H(r, 5) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 1));
2695 VECT_H(r, 6) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 2));
2696 VECT_H(r, 7) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 3));
2697
2698 gCPU.vr[vrD] = r;
2699 }
2700 JITCFlow ppc_opc_gen_vpkpx()
2701 {
2702 #if 0
2703 ppc_opc_gen_interpret(ppc_opc_vpkpx);
2704 return flowEndBlock;
2705 #else
2706 int vrD, vrA, vrB;
2707 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2708
2709 jitcFlushClientVectorRegister(vrA);
2710 jitcFlushClientVectorRegister(vrB);
2711 jitcDropClientVectorRegister(vrD);
2712
2713 jitcClobberCarryAndFlags();
2714 NativeReg reg1 = jitcAllocRegister();
2715 NativeReg reg2 = jitcAllocRegister();
2716 NativeReg reg3 = jitcAllocRegister();
2717
2718 if((vrA == vrB)) {
2719 for (int i=0; i<8; i += 2) {
2720 vec_getWord(reg1, vrB, i<<1);
2721 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2722 vec_setHalf(vrD, i, (NativeReg16)reg1);
2723
2724 if (vrD != vrA)
2725 vec_setHalf(vrD, i+8, (NativeReg16)reg1);
2726 }
2727
2728 if (vrD == vrB) {
2729 vec_getWord(reg1, vrD, 0);
2730 vec_getWord(reg2, vrD, 4);
2731
2732 vec_setWord(vrD, 8, reg1);
2733 vec_setWord(vrD,12, reg2);
2734 }
2735 } else if (vrB == vrD) {
2736 for (int i=0; i<8; i += 2) {
2737 vec_getWord(reg1, vrB, i<<1);
2738 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2739 vec_setHalf(vrD, i, (NativeReg16)reg1);
2740 }
2741
2742 for (int i=0; i<8; i += 2) {
2743 vec_getWord(reg1, vrA, i<<1);
2744 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2745 vec_setHalf(vrD, i+8, (NativeReg16)reg1);
2746 }
2747 } else {
2748 for (int i=0; i<8; i += 2) {
2749 vec_getWord(reg1, vrA, 12-(i<<1));
2750 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2751 vec_setHalf(vrD, 14-i, (NativeReg16)reg1);
2752 }
2753
2754 for (int i=0; i<8; i += 2) {
2755 vec_getWord(reg1, vrB, i<<1);
2756 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2757 vec_setHalf(vrD, i, (NativeReg16)reg1);
2758 }
2759 }
2760
2761 return flowContinue;
2762 #endif
2763 }
2764
2765
2766 /* vpkuhus Vector Pack Unsigned Half Word Unsigned Saturate
2767 * v.225
2768 */
2769 void ppc_opc_vpkuhus()
2770 {
2771 VECTOR_DEBUG;
2772 int vrD, vrA, vrB;
2773 Vector_t r;
2774 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2775
2776 VECT_B(r, 0) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 0));
2777 VECT_B(r, 1) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 1));
2778 VECT_B(r, 2) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 2));
2779 VECT_B(r, 3) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 3));
2780 VECT_B(r, 4) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 4));
2781 VECT_B(r, 5) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 5));
2782 VECT_B(r, 6) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 6));
2783 VECT_B(r, 7) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 7));
2784
2785 VECT_B(r, 8) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 0));
2786 VECT_B(r, 9) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 1));
2787 VECT_B(r,10) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 2));
2788 VECT_B(r,11) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 3));
2789 VECT_B(r,12) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 4));
2790 VECT_B(r,13) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 5));
2791 VECT_B(r,14) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 6));
2792 VECT_B(r,15) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 7));
2793
2794 gCPU.vr[vrD] = r;
2795 }
2796 JITCFlow ppc_opc_gen_vpkuhus()
2797 {
2798 #if 0
2799 ppc_opc_gen_interpret(ppc_opc_vpkuhus);
2800 return flowEndBlock;
2801 #else
2802 int vrD, vrA, vrB;
2803 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2804
2805 jitcFlushClientVectorRegister(vrA);
2806 jitcFlushClientVectorRegister(vrB);
2807 jitcDropClientVectorRegister(vrD);
2808
2809 jitcClobberCarryAndFlags();
2810 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
2811
2812 if (vrB == vrD && vrA == vrD) {
2813 for (int i=0; i<16; i += 2) {
2814 vec_getHalfZ((NativeReg)reg1, vrA, i);
2815
2816 vec_saturateUB(reg1);
2817
2818 vec_setByte(vrT, REG_NO, (i >> 1), reg1);
2819 vec_setByte(vrT, REG_NO, 8+(i >> 1), reg1);
2820 }
2821
2822 vec_Copy(vrD, vrT);
2823 } else if (vrA == vrD) {
2824 for (int i=0; i<16; i += 2) {
2825 vec_getHalfZ((NativeReg)reg1, vrA, 14 - i);
2826
2827 vec_saturateUB(reg1);
2828
2829 vec_setByte(vrD, REG_NO, 15 - (i >> 1), reg1);
2830 }
2831
2832 for (int i=0; i<16; i += 2) {
2833 vec_getHalfZ((NativeReg)reg1, vrB, i);
2834
2835 vec_saturateUB(reg1);
2836
2837 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2838 }
2839 } else {
2840 for (int i=0; i<16; i += 2) {
2841 vec_getHalfZ((NativeReg)reg1, vrB, i);
2842
2843 vec_saturateUB(reg1);
2844
2845 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2846 }
2847
2848 for (int i=0; i<16; i += 2) {
2849 vec_getHalfZ((NativeReg)reg1, vrA, i);
2850
2851 vec_saturateUB(reg1);
2852
2853 vec_setByte(vrD, REG_NO, 8 + (i >> 1), reg1);
2854 }
2855 }
2856
2857 return flowContinue;
2858 #endif
2859 }
2860
2861 /* vpkshss Vector Pack Signed Half Word Signed Saturate
2862 * v.220
2863 */
2864 void ppc_opc_vpkshss()
2865 {
2866 VECTOR_DEBUG;
2867 int vrD, vrA, vrB;
2868 Vector_t r;
2869 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2870
2871 VECT_B(r, 0) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 0));
2872 VECT_B(r, 1) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 1));
2873 VECT_B(r, 2) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 2));
2874 VECT_B(r, 3) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 3));
2875 VECT_B(r, 4) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 4));
2876 VECT_B(r, 5) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 5));
2877 VECT_B(r, 6) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 6));
2878 VECT_B(r, 7) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 7));
2879
2880 VECT_B(r, 8) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 0));
2881 VECT_B(r, 9) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 1));
2882 VECT_B(r,10) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 2));
2883 VECT_B(r,11) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 3));
2884 VECT_B(r,12) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 4));
2885 VECT_B(r,13) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 5));
2886 VECT_B(r,14) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 6));
2887 VECT_B(r,15) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 7));
2888
2889 gCPU.vr[vrD] = r;
2890 }
2891 JITCFlow ppc_opc_gen_vpkshss()
2892 {
2893 ppc_opc_gen_interpret(ppc_opc_vpkshss);
2894 return flowEndBlock;
2895 }
2896
2897 /* vpkuwus Vector Pack Unsigned Word Unsigned Saturate
2898 * v.227
2899 */
2900 void ppc_opc_vpkuwus()
2901 {
2902 VECTOR_DEBUG;
2903 int vrD, vrA, vrB;
2904 Vector_t r;
2905 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2906
2907 VECT_H(r, 0) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 0));
2908 VECT_H(r, 1) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 1));
2909 VECT_H(r, 2) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 2));
2910 VECT_H(r, 3) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 3));
2911
2912 VECT_H(r, 4) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 0));
2913 VECT_H(r, 5) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 1));
2914 VECT_H(r, 6) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 2));
2915 VECT_H(r, 7) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 3));
2916
2917 gCPU.vr[vrD] = r;
2918 }
2919 JITCFlow ppc_opc_gen_vpkuwus()
2920 {
2921 ppc_opc_gen_interpret(ppc_opc_vpkuwus);
2922 return flowEndBlock;
2923 }
2924
2925 /* vpkswss Vector Pack Signed Word Signed Saturate
2926 * v.222
2927 */
2928 void ppc_opc_vpkswss()
2929 {
2930 VECTOR_DEBUG;
2931 int vrD, vrA, vrB;
2932 Vector_t r;
2933 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2934
2935 VECT_H(r, 0) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 0));
2936 VECT_H(r, 1) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 1));
2937 VECT_H(r, 2) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 2));
2938 VECT_H(r, 3) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 3));
2939
2940 VECT_H(r, 4) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 0));
2941 VECT_H(r, 5) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 1));
2942 VECT_H(r, 6) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 2));
2943 VECT_H(r, 7) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 3));
2944
2945 gCPU.vr[vrD] = r;
2946 }
2947 JITCFlow ppc_opc_gen_vpkswss()
2948 {
2949 #if 0
2950 ppc_opc_gen_interpret(ppc_opc_vpkswss);
2951 return flowEndBlock;
2952 #else
2953 int vrD, vrA, vrB;
2954 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2955
2956 if (SSE2_AVAIL) {
2957 noncommutative_operation(X86_PACKSSDW, vrD, vrB, vrA);
2958 return flowContinue;
2959 }
2960
2961 jitcFlushClientVectorRegister(vrA);
2962 jitcFlushClientVectorRegister(vrB);
2963 jitcDropClientVectorRegister(vrD);
2964
2965 jitcClobberCarryAndFlags();
2966 NativeReg reg1 = jitcAllocRegister();
2967 NativeReg reg2 = jitcAllocRegister();
2968
2969 if (vrB == vrD && vrA == vrD) {
2970 for (int i=0; i<16; i += 4) {
2971 vec_getWord(reg1, vrA, i);
2972
2973 vec_saturateSH(reg1, reg2);
2974
2975 vec_setHalf(vrT, (i >> 1), (NativeReg16)reg1);
2976 vec_setHalf(vrT, 8+(i >> 1), (NativeReg16)reg1);
2977 }
2978
2979 vec_Copy(vrD, vrT);
2980 } else if (vrA == vrD) {
2981 for (int i=0; i<16; i += 4) {
2982 vec_getWord(reg1, vrA, 12 - i);
2983
2984 vec_saturateSH(reg1, reg2);
2985
2986 vec_setHalf(vrD, 14 - (i >> 1), (NativeReg16)reg1);
2987 }
2988
2989 for (int i=0; i<16; i += 4) {
2990 vec_getWord(reg1, vrB, i);
2991
2992 vec_saturateSH(reg1, reg2);
2993
2994 vec_setHalf(vrD, (i >> 1), (NativeReg16)reg1);
2995 }
2996 } else {
2997 for (int i=0; i<16; i += 4) {
2998 vec_getWord(reg1, vrB, i);
2999
3000 vec_saturateSH(reg1, reg2);
3001
3002 vec_setHalf(vrD, (i >> 1), (NativeReg16)reg1);
3003 }
3004
3005 for (int i=0; i<16; i += 4) {
3006 vec_getWord(reg1, vrA, i);
3007
3008 vec_saturateSH(reg1, reg2);
3009
3010 vec_setHalf(vrD, 8 + (i >> 1), (NativeReg16)reg1);
3011 }
3012 }
3013
3014 return flowContinue;
3015 #endif
3016 }
3017
3018 /* vpkshus Vector Pack Signed Half Word Unsigned Saturate
3019 * v.221
3020 */
3021 void ppc_opc_vpkshus()
3022 {
3023 VECTOR_DEBUG;
3024 int vrD, vrA, vrB;
3025 Vector_t r;
3026 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3027
3028 VECT_B(r, 0) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 0));
3029 VECT_B(r, 1) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 1));
3030 VECT_B(r, 2) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 2));
3031 VECT_B(r, 3) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 3));
3032 VECT_B(r, 4) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 4));
3033 VECT_B(r, 5) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 5));
3034 VECT_B(r, 6) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 6));
3035 VECT_B(r, 7) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 7));
3036
3037 VECT_B(r, 8) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 0));
3038 VECT_B(r, 9) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 1));
3039 VECT_B(r,10) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 2));
3040 VECT_B(r,11) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 3));
3041 VECT_B(r,12) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 4));
3042 VECT_B(r,13) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 5));
3043 VECT_B(r,14) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 6));
3044 VECT_B(r,15) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 7));
3045
3046 gCPU.vr[vrD] = r;
3047 }
3048 JITCFlow ppc_opc_gen_vpkshus()
3049 {
3050 #if 0
3051 ppc_opc_gen_interpret(ppc_opc_vpkshus);
3052 return flowEndBlock;
3053 #else
3054 int vrD, vrA, vrB;
3055 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3056
3057 if (SSE2_AVAIL) {
3058 noncommutative_operation(X86_PACKUSWB, vrD, vrB, vrA);
3059 return flowContinue;
3060 }
3061
3062 NativeReg reg1 = jitcAllocRegister(NATIVE_REG_8);
3063 NativeReg reg2 = jitcAllocRegister();
3064
3065 if (vrB == vrD && vrA == vrD) {
3066 for (int i=0; i<16; i += 2) {
3067 vec_getHalfS(reg1, vrA, i);
3068
3069 vec_saturateSUB(reg1, reg2);
3070
3071 vec_setByte(vrT, REG_NO, (i >> 1), (NativeReg8)reg1);
3072 vec_setByte(vrT, REG_NO, 8+(i >> 1), (NativeReg8)reg1);
3073 }
3074
3075 vec_Copy(vrD, vrT);
3076 } else if (vrA == vrD) {
3077 for (int i=0; i<16; i += 2) {
3078 vec_getHalfS(reg1, vrA, 14 - i);
3079
3080 vec_saturateSUB(reg1, reg2);
3081
3082 vec_setByte(vrD, REG_NO, 15 - (i >> 1), (NativeReg8)reg1);
3083 }
3084
3085 for (int i=0; i<16; i += 2) {
3086 vec_getHalfS(reg1, vrB, i);
3087
3088 vec_saturateSUB(reg1, reg2);
3089
3090 vec_setByte(vrD, REG_NO, (i >> 1), (NativeReg8)reg1);
3091 }
3092 } else {
3093 for (int i=0; i<16; i += 2) {
3094 vec_getHalfS(reg1, vrB, i);
3095
3096 vec_saturateSUB(reg1, reg2);
3097
3098 vec_setByte(vrD, REG_NO, (i >> 1), (NativeReg8)reg1);
3099 }
3100
3101 for (int i=0; i<16; i += 2) {
3102 vec_getHalfS(reg1, vrA, i);
3103
3104 vec_saturateSUB(reg1, reg2);
3105
3106 vec_setByte(vrD, REG_NO, 8 + (i >> 1), (NativeReg8)reg1);
3107 }
3108 }
3109
3110 return flowContinue;
3111 #endif
3112 }
3113
3114 /* vpkswus Vector Pack Signed Word Unsigned Saturate
3115 * v.223
3116 */
3117 void ppc_opc_vpkswus()
3118 {
3119 VECTOR_DEBUG;
3120 int vrD, vrA, vrB;
3121 Vector_t r;
3122 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3123
3124 VECT_H(r, 0) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 0));
3125 VECT_H(r, 1) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 1));
3126 VECT_H(r, 2) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 2));
3127 VECT_H(r, 3) = SATURATE_USH(VECT_W(gCPU.vr[vrA], 3));
3128
3129 VECT_H(r, 4) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 0));
3130 VECT_H(r, 5) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 1));
3131 VECT_H(r, 6) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 2));
3132 VECT_H(r, 7) = SATURATE_USH(VECT_W(gCPU.vr[vrB], 3));
3133
3134 gCPU.vr[vrD] = r;
3135 }
3136 JITCFlow ppc_opc_gen_vpkswus()
3137 {
3138 ppc_opc_gen_interpret(ppc_opc_vpkswus);
3139 return flowEndBlock;
3140 }
3141
3142 /* vupkhsb Vector Unpack High Signed Byte
3143 * v.277
3144 */
3145 void ppc_opc_vupkhsb()
3146 {
3147 VECTOR_DEBUG;
3148 int vrD, vrA, vrB;
3149 Vector_t r;
3150 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3151 PPC_OPC_ASSERT(vrA==0);
3152
3153 VECT_SH(r, 0) = VECT_SB(gCPU.vr[vrB], 0);
3154 VECT_SH(r, 1) = VECT_SB(gCPU.vr[vrB], 1);
3155 VECT_SH(r, 2) = VECT_SB(gCPU.vr[vrB], 2);
3156 VECT_SH(r, 3) = VECT_SB(gCPU.vr[vrB], 3);
3157 VECT_SH(r, 4) = VECT_SB(gCPU.vr[vrB], 4);
3158 VECT_SH(r, 5) = VECT_SB(gCPU.vr[vrB], 5);
3159 VECT_SH(r, 6) = VECT_SB(gCPU.vr[vrB], 6);
3160 VECT_SH(r, 7) = VECT_SB(gCPU.vr[vrB], 7);
3161
3162 gCPU.vr[vrD] = r;
3163 }
3164 JITCFlow ppc_opc_gen_vupkhsb()
3165 {
3166 #if 0
3167 ppc_opc_gen_interpret(ppc_opc_vupkhsb);
3168 return flowEndBlock;
3169 #else
3170 int vrD, vrA, vrB;
3171 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3172
3173 jitcFlushClientVectorRegister(vrB);
3174 jitcDropClientVectorRegister(vrD);
3175
3176 NativeReg reg = jitcAllocRegister();
3177
3178 for (int i=0; i<16; i += 2) {
3179 vec_getByteS(reg, vrB, 8 + (i >> 1));
3180
3181 vec_setHalf(vrD, i, (NativeReg16)reg);
3182 }
3183
3184 return flowContinue;
3185 #endif
3186 }
3187
3188 /* vupkhpx Vector Unpack High Pixel32
3189 * v.279
3190 */
3191 void ppc_opc_vupkhpx()
3192 {
3193 VECTOR_DEBUG;
3194 int vrD, vrA, vrB;
3195 Vector_t r;
3196 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3197 PPC_OPC_ASSERT(vrA==0);
3198
3199 VECT_W(r, 0) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 0));
3200 VECT_W(r, 1) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 1));
3201 VECT_W(r, 2) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 2));
3202 VECT_W(r, 3) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 3));
3203
3204 gCPU.vr[vrD] = r;
3205 }
3206 JITCFlow ppc_opc_gen_vupkhpx()
3207 {
3208 ppc_opc_gen_interpret(ppc_opc_vupkhpx);
3209 return flowEndBlock;
3210 }
3211
3212 /* vupkhsh Vector Unpack High Signed Half Word
3213 * v.278
3214 */
3215 void ppc_opc_vupkhsh()
3216 {
3217 VECTOR_DEBUG;
3218 int vrD, vrA, vrB;
3219 Vector_t r;
3220 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3221 PPC_OPC_ASSERT(vrA==0);
3222
3223 VECT_SW(r, 0) = VECT_SH(gCPU.vr[vrB], 0);
3224 VECT_SW(r, 1) = VECT_SH(gCPU.vr[vrB], 1);
3225 VECT_SW(r, 2) = VECT_SH(gCPU.vr[vrB], 2);
3226 VECT_SW(r, 3) = VECT_SH(gCPU.vr[vrB], 3);
3227
3228 gCPU.vr[vrD] = r;
3229 }
3230 JITCFlow ppc_opc_gen_vupkhsh()
3231 {
3232 ppc_opc_gen_interpret(ppc_opc_vupkhsh);
3233 return flowEndBlock;
3234 }
3235
3236 /* vupklsb Vector Unpack Low Signed Byte
3237 * v.280
3238 */
3239 void ppc_opc_vupklsb()
3240 {
3241 VECTOR_DEBUG;
3242 int vrD, vrA, vrB;
3243 Vector_t r;
3244 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3245 PPC_OPC_ASSERT(vrA==0);
3246
3247 VECT_SH(r, 0) = VECT_SB(gCPU.vr[vrB], 8);
3248 VECT_SH(r, 1) = VECT_SB(gCPU.vr[vrB], 9);
3249 VECT_SH(r, 2) = VECT_SB(gCPU.vr[vrB],10);
3250 VECT_SH(r, 3) = VECT_SB(gCPU.vr[vrB],11);
3251 VECT_SH(r, 4) = VECT_SB(gCPU.vr[vrB],12);
3252 VECT_SH(r, 5) = VECT_SB(gCPU.vr[vrB],13);
3253 VECT_SH(r, 6) = VECT_SB(gCPU.vr[vrB],14);
3254 VECT_SH(r, 7) = VECT_SB(gCPU.vr[vrB],15);
3255
3256 gCPU.vr[vrD] = r;
3257 }
3258 JITCFlow ppc_opc_gen_vupklsb()
3259 {
3260 #if 0
3261 ppc_opc_gen_interpret(ppc_opc_vupklsb);
3262 return flowEndBlock;
3263 #else
3264 int vrD, vrA, vrB;
3265 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3266
3267 jitcFlushClientVectorRegister(vrB);
3268 jitcDropClientVectorRegister(vrD);
3269
3270 NativeReg reg = jitcAllocRegister();
3271
3272 for (int i=0; i<16; i += 2) {
3273 vec_getByteS(reg, vrB, 7 - (i >> 1));
3274
3275 vec_setHalf(vrD, 14 - i, (NativeReg16)reg);
3276 }
3277
3278 return flowContinue;
3279 #endif
3280 }
3281
3282 /* vupklpx Vector Unpack Low Pixel32
3283 * v.279
3284 */
3285 void ppc_opc_vupklpx()
3286 {
3287 VECTOR_DEBUG;
3288 int vrD, vrA, vrB;
3289 Vector_t r;
3290 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3291 PPC_OPC_ASSERT(vrA==0);
3292
3293 VECT_W(r, 0) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 4));
3294 VECT_W(r, 1) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 5));
3295 VECT_W(r, 2) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 6));
3296 VECT_W(r, 3) = UNPACK_PIXEL(VECT_H(gCPU.vr[vrB], 7));
3297
3298 gCPU.vr[vrD] = r;
3299 }
3300 JITCFlow ppc_opc_gen_vupklpx()
3301 {
3302 ppc_opc_gen_interpret(ppc_opc_vupklpx);
3303 return flowEndBlock;
3304 }
3305
3306 /* vupklsh Vector Unpack Low Signed Half Word
3307 * v.281
3308 */
3309 void ppc_opc_vupklsh()
3310 {
3311 VECTOR_DEBUG;
3312 int vrD, vrA, vrB;
3313 Vector_t r;
3314 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3315 PPC_OPC_ASSERT(vrA==0);
3316
3317 VECT_SW(r, 0) = VECT_SH(gCPU.vr[vrB], 4);
3318 VECT_SW(r, 1) = VECT_SH(gCPU.vr[vrB], 5);
3319 VECT_SW(r, 2) = VECT_SH(gCPU.vr[vrB], 6);
3320 VECT_SW(r, 3) = VECT_SH(gCPU.vr[vrB], 7);
3321
3322 gCPU.vr[vrD] = r;
3323 }
3324 JITCFlow ppc_opc_gen_vupklsh()
3325 {
3326 ppc_opc_gen_interpret(ppc_opc_vupklsh);
3327 return flowEndBlock;
3328 }
3329
3330 static inline void helper_gen_vaddubm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3331 {
3332 asmALU(X86_MOV, reg, modrma);
3333 asmALU(X86_MOV, reg2, reg);
3334
3335 asmALU(X86_AND, reg, modrmb);
3336 asmALU(X86_XOR, reg2, modrmb);
3337
3338 asmShift(X86_SHL, reg, 1);
3339 asmALU(X86_AND, reg2, 0xfefefefe);
3340
3341 asmALU(X86_ADD, reg2, reg);
3342 asmALU(X86_AND, reg2, 0x01010100);
3343 }
3344
3345 /* vaddubm Vector Add Unsigned Byte Modulo
3346 * v.141
3347 */
3348 void ppc_opc_vaddubm()
3349 {
3350 VECTOR_DEBUG;
3351 int vrD, vrA, vrB;
3352 uint8 res;
3353 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3354
3355 for (int i=0; i<16; i++) {
3356 res = gCPU.vr[vrA].b[i] + gCPU.vr[vrB].b[i];
3357 gCPU.vr[vrD].b[i] = res;
3358 }
3359 }
3360 JITCFlow ppc_opc_gen_vaddubm()
3361 {
3362 #if 0
3363 ppc_opc_gen_interpret(ppc_opc_vaddubm);
3364 return flowEndBlock;
3365 #else
3366 int vrD, vrA, vrB;
3367 modrm_o modrma, modrmb;
3368 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3369
3370 if (SSE2_AVAIL) {
3371 commutative_operation(PALUB(X86_PADD), vrD, vrA, vrB);
3372 return flowContinue;
3373 }
3374
3375 jitcFlushClientVectorRegister(vrA);
3376 jitcFlushClientVectorRegister(vrB);
3377 jitcDropClientVectorRegister(vrD);
3378
3379 jitcClobberCarryAndFlags();
3380 NativeReg reg = jitcAllocRegister();
3381 NativeReg reg2 = jitcAllocRegister();
3382
3383 if ((vrA == vrD) || (vrB == vrD)) {
3384 int vrS = vrB;
3385
3386 if (vrD == vrB) {
3387 vrS = vrA;
3388 }
3389
3390 for (int i=0; i<16; i += 4) {
3391 x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3392 x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3393
3394 helper_gen_vaddubm(reg, reg2, modrma, modrmb);
3395
3396 asmALU(X86_SUB, reg2, modrmb);
3397 asmALU(X86_SUB, modrma, reg2);
3398 }
3399 } else {
3400 for (int i=0; i<16; i += 4) {
3401 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3402 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3403
3404 helper_gen_vaddubm(reg, reg2, modrma, modrmb);
3405
3406 asmALU(X86_MOV, reg, modrmb);
3407 asmALU(X86_SUB, reg, reg2);
3408 asmALU(X86_ADD, reg, modrma);
3409
3410 vec_setWord(vrD, i, reg);
3411 }
3412 }
3413
3414 return flowContinue;
3415 #endif
3416 }
3417
3418 static inline void helper_gen_vadduhm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3419 {
3420 asmALU(X86_MOV, reg, modrma);
3421 asmALU(X86_MOV, reg2, reg);
3422
3423 asmALU(X86_AND, reg, modrmb);
3424 asmALU(X86_XOR, reg2, modrmb);
3425
3426 asmShift(X86_SHL, reg, 1);
3427 asmALU(X86_AND, reg2, 0xfffefffe);
3428
3429 asmALU(X86_ADD, reg2, reg);
3430 asmALU(X86_AND, reg2, 0x00010000);
3431 }
3432
3433 /* vadduhm Vector Add Unsigned Half Word Modulo
3434 * v.143
3435 */
3436 void ppc_opc_vadduhm()
3437 {
3438 VECTOR_DEBUG;
3439 int vrD, vrA, vrB;
3440 uint16 res;
3441 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3442
3443 for (int i=0; i<8; i++) {
3444 res = gCPU.vr[vrA].h[i] + gCPU.vr[vrB].h[i];
3445 gCPU.vr[vrD].h[i] = res;
3446 }
3447 }
3448 JITCFlow ppc_opc_gen_vadduhm()
3449 {
3450 #if 0
3451 ppc_opc_gen_interpret(ppc_opc_vadduhm);
3452 return flowEndBlock;
3453 #else
3454 int vrD, vrA, vrB;
3455 modrm_o modrma, modrmb;
3456 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3457
3458 if (SSE2_AVAIL) {
3459 commutative_operation(PALUW(X86_PADD), vrD, vrA, vrB);
3460 return flowContinue;
3461 }
3462
3463 jitcFlushClientVectorRegister(vrA);
3464 jitcFlushClientVectorRegister(vrB);
3465 jitcDropClientVectorRegister(vrD);
3466
3467 jitcClobberCarryAndFlags();
3468 NativeReg reg = jitcAllocRegister();
3469 NativeReg reg2 = jitcAllocRegister();
3470
3471 if ((vrA == vrD) || (vrB == vrD)) {
3472 int vrS = vrB;
3473
3474 if (vrD == vrB) {
3475 vrS = vrA;
3476 }
3477
3478 for (int i=0; i<16; i += 4) {
3479 x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3480 x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3481
3482 helper_gen_vadduhm(reg, reg2, modrma, modrmb);
3483
3484 asmALU(X86_SUB, reg2, modrmb);
3485 asmALU(X86_SUB, modrma, reg2);
3486 }
3487 } else {
3488 for (int i=0; i<16; i += 4) {
3489 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3490 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3491
3492 helper_gen_vadduhm(reg, reg2, modrma, modrmb);
3493
3494 asmALU(X86_MOV, reg, modrmb);
3495 asmALU(X86_SUB, reg, reg2);
3496 asmALU(X86_ADD, reg, modrma);
3497
3498 vec_setWord(vrD, i, reg);
3499 }
3500 }
3501
3502 return flowContinue;
3503 #endif
3504 }
3505
3506 /* vadduwm Vector Add Unsigned Word Modulo
3507 * v.145
3508 */
3509 void ppc_opc_vadduwm()
3510 {
3511 VECTOR_DEBUG;
3512 int vrD, vrA, vrB;
3513 uint32 res;
3514 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3515
3516 for (int i=0; i<4; i++) {
3517 res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3518 gCPU.vr[vrD].w[i] = res;
3519 }
3520 }
3521 JITCFlow ppc_opc_gen_vadduwm()
3522 {
3523 #if 0
3524 ppc_opc_gen_interpret(ppc_opc_vadduwm);
3525 return flowEndBlock;
3526 #else
3527 int vrD, vrA, vrB;
3528 modrm_o modrm;
3529 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3530
3531 jitcFlushClientVectorRegister(vrA);
3532 jitcFlushClientVectorRegister(vrB);
3533 jitcDropClientVectorRegister(vrD);
3534 NativeReg reg = jitcAllocRegister();
3535
3536 if (SSE2_AVAIL) {
3537 commutative_operation(PALUD(X86_PADD), vrD, vrA, vrB);
3538 return flowContinue;
3539 }
3540
3541 if ((vrA == vrD) || (vrB == vrD)) {
3542 int vrS = vrB;
3543
3544 if (vrD == vrB) {
3545 vrS = vrA;
3546 }
3547
3548 for (int i=0; i<16; i += 4) {
3549 vec_getWord(reg, vrS, i);
3550
3551 asmALU(X86_ADD,
3552 x86_mem2(modrm, &(gCPU.vr[vrD].b[i])), reg);
3553 }
3554 } else {
3555 for (int i=0; i<16; i += 4) {
3556 vec_getWord(reg, vrA, i);
3557
3558 asmALU(X86_ADD, reg,
3559 x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
3560
3561 vec_setWord(vrD, i, reg);
3562 }
3563 }
3564
3565 return flowContinue;
3566 #endif
3567 }
3568
3569 /* vaddfp Vector Add Float Point
3570 * v.137
3571 */
3572 void ppc_opc_vaddfp()
3573 {
3574 VECTOR_DEBUG;
3575 int vrD, vrA, vrB;
3576 float res;
3577 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3578
3579 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
3580 res = gCPU.vr[vrA].f[i] + gCPU.vr[vrB].f[i];
3581 gCPU.vr[vrD].f[i] = res;
3582 }
3583 }
3584 JITCFlow ppc_opc_gen_vaddfp()
3585 {
3586 #if 0
3587 ppc_opc_gen_interpret(ppc_opc_vaddfp);
3588 return flowEndBlock;
3589 #else
3590 int vrD, vrA, vrB;
3591 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3592
3593 if (SSE_AVAIL) {
3594 commutative_operation(X86_ADDPS, vrD, vrA, vrB);
3595 return flowContinue;
3596 }
3597
3598 ppc_opc_gen_interpret(ppc_opc_vaddfp);
3599 return flowEndBlock;
3600 #endif
3601 }
3602
3603 /* vaddcuw Vector Add Carryout Unsigned Word
3604 * v.136
3605 */
3606 void ppc_opc_vaddcuw()
3607 {
3608 VECTOR_DEBUG;
3609 int vrD, vrA, vrB;
3610 uint32 res;
3611 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3612
3613 for (int i=0; i<4; i++) {
3614 res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3615 gCPU.vr[vrD].w[i] = (res < gCPU.vr[vrA].w[i]) ? 1 : 0;
3616 }
3617 }
3618 JITCFlow ppc_opc_gen_vaddcuw()
3619 {
3620 ppc_opc_gen_interpret(ppc_opc_vaddcuw);
3621 return flowEndBlock;
3622 }
3623
3624 static inline void helper_gen_vaddubs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3625 {
3626 asmALU(X86_MOV, reg, modrma);
3627 asmALU(X86_MOV, reg2, reg);
3628
3629 asmALU(X86_XOR, reg, modrmb);
3630 asmALU(X86_AND, reg2, modrmb);
3631
3632 asmShift(X86_SHR, reg, 1);
3633 asmALU(X86_AND, reg, 0x7f7f7f7f);
3634
3635 asmALU(X86_ADD, reg2, reg);
3636 asmALU(X86_AND, reg2, 0x80808080);
3637
3638 asmShift(X86_SHR, reg2, 7);
3639 asmALU(X86_ADD, reg2, 0x7f7f7f7f);
3640 asmALU(X86_XOR, reg2, 0x7f7f7f7f);
3641 NativeAddress skip = asmJxxFixup(X86_Z);
3642
3643 vec_raise_saturate();
3644
3645 asmResolveFixup(skip);
3646 }
3647
3648 /* vaddubs Vector Add Unsigned Byte Saturate
3649 * v.142
3650 */
3651 void ppc_opc_vaddubs()
3652 {
3653 VECTOR_DEBUG;
3654 int vrD, vrA, vrB;
3655 uint16 res;
3656 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3657
3658 for (int i=0; i<16; i++) {
3659 res = (uint16)gCPU.vr[vrA].b[i] + (uint16)gCPU.vr[vrB].b[i];
3660 gCPU.vr[vrD].b[i] = SATURATE_UB(res);
3661 }
3662 }
3663 JITCFlow ppc_opc_gen_vaddubs()
3664 {
3665 #if 0
3666 ppc_opc_gen_interpret(ppc_opc_vaddubs);
3667 return flowEndBlock;
3668 #else
3669 int vrD, vrA, vrB;
3670 modrm_o modrma, modrmb;
3671 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3672
3673 if (SSE2_AVAIL) {
3674 commutative_operation(PALUB(X86_PADDUS), vrD, vrA, vrB);
3675 return flowContinue;
3676 }
3677
3678 jitcFlushClientVectorRegister(vrA);
3679 jitcFlushClientVectorRegister(vrB);
3680 jitcDropClientVectorRegister(vrD);
3681
3682 jitcClobberCarryAndFlags();
3683 NativeReg reg = jitcAllocRegister();
3684 NativeReg reg2 = jitcAllocRegister();
3685
3686 if ((vrA == vrD) || (vrB == vrD)) {
3687 int vrS = vrB;
3688
3689 if (vrD == vrB) {
3690 vrS = vrA;
3691 }
3692
3693 for (int i=0; i<16; i += 4) {
3694 x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3695 x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3696
3697 helper_gen_vaddubs(reg, reg2, modrma, modrmb);
3698
3699 asmALU(X86_MOV, reg, modrmb);
3700 asmALU(X86_SUB, reg, reg2);
3701 asmALU(X86_ADD, modrma, reg);
3702 asmALU(X86_OR, modrma, reg2);
3703 }
3704 } else {
3705 for (int i=0; i<16; i += 4) {
3706 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3707 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3708
3709 helper_gen_vaddubs(reg, reg2, modrma, modrmb);
3710
3711 asmALU(X86_MOV, reg, modrmb);
3712 asmALU(X86_SUB, reg, reg2);
3713 asmALU(X86_ADD, reg, modrma);
3714 asmALU(X86_OR, reg, reg2);
3715
3716 vec_setWord(vrD, i, reg);
3717 }
3718 }
3719
3720 return flowContinue;
3721 #endif
3722 }
3723
3724 /* vaddsbs Vector Add Signed Byte Saturate
3725 * v.138
3726 */
3727 void ppc_opc_vaddsbs()
3728 {
3729 VECTOR_DEBUG;
3730 int vrD, vrA, vrB;
3731 sint16 res;
3732 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3733
3734 for (int i=0; i<16; i++) {
3735 res = (sint16)gCPU.vr[vrA].sb[i] + (sint16)gCPU.vr[vrB].sb[i];
3736 gCPU.vr[vrD].b[i] = SATURATE_SB(res);
3737 }
3738 }
3739 JITCFlow ppc_opc_gen_vaddsbs()
3740 {
3741 ppc_opc_gen_interpret(ppc_opc_vaddsbs);
3742 return flowEndBlock;
3743 }
3744
3745 static inline void helper_gen_vadduhs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3746 {
3747 asmALU(X86_MOV, reg, modrma);
3748 asmALU(X86_MOV, reg2, reg);
3749
3750 asmALU(X86_XOR, reg, modrmb);
3751 asmALU(X86_AND, reg2, modrmb);
3752
3753 asmShift(X86_SHR, reg, 1);
3754 asmALU(X86_AND, reg, 0x7fff7fff);
3755
3756 asmALU(X86_ADD, reg2, reg);
3757 asmALU(X86_AND, reg2, 0x80008000);
3758
3759 asmShift(X86_SHR, reg2, 15);
3760 asmALU(X86_ADD, reg2, 0x7fff7fff);
3761 asmALU(X86_XOR, reg2, 0x7fff7fff);
3762 NativeAddress skip = asmJxxFixup(X86_Z);
3763
3764 vec_raise_saturate();
3765
3766 asmResolveFixup(skip);
3767 }
3768
3769 /* vadduhs Vector Add Unsigned Half Word Saturate
3770 * v.144
3771 */
3772 void ppc_opc_vadduhs()
3773 {
3774 VECTOR_DEBUG;
3775 int vrD, vrA, vrB;
3776 uint32 res;
3777 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3778
3779 for (int i=0; i<8; i++) {
3780 res = (uint32)gCPU.vr[vrA].h[i] + (uint32)gCPU.vr[vrB].h[i];
3781 gCPU.vr[vrD].h[i] = SATURATE_UH(res);
3782 }
3783 }
3784 JITCFlow ppc_opc_gen_vadduhs()
3785 {
3786 #if 0
3787 ppc_opc_gen_interpret(ppc_opc_vadduhs);
3788 return flowEndBlock;
3789 #else
3790 int vrD, vrA, vrB;
3791 modrm_o modrma, modrmb;
3792 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3793
3794 if (SSE2_AVAIL) {
3795 commutative_operation(PALUW(X86_PADDUS), vrD, vrA, vrB);
3796 return flowContinue;
3797 }
3798
3799 jitcFlushClientVectorRegister(vrA);
3800 jitcFlushClientVectorRegister(vrB);
3801 jitcDropClientVectorRegister(vrD);
3802
3803 jitcClobberCarryAndFlags();
3804 NativeReg reg = jitcAllocRegister();
3805 NativeReg reg2 = jitcAllocRegister();
3806
3807 if ((vrA == vrD) || (vrB == vrD)) {
3808 int vrS = vrB;
3809
3810 if (vrD == vrB) {
3811 vrS = vrA;
3812 }
3813
3814 for (int i=0; i<16; i += 4) {
3815 x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
3816 x86_mem2(modrmb, &(gCPU.vr[vrS].b[i]));
3817
3818 helper_gen_vadduhs(reg, reg2, modrma, modrmb);
3819
3820 asmALU(X86_MOV, reg, modrmb);
3821 asmALU(X86_SUB, reg, reg2);
3822 asmALU(X86_ADD, modrma, reg);
3823 asmALU(X86_OR, modrma, reg2);
3824 }
3825 } else {
3826 for (int i=0; i<16; i += 4) {
3827 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
3828 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
3829
3830 helper_gen_vadduhs(reg, reg2, modrma, modrmb);
3831
3832 asmALU(X86_MOV, reg, modrmb);
3833 asmALU(X86_SUB, reg, reg2);
3834 asmALU(X86_ADD, reg, modrma);
3835 asmALU(X86_OR, reg, reg2);
3836
3837 vec_setWord(vrD, i, reg);
3838 }
3839 }
3840
3841 return flowContinue;
3842
3843 #endif
3844 }
3845
3846 /* vaddshs Vector Add Signed Half Word Saturate
3847 * v.139
3848 */
3849 void ppc_opc_vaddshs()
3850 {
3851 VECTOR_DEBUG;
3852 int vrD, vrA, vrB;
3853 sint32 res;
3854 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3855
3856 for (int i=0; i<8; i++) {
3857 res = (sint32)gCPU.vr[vrA].sh[i] + (sint32)gCPU.vr[vrB].sh[i];
3858 gCPU.vr[vrD].h[i] = SATURATE_SH(res);
3859 }
3860 }
3861 JITCFlow ppc_opc_gen_vaddshs()
3862 {
3863 #if 0
3864 ppc_opc_gen_interpret(ppc_opc_vaddshs);
3865 return flowEndBlock;
3866 #else
3867 int vrD, vrA, vrB;
3868 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3869
3870 if (SSE2_AVAIL) {
3871 commutative_operation(PALUW(X86_PADDS), vrD, vrA, vrB);
3872 return flowContinue;
3873 }
3874
3875 jitcFlushClientVectorRegister(vrA);
3876 jitcFlushClientVectorRegister(vrB);
3877 jitcDropClientVectorRegister(vrD);
3878
3879 jitcClobberCarryAndFlags();
3880 NativeReg reg = jitcAllocRegister();
3881 NativeReg reg2 = jitcAllocRegister();
3882
3883 for (int i=0; i<16; i += 2) {
3884 vec_getHalfS(reg, vrA, i);
3885 vec_getHalfS(reg2, vrB, i);
3886
3887 asmALU(X86_ADD, reg, reg2);
3888 vec_saturateSH(reg, reg2);
3889
3890 vec_setHalf(vrD, i, (NativeReg16)reg);
3891 }
3892
3893 return flowContinue;
3894 #endif
3895 }
3896
3897 /* vadduws Vector Add Unsigned Word Saturate
3898 * v.146
3899 */
3900 void ppc_opc_vadduws()
3901 {
3902 VECTOR_DEBUG;
3903 int vrD, vrA, vrB;
3904 uint32 res;
3905 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3906
3907 for (int i=0; i<4; i++) {
3908 res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3909
3910 // We do this to prevent us from having to do 64-bit math
3911 if (res < gCPU.vr[vrA].w[i]) {
3912 res = 0xFFFFFFFF;
3913 gCPU.vscr |= VSCR_SAT;
3914 }
3915
3916 /* 64-bit math | 32-bit hack
3917 * ------------------------+-------------------------------------
3918 * add, addc (a+b) | add (a+b)
3919 * sub, subb (r>ub) | sub (r<a)
3920 */
3921
3922 gCPU.vr[vrD].w[i] = res;
3923 }
3924 }
3925 JITCFlow ppc_opc_gen_vadduws()
3926 {
3927 ppc_opc_gen_interpret(ppc_opc_vadduws);
3928 return flowEndBlock;
3929 }
3930
3931 /* vaddsws Vector Add Signed Word Saturate
3932 * v.140
3933 */
3934 void ppc_opc_vaddsws()
3935 {
3936 VECTOR_DEBUG;
3937 int vrD, vrA, vrB;
3938 uint32 res;
3939 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3940
3941 for (int i=0; i<4; i++) {
3942 res = gCPU.vr[vrA].w[i] + gCPU.vr[vrB].w[i];
3943
3944 // We do this to prevent us from having to do 64-bit math
3945 if (((gCPU.vr[vrA].w[i] ^ gCPU.vr[vrB].w[i]) & SIGN32) == 0) {
3946 // the signs of both operands are the same
3947
3948 if (((res ^ gCPU.vr[vrA].w[i]) & SIGN32) != 0) {
3949 // sign of result != sign of operands
3950
3951 // if res is negative, should have been positive
3952 res = (res & SIGN32) ? (SIGN32 - 1) : SIGN32;
3953 gCPU.vscr |= VSCR_SAT;
3954 }
3955 }
3956
3957 /* 64-bit math | 32-bit hack
3958 * ------------------------+-------------------------------------
3959 * add, addc (a+b) | add (a+b)
3960 * sub, subb (r>ub) | xor, and (sign == sign)
3961 * sub, subb (r<lb) | xor, and (sign != sign)
3962 * | and (which)
3963 */
3964
3965 gCPU.vr[vrD].w[i] = res;
3966 }
3967 }
3968 JITCFlow ppc_opc_gen_vaddsws()
3969 {
3970 ppc_opc_gen_interpret(ppc_opc_vaddsws);
3971 return flowEndBlock;
3972 }
3973
3974 static inline void helper_gen_vsububm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
3975 {
3976 asmALU(X86_MOV, reg, modrma);
3977 asmALU(X86_NOT, reg);
3978 asmALU(X86_MOV, reg2, reg);
3979
3980 asmALU(X86_AND, reg, modrmb);
3981 asmALU(X86_XOR, reg2, modrmb);
3982
3983 asmShift(X86_SHL, reg, 1);
3984 asmALU(X86_AND, reg2, 0xfefefefe);
3985
3986 asmALU(X86_ADD, reg2, reg);
3987 asmALU(X86_AND, reg2, 0x01010100);
3988 }
3989
3990 /* vsububm Vector Subtract Unsigned Byte Modulo
3991 * v.265
3992 */
3993 void ppc_opc_vsububm()
3994 {
3995 VECTOR_DEBUG;
3996 int vrD, vrA, vrB;
3997 uint8 res;
3998 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3999
4000 for (int i=0; i<16; i++) {
4001 res = gCPU.vr[vrA].b[i] - gCPU.vr[vrB].b[i];
4002 gCPU.vr[vrD].b[i] = res;
4003 }
4004 }
4005 JITCFlow ppc_opc_gen_vsububm()
4006 {
4007 #if 0
4008 ppc_opc_gen_interpret(ppc_opc_vsububm);
4009 return flowEndBlock;
4010 #else
4011 int vrD, vrA, vrB;
4012 modrm_o modrma, modrmb;
4013 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4014
4015 if (SSE2_AVAIL) {
4016 noncommutative_operation(PALUB(X86_PSUB), vrD, vrA, vrB);
4017 return flowContinue;
4018 }
4019
4020 jitcFlushClientVectorRegister(vrA);
4021 jitcFlushClientVectorRegister(vrB);
4022 jitcDropClientVectorRegister(vrD);
4023
4024 jitcClobberCarryAndFlags();
4025 NativeReg reg = jitcAllocRegister();
4026 NativeReg reg2 = jitcAllocRegister();
4027
4028 if (vrA == vrD) {
4029 for (int i=0; i<16; i += 4) {
4030 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4031 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4032
4033 helper_gen_vsububm(reg, reg2, modrma, modrmb);
4034
4035 asmALU(X86_SUB, reg2, modrmb);
4036 asmALU(X86_ADD, modrma, reg2);
4037 }
4038 } else {
4039 for (int i=0; i<16; i += 4) {
4040 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4041 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4042
4043 helper_gen_vsububm(reg, reg2, modrma, modrmb);
4044
4045 asmALU(X86_SUB, reg2, modrmb);
4046 asmALU(X86_ADD, reg2, modrma);
4047
4048 vec_setWord(vrD, i, reg2);
4049 }
4050 }
4051
4052 return flowContinue;
4053 #endif
4054 }
4055
4056 static inline void helper_gen_vsubuhm(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
4057 {
4058 asmALU(X86_MOV, reg, modrma);
4059 asmALU(X86_NOT, reg);
4060 asmALU(X86_MOV, reg2, reg);
4061
4062 asmALU(X86_AND, reg, modrmb);
4063 asmALU(X86_XOR, reg2, modrmb);
4064
4065 asmShift(X86_SHL, reg, 1);
4066 asmALU(X86_AND, reg2, 0xfffefffe);
4067
4068 asmALU(X86_ADD, reg2, reg);
4069 asmALU(X86_AND, reg2, 0x00010000);
4070 }
4071
4072
4073 /* vsubuhm Vector Subtract Unsigned Half Word Modulo
4074 * v.267
4075 */
4076 void ppc_opc_vsubuhm()
4077 {
4078 VECTOR_DEBUG;
4079 int vrD, vrA, vrB;
4080 uint16 res;
4081 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4082
4083 for (int i=0; i<8; i++) {
4084 res = gCPU.vr[vrA].h[i] - gCPU.vr[vrB].h[i];
4085 gCPU.vr[vrD].h[i] = res;
4086 }
4087 }
4088 JITCFlow ppc_opc_gen_vsubuhm()
4089 {
4090 #if 0
4091 ppc_opc_gen_interpret(ppc_opc_vsubuhm);
4092 return flowEndBlock;
4093 #else
4094 int vrD, vrA, vrB;
4095 modrm_o modrma, modrmb;
4096 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4097
4098 if (SSE2_AVAIL) {
4099 noncommutative_operation(PALUW(X86_PSUB), vrD, vrA, vrB);
4100 return flowContinue;
4101 }
4102
4103 jitcFlushClientVectorRegister(vrA);
4104 jitcFlushClientVectorRegister(vrB);
4105 jitcDropClientVectorRegister(vrD);
4106
4107 NativeReg reg = jitcAllocRegister();
4108 NativeReg reg2 = jitcAllocRegister();
4109
4110 if (vrA == vrD) {
4111 for (int i=0; i<16; i += 4) {
4112 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4113 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4114
4115 helper_gen_vsubuhm(reg, reg2, modrma, modrmb);
4116
4117 asmALU(X86_SUB, reg2, modrmb);
4118 asmALU(X86_ADD, modrma, reg2);
4119 }
4120 } else {
4121 for (int i=0; i<16; i += 4) {
4122 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4123 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4124
4125 helper_gen_vsubuhm(reg, reg2, modrma, modrmb);
4126
4127 asmALU(X86_SUB, reg2, modrmb);
4128 asmALU(X86_ADD, reg2, modrma);
4129
4130 vec_setWord(vrD, i, reg2);
4131 }
4132 }
4133
4134 return flowContinue;
4135 #endif
4136 }
4137
4138 /* vsubuwm Vector Subtract Unsigned Word Modulo
4139 * v.269
4140 */
4141 void ppc_opc_vsubuwm()
4142 {
4143 VECTOR_DEBUG;
4144 int vrD, vrA, vrB;
4145 uint32 res;
4146 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4147
4148 for (int i=0; i<4; i++) {
4149 res = gCPU.vr[vrA].w[i] - gCPU.vr[vrB].w[i];
4150 gCPU.vr[vrD].w[i] = res;
4151 }
4152 }
4153 JITCFlow ppc_opc_gen_vsubuwm()
4154 {
4155 #if 0
4156 ppc_opc_gen_interpret(ppc_opc_vsubuwm);
4157 return flowEndBlock;
4158 #else
4159 int vrD, vrA, vrB;
4160 modrm_o modrm;
4161 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4162
4163 if (SSE2_AVAIL) {
4164 noncommutative_operation(PALUD(X86_PSUB), vrD, vrA, vrB);
4165 return flowContinue;
4166 }
4167
4168 jitcFlushClientVectorRegister(vrA);
4169 jitcFlushClientVectorRegister(vrB);
4170 jitcDropClientVectorRegister(vrD);
4171 NativeReg reg = jitcAllocRegister();
4172
4173 if (vrA == vrD) {
4174 for (int i=0; i<16; i += 4) {
4175 vec_getWord(reg, vrB, i);
4176
4177 asmALU(X86_SUB,
4178 x86_mem2(modrm, &(gCPU.vr[vrD].b[i])), reg);
4179 }
4180 } else {
4181 for (int i=0; i<16; i += 4) {
4182 vec_getWord(reg, vrA, i);
4183
4184 asmALU(X86_SUB, reg,
4185 x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4186
4187 vec_setWord(vrD, i, reg);
4188 }
4189 }
4190
4191 return flowContinue;
4192 #endif
4193 }
4194
4195 /* vsubfp Vector Subtract Float Point
4196 * v.261
4197 */
4198 void ppc_opc_vsubfp()
4199 {
4200 VECTOR_DEBUG;
4201 int vrD, vrA, vrB;
4202 float res;
4203 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4204
4205 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
4206 res = gCPU.vr[vrA].f[i] - gCPU.vr[vrB].f[i];
4207 gCPU.vr[vrD].f[i] = res;
4208 }
4209 }
4210 JITCFlow ppc_opc_gen_vsubfp()
4211 {
4212 #if 0
4213 ppc_opc_gen_interpret(ppc_opc_vsubfp);
4214 return flowEndBlock;
4215 #else
4216 int vrD, vrA, vrB;
4217 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4218
4219 if (SSE_AVAIL) {
4220 noncommutative_operation(X86_SUBPS, vrD, vrA, vrB);
4221 return flowContinue;
4222 }
4223
4224 ppc_opc_gen_interpret(ppc_opc_vsubfp);
4225 return flowEndBlock;
4226 #endif
4227 }
4228
4229 /* vsubcuw Vector Subtract Carryout Unsigned Word
4230 * v.260
4231 */
4232 void ppc_opc_vsubcuw()
4233 {
4234 VECTOR_DEBUG;
4235 int vrD, vrA, vrB;
4236 uint32 res;
4237 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4238
4239 for (int i=0; i<4; i++) {
4240 res = gCPU.vr[vrA].w[i] - gCPU.vr[vrB].w[i];
4241 gCPU.vr[vrD].w[i] = (res <= gCPU.vr[vrA].w[i]) ? 1 : 0;
4242 }
4243 }
4244 JITCFlow ppc_opc_gen_vsubcuw()
4245 {
4246 #if 0
4247 ppc_opc_gen_interpret(ppc_opc_vsubcuw);
4248 return flowEndBlock;
4249 #else
4250 int vrD, vrA, vrB;
4251 modrm_o modrm;
4252 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4253
4254 if (vrA == vrB) {
4255 // no need to flush vrA, vrB, their values are irrelevant here
4256 jitcDropClientVectorRegister(vrD);
4257
4258 NativeReg reg = jitcAllocRegister();
4259 asmMOV_NoFlags(reg, 1); // avoid flag clobber
4260
4261 vec_setWord(vrD, 0, reg);
4262 vec_setWord(vrD, 4, reg);
4263 vec_setWord(vrD, 8, reg);
4264 vec_setWord(vrD,12, reg);
4265 } else {
4266 jitcFlushClientVectorRegister(vrA);
4267 jitcFlushClientVectorRegister(vrB);
4268 jitcFlushClientVectorRegister(vrD);
4269
4270 NativeReg reg = jitcAllocRegister(NATIVE_REG_8);
4271
4272 for (int i=0; i<16; i += 4) {
4273 vec_getWord(reg, vrA, i);
4274
4275 asmALU(X86_SUB, reg,
4276 x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4277
4278 asmSET(X86_NC, (NativeReg8)reg);
4279
4280 asmMOVxx(X86_MOVZX, reg, (NativeReg8)reg);
4281
4282 vec_setWord(vrD, i, reg);
4283 }
4284 }
4285
4286 return flowContinue;
4287 #endif
4288 }
4289
4290 static inline void helper_gen_vsububs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
4291 {
4292 asmALU(X86_MOV, reg, modrma);
4293 asmALU(X86_NOT, reg);
4294 asmALU(X86_MOV, reg2, reg);
4295
4296 asmALU(X86_XOR, reg, modrmb);
4297 asmALU(X86_AND, reg2, modrmb);
4298
4299 asmShift(X86_SHR, reg, 1);
4300 asmALU(X86_AND, reg, 0x7f7f7f7f);
4301
4302 asmALU(X86_ADD, reg2, reg);
4303 asmALU(X86_AND, reg2, 0x80808080);
4304
4305 asmShift(X86_SHR, reg2, 7);
4306 asmALU(X86_ADD, reg2, 0x7f7f7f7f);
4307 asmALU(X86_XOR, reg2, 0x7f7f7f7f);
4308 NativeAddress skip = asmJxxFixup(X86_Z);
4309
4310 vec_raise_saturate();
4311
4312 asmResolveFixup(skip);
4313 }
4314
4315 /* vsububs Vector Subtract Unsigned Byte Saturate
4316 * v.266
4317 */
4318 void ppc_opc_vsububs()
4319 {
4320 VECTOR_DEBUG;
4321 int vrD, vrA, vrB;
4322 uint16 res;
4323 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4324
4325 for (int i=0; i<16; i++) {
4326 res = (uint16)gCPU.vr[vrA].b[i] - (uint16)gCPU.vr[vrB].b[i];
4327
4328 gCPU.vr[vrD].b[i] = SATURATE_0B(res);
4329 }
4330 }
4331 JITCFlow ppc_opc_gen_vsububs()
4332 {
4333 #if 0
4334 ppc_opc_gen_interpret(ppc_opc_vsububs);
4335 return flowEndBlock;
4336 #else
4337 int vrD, vrA, vrB;
4338 modrm_o modrma, modrmb;
4339 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4340
4341 if (SSE2_AVAIL) {
4342 noncommutative_operation(PALUB(X86_PSUBUS), vrD, vrA, vrB);
4343 return flowContinue;
4344 }
4345
4346 jitcFlushClientVectorRegister(vrA);
4347 jitcFlushClientVectorRegister(vrB);
4348 jitcDropClientVectorRegister(vrD);
4349
4350 jitcClobberCarryAndFlags();
4351 NativeReg reg = jitcAllocRegister();
4352 NativeReg reg2 = jitcAllocRegister();
4353
4354 if (vrA == vrD) {
4355 for (int i=0; i<16; i += 4) {
4356 x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
4357 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4358
4359 helper_gen_vsububs(reg, reg2, modrma, modrmb);
4360
4361 asmALU(X86_MOV, reg, modrmb);
4362 asmALU(X86_OR, reg, reg2);
4363
4364 asmALU(X86_OR, modrma, reg2);
4365 asmALU(X86_SUB, modrma, reg);
4366 }
4367 } else {
4368 for (int i=0; i<16; i += 4) {
4369 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4370 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4371
4372 helper_gen_vsububs(reg, reg2, modrma, modrmb);
4373
4374 asmALU(X86_MOV, reg, modrmb);
4375 asmALU(X86_OR, reg, reg2);
4376
4377 asmALU(X86_OR, reg2, modrma);
4378 asmALU(X86_SUB, reg2, reg);
4379
4380 vec_setWord(vrD, i, reg2);
4381 }
4382 }
4383
4384 return flowContinue;
4385 #endif
4386 }
4387
4388 /* vsubsbs Vector Subtract Signed Byte Saturate
4389 * v.262
4390 */
4391 void ppc_opc_vsubsbs()
4392 {
4393 VECTOR_DEBUG;
4394 int vrD, vrA, vrB;
4395 sint16 res;
4396 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4397
4398 for (int i=0; i<16; i++) {
4399 res = (sint16)gCPU.vr[vrA].sb[i] - (sint16)gCPU.vr[vrB].sb[i];
4400
4401 gCPU.vr[vrD].sb[i] = SATURATE_SB(res);
4402 }
4403 }
4404 JITCFlow ppc_opc_gen_vsubsbs()
4405 {
4406 #if 0
4407 ppc_opc_gen_interpret(ppc_opc_vsubsbs);
4408 return flowEndBlock;
4409 #else
4410 int vrD, vrA, vrB;
4411 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4412
4413 if (SSE2_AVAIL) {
4414 noncommutative_operation(PALUB(X86_PSUBS), vrD, vrA, vrB);
4415 return flowContinue;
4416 }
4417
4418 jitcFlushClientVectorRegister(vrA);
4419 jitcFlushClientVectorRegister(vrB);
4420 jitcDropClientVectorRegister(vrD);
4421
4422 jitcClobberCarryAndFlags();
4423 NativeReg reg = jitcAllocRegister(NATIVE_REG_8);
4424 NativeReg reg2 = jitcAllocRegister();
4425
4426 for (int i=0; i<16; i++) {
4427 vec_getByteS(reg, vrA, i);
4428 vec_getByteS(reg2, vrB, i);
4429
4430 asmALU(X86_SUB, reg, reg2);
4431 vec_saturateSB(reg, reg2);
4432
4433 vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
4434 }
4435
4436 return flowContinue;
4437 #endif
4438 }
4439
4440 static inline void helper_gen_vsubuhs(NativeReg reg, NativeReg reg2, modrm_p modrma, modrm_p modrmb)
4441 {
4442 asmALU(X86_MOV, reg, modrma);
4443 asmALU(X86_NOT, reg);
4444 asmALU(X86_MOV, reg2, reg);
4445
4446 asmALU(X86_XOR, reg, modrmb);
4447 asmALU(X86_AND, reg2, modrmb);
4448
4449 asmShift(X86_SHR, reg, 1);
4450 asmALU(X86_AND, reg, 0x7fff7fff);
4451
4452 asmALU(X86_ADD, reg2, reg);
4453 asmALU(X86_AND, reg2, 0x80008000);
4454
4455 asmShift(X86_SHR, reg2, 15);
4456 asmALU(X86_ADD, reg2, 0x7fff7fff);
4457 asmALU(X86_XOR, reg2, 0x7fff7fff);
4458 NativeAddress skip = asmJxxFixup(X86_Z);
4459
4460 vec_raise_saturate();
4461
4462 asmResolveFixup(skip);
4463 }
4464
4465 /* vsubuhs Vector Subtract Unsigned Half Word Saturate
4466 * v.268
4467 */
4468 void ppc_opc_vsubuhs()
4469 {
4470 VECTOR_DEBUG;
4471 int vrD, vrA, vrB;
4472 uint32 res;
4473 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4474
4475 for (int i=0; i<8; i++) {
4476 res = (uint32)gCPU.vr[vrA].h[i] - (uint32)gCPU.vr[vrB].h[i];
4477
4478 gCPU.vr[vrD].h[i] = SATURATE_0H(res);
4479 }
4480 }
4481 JITCFlow ppc_opc_gen_vsubuhs()
4482 {
4483 #if 0
4484 ppc_opc_gen_interpret(ppc_opc_vsubuhs);
4485 return flowEndBlock;
4486 #else
4487 int vrD, vrA, vrB;
4488 modrm_o modrma, modrmb;
4489 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4490
4491 if (SSE2_AVAIL) {
4492 noncommutative_operation(PALUW(X86_PSUBUS), vrD, vrA, vrB);
4493 return flowContinue;
4494 }
4495
4496 jitcFlushClientVectorRegister(vrA);
4497 jitcFlushClientVectorRegister(vrB);
4498 jitcDropClientVectorRegister(vrD);
4499
4500 jitcClobberCarryAndFlags();
4501 NativeReg reg = jitcAllocRegister();
4502 NativeReg reg2 = jitcAllocRegister();
4503
4504 if (vrA == vrD) {
4505 for (int i=0; i<16; i += 4) {
4506 x86_mem2(modrma, &(gCPU.vr[vrD].b[i]));
4507 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4508
4509 helper_gen_vsubuhs(reg, reg2, modrma, modrmb);
4510
4511 asmALU(X86_MOV, reg, modrmb);
4512 asmALU(X86_OR, reg, reg2);
4513
4514 asmALU(X86_OR, modrma, reg2);
4515 asmALU(X86_SUB, modrma, reg);
4516 }
4517 } else {
4518 for (int i=0; i<16; i += 4) {
4519 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
4520 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
4521
4522 helper_gen_vsubuhs(reg, reg2, modrma, modrmb);
4523
4524 asmALU(X86_MOV, reg, modrmb);
4525 asmALU(X86_OR, reg, reg2);
4526
4527 asmALU(X86_OR, reg2, modrma);
4528 asmALU(X86_SUB, reg2, reg);
4529
4530 vec_setWord(vrD, i, reg2);
4531 }
4532 }
4533
4534 return flowContinue;
4535 #endif
4536 }
4537
4538 /* vsubshs Vector Subtract Signed Half Word Saturate
4539 * v.263
4540 */
4541 void ppc_opc_vsubshs()
4542 {
4543 VECTOR_DEBUG;
4544 int vrD, vrA, vrB;
4545 sint32 res;
4546 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4547
4548 for (int i=0; i<8; i++) {
4549 res = (sint32)gCPU.vr[vrA].sh[i] - (sint32)gCPU.vr[vrB].sh[i];
4550
4551 gCPU.vr[vrD].sh[i] = SATURATE_SH(res);
4552 }
4553 }
4554 JITCFlow ppc_opc_gen_vsubshs()
4555 {
4556 #if 0
4557 ppc_opc_gen_interpret(ppc_opc_vsubshs);
4558 return flowEndBlock;
4559 #else
4560 int vrD, vrA, vrB;
4561 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4562
4563 if (SSE2_AVAIL) {
4564 noncommutative_operation(PALUW(X86_PSUBS), vrD, vrA, vrB);
4565 return flowContinue;
4566 }
4567
4568 jitcFlushClientVectorRegister(vrA);
4569 jitcFlushClientVectorRegister(vrB);
4570 jitcDropClientVectorRegister(vrD);
4571
4572 jitcClobberCarryAndFlags();
4573 NativeReg reg = jitcAllocRegister();
4574 NativeReg reg2 = jitcAllocRegister();
4575
4576 for (int i=0; i<16; i += 2) {
4577 vec_getHalfS(reg, vrA, i);
4578 vec_getHalfS(reg2, vrB, i);
4579
4580 asmALU(X86_SUB, reg, reg2);
4581 vec_saturateSH(reg, reg2);
4582
4583 vec_setHalf(vrD, i, (NativeReg16)reg);
4584 }
4585
4586 return flowContinue;
4587 #endif
4588 }
4589
4590 /* vsubuws Vector Subtract Unsigned Word Saturate
4591 * v.270
4592 */
4593 void ppc_opc_vsubuws()
4594 {
4595 VECTOR_DEBUG;
4596 int vrD, vrA, vrB;
4597 uint32 res;
4598 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4599
4600 for (int i=0; i<4; i++) {
4601 res = gCPU.vr[vrA].w[i] - gCPU.vr[vrB].w[i];
4602
4603 // We do this to prevent us from having to do 64-bit math
4604 if (res > gCPU.vr[vrA].w[i]) {
4605 res = 0;
4606 gCPU.vscr |= VSCR_SAT;
4607 }
4608
4609 /* 64-bit math | 32-bit hack
4610 * ------------------------+-------------------------------------
4611 * sub, subb (a+b) | sub (a+b)
4612 * sub, subb (r>ub) | sub (r<a)
4613 */
4614
4615 gCPU.vr[vrD].w[i] = res;
4616 }
4617 }
4618 JITCFlow ppc_opc_gen_vsubuws()
4619 {
4620 #if 0
4621 ppc_opc_gen_interpret(ppc_opc_vsubuws);
4622 return flowEndBlock;
4623 #else
4624 int vrD, vrA, vrB;
4625 modrm_o modrm;
4626 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4627
4628 jitcFlushClientVectorRegister(vrA);
4629 jitcFlushClientVectorRegister(vrB);
4630 jitcDropClientVectorRegister(vrD);
4631
4632 jitcClobberCarryAndFlags();
4633 NativeReg reg = jitcAllocRegister();
4634
4635 for (int i=0; i<16; i += 4) {
4636 vec_getWord(reg, vrA, i);
4637
4638 asmALU(X86_SUB, reg,
4639 x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4640
4641 NativeAddress skip = asmJxxFixup(X86_NC);
4642
4643 asmALU(X86_XOR, reg, reg);
4644
4645 asmResolveFixup(skip);
4646
4647 vec_setWord(vrD, i, reg);
4648 }
4649
4650 return flowContinue;
4651 #endif
4652 }
4653
4654 /* vsubsws Vector Subtract Signed Word Saturate
4655 * v.264
4656 */
4657 void ppc_opc_vsubsws()
4658 {
4659 VECTOR_DEBUG;
4660 int vrD, vrA, vrB;
4661 uint32 res, tmp;
4662 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4663
4664 for (int i=0; i<4; i++) {
4665 tmp = -gCPU.vr[vrB].w[i];
4666 res = gCPU.vr[vrA].w[i] + tmp;
4667
4668 // We do this to prevent us from having to do 64-bit math
4669 if (((gCPU.vr[vrA].w[i] ^ tmp) & SIGN32) == 0) {
4670 // the signs of both operands are the same
4671
4672 if (((res ^ tmp) & SIGN32) != 0) {
4673 // sign of result != sign of operands
4674
4675 // if res is negative, should have been positive
4676 res = (res & SIGN32) ? (SIGN32 - 1) : SIGN32;
4677 gCPU.vscr |= VSCR_SAT;
4678 }
4679 }
4680
4681 /* 64-bit math | 32-bit hack
4682 * ------------------------+-------------------------------------
4683 * sub, subc (a+b) | neg, add (a-b)
4684 * sub, subb (r>ub) | xor, and (sign == sign)
4685 * sub, subb (r<lb) | xor, and (sign != sign)
4686 * | and (which)
4687 */
4688
4689 gCPU.vr[vrD].w[i] = res;
4690 }
4691 }
4692 JITCFlow ppc_opc_gen_vsubsws()
4693 {
4694 #if 0
4695 ppc_opc_gen_interpret(ppc_opc_vsubsws);
4696 return flowEndBlock;
4697 #else
4698 int vrD, vrA, vrB;
4699 modrm_o modrm;
4700 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4701
4702 jitcFlushClientVectorRegister(vrA);
4703 jitcFlushClientVectorRegister(vrB);
4704 jitcDropClientVectorRegister(vrD);
4705
4706 jitcClobberCarryAndFlags();
4707 NativeReg reg = jitcAllocRegister();
4708
4709 for (int i=0; i<16; i += 4) {
4710 vec_getWord(reg, vrA, i);
4711
4712 asmALU(X86_SUB, reg,
4713 x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
4714
4715 NativeAddress skip = asmJxxFixup(X86_NO);
4716
4717 asmMOV_NoFlags(reg, 0x80000000);
4718
4719 asmResolveFixup(skip);
4720
4721 vec_setWord(vrD, i, reg);
4722 }
4723
4724 return flowContinue;
4725 #endif
4726 }
4727
4728 /* vmuleub Vector Multiply Even Unsigned Byte
4729 * v.209
4730 */
4731 void ppc_opc_vmuleub()
4732 {
4733 VECTOR_DEBUG;
4734 int vrD, vrA, vrB;
4735 uint16 res;
4736 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4737
4738 for (int i=0; i<8; i++) {
4739 res = (uint16)gCPU.vr[vrA].b[VECT_EVEN(i)] *
4740 (uint16)gCPU.vr[vrB].b[VECT_EVEN(i)];
4741
4742 gCPU.vr[vrD].h[i] = res;
4743 }
4744 }
4745 JITCFlow ppc_opc_gen_vmuleub()
4746 {
4747 #if 0
4748 ppc_opc_gen_interpret(ppc_opc_vmuleub);
4749 return flowEndBlock;
4750 #else
4751 int vrD, vrA, vrB;
4752 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4753
4754 jitcFlushClientVectorRegister(vrA);
4755 jitcFlushClientVectorRegister(vrB);
4756 jitcDropClientVectorRegister(vrD);
4757
4758 jitcClobberCarryAndFlags();
4759 NativeReg reg1 = jitcAllocRegister();
4760 NativeReg reg2 = jitcAllocRegister();
4761
4762 for (int i=0; i<16; i+=2) {
4763 vec_getByteZ(reg1, vrA, i+1);
4764 vec_getByteZ(reg2, vrB, i+1);
4765
4766 asmIMUL(reg1, reg2);
4767
4768 vec_setHalf(vrD, i, (NativeReg16)reg1);
4769 }
4770
4771 return flowContinue;
4772 #endif
4773 }
4774
4775 /* vmulesb Vector Multiply Even Signed Byte
4776 * v.207
4777 */
4778 void ppc_opc_vmulesb()
4779 {
4780 VECTOR_DEBUG;
4781 int vrD, vrA, vrB;
4782 sint16 res;
4783 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4784
4785 for (int i=0; i<8; i++) {
4786 res = (sint16)gCPU.vr[vrA].sb[VECT_EVEN(i)] *
4787 (sint16)gCPU.vr[vrB].sb[VECT_EVEN(i)];
4788
4789 gCPU.vr[vrD].sh[i] = res;
4790 }
4791 }
4792 JITCFlow ppc_opc_gen_vmulesb()
4793 {
4794 #if 0
4795 ppc_opc_gen_interpret(ppc_opc_vmulesb);
4796 return flowEndBlock;
4797 #else
4798 int vrD, vrA, vrB;
4799 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4800
4801 jitcFlushClientVectorRegister(vrA);
4802 jitcFlushClientVectorRegister(vrB);
4803 jitcDropClientVectorRegister(vrD);
4804
4805 jitcClobberCarryAndFlags();
4806 NativeReg reg1 = jitcAllocRegister();
4807 NativeReg reg2 = jitcAllocRegister();
4808
4809 for (int i=0; i<16; i+=2) {
4810 vec_getByteS(reg1, vrA, i+1);
4811 vec_getByteS(reg2, vrB, i+1);
4812
4813 asmIMUL(reg1, reg2);
4814
4815 vec_setHalf(vrD, i, (NativeReg16)reg1);
4816 }
4817
4818 return flowContinue;
4819 #endif
4820 }
4821
4822 /* vmuleuh Vector Multiply Even Unsigned Half Word
4823 * v.210
4824 */
4825 void ppc_opc_vmuleuh()
4826 {
4827 VECTOR_DEBUG;
4828 int vrD, vrA, vrB;
4829 uint32 res;
4830 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4831
4832 for (int i=0; i<4; i++) {
4833 res = (uint32)gCPU.vr[vrA].h[VECT_EVEN(i)] *
4834 (uint32)gCPU.vr[vrB].h[VECT_EVEN(i)];
4835
4836 gCPU.vr[vrD].w[i] = res;
4837 }
4838 }
4839 JITCFlow ppc_opc_gen_vmuleuh()
4840 {
4841 #if 0
4842 ppc_opc_gen_interpret(ppc_opc_vmuleuh);
4843 return flowEndBlock;
4844 #else
4845 int vrD, vrA, vrB;
4846 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4847
4848 jitcFlushClientVectorRegister(vrA);
4849 jitcFlushClientVectorRegister(vrB);
4850 jitcDropClientVectorRegister(vrD);
4851
4852 jitcClobberCarryAndFlags();
4853 NativeReg reg1 = jitcAllocRegister();
4854 NativeReg reg2 = jitcAllocRegister();
4855
4856 for (int i=0; i<16; i+=4) {
4857 vec_getHalfZ(reg1, vrA, i+2);
4858 vec_getHalfZ(reg2, vrB, i+2);
4859
4860 asmIMUL(reg1, reg2);
4861
4862 vec_setWord(vrD, i, reg1);
4863 }
4864
4865 return flowContinue;
4866 #endif
4867 }
4868
4869 /* vmulesh Vector Multiply Even Signed Half Word
4870 * v.208
4871 */
4872 void ppc_opc_vmulesh()
4873 {
4874 VECTOR_DEBUG;
4875 int vrD, vrA, vrB;
4876 sint32 res;
4877 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4878
4879 for (int i=0; i<4; i++) {
4880 res = (sint32)gCPU.vr[vrA].sh[VECT_EVEN(i)] *
4881 (sint32)gCPU.vr[vrB].sh[VECT_EVEN(i)];
4882
4883 gCPU.vr[vrD].sw[i] = res;
4884 }
4885 }
4886 JITCFlow ppc_opc_gen_vmulesh()
4887 {
4888 #if 0
4889 ppc_opc_gen_interpret(ppc_opc_vmulesh);
4890 return flowEndBlock;
4891 #else
4892 int vrD, vrA, vrB;
4893 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4894
4895 jitcFlushClientVectorRegister(vrA);
4896 jitcFlushClientVectorRegister(vrB);
4897 jitcDropClientVectorRegister(vrD);
4898
4899 jitcClobberCarryAndFlags();
4900 NativeReg reg1 = jitcAllocRegister();
4901 NativeReg reg2 = jitcAllocRegister();
4902
4903 for (int i=0; i<16; i+=4) {
4904 vec_getHalfS(reg1, vrA, i+2);
4905 vec_getHalfS(reg2, vrB, i+2);
4906
4907 asmIMUL(reg1, reg2);
4908
4909 vec_setWord(vrD, i, reg1);
4910 }
4911
4912 return flowContinue;
4913 #endif
4914 }
4915
4916 /* vmuloub Vector Multiply Odd Unsigned Byte
4917 * v.213
4918 */
4919 void ppc_opc_vmuloub()
4920 {
4921 VECTOR_DEBUG;
4922 int vrD, vrA, vrB;
4923 uint16 res;
4924 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4925
4926 for (int i=0; i<8; i++) {
4927 res = (uint16)gCPU.vr[vrA].b[VECT_ODD(i)] *
4928 (uint16)gCPU.vr[vrB].b[VECT_ODD(i)];
4929
4930 gCPU.vr[vrD].h[i] = res;
4931 }
4932 }
4933 JITCFlow ppc_opc_gen_vmuloub()
4934 {
4935 #if 0
4936 ppc_opc_gen_interpret(ppc_opc_vmuloub);
4937 return flowEndBlock;
4938 #else
4939 int vrD, vrA, vrB;
4940 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4941
4942 jitcFlushClientVectorRegister(vrA);
4943 jitcFlushClientVectorRegister(vrB);
4944 jitcDropClientVectorRegister(vrD);
4945
4946 jitcClobberCarryAndFlags();
4947 NativeReg reg1 = jitcAllocRegister();
4948 NativeReg reg2 = jitcAllocRegister();
4949
4950 for (int i=0; i<16; i+=2) {
4951 vec_getByteZ(reg1, vrA, i);
4952 vec_getByteZ(reg2, vrB, i);
4953
4954 asmIMUL(reg1, reg2);
4955
4956 vec_setHalf(vrD, i, (NativeReg16)reg1);
4957 }
4958
4959 return flowContinue;
4960 #endif
4961 }
4962
4963 /* vmulosb Vector Multiply Odd Signed Byte
4964 * v.211
4965 */
4966 void ppc_opc_vmulosb()
4967 {
4968 VECTOR_DEBUG;
4969 int vrD, vrA, vrB;
4970 sint16 res;
4971 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
4972
4973 for (int i=0; i<8; i++) {
4974 res = (sint16)gCPU.vr[vrA].sb[VECT_ODD(i)] *
4975 (sint16)gCPU.vr[vrB].sb[VECT_ODD(i)];
4976
4977 gCPU.vr[vrD].sh[i] = res;
4978 }
4979 }
4980 JITCFlow ppc_opc_gen_vmulosb()
4981 {
4982 #if 0
4983 ppc_opc_gen_interpret(ppc_opc_vmulosb);
4984 return flowEndBlock;
4985 #else
4986 int vrD, vrA, vrB;
4987 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
4988
4989 jitcFlushClientVectorRegister(vrA);
4990 jitcFlushClientVectorRegister(vrB);
4991 jitcDropClientVectorRegister(vrD);
4992
4993 jitcClobberCarryAndFlags();
4994 NativeReg reg1 = jitcAllocRegister();
4995 NativeReg reg2 = jitcAllocRegister();
4996
4997 for (int i=0; i<16; i+=2) {
4998 vec_getByteS(reg1, vrA, i);
4999 vec_getByteS(reg2, vrB, i);
5000
5001 asmIMUL(reg1, reg2);
5002
5003 vec_setHalf(vrD, i, (NativeReg16)reg1);
5004 }
5005
5006 return flowContinue;
5007 #endif
5008 }
5009
5010 /* vmulouh Vector Multiply Odd Unsigned Half Word
5011 * v.214
5012 */
5013 void ppc_opc_vmulouh()
5014 {
5015 VECTOR_DEBUG;
5016 int vrD, vrA, vrB;
5017 uint32 res;
5018 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5019
5020 for (int i=0; i<4; i++) {
5021 res = (uint32)gCPU.vr[vrA].h[VECT_ODD(i)] *
5022 (uint32)gCPU.vr[vrB].h[VECT_ODD(i)];
5023
5024 gCPU.vr[vrD].w[i] = res;
5025 }
5026 }
5027 JITCFlow ppc_opc_gen_vmulouh()
5028 {
5029 #if 0
5030 ppc_opc_gen_interpret(ppc_opc_vmulouh);
5031 return flowEndBlock;
5032 #else
5033 int vrD, vrA, vrB;
5034 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5035
5036 jitcFlushClientVectorRegister(vrA);
5037 jitcFlushClientVectorRegister(vrB);
5038 jitcDropClientVectorRegister(vrD);
5039
5040 jitcClobberCarryAndFlags();
5041 NativeReg reg1 = jitcAllocRegister();
5042 NativeReg reg2 = jitcAllocRegister();
5043
5044 for (int i=0; i<16; i+=4) {
5045 vec_getHalfZ(reg1, vrA, i);
5046 vec_getHalfZ(reg2, vrB, i);
5047
5048 asmIMUL(reg1, reg2);
5049
5050 vec_setWord(vrD, i, reg1);
5051 }
5052
5053 return flowContinue;
5054 #endif
5055 }
5056
5057 /* vmulosh Vector Multiply Odd Signed Half Word
5058 * v.212
5059 */
5060 void ppc_opc_vmulosh()
5061 {
5062 VECTOR_DEBUG;
5063 int vrD, vrA, vrB;
5064 sint32 res;
5065 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5066
5067 for (int i=0; i<4; i++) {
5068 res = (sint32)gCPU.vr[vrA].sh[VECT_ODD(i)] *
5069 (sint32)gCPU.vr[vrB].sh[VECT_ODD(i)];
5070
5071 gCPU.vr[vrD].sw[i] = res;
5072 }
5073 }
5074 JITCFlow ppc_opc_gen_vmulosh()
5075 {
5076 #if 0
5077 ppc_opc_gen_interpret(ppc_opc_vmulosh);
5078 return flowEndBlock;
5079 #else
5080 int vrD, vrA, vrB;
5081 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5082
5083 jitcFlushClientVectorRegister(vrA);
5084 jitcFlushClientVectorRegister(vrB);
5085 jitcDropClientVectorRegister(vrD);
5086
5087 jitcClobberCarryAndFlags();
5088 NativeReg reg1 = jitcAllocRegister();
5089 NativeReg reg2 = jitcAllocRegister();
5090
5091 for (int i=0; i<16; i+=4) {
5092 vec_getHalfS(reg1, vrA, i);
5093 vec_getHalfS(reg2, vrB, i);
5094
5095 asmIMUL(reg1, reg2);
5096
5097 vec_setWord(vrD, i, reg1);
5098 }
5099
5100 return flowContinue;
5101 #endif
5102 }
5103
5104 /* vmaddfp Vector Multiply Add Floating Point
5105 * v.177
5106 */
5107 void ppc_opc_vmaddfp()
5108 {
5109 VECTOR_DEBUG;
5110 int vrD, vrA, vrB, vrC;
5111 double res;
5112 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5113
5114 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
5115 res = (double)gCPU.vr[vrA].f[i] * (double)gCPU.vr[vrC].f[i];
5116
5117 res = (double)gCPU.vr[vrB].f[i] + res;
5118
5119 gCPU.vr[vrD].f[i] = (float)res;
5120 }
5121 }
5122 JITCFlow ppc_opc_gen_vmaddfp()
5123 {
5124 #if 0
5125 ppc_opc_gen_interpret(ppc_opc_vmaddfp);
5126 return flowEndBlock;
5127 #else
5128 int vrD, vrA, vrB, vrC;
5129 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5130
5131 if (SSE_AVAIL) {
5132 commutative_operation(X86_MULPS, vrT, vrA, vrC);
5133 commutative_operation(X86_ADDPS, vrD, vrB, vrT);
5134 return flowContinue;
5135 }
5136
5137 ppc_opc_gen_interpret(ppc_opc_vmaddfp);
5138 return flowEndBlock;
5139 #endif
5140 }
5141
5142 /* vmhaddshs Vector Multiply High and Add Signed Half Word Saturate
5143 * v.185
5144 */
5145 void ppc_opc_vmhaddshs()
5146 {
5147 VECTOR_DEBUG;
5148 int vrD, vrA, vrB, vrC;
5149 sint32 prod;
5150 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5151
5152 for (int i=0; i<8; i++) {
5153 prod = (sint32)gCPU.vr[vrA].sh[i] * (sint32)gCPU.vr[vrB].sh[i];
5154
5155 prod = (prod >> 15) + (sint32)gCPU.vr[vrC].sh[i];
5156
5157 gCPU.vr[vrD].sh[i] = SATURATE_SH(prod);
5158 }
5159 }
5160 JITCFlow ppc_opc_gen_vmhaddshs()
5161 {
5162 #if 0
5163 ppc_opc_gen_interpret(ppc_opc_vmhaddshs);
5164 return flowEndBlock;
5165 #else
5166 int vrD, vrA, vrB, vrC;
5167 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5168
5169 jitcFlushClientVectorRegister(vrA);
5170 jitcFlushClientVectorRegister(vrB);
5171 jitcFlushClientVectorRegister(vrC);
5172 jitcDropClientVectorRegister(vrD);
5173
5174 jitcClobberCarryAndFlags();
5175 NativeReg reg1 = jitcAllocRegister();
5176 NativeReg reg2 = jitcAllocRegister();
5177
5178 for (int i=0; i<16; i+=2) {
5179 vec_getHalfS(reg1, vrA, i);
5180 vec_getHalfS(reg2, vrB, i);
5181
5182 asmIMUL(reg1, reg2);
5183 asmShift(X86_SAR, reg1, 15);
5184
5185 vec_getHalfS(reg2, vrC, i);
5186 asmALU(X86_ADD, reg1, reg2);
5187
5188 vec_saturateSH(reg1, reg2);
5189
5190 vec_setHalf(vrD, i, (NativeReg16)reg1);
5191 }
5192
5193 return flowContinue;
5194 #endif
5195 }
5196
5197 /* vmladduhm Vector Multiply Low and Add Unsigned Half Word Modulo
5198 * v.194
5199 */
5200 void ppc_opc_vmladduhm()
5201 {
5202 VECTOR_DEBUG;
5203 int vrD, vrA, vrB, vrC;
5204 uint32 prod;
5205 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5206
5207 for (int i=0; i<8; i++) {
5208 prod = (uint32)gCPU.vr[vrA].h[i] * (uint32)gCPU.vr[vrB].h[i];
5209
5210 prod = prod + (uint32)gCPU.vr[vrC].h[i];
5211
5212 gCPU.vr[vrD].h[i] = prod;
5213 }
5214 }
5215 JITCFlow ppc_opc_gen_vmladduhm()
5216 {
5217 #if 0
5218 ppc_opc_gen_interpret(ppc_opc_vmladduhm);
5219 return flowEndBlock;
5220 #else
5221 int vrD, vrA, vrB, vrC;
5222 modrm_o modrm;
5223 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5224
5225 if (SSE2_AVAIL) {
5226 commutative_operation(X86_PMULLW, vrT, vrA, vrB);
5227 commutative_operation(PALUW(X86_PADD), vrD, vrT, vrC);
5228 return flowContinue;
5229 }
5230
5231 jitcFlushClientVectorRegister(vrA);
5232 jitcFlushClientVectorRegister(vrB);
5233 jitcFlushClientVectorRegister(vrC);
5234 jitcDropClientVectorRegister(vrD);
5235
5236 jitcClobberCarryAndFlags();
5237 NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
5238 NativeReg reg2 = jitcAllocRegister();
5239
5240 for (int i=0; i<16; i+=2) {
5241 vec_getHalfZ((NativeReg)reg1, vrA, i);
5242 vec_getHalfZ(reg2, vrB, i);
5243
5244 asmIMUL((NativeReg)reg1, reg2);
5245
5246 if (vrC == vrD) {
5247 asmALU(X86_ADD,
5248 x86_mem2(modrm, &(gCPU.vr[vrD].b[i])), reg1);
5249 } else {
5250 asmALU(X86_ADD, reg1,
5251 x86_mem2(modrm, &(gCPU.vr[vrC].b[i])));
5252
5253 vec_setHalf(vrD, i, reg1);
5254 }
5255 }
5256
5257 return flowContinue;
5258 #endif
5259 }
5260
5261 /* vmhraddshs Vector Multiply High Round and Add Signed Half Word Saturate
5262 * v.186
5263 */
5264 void ppc_opc_vmhraddshs()
5265 {
5266 VECTOR_DEBUG;
5267 int vrD, vrA, vrB, vrC;
5268 sint32 prod;
5269 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5270
5271 for (int i=0; i<8; i++) {
5272 prod = (sint32)gCPU.vr[vrA].sh[i] * (sint32)gCPU.vr[vrB].sh[i];
5273
5274 prod += 0x4000;
5275 prod = (prod >> 15) + (sint32)gCPU.vr[vrC].sh[i];
5276
5277 gCPU.vr[vrD].sh[i] = SATURATE_SH(prod);
5278 }
5279 }
5280 JITCFlow ppc_opc_gen_vmhraddshs()
5281 {
5282 #if 0
5283 ppc_opc_gen_interpret(ppc_opc_vmhraddshs);
5284 return flowEndBlock;
5285 #else
5286 int vrD, vrA, vrB, vrC;
5287 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5288
5289 jitcFlushClientVectorRegister(vrA);
5290 jitcFlushClientVectorRegister(vrB);
5291 jitcFlushClientVectorRegister(vrC);
5292 jitcDropClientVectorRegister(vrD);
5293
5294 jitcClobberCarryAndFlags();
5295 NativeReg reg1 = jitcAllocRegister();
5296 NativeReg reg2 = jitcAllocRegister();
5297
5298 for (int i=0; i<16; i+=2) {
5299 vec_getHalfS(reg1, vrA, i);
5300 vec_getHalfS(reg2, vrB, i);
5301
5302 asmIMUL(reg1, reg2);
5303 asmALU(X86_ADD, reg1, 0x4000);
5304 asmShift(X86_SAR, reg1, 15);
5305
5306 vec_getHalfS(reg2, vrC, i);
5307 asmALU(X86_ADD, reg1, reg2);
5308
5309 vec_saturateSH(reg1, reg2);
5310
5311 vec_setHalf(vrD, i, (NativeReg16)reg1);
5312 }
5313
5314 return flowContinue;
5315 #endif
5316 }
5317
5318 /* vmsumubm Vector Multiply Sum Unsigned Byte Modulo
5319 * v.204
5320 */
5321 void ppc_opc_vmsumubm()
5322 {
5323 VECTOR_DEBUG;
5324 int vrD, vrA, vrB, vrC;
5325 uint32 temp;
5326 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5327
5328 for (int i=0; i<4; i++) {
5329 temp = gCPU.vr[vrC].w[i];
5330
5331 temp += (uint16)gCPU.vr[vrA].b[i<<2] *
5332 (uint16)gCPU.vr[vrB].b[i<<2];
5333
5334 temp += (uint16)gCPU.vr[vrA].b[(i<<2)+1] *
5335 (uint16)gCPU.vr[vrB].b[(i<<2)+1];
5336
5337 temp += (uint16)gCPU.vr[vrA].b[(i<<2)+2] *
5338 (uint16)gCPU.vr[vrB].b[(i<<2)+2];
5339
5340 temp += (uint16)gCPU.vr[vrA].b[(i<<2)+3] *
5341 (uint16)gCPU.vr[vrB].b[(i<<2)+3];
5342
5343 gCPU.vr[vrD].w[i] = temp;
5344 }
5345 }
5346 JITCFlow ppc_opc_gen_vmsumubm()
5347 {
5348 ppc_opc_gen_interpret(ppc_opc_vmsumubm);
5349 return flowEndBlock;
5350 }
5351
5352 /* vmsumuhm Vector Multiply Sum Unsigned Half Word Modulo
5353 * v.205
5354 */
5355 void ppc_opc_vmsumuhm()
5356 {
5357 VECTOR_DEBUG;
5358 int vrD, vrA, vrB, vrC;
5359 uint32 temp;
5360 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5361
5362 for (int i=0; i<4; i++) {
5363 temp = gCPU.vr[vrC].w[i];
5364
5365 temp += (uint32)gCPU.vr[vrA].h[i<<1] *
5366 (uint32)gCPU.vr[vrB].h[i<<1];
5367 temp += (uint32)gCPU.vr[vrA].h[(i<<1)+1] *
5368 (uint32)gCPU.vr[vrB].h[(i<<1)+1];
5369
5370 gCPU.vr[vrD].w[i] = temp;
5371 }
5372 }
5373 JITCFlow ppc_opc_gen_vmsumuhm()
5374 {
5375 ppc_opc_gen_interpret(ppc_opc_vmsumuhm);
5376 return flowEndBlock;
5377 }
5378
5379 /* vmsummbm Vector Multiply Sum Mixed-Sign Byte Modulo
5380 * v.201
5381 */
5382 void ppc_opc_vmsummbm()
5383 {
5384 VECTOR_DEBUG;
5385 int vrD, vrA, vrB, vrC;
5386 sint32 temp;
5387 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5388
5389 for (int i=0; i<4; i++) {
5390 temp = gCPU.vr[vrC].sw[i];
5391
5392 temp += (sint16)gCPU.vr[vrA].sb[i<<2] *
5393 (uint16)gCPU.vr[vrB].b[i<<2];
5394 temp += (sint16)gCPU.vr[vrA].sb[(i<<2)+1] *
5395 (uint16)gCPU.vr[vrB].b[(i<<2)+1];
5396 temp += (sint16)gCPU.vr[vrA].sb[(i<<2)+2] *
5397 (uint16)gCPU.vr[vrB].b[(i<<2)+2];
5398 temp += (sint16)gCPU.vr[vrA].sb[(i<<2)+3] *
5399 (uint16)gCPU.vr[vrB].b[(i<<2)+3];
5400
5401 gCPU.vr[vrD].sw[i] = temp;
5402 }
5403 }
5404 JITCFlow ppc_opc_gen_vmsummbm()
5405 {
5406 #if 0
5407 ppc_opc_gen_interpret(ppc_opc_vmsummbm);
5408 return flowEndBlock;
5409 #else
5410 int vrD, vrA, vrB, vrC;
5411 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5412
5413 jitcFlushClientVectorRegister(vrA);
5414 jitcFlushClientVectorRegister(vrB);
5415 jitcFlushClientVectorRegister(vrC);
5416 jitcDropClientVectorRegister(vrD);
5417
5418 jitcClobberCarryAndFlags();
5419 NativeReg reg1 = jitcAllocRegister();
5420 NativeReg reg2 = jitcAllocRegister();
5421 NativeReg reg3 = jitcAllocRegister();
5422
5423 for (int i=0; i<16; i+=4) {
5424 vec_getWord(reg1, vrC, i);
5425
5426 for (int j=0; j<4; j++) {
5427 vec_getByteS(reg2, vrA, i+j);
5428 vec_getByteZ(reg3, vrB, i+j);
5429
5430 asmIMUL(reg2, reg3);
5431 asmALU(X86_ADD, reg1, reg2);
5432 }
5433
5434 vec_setWord(vrD, i, reg1);
5435 }
5436
5437 return flowContinue;
5438 #endif
5439 }
5440
5441 /* vmsumshm Vector Multiply Sum Signed Half Word Modulo
5442 * v.202
5443 */
5444 void ppc_opc_vmsumshm()
5445 {
5446 VECTOR_DEBUG;
5447 int vrD, vrA, vrB, vrC;
5448 sint32 temp;
5449 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5450
5451 for (int i=0; i<4; i++) {
5452 temp = gCPU.vr[vrC].sw[i];
5453
5454 temp += (sint32)gCPU.vr[vrA].sh[i<<1] *
5455 (sint32)gCPU.vr[vrB].sh[i<<1];
5456 temp += (sint32)gCPU.vr[vrA].sh[(i<<1)+1] *
5457 (sint32)gCPU.vr[vrB].sh[(i<<1)+1];
5458
5459 gCPU.vr[vrD].sw[i] = temp;
5460 }
5461 }
5462 JITCFlow ppc_opc_gen_vmsumshm()
5463 {
5464 ppc_opc_gen_interpret(ppc_opc_vmsumshm);
5465 return flowEndBlock;
5466 }
5467
5468 /* vmsumuhs Vector Multiply Sum Unsigned Half Word Saturate
5469 * v.206
5470 */
5471 void ppc_opc_vmsumuhs()
5472 {
5473 VECTOR_DEBUG;
5474 int vrD, vrA, vrB, vrC;
5475 uint64 temp;
5476 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5477
5478 /* For this, there's no way to get around 64-bit math. If we use
5479 * the hacks used before, then we have to do it so often, that
5480 * we'll outpace the 64-bit math in execution time.
5481 */
5482 for (int i=0; i<4; i++) {
5483 temp = gCPU.vr[vrC].w[i];
5484
5485 temp += (uint32)gCPU.vr[vrA].h[i<<1] *
5486 (uint32)gCPU.vr[vrB].h[i<<1];
5487
5488 temp += (uint32)gCPU.vr[vrA].h[(i<<1)+1] *
5489 (uint32)gCPU.vr[vrB].h[(i<<1)+1];
5490
5491 gCPU.vr[vrD].w[i] = SATURATE_UW(temp);
5492 }
5493 }
5494 JITCFlow ppc_opc_gen_vmsumuhs()
5495 {
5496 ppc_opc_gen_interpret(ppc_opc_vmsumuhs);
5497 return flowEndBlock;
5498 }
5499
5500 /* vmsumshs Vector Multiply Sum Signed Half Word Saturate
5501 * v.203
5502 */
5503 void ppc_opc_vmsumshs()
5504 {
5505 VECTOR_DEBUG;
5506 int vrD, vrA, vrB, vrC;
5507 sint64 temp;
5508 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5509
5510 /* For this, there's no way to get around 64-bit math. If we use
5511 * the hacks used before, then we have to do it so often, that
5512 * we'll outpace the 64-bit math in execution time.
5513 */
5514
5515 for (int i=0; i<4; i++) {
5516 temp = gCPU.vr[vrC].sw[i];
5517
5518 temp += (sint32)gCPU.vr[vrA].sh[i<<1] *
5519 (sint32)gCPU.vr[vrB].sh[i<<1];
5520 temp += (sint32)gCPU.vr[vrA].sh[(i<<1)+1] *
5521 (sint32)gCPU.vr[vrB].sh[(i<<1)+1];
5522
5523 gCPU.vr[vrD].sw[i] = SATURATE_SW(temp);
5524 }
5525 }
5526 JITCFlow ppc_opc_gen_vmsumshs()
5527 {
5528 ppc_opc_gen_interpret(ppc_opc_vmsumshs);
5529 return flowEndBlock;
5530 }
5531
5532 /* vsum4ubs Vector Sum Across Partial (1/4) Unsigned Byte Saturate
5533 * v.275
5534 */
5535 void ppc_opc_vsum4ubs()
5536 {
5537 VECTOR_DEBUG;
5538 int vrD, vrA, vrB;
5539 uint64 res;
5540 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5541
5542 /* For this, there's no way to get around 64-bit math. If we use
5543 * the hacks used before, then we have to do it so often, that
5544 * we'll outpace the 64-bit math in execution time.
5545 */
5546
5547 for (int i=0; i<4; i++) {
5548 res = (uint64)gCPU.vr[vrB].w[i];
5549
5550 res += (uint64)gCPU.vr[vrA].b[(i<<2)];
5551 res += (uint64)gCPU.vr[vrA].b[(i<<2)+1];
5552 res += (uint64)gCPU.vr[vrA].b[(i<<2)+2];
5553 res += (uint64)gCPU.vr[vrA].b[(i<<2)+3];
5554
5555 gCPU.vr[vrD].w[i] = SATURATE_UW(res);
5556 }
5557 }
5558 JITCFlow ppc_opc_gen_vsum4ubs()
5559 {
5560 ppc_opc_gen_interpret(ppc_opc_vsum4ubs);
5561 return flowEndBlock;
5562 }
5563
5564 /* vsum4sbs Vector Sum Across Partial (1/4) Signed Byte Saturate
5565 * v.273
5566 */
5567 void ppc_opc_vsum4sbs()
5568 {
5569 VECTOR_DEBUG;
5570 int vrD, vrA, vrB;
5571 sint64 res;
5572 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5573
5574 for (int i=0; i<4; i++) {
5575 res = (sint64)gCPU.vr[vrB].sw[i];
5576
5577 res += (sint64)gCPU.vr[vrA].sb[(i<<2)];
5578 res += (sint64)gCPU.vr[vrA].sb[(i<<2)+1];
5579 res += (sint64)gCPU.vr[vrA].sb[(i<<2)+2];
5580 res += (sint64)gCPU.vr[vrA].sb[(i<<2)+3];
5581
5582 gCPU.vr[vrD].sw[i] = SATURATE_SW(res);
5583 }
5584 }
5585 JITCFlow ppc_opc_gen_vsum4sbs()
5586 {
5587 ppc_opc_gen_interpret(ppc_opc_vsum4sbs);
5588 return flowEndBlock;
5589 }
5590
5591 /* vsum4shs Vector Sum Across Partial (1/4) Signed Half Word Saturate
5592 * v.274
5593 */
5594 void ppc_opc_vsum4shs()
5595 {
5596 VECTOR_DEBUG;
5597 int vrD, vrA, vrB;
5598 sint64 res;
5599 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5600
5601 for (int i=0; i<4; i++) {
5602 res = (sint64)gCPU.vr[vrB].sw[i];
5603
5604 res += (sint64)gCPU.vr[vrA].sh[(i<<1)];
5605 res += (sint64)gCPU.vr[vrA].sh[(i<<1)+1];
5606
5607 gCPU.vr[vrD].sw[i] = SATURATE_SW(res);
5608 }
5609 }
5610 JITCFlow ppc_opc_gen_vsum4shs()
5611 {
5612 ppc_opc_gen_interpret(ppc_opc_vsum4shs);
5613 return flowEndBlock;
5614 }
5615
5616 /* vsum2sws Vector Sum Across Partial (1/2) Signed Word Saturate
5617 * v.272
5618 */
5619 void ppc_opc_vsum2sws()
5620 {
5621 VECTOR_DEBUG;
5622 int vrD, vrA, vrB;
5623 sint64 res;
5624 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5625
5626 res = (sint64)gCPU.vr[vrA].sw[0] + (sint64)gCPU.vr[vrA].sw[1];
5627 res += (sint64)gCPU.vr[vrB].sw[VECT_ODD(0)];
5628
5629 gCPU.vr[vrD].w[VECT_ODD(0)] = SATURATE_SW(res);
5630 gCPU.vr[vrD].w[VECT_EVEN(0)] = 0;
5631
5632 res = (sint64)gCPU.vr[vrA].sw[2] + (sint64)gCPU.vr[vrA].sw[3];
5633 res += (sint64)gCPU.vr[vrB].sw[VECT_ODD(1)];
5634
5635 gCPU.vr[vrD].w[VECT_ODD(1)] = SATURATE_SW(res);
5636 gCPU.vr[vrD].w[VECT_EVEN(1)] = 0;
5637 }
5638 JITCFlow ppc_opc_gen_vsum2sws()
5639 {
5640 ppc_opc_gen_interpret(ppc_opc_vsum2sws);
5641 return flowEndBlock;
5642 }
5643
5644 /* vsumsws Vector Sum Across Signed Word Saturate
5645 * v.271
5646 */
5647 void ppc_opc_vsumsws()
5648 {
5649 VECTOR_DEBUG;
5650 int vrD, vrA, vrB;
5651 sint64 res;
5652 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5653
5654 res = (sint64)gCPU.vr[vrA].sw[0] + (sint64)gCPU.vr[vrA].sw[1];
5655 res += (sint64)gCPU.vr[vrA].sw[2] + (sint64)gCPU.vr[vrA].sw[3];
5656
5657 res += (sint64)VECT_W(gCPU.vr[vrB], 3);
5658
5659 VECT_W(gCPU.vr[vrD], 3) = SATURATE_SW(res);
5660 VECT_W(gCPU.vr[vrD], 2) = 0;
5661 VECT_W(gCPU.vr[vrD], 1) = 0;
5662 VECT_W(gCPU.vr[vrD], 0) = 0;
5663 }
5664 JITCFlow ppc_opc_gen_vsumsws()
5665 {
5666 ppc_opc_gen_interpret(ppc_opc_vsumsws);
5667 return flowEndBlock;
5668 }
5669
5670 /* vnmsubfp Vector Negative Multiply-Subtract Floating Point
5671 * v.215
5672 */
5673 void ppc_opc_vnmsubfp()
5674 {
5675 VECTOR_DEBUG;
5676 int vrD, vrA, vrB, vrC;
5677 double res;
5678 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
5679
5680 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
5681 res = (double)gCPU.vr[vrA].f[i] * (double)gCPU.vr[vrC].f[i];
5682
5683 res = (double)gCPU.vr[vrB].f[i] - res;
5684
5685 gCPU.vr[vrD].f[i] = (float)res;
5686 }
5687 }
5688 JITCFlow ppc_opc_gen_vnmsubfp()
5689 {
5690 #if 0
5691 ppc_opc_gen_interpret(ppc_opc_vnmsubfp);
5692 return flowEndBlock;
5693 #else
5694 int vrD, vrA, vrB, vrC;
5695 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
5696
5697 if (SSE_AVAIL) {
5698 commutative_operation(X86_MULPS, vrT, vrA, vrC);
5699 noncommutative_operation(X86_SUBPS, vrD, vrB, vrT);
5700 return flowContinue;
5701 }
5702
5703 ppc_opc_gen_interpret(ppc_opc_vnmsubfp);
5704 return flowEndBlock;
5705 #endif
5706 }
5707
5708 /* vavgub Vector Average Unsigned Byte
5709 * v.152
5710 */
5711 void ppc_opc_vavgub()
5712 {
5713 VECTOR_DEBUG;
5714 int vrD, vrA, vrB;
5715 uint16 res;
5716 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5717
5718 for (int i=0; i<16; i++) {
5719 res = (uint16)gCPU.vr[vrA].b[i] +
5720 (uint16)gCPU.vr[vrB].b[i] + 1;
5721
5722 gCPU.vr[vrD].b[i] = (res >> 1);
5723 }
5724 }
5725 JITCFlow ppc_opc_gen_vavgub()
5726 {
5727 #if 0
5728 ppc_opc_gen_interpret(ppc_opc_vavgub);
5729 return flowEndBlock;
5730 #else
5731 int vrD, vrA, vrB;
5732 modrm_o modrma, modrmb;
5733 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5734
5735 if (SSE2_AVAIL) {
5736 commutative_operation(X86_PAVGB, vrD, vrA, vrB);
5737 return flowContinue;
5738 }
5739
5740 jitcFlushClientVectorRegister(vrA);
5741 jitcFlushClientVectorRegister(vrB);
5742 jitcDropClientVectorRegister(vrD);
5743
5744 jitcClobberCarryAndFlags();
5745 NativeReg reg = jitcAllocRegister();
5746 NativeReg reg2 = jitcAllocRegister();
5747 NativeReg reg3 = jitcAllocRegister();
5748
5749 for (int i=0; i<16; i += 4) {
5750 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
5751 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
5752
5753 asmALU(X86_MOV, reg, modrma);
5754 asmALU(X86_MOV, reg2, modrmb);
5755 asmALU(X86_MOV, reg3, reg);
5756
5757 asmALU(X86_OR, reg3, reg2);
5758 asmALU(X86_AND, reg3, 0x01010101);
5759
5760 asmShift(X86_SHR, reg, 1);
5761 asmShift(X86_SHR, reg2, 1);
5762
5763 asmALU(X86_AND, reg, 0x7f7f7f7f);
5764 asmALU(X86_AND, reg2, 0x7f7f7f7f);
5765
5766 asmALU(X86_ADD, reg, reg2);
5767 asmALU(X86_ADD, reg, reg3);
5768
5769 vec_setWord(vrD, i, reg);
5770 }
5771
5772 return flowContinue;
5773 #endif
5774 }
5775
5776 /* vavguh Vector Average Unsigned Half Word
5777 * v.153
5778 */
5779 void ppc_opc_vavguh()
5780 {
5781 VECTOR_DEBUG;
5782 int vrD, vrA, vrB;
5783 uint32 res;
5784 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5785
5786 for (int i=0; i<8; i++) {
5787 res = (uint32)gCPU.vr[vrA].h[i] +
5788 (uint32)gCPU.vr[vrB].h[i] + 1;
5789
5790 gCPU.vr[vrD].h[i] = (res >> 1);
5791 }
5792 }
5793 JITCFlow ppc_opc_gen_vavguh()
5794 {
5795 #if 0
5796 ppc_opc_gen_interpret(ppc_opc_vavguh);
5797 return flowEndBlock;
5798 #else
5799 int vrD, vrA, vrB;
5800 modrm_o modrma, modrmb;
5801 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5802
5803 if (SSE2_AVAIL) {
5804 commutative_operation(X86_PAVGW, vrD, vrA, vrB);
5805 return flowContinue;
5806 }
5807
5808 jitcFlushClientVectorRegister(vrA);
5809 jitcFlushClientVectorRegister(vrB);
5810 jitcDropClientVectorRegister(vrD);
5811
5812 jitcClobberCarryAndFlags();
5813 NativeReg reg = jitcAllocRegister();
5814 NativeReg reg2 = jitcAllocRegister();
5815 NativeReg reg3 = jitcAllocRegister();
5816
5817 for (int i=0; i<16; i += 4) {
5818 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
5819 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
5820
5821 asmALU(X86_MOV, reg, modrma);
5822 asmALU(X86_MOV, reg2, modrmb);
5823 asmALU(X86_MOV, reg3, reg);
5824
5825 asmALU(X86_OR, reg3, reg2);
5826 asmALU(X86_AND, reg3, 0x00010001);
5827
5828 asmShift(X86_SHR, reg, 1);
5829 asmShift(X86_SHR, reg2, 1);
5830
5831 asmALU(X86_AND, reg, 0x7fff7fff);
5832 asmALU(X86_AND, reg2, 0x7fff7fff);
5833
5834 asmALU(X86_ADD, reg, reg2);
5835 asmALU(X86_ADD, reg, reg3);
5836
5837 vec_setWord(vrD, i, reg);
5838 }
5839
5840 return flowContinue;
5841 #endif
5842 }
5843
5844 /* vavguw Vector Average Unsigned Word
5845 * v.154
5846 */
5847 void ppc_opc_vavguw()
5848 {
5849 VECTOR_DEBUG;
5850 int vrD, vrA, vrB;
5851 uint64 res;
5852 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5853
5854 for (int i=0; i<4; i++) {
5855 res = (uint64)gCPU.vr[vrA].w[i] +
5856 (uint64)gCPU.vr[vrB].w[i] + 1;
5857
5858 gCPU.vr[vrD].w[i] = (res >> 1);
5859 }
5860 }
5861 JITCFlow ppc_opc_gen_vavguw()
5862 {
5863 #if 0
5864 ppc_opc_gen_interpret(ppc_opc_vavguw);
5865 return flowEndBlock;
5866 #else
5867 int vrD, vrA, vrB;
5868 modrm_o modrma, modrmb;
5869 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
5870
5871 jitcFlushClientVectorRegister(vrA);
5872 jitcFlushClientVectorRegister(vrB);
5873 jitcDropClientVectorRegister(vrD);
5874
5875 jitcClobberCarryAndFlags();
5876 NativeReg reg = jitcAllocRegister();
5877 NativeReg reg2 = jitcAllocRegister();
5878 NativeReg reg3 = jitcAllocRegister();
5879
5880 for (int i=0; i<16; i += 4) {
5881 x86_mem2(modrma, &(gCPU.vr[vrA].b[i]));
5882 x86_mem2(modrmb, &(gCPU.vr[vrB].b[i]));
5883
5884 asmALU(X86_MOV, reg, modrma);
5885 asmALU(X86_MOV, reg2, modrmb);
5886 asmALU(X86_MOV, reg3, reg);
5887
5888 asmALU(X86_OR, reg3, reg2);
5889 asmALU(X86_AND, reg3, 0x00000001);
5890
5891 asmShift(X86_SHR, reg, 1);
5892 asmShift(X86_SHR, reg2, 1);
5893
5894 asmALU(X86_ADD, reg, reg2);
5895 asmALU(X86_ADD, reg, reg3);
5896
5897 vec_setWord(vrD, i, reg);
5898 }
5899
5900 return flowContinue;
5901 #endif
5902 }
5903
5904 /* vavgsb Vector Average Signed Byte
5905 * v.149
5906 */
5907 void ppc_opc_vavgsb()
5908 {
5909 VECTOR_DEBUG;
5910 int vrD, vrA, vrB;
5911 sint16 res;
5912 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5913
5914 for (int i=0; i<16; i++) {
5915 res = (sint16)gCPU.vr[vrA].sb[i] +
5916 (sint16)gCPU.vr[vrB].sb[i] + 1;
5917
5918 gCPU.vr[vrD].sb[i] = (res >> 1);
5919 }
5920 }
5921 JITCFlow ppc_opc_gen_vavgsb()
5922 {
5923 ppc_opc_gen_interpret(ppc_opc_vavgsb);
5924 return flowEndBlock;
5925 }
5926
5927 /* vavgsh Vector Average Signed Half Word
5928 * v.150
5929 */
5930 void ppc_opc_vavgsh()
5931 {
5932 VECTOR_DEBUG;
5933 int vrD, vrA, vrB;
5934 sint32 res;
5935 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5936
5937 for (int i=0; i<8; i++) {
5938 res = (sint32)gCPU.vr[vrA].sh[i] +
5939 (sint32)gCPU.vr[vrB].sh[i] + 1;
5940
5941 gCPU.vr[vrD].sh[i] = (res >> 1);
5942 }
5943 }
5944 JITCFlow ppc_opc_gen_vavgsh()
5945 {
5946 ppc_opc_gen_interpret(ppc_opc_vavgsh);
5947 return flowEndBlock;
5948 }
5949
5950 /* vavgsw Vector Average Signed Word
5951 * v.151
5952 */
5953 void ppc_opc_vavgsw()
5954 {
5955 VECTOR_DEBUG;
5956 int vrD, vrA, vrB;
5957 sint64 res;
5958 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5959
5960 for (int i=0; i<4; i++) {
5961 res = (sint64)gCPU.vr[vrA].sw[i] +
5962 (sint64)gCPU.vr[vrB].sw[i] + 1;
5963
5964 gCPU.vr[vrD].sw[i] = (res >> 1);
5965 }
5966 }
5967 JITCFlow ppc_opc_gen_vavgsw()
5968 {
5969 ppc_opc_gen_interpret(ppc_opc_vavgsw);
5970 return flowEndBlock;
5971 }
5972
5973 /* vmaxub Vector Maximum Unsigned Byte
5974 * v.182
5975 */
5976 void ppc_opc_vmaxub()
5977 {
5978 VECTOR_DEBUG;
5979 int vrD, vrA, vrB;
5980 uint8 res;
5981 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
5982
5983 for (int i=0; i<16; i++) {
5984 res = gCPU.vr[vrA].b[i];
5985
5986 if (res < gCPU.vr[vrB].b[i])
5987 res = gCPU.vr[vrB].b[i];
5988
5989 gCPU.vr[vrD].b[i] = res;
5990 }
5991 }
5992 JITCFlow ppc_opc_gen_vmaxub()
5993 {
5994 #if 0
5995 ppc_opc_gen_interpret(ppc_opc_vmaxub);
5996 return flowEndBlock;
5997 #else
5998 int vrD, vrA, vrB;
5999 modrm_o modrm;
6000 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6001
6002 if (vrA == vrD && vrB == vrD) {
6003 return flowContinue;
6004 } else if (vrA == vrB) {
6005 vec_Copy(vrD, vrA);
6006
6007 return flowContinue;
6008 }
6009
6010 if (SSE2_AVAIL) {
6011 commutative_operation(X86_PMAXUB, vrD, vrA, vrB);
6012 return flowContinue;
6013 }
6014
6015 jitcFlushClientVectorRegister(vrA);
6016 jitcFlushClientVectorRegister(vrB);
6017 jitcDropClientVectorRegister(vrD);
6018
6019 jitcClobberCarryAndFlags();
6020 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
6021
6022 for (int i=0; i<16; i++) {
6023 vec_getByte(reg, vrA, REG_NO, i);
6024
6025 x86_mem2(modrm, &(gCPU.vr[vrB].b[i]));
6026
6027 asmALU(X86_CMP, reg, modrm);
6028 asmCMOV(X86_B, (NativeReg)reg, modrm);
6029
6030 vec_setByte(vrD, REG_NO, i, reg);
6031 }
6032
6033 return flowContinue;
6034 #endif
6035 }
6036
6037 /* vmaxuh Vector Maximum Unsigned Half Word
6038 * v.183
6039 */
6040 void ppc_opc_vmaxuh()
6041 {
6042 VECTOR_DEBUG;
6043 int vrD, vrA, vrB;
6044 uint16 res;
6045 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6046
6047 for (int i=0; i<8; i++) {
6048 res = gCPU.vr[vrA].h[i];
6049
6050 if (res < gCPU.vr[vrB].h[i])
6051 res = gCPU.vr[vrB].h[i];
6052
6053 gCPU.vr[vrD].h[i] = res;
6054 }
6055 }
6056 JITCFlow ppc_opc_gen_vmaxuh()
6057 {
6058 ppc_opc_gen_interpret(ppc_opc_vmaxuh);
6059 return flowEndBlock;
6060 }
6061
6062 /* vmaxuw Vector Maximum Unsigned Word
6063 * v.184
6064 */
6065 void ppc_opc_vmaxuw()
6066 {
6067 VECTOR_DEBUG;
6068 int vrD, vrA, vrB;
6069 uint32 res;
6070 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6071
6072 for (int i=0; i<4; i++) {
6073 res = gCPU.vr[vrA].w[i];
6074
6075 if (res < gCPU.vr[vrB].w[i])
6076 res = gCPU.vr[vrB].w[i];
6077
6078 gCPU.vr[vrD].w[i] = res;
6079 }
6080 }
6081 JITCFlow ppc_opc_gen_vmaxuw()
6082 {
6083 ppc_opc_gen_interpret(ppc_opc_vmaxuw);
6084 return flowEndBlock;
6085 }
6086
6087 /* vmaxsb Vector Maximum Signed Byte
6088 * v.179
6089 */
6090 void ppc_opc_vmaxsb()
6091 {
6092 VECTOR_DEBUG;
6093 int vrD, vrA, vrB;
6094 sint8 res;
6095 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6096
6097 for (int i=0; i<16; i++) {
6098 res = gCPU.vr[vrA].sb[i];
6099
6100 if (res < gCPU.vr[vrB].sb[i])
6101 res = gCPU.vr[vrB].sb[i];
6102
6103 gCPU.vr[vrD].sb[i] = res;
6104 }
6105 }
6106 JITCFlow ppc_opc_gen_vmaxsb()
6107 {
6108 ppc_opc_gen_interpret(ppc_opc_vmaxsb);
6109 return flowEndBlock;
6110 }
6111
6112 /* vmaxsh Vector Maximum Signed Half Word
6113 * v.180
6114 */
6115 void ppc_opc_vmaxsh()
6116 {
6117 VECTOR_DEBUG;
6118 int vrD, vrA, vrB;
6119 sint16 res;
6120 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6121
6122 for (int i=0; i<8; i++) {
6123 res = gCPU.vr[vrA].sh[i];
6124
6125 if (res < gCPU.vr[vrB].sh[i])
6126 res = gCPU.vr[vrB].sh[i];
6127
6128 gCPU.vr[vrD].sh[i] = res;
6129 }
6130 }
6131 JITCFlow ppc_opc_gen_vmaxsh()
6132 {
6133 ppc_opc_gen_interpret(ppc_opc_vmaxsh);
6134 return flowEndBlock;
6135 }
6136
6137 /* vmaxsw Vector Maximum Signed Word
6138 * v.181
6139 */
6140 void ppc_opc_vmaxsw()
6141 {
6142 VECTOR_DEBUG;
6143 int vrD, vrA, vrB;
6144 sint32 res;
6145 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6146
6147 for (int i=0; i<4; i++) {
6148 res = gCPU.vr[vrA].sw[i];
6149
6150 if (res < gCPU.vr[vrB].sw[i])
6151 res = gCPU.vr[vrB].sw[i];
6152
6153 gCPU.vr[vrD].sw[i] = res;
6154 }
6155 }
6156 JITCFlow ppc_opc_gen_vmaxsw()
6157 {
6158 ppc_opc_gen_interpret(ppc_opc_vmaxsw);
6159 return flowEndBlock;
6160 }
6161
6162 /* vmaxfp Vector Maximum Floating Point
6163 * v.178
6164 */
6165 void ppc_opc_vmaxfp()
6166 {
6167 VECTOR_DEBUG;
6168 int vrD, vrA, vrB;
6169 float res;
6170 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6171
6172 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6173 res = gCPU.vr[vrA].f[i];
6174
6175 if (res < gCPU.vr[vrB].f[i])
6176 res = gCPU.vr[vrB].f[i];
6177
6178 gCPU.vr[vrD].f[i] = res;
6179 }
6180 }
6181 JITCFlow ppc_opc_gen_vmaxfp()
6182 {
6183 #if 0
6184 ppc_opc_gen_interpret(ppc_opc_vmaxfp);
6185 return flowEndBlock;
6186 #else
6187 int vrD, vrA, vrB;
6188 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6189
6190 if (vrA == vrD && vrB == vrD) {
6191 return flowContinue;
6192 } else if (vrA == vrB) {
6193 vec_Copy(vrD, vrA);
6194
6195 return flowContinue;
6196 }
6197
6198 if (SSE_AVAIL) {
6199 commutative_operation(X86_MAXPS, vrD, vrA, vrB);
6200 return flowContinue;
6201 }
6202
6203 ppc_opc_gen_interpret(ppc_opc_vmaxfp);
6204 return flowEndBlock;
6205 #endif
6206 }
6207
6208 /* vminub Vector Minimum Unsigned Byte
6209 * v.191
6210 */
6211 void ppc_opc_vminub()
6212 {
6213 VECTOR_DEBUG;
6214 int vrD, vrA, vrB;
6215 uint8 res;
6216 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6217
6218 for (int i=0; i<16; i++) {
6219 res = gCPU.vr[vrA].b[i];
6220
6221 if (res > gCPU.vr[vrB].b[i])
6222 res = gCPU.vr[vrB].b[i];
6223
6224 gCPU.vr[vrD].b[i] = res;
6225 }
6226 }
6227 JITCFlow ppc_opc_gen_vminub()
6228 {
6229 #if 0
6230 ppc_opc_gen_interpret(ppc_opc_vminub);
6231 return flowEndBlock;
6232 #else
6233 int vrD, vrA, vrB;
6234 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6235
6236 if (vrA == vrD && vrB == vrD) {
6237 return flowContinue;
6238 } else if (vrA == vrB) {
6239 vec_Copy(vrD, vrA);
6240
6241 return flowContinue;
6242 }
6243
6244 if (SSE2_AVAIL) {
6245 commutative_operation(X86_PMINUB, vrD, vrA, vrB);
6246 return flowContinue;
6247 }
6248
6249 jitcFlushClientVectorRegister(vrA);
6250 jitcFlushClientVectorRegister(vrB);
6251 jitcDropClientVectorRegister(vrD);
6252
6253 jitcClobberCarryAndFlags();
6254 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
6255 NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
6256
6257 for (int i=0; i<16; i++) {
6258 vec_getByte(reg, vrA, REG_NO, i);
6259 vec_getByte(reg2, vrB, REG_NO, i);
6260
6261 asmALU(X86_CMP, reg, reg2);
6262 asmCMOV(X86_A, (NativeReg)reg, (NativeReg)reg2);
6263
6264 vec_setByte(vrD, REG_NO, i, reg);
6265 }
6266
6267 return flowContinue;
6268 #endif
6269 }
6270
6271 /* vminuh Vector Minimum Unsigned Half Word
6272 * v.192
6273 */
6274 void ppc_opc_vminuh()
6275 {
6276 VECTOR_DEBUG;
6277 int vrD, vrA, vrB;
6278 uint16 res;
6279 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6280
6281 for (int i=0; i<8; i++) {
6282 res = gCPU.vr[vrA].h[i];
6283
6284 if (res > gCPU.vr[vrB].h[i])
6285 res = gCPU.vr[vrB].h[i];
6286
6287 gCPU.vr[vrD].h[i] = res;
6288 }
6289 }
6290 JITCFlow ppc_opc_gen_vminuh()
6291 {
6292 ppc_opc_gen_interpret(ppc_opc_vminuh);
6293 return flowEndBlock;
6294 }
6295
6296 /* vminuw Vector Minimum Unsigned Word
6297 * v.193
6298 */
6299 void ppc_opc_vminuw()
6300 {
6301 VECTOR_DEBUG;
6302 int vrD, vrA, vrB;
6303 uint32 res;
6304 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6305
6306 for (int i=0; i<4; i++) {
6307 res = gCPU.vr[vrA].w[i];
6308
6309 if (res > gCPU.vr[vrB].w[i])
6310 res = gCPU.vr[vrB].w[i];
6311
6312 gCPU.vr[vrD].w[i] = res;
6313 }
6314 }
6315 JITCFlow ppc_opc_gen_vminuw()
6316 {
6317 ppc_opc_gen_interpret(ppc_opc_vminuw);
6318 return flowEndBlock;
6319 }
6320
6321 /* vminsb Vector Minimum Signed Byte
6322 * v.188
6323 */
6324 void ppc_opc_vminsb()
6325 {
6326 VECTOR_DEBUG;
6327 int vrD, vrA, vrB;
6328 sint8 res;
6329 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6330
6331 for (int i=0; i<16; i++) {
6332 res = gCPU.vr[vrA].sb[i];
6333
6334 if (res > gCPU.vr[vrB].sb[i])
6335 res = gCPU.vr[vrB].sb[i];
6336
6337 gCPU.vr[vrD].sb[i] = res;
6338 }
6339 }
6340 JITCFlow ppc_opc_gen_vminsb()
6341 {
6342 ppc_opc_gen_interpret(ppc_opc_vminsb);
6343 return flowEndBlock;
6344 }
6345
6346 /* vminsh Vector Minimum Signed Half Word
6347 * v.189
6348 */
6349 void ppc_opc_vminsh()
6350 {
6351 VECTOR_DEBUG;
6352 int vrD, vrA, vrB;
6353 sint16 res;
6354 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6355
6356 for (int i=0; i<8; i++) {
6357 res = gCPU.vr[vrA].sh[i];
6358
6359 if (res > gCPU.vr[vrB].sh[i])
6360 res = gCPU.vr[vrB].sh[i];
6361
6362 gCPU.vr[vrD].sh[i] = res;
6363 }
6364 }
6365 JITCFlow ppc_opc_gen_vminsh()
6366 {
6367 ppc_opc_gen_interpret(ppc_opc_vminsh);
6368 return flowEndBlock;
6369 }
6370
6371 /* vminsw Vector Minimum Signed Word
6372 * v.190
6373 */
6374 void ppc_opc_vminsw()
6375 {
6376 VECTOR_DEBUG;
6377 int vrD, vrA, vrB;
6378 sint32 res;
6379 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6380
6381 for (int i=0; i<4; i++) {
6382 res = gCPU.vr[vrA].sw[i];
6383
6384 if (res > gCPU.vr[vrB].sw[i])
6385 res = gCPU.vr[vrB].sw[i];
6386
6387 gCPU.vr[vrD].sw[i] = res;
6388 }
6389 }
6390 JITCFlow ppc_opc_gen_vminsw()
6391 {
6392 ppc_opc_gen_interpret(ppc_opc_vminsw);
6393 return flowEndBlock;
6394 }
6395
6396 /* vminfp Vector Minimum Floating Point
6397 * v.187
6398 */
6399 void ppc_opc_vminfp()
6400 {
6401 VECTOR_DEBUG;
6402 int vrD, vrA, vrB;
6403 float res;
6404 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6405
6406 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6407 res = gCPU.vr[vrA].f[i];
6408
6409 if (res > gCPU.vr[vrB].f[i])
6410 res = gCPU.vr[vrB].f[i];
6411
6412 gCPU.vr[vrD].f[i] = res;
6413 }
6414 }
6415 JITCFlow ppc_opc_gen_vminfp()
6416 {
6417 #if 0
6418 ppc_opc_gen_interpret(ppc_opc_vminfp);
6419 return flowEndBlock;
6420 #else
6421 int vrD, vrA, vrB;
6422 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6423
6424 if (vrA == vrD && vrB == vrD) {
6425 return flowContinue;
6426 } else if (vrA == vrB) {
6427 vec_Copy(vrD, vrA);
6428
6429 return flowContinue;
6430 }
6431
6432 if (SSE_AVAIL) {
6433 commutative_operation(X86_MINPS, vrD, vrA, vrB);
6434 return flowContinue;
6435 }
6436
6437 ppc_opc_gen_interpret(ppc_opc_vminfp);
6438 return flowEndBlock;
6439 #endif
6440 }
6441
6442 #define RND_NEAREST 0x0000 // round to nearest
6443 #define RND_ZERO 0x0c00 // round to zero
6444 #define RND_PINF 0x0800 // round to +INF
6445 #define RND_MINF 0x0400 // round to -INF
6446
6447 /* This functionality was borrowed from glibc 2.3.3 */
6448 static inline void set_roundmode(int mode)
6449 {
6450 modrm_o modrm;
6451
6452 jitcClobberCarryAndFlags();
6453 NativeReg reg = jitcAllocRegister();
6454
6455 x86_mem2(modrm, &gCPU.vfcw_save);
6456
6457 asmFSTCW(x86_mem2(modrm, &gCPU.vfcw_save));
6458
6459 asmMOV_NoFlags(reg, mode); // avoid flag clobber
6460 asmALU(X86_OR, reg, modrm);
6461
6462 if (mode != RND_ZERO) {
6463 asmALU(X86_AND, reg, (0xffff ^ (~mode & 0x0c00)));
6464 }
6465
6466 x86_mem2(modrm, &gCPU.vfcw);
6467
6468 asmALU(X86_MOV, modrm, reg);
6469
6470 asmFLDCW(modrm);
6471 }
6472
6473 static inline void restore_roundmode(void)
6474 {
6475 modrm_o modrm;
6476
6477 asmFLDCW(x86_mem2(modrm, &gCPU.vfcw_save));
6478 }
6479
6480 /* vrfin Vector Round to Floating-Point Integer Nearest
6481 * v.231
6482 */
6483 void ppc_opc_vrfin()
6484 {
6485 VECTOR_DEBUG;
6486 int vrD, vrA, vrB;
6487 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6488 PPC_OPC_ASSERT(vrA==0);
6489
6490 /* Documentation doesn't dictate how this instruction should
6491 * round from a middle point. With a test on a real G4, it was
6492 * found to be round to nearest, with bias to even if equidistant.
6493 *
6494 * This is covered by the function rint()
6495 */
6496 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6497 gCPU.vr[vrD].f[i] = rintf(gCPU.vr[vrB].f[i]);
6498 }
6499 }
6500 JITCFlow ppc_opc_gen_vrfin()
6501 {
6502 #if 0
6503 ppc_opc_gen_interpret(ppc_opc_vrfin);
6504 return flowEndBlock;
6505 #else
6506 int vrD, vrA, vrB;
6507 modrm_o modrm;
6508 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6509
6510 jitcFlushClientVectorRegister(vrB);
6511 jitcDropClientVectorRegister(vrD);
6512
6513 jitcFloatRegisterClobberAll();
6514 set_roundmode(RND_NEAREST);
6515
6516 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6517 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6518
6519 asmFSimple(FRNDINT);
6520
6521 asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6522 }
6523
6524 restore_roundmode();
6525
6526 return flowContinue;
6527 #endif
6528 }
6529
6530 /* vrfip Vector Round to Floating-Point Integer toward Plus Infinity
6531 * v.232
6532 */
6533 void ppc_opc_vrfip()
6534 {
6535 VECTOR_DEBUG;
6536 int vrD, vrA, vrB;
6537 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6538 PPC_OPC_ASSERT(vrA==0);
6539
6540 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6541 gCPU.vr[vrD].f[i] = ceilf(gCPU.vr[vrB].f[i]);
6542 }
6543 }
6544 JITCFlow ppc_opc_gen_vrfip()
6545 {
6546 #if 0
6547 ppc_opc_gen_interpret(ppc_opc_vrfip);
6548 return flowEndBlock;
6549 #else
6550 int vrD, vrA, vrB;
6551 modrm_o modrm;
6552 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6553
6554 jitcFlushClientVectorRegister(vrB);
6555 jitcDropClientVectorRegister(vrD);
6556
6557 jitcFloatRegisterClobberAll();
6558 set_roundmode(RND_PINF);
6559
6560 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6561 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6562
6563 asmFSimple(FRNDINT);
6564
6565 asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6566 }
6567
6568 restore_roundmode();
6569
6570 return flowContinue;
6571 #endif
6572 }
6573
6574 /* vrfim Vector Round to Floating-Point Integer toward Minus Infinity
6575 * v.230
6576 */
6577 void ppc_opc_vrfim()
6578 {
6579 VECTOR_DEBUG;
6580 int vrD, vrA, vrB;
6581 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6582 PPC_OPC_ASSERT(vrA==0);
6583
6584 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6585 gCPU.vr[vrD].f[i] = floorf(gCPU.vr[vrB].f[i]);
6586 }
6587 }
6588 JITCFlow ppc_opc_gen_vrfim()
6589 {
6590 #if 0
6591 ppc_opc_gen_interpret(ppc_opc_vrfim);
6592 return flowEndBlock;
6593 #else
6594 int vrD, vrA, vrB;
6595 modrm_o modrm;
6596 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6597
6598 jitcFlushClientVectorRegister(vrB);
6599 jitcDropClientVectorRegister(vrD);
6600
6601 jitcFloatRegisterClobberAll();
6602 set_roundmode(RND_MINF);
6603
6604 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6605 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6606
6607 asmFSimple(FRNDINT);
6608
6609 asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6610 }
6611
6612 restore_roundmode();
6613
6614 return flowContinue;
6615 #endif
6616 }
6617
6618 /* vrfiz Vector Round to Floating-Point Integer toward Zero
6619 * v.233
6620 */
6621 void ppc_opc_vrfiz()
6622 {
6623 VECTOR_DEBUG;
6624 int vrD, vrA, vrB;
6625 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6626 PPC_OPC_ASSERT(vrA==0);
6627
6628 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6629 gCPU.vr[vrD].f[i] = truncf(gCPU.vr[vrD].f[i]);
6630 }
6631 }
6632 JITCFlow ppc_opc_gen_vrfiz()
6633 {
6634 #if 0
6635 ppc_opc_gen_interpret(ppc_opc_vrfiz);
6636 return flowEndBlock;
6637 #else
6638 int vrD, vrA, vrB;
6639 modrm_o modrm;
6640 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6641
6642 jitcFlushClientVectorRegister(vrB);
6643 jitcDropClientVectorRegister(vrD);
6644
6645 jitcFloatRegisterClobberAll();
6646 set_roundmode(RND_ZERO);
6647
6648 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6649 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6650
6651 asmFSimple(FRNDINT);
6652
6653 asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6654 }
6655
6656 restore_roundmode();
6657
6658 return flowContinue;
6659 #endif
6660 }
6661
6662 /* vrefp Vector Reciprocal Estimate Floating Point
6663 * v.228
6664 */
6665 void ppc_opc_vrefp()
6666 {
6667 VECTOR_DEBUG;
6668 int vrD, vrA, vrB;
6669 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6670 PPC_OPC_ASSERT(vrA==0);
6671
6672 /* This emulation generates an exact value, instead of an estimate.
6673 * This is technically within specs, but some test-suites expect the
6674 * exact estimate value returned by G4s. These anomolous failures
6675 * should be ignored.
6676 */
6677
6678 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6679 gCPU.vr[vrD].f[i] = 1 / gCPU.vr[vrB].f[i];
6680 }
6681 }
6682 JITCFlow ppc_opc_gen_vrefp()
6683 {
6684 #if 0
6685 ppc_opc_gen_interpret(ppc_opc_vrefp);
6686 return flowEndBlock;
6687 #else
6688 int vrD, vrA, vrB;
6689 modrm_o modrm;
6690 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6691
6692 if (!(SSE_AVAIL)) {
6693 ppc_opc_gen_interpret(ppc_opc_vrefp);
6694 return flowEndBlock;
6695 }
6696
6697 if (vrB == vrD) {
6698 NativeVectorReg reg = jitcGetClientVectorRegisterDirty(vrB);
6699
6700 asmALUPS(X86_RCPPS, reg, reg);
6701
6702 return flowContinue;
6703 }
6704
6705 NativeVectorReg reg1 = jitcAllocVectorRegister();
6706 NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(vrB);
6707
6708 if (reg2 == VECTREG_NO) {
6709 asmALUPS(X86_RCPPS, reg1, x86_mem2(modrm, &gCPU.vr[vrB]));
6710 } else {
6711 asmALUPS(X86_RCPPS, reg1, reg2);
6712 }
6713
6714 jitcRenameVectorRegisterDirty(reg1, vrD);
6715
6716 return flowContinue;
6717 #endif
6718 }
6719
6720 /* vrsqrtefp Vector Reciprocal Square Root Estimate Floating Point
6721 * v.237
6722 */
6723 void ppc_opc_vrsqrtefp()
6724 {
6725 VECTOR_DEBUG;
6726 int vrD, vrA, vrB;
6727 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6728 PPC_OPC_ASSERT(vrA==0);
6729
6730 /* This emulation generates an exact value, instead of an estimate.
6731 * This is technically within specs, but some test-suites expect the
6732 * exact estimate value returned by G4s. These anomolous failures
6733 * should be ignored.
6734 */
6735
6736 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6737 gCPU.vr[vrD].f[i] = 1 / sqrt(gCPU.vr[vrB].f[i]);
6738 }
6739 }
6740 JITCFlow ppc_opc_gen_vrsqrtefp()
6741 {
6742 #if 0
6743 ppc_opc_gen_interpret(ppc_opc_vrsqrtefp);
6744 return flowEndBlock;
6745 #else
6746 int vrD, vrA, vrB;
6747 modrm_o modrm;
6748 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6749
6750 if (!(SSE_AVAIL)) {
6751 ppc_opc_gen_interpret(ppc_opc_vrsqrtefp);
6752 return flowEndBlock;
6753 }
6754
6755 if (vrB == vrD) {
6756 NativeVectorReg reg = jitcGetClientVectorRegisterDirty(vrB);
6757
6758 asmALUPS(X86_SQRTPS, reg, reg);
6759 asmALUPS(X86_RCPPS, reg, reg);
6760
6761 return flowContinue;
6762 }
6763
6764 NativeVectorReg reg1 = jitcAllocVectorRegister();
6765 NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(vrB);
6766
6767 if (reg2 == VECTREG_NO) {
6768 asmALUPS(X86_SQRTPS, reg1, x86_mem2(modrm, &gCPU.vr[vrB]));
6769 } else {
6770 asmALUPS(X86_SQRTPS, reg1, reg2);
6771 }
6772 asmALUPS(X86_RCPPS, reg1, reg1);
6773
6774 jitcRenameVectorRegisterDirty(reg1, vrD);
6775
6776 return flowContinue;
6777 #endif
6778 }
6779
6780 /* vlogefp Vector Log2 Estimate Floating Point
6781 * v.175
6782 */
6783 void ppc_opc_vlogefp()
6784 {
6785 VECTOR_DEBUG;
6786 int vrD, vrA, vrB;
6787 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6788 PPC_OPC_ASSERT(vrA==0);
6789
6790 /* This emulation generates an exact value, instead of an estimate.
6791 * This is technically within specs, but some test-suites expect the
6792 * exact estimate value returned by G4s. These anomolous failures
6793 * should be ignored.
6794 */
6795
6796 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6797 gCPU.vr[vrD].f[i] = log2(gCPU.vr[vrB].f[i]);
6798 }
6799 }
6800 JITCFlow ppc_opc_gen_vlogefp()
6801 {
6802 #if 0
6803 ppc_opc_gen_interpret(ppc_opc_vlogefp);
6804 return flowEndBlock;
6805 #else
6806 int vrD, vrA, vrB;
6807 modrm_o modrm;
6808 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6809
6810 jitcFlushClientVectorRegister(vrB);
6811 jitcDropClientVectorRegister(vrD);
6812
6813 jitcFloatRegisterClobberAll();
6814
6815 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6816 asmFSimple(FLD1);
6817
6818 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6819
6820 asmFSimple(FYL2X);
6821
6822 asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6823 }
6824
6825 return flowContinue;
6826 #endif
6827 }
6828
6829 /* vexptefp Vector 2 Raised to the Exponent Estimate Floating Point
6830 * v.173
6831 */
6832 void ppc_opc_vexptefp()
6833 {
6834 VECTOR_DEBUG;
6835 int vrD, vrA, vrB;
6836 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
6837 PPC_OPC_ASSERT(vrA==0);
6838
6839 /* This emulation generates an exact value, instead of an estimate.
6840 * This is technically within specs, but some test-suites expect the
6841 * exact estimate value returned by G4s. These anomolous failures
6842 * should be ignored.
6843 */
6844
6845 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6846 gCPU.vr[vrD].f[i] = exp2(gCPU.vr[vrB].f[i]);
6847 }
6848 }
6849 JITCFlow ppc_opc_gen_vexptefp()
6850 {
6851 #if 0
6852 ppc_opc_gen_interpret(ppc_opc_vexptefp);
6853 return flowEndBlock;
6854 #else
6855 int vrD, vrA, vrB;
6856 modrm_o modrm;
6857 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
6858
6859 jitcFlushClientVectorRegister(vrB);
6860 jitcDropClientVectorRegister(vrD);
6861
6862 jitcFloatRegisterClobberAll();
6863
6864 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6865 asmFSimple(FLD1);
6866
6867 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6868
6869 asmFSimple(F2XM1);
6870 asmFArithP_STi(X86_FADD, Float_ST1);
6871
6872 asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6873 }
6874
6875 return flowContinue;
6876 #endif
6877 }
6878
6879 /* vcfux Vector Convert from Unsigned Fixed-Point Word
6880 * v.156
6881 */
6882 void ppc_opc_vcfux()
6883 {
6884 VECTOR_DEBUG;
6885 int vrD, vrB;
6886 uint32 uimm;
6887 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
6888
6889 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6890 gCPU.vr[vrD].f[i] = ((float)gCPU.vr[vrB].w[i]) / (1 << uimm);
6891 }
6892 }
6893 JITCFlow ppc_opc_gen_vcfux()
6894 {
6895 ppc_opc_gen_interpret(ppc_opc_vcfux);
6896 return flowEndBlock;
6897 }
6898
6899 /* vcfsx Vector Convert from Signed Fixed-Point Word
6900 * v.155
6901 */
6902 void ppc_opc_vcfsx()
6903 {
6904 VECTOR_DEBUG;
6905 int vrD, vrB;
6906 uint32 uimm;
6907 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
6908
6909 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6910 gCPU.vr[vrD].f[i] = ((float)gCPU.vr[vrB].sw[i]) / (1 << uimm);
6911 }
6912 }
6913 JITCFlow ppc_opc_gen_vcfsx()
6914 {
6915 #if 0
6916 ppc_opc_gen_interpret(ppc_opc_vcfsx);
6917 return flowEndBlock;
6918 #else
6919 int vrD, vrB;
6920 uint32 uimm;
6921 modrm_o modrm;
6922 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
6923
6924 jitcFlushClientVectorRegister(vrB);
6925 jitcDropClientVectorRegister(vrD);
6926
6927 jitcClobberCarryAndFlags();
6928 jitcClobberRegister(NATIVE_REG | EAX);
6929
6930 jitcFloatRegisterClobberAll();
6931
6932 if (uimm == 1) {
6933 asmFSimple(FLD1);
6934 } else if (uimm > 1) {
6935 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), uimm);
6936 asmFILD_D(modrm);
6937 }
6938
6939 if (uimm) asmFSimple(FCHS);
6940
6941 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
6942 asmFILD_D(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
6943
6944 if (uimm) {
6945 asmFSimple(FSCALE);
6946 }
6947
6948 asmFSTP_Single(x86_mem2(modrm, &(gCPU.vr[vrD].b[i])));
6949 }
6950
6951 if (uimm) asmFSTP(Float_ST0);
6952
6953 return flowContinue;
6954 #endif
6955 }
6956
6957 /* vctsxs Vector Convert To Signed Fixed-Point Word Saturate
6958 * v.171
6959 */
6960 void ppc_opc_vctsxs()
6961 {
6962 VECTOR_DEBUG;
6963 int vrD, vrB;
6964 uint32 uimm;
6965 float ftmp;
6966 sint32 tmp;
6967 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
6968
6969 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
6970 ftmp = gCPU.vr[vrB].f[i] * (float)(1 << uimm);
6971 ftmp = truncf(ftmp);
6972
6973 tmp = (sint32)ftmp;
6974
6975 if (ftmp > 2147483647.0) {
6976 tmp = 2147483647; // 0x7fffffff
6977 gCPU.vscr |= VSCR_SAT;
6978 } else if (ftmp < -2147483648.0) {
6979 tmp = -2147483648LL; // 0x80000000
6980 gCPU.vscr |= VSCR_SAT;
6981 }
6982
6983 gCPU.vr[vrD].sw[i] = tmp;
6984 }
6985 }
6986 JITCFlow ppc_opc_gen_vctsxs()
6987 {
6988 #if 0
6989 ppc_opc_gen_interpret(ppc_opc_vctsxs);
6990 return flowEndBlock;
6991 #else
6992 int vrD, vrB;
6993 uint32 uimm;
6994 modrm_o modrm, dest_modrm;
6995 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
6996
6997 jitcFlushClientVectorRegister(vrB);
6998 jitcDropClientVectorRegister(vrD);
6999
7000 jitcClobberCarryAndFlags();
7001 jitcClobberRegister(NATIVE_REG | EAX);
7002
7003 jitcFloatRegisterClobberAll();
7004 set_roundmode(RND_ZERO);
7005
7006 if (uimm == 1) {
7007 asmFSimple(FLD1);
7008 } else if (uimm > 1) {
7009 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), uimm);
7010 asmFILD_D(modrm);
7011 }
7012
7013 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
7014 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
7015
7016 if (uimm) {
7017 asmFSimple(FSCALE);
7018 }
7019
7020 asmFSimple(FRNDINT);
7021
7022 asmFIComp(X86_FICOM32, x86_mem2(modrm, &sint32_max));
7023 asmFSTSW_EAX();
7024 asmALU(X86_TEST, EAX, 0x4100);
7025 NativeAddress of_high = asmJxxFixup(X86_Z);
7026
7027 asmFIComp(X86_FICOM32, x86_mem2(modrm, &sint32_min));
7028 asmFSTSW_EAX();
7029 asmALU(X86_TEST, EAX, 0x0100);
7030 NativeAddress of_low = asmJxxFixup(X86_NZ);
7031
7032 asmFISTP_D(x86_mem2(dest_modrm, &(gCPU.vr[vrD].b[i])));
7033 NativeAddress skip_end = asmJMPFixup();
7034
7035 asmResolveFixup(of_high);
7036 asmALU_D(X86_MOV, dest_modrm, 2147483647);
7037 NativeAddress skip1 = asmJMPFixup();
7038
7039 asmResolveFixup(of_low);
7040 asmALU_D(X86_MOV, dest_modrm, -2147483648);
7041
7042 asmResolveFixup(skip1);
7043 vec_raise_saturate();
7044
7045 asmResolveFixup(skip_end);
7046 }
7047
7048 if (uimm) asmFSTP(Float_ST0);
7049
7050 restore_roundmode();
7051
7052 return flowContinue;
7053 #endif
7054 }
7055
7056 /* vctuxs Vector Convert to Unsigned Fixed-Point Word Saturate
7057 * v.172
7058 */
7059 void ppc_opc_vctuxs()
7060 {
7061 VECTOR_DEBUG;
7062 int vrD, vrB;
7063 uint32 tmp, uimm;
7064 float ftmp;
7065 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
7066
7067 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
7068 ftmp = gCPU.vr[vrB].f[i] * (float)(1 << uimm);
7069 ftmp = truncf(ftmp);
7070
7071 tmp = (uint32)ftmp;
7072
7073 if (ftmp > 4294967295.0) {
7074 tmp = 0xffffffff;
7075 gCPU.vscr |= VSCR_SAT;
7076 } else if (ftmp < 0) {
7077 tmp = 0;
7078 gCPU.vscr |= VSCR_SAT;
7079 }
7080
7081 gCPU.vr[vrD].w[i] = tmp;
7082 }
7083 }
7084 JITCFlow ppc_opc_gen_vctuxs()
7085 {
7086 #if 0
7087 ppc_opc_gen_interpret(ppc_opc_vctuxs);
7088 return flowEndBlock;
7089 #else
7090 int vrD, vrB;
7091 uint32 uimm;
7092 modrm_o modrm, dest_modrm;
7093 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
7094
7095 jitcFlushClientVectorRegister(vrB);
7096 jitcDropClientVectorRegister(vrD);
7097
7098 jitcClobberCarryAndFlags();
7099 jitcClobberRegister(NATIVE_REG | EAX);
7100
7101 jitcFloatRegisterClobberAll();
7102 set_roundmode(RND_ZERO);
7103
7104 NativeReg reg = jitcAllocRegister();
7105
7106 if (uimm == 1) {
7107 asmFSimple(FLD1);
7108 } else if (uimm > 1) {
7109 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), uimm);
7110 asmFILD_D(modrm);
7111 }
7112
7113 for (int i=0; i<16; i+=4) { //FIXME: This might not comply with Java FP
7114 asmFLD_Single(x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
7115
7116 if (uimm) {
7117 asmFSimple(FSCALE);
7118 }
7119
7120 asmFSimple(FRNDINT);
7121
7122 asmFISTP_Q(x86_mem2(modrm, &gCPU.vtemp64));
7123
7124 asmMOV(reg, (byte *)&gCPU.vtemp64+4);
7125
7126 asmALU(X86_TEST, reg, 0x80000000);
7127 NativeAddress of_low = asmJxxFixup(X86_NZ);
7128
7129 asmALU(X86_TEST, reg, 0xffffffff);
7130 NativeAddress of_high = asmJxxFixup(X86_NZ);
7131
7132 asmMOV(reg, &gCPU.vtemp64);
7133
7134 x86_mem2(dest_modrm, &(gCPU.vr[vrD].b[i]));
7135
7136 asmALU(X86_MOV, dest_modrm, reg);
7137 NativeAddress skip_end = asmJMPFixup();
7138
7139 asmResolveFixup(of_high);
7140 asmALU_D(X86_MOV, dest_modrm, 0xffffffff);
7141 NativeAddress skip1 = asmJMPFixup();
7142
7143 asmResolveFixup(of_low);
7144 asmALU_D(X86_MOV, dest_modrm, 0);
7145
7146 asmResolveFixup(skip1);
7147 vec_raise_saturate();
7148
7149 asmResolveFixup(skip_end);
7150 }
7151
7152 if (uimm) asmFSTP(Float_ST0);
7153
7154 restore_roundmode();
7155
7156 return flowContinue;
7157 #endif
7158 }
7159
7160 static inline void commutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB)
7161 {
7162 NativeVectorReg d, s;
7163 modrm_o modrm;
7164 int vrS;
7165
7166 if (vrA == vrD) {
7167 d = jitcGetClientVectorRegisterDirty(vrD);
7168 s = jitcGetClientVectorRegisterMapping(vrB);
7169 vrS = vrB;
7170 } else if (vrB == vrD) {
7171 d = jitcGetClientVectorRegisterDirty(vrD);
7172 s = jitcGetClientVectorRegisterMapping(vrA);
7173 vrS = vrA;
7174 } else {
7175 d = jitcMapClientVectorRegisterDirty(vrD);
7176 NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7177 s = jitcGetClientVectorRegisterMapping(vrB);
7178 vrS = vrB;
7179
7180 if (a == VECTREG_NO)
7181 asmMOVAPS(d, &gCPU.vr[vrA]);
7182 else
7183 asmALUPS(X86_MOVAPS, d, a);
7184 }
7185
7186 if (s == VECTREG_NO) {
7187 asmALUPS(opc, d, x86_mem2(modrm, &gCPU.vr[vrS]));
7188 } else
7189 asmALUPS(opc, d, s);
7190 }
7191
7192 static inline void commutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB)
7193 {
7194 NativeVectorReg d, s;
7195 modrm_o modrm;
7196 int vrS;
7197
7198 if (vrA == vrD) {
7199 d = jitcGetClientVectorRegisterDirty(vrD);
7200 s = jitcGetClientVectorRegisterMapping(vrB);
7201 vrS = vrB;
7202 } else if (vrB == vrD) {
7203 d = jitcGetClientVectorRegisterDirty(vrD);
7204 s = jitcGetClientVectorRegisterMapping(vrA);
7205 vrS = vrA;
7206 } else {
7207 d = jitcMapClientVectorRegisterDirty(vrD);
7208 NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7209 s = jitcGetClientVectorRegisterMapping(vrB);
7210 vrS = vrB;
7211
7212 if (a == VECTREG_NO)
7213 asmMOVAPS(d, &gCPU.vr[vrA]);
7214 else
7215 asmALUPS(X86_MOVAPS, d, a);
7216 }
7217
7218 if (s == VECTREG_NO) {
7219 asmPALU(opc, d, x86_mem2(modrm, &gCPU.vr[vrS]));
7220 } else
7221 asmPALU(opc, d, s);
7222 }
7223
7224 static inline void noncommutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB)
7225 {
7226 NativeVectorReg d, s;
7227 modrm_o modrm;
7228
7229 if (vrA == vrD) {
7230 d = jitcGetClientVectorRegisterDirty(vrA);
7231 s = jitcGetClientVectorRegisterMapping(vrB);
7232 } else {
7233 if (vrB == vrD)
7234 d = jitcAllocVectorRegister();
7235 else
7236 d = jitcMapClientVectorRegisterDirty(vrD);
7237
7238 NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7239 s = jitcGetClientVectorRegisterMapping(vrB);
7240
7241 if (a == VECTREG_NO)
7242 asmMOVAPS(d, &gCPU.vr[vrA]);
7243 else
7244 asmALUPS(X86_MOVAPS, d, a);
7245 }
7246
7247 if (s == VECTREG_NO) {
7248 asmALUPS(opc, d, x86_mem2(modrm, &gCPU.vr[vrB]));
7249 } else
7250 asmALUPS(opc, d, s);
7251
7252 if (vrB == vrD)
7253 jitcRenameVectorRegisterDirty(d, vrD);
7254 }
7255
7256 static inline void noncommutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB)
7257 {
7258 NativeVectorReg d, s;
7259 modrm_o modrm;
7260
7261 if (vrA == vrD) {
7262 d = jitcGetClientVectorRegisterDirty(vrA);
7263 s = jitcGetClientVectorRegisterMapping(vrB);
7264 } else {
7265 if (vrB == vrD)
7266 d = jitcAllocVectorRegister();
7267 else
7268 d = jitcMapClientVectorRegisterDirty(vrD);
7269
7270 NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
7271 s = jitcGetClientVectorRegisterMapping(vrB);
7272
7273 if (a == VECTREG_NO)
7274 asmMOVAPS(d, &gCPU.vr[vrA]);
7275 else
7276 asmALUPS(X86_MOVAPS, d, a);
7277 }
7278
7279 if (s == VECTREG_NO) {
7280 asmPALU(opc, d, x86_mem2(modrm, &gCPU.vr[vrB]));
7281 } else
7282 asmPALU(opc, d, s);
7283
7284 if (vrB == vrD)
7285 jitcRenameVectorRegisterDirty(d, vrD);
7286 }
7287
7288 /* vand Vector Logical AND
7289 * v.147
7290 */
7291 void ppc_opc_vand()
7292 {
7293 VECTOR_DEBUG;
7294 int vrD, vrA, vrB;
7295 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7296
7297 gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] & gCPU.vr[vrB].d[0];
7298 gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] & gCPU.vr[vrB].d[1];
7299 }
7300 JITCFlow ppc_opc_gen_vand()
7301 {
7302 #if 0
7303 ppc_opc_gen_interpret(ppc_opc_vand);
7304 return flowEndBlock;
7305 #else
7306 int vrD, vrA, vrB;
7307 modrm_o modrm;
7308 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7309
7310 if (vrA == vrB) {
7311 vec_Copy(vrD, vrA);
7312 return flowContinue;
7313 }
7314
7315 if (SSE_AVAIL) {
7316 commutative_operation(X86_ANDPS, vrD, vrA, vrB);
7317 return flowContinue;
7318 }
7319
7320 jitcFlushClientVectorRegister(vrA);
7321 jitcFlushClientVectorRegister(vrB);
7322 jitcDropClientVectorRegister(vrD);
7323
7324 jitcClobberCarryAndFlags();
7325 NativeReg reg = jitcAllocRegister();
7326
7327 vec_getWord(reg, vrA, 0);
7328 asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB])));
7329 vec_setWord(vrD, 0, reg);
7330
7331 vec_getWord(reg, vrA, 4);
7332 asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7333 vec_setWord(vrD, 4, reg);
7334
7335 vec_getWord(reg, vrA, 8);
7336 asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7337 vec_setWord(vrD, 8, reg);
7338
7339 vec_getWord(reg, vrA,12);
7340 asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7341 vec_setWord(vrD,12, reg);
7342
7343 return flowContinue;
7344 #endif
7345 }
7346
7347 /* vandc Vector Logical AND with Complement
7348 * v.148
7349 */
7350 void ppc_opc_vandc()
7351 {
7352 VECTOR_DEBUG;
7353 int vrD, vrA, vrB;
7354 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7355
7356 gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] & ~gCPU.vr[vrB].d[0];
7357 gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] & ~gCPU.vr[vrB].d[1];
7358 }
7359 JITCFlow ppc_opc_gen_vandc()
7360 {
7361 #if 0
7362 ppc_opc_gen_interpret(ppc_opc_vandc);
7363 return flowEndBlock;
7364 #else
7365 int vrD, vrA, vrB;
7366 modrm_o modrm;
7367 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7368
7369 if (vrA == vrB) {
7370 vec_Zero(vrD);
7371 return flowContinue;
7372 }
7373
7374 if (SSE_AVAIL) {
7375 noncommutative_operation(X86_ANDNPS, vrD, vrB, vrA);
7376 return flowContinue;
7377 }
7378
7379 jitcFlushClientVectorRegister(vrA);
7380 jitcFlushClientVectorRegister(vrB);
7381 jitcDropClientVectorRegister(vrD);
7382
7383 jitcClobberCarryAndFlags();
7384 NativeReg reg = jitcAllocRegister();
7385
7386 vec_getWord(reg, vrB, 0);
7387 asmALU(X86_NOT, reg);
7388 asmALU(X86_AND, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7389 vec_setWord(vrD, 0, reg);
7390
7391 vec_getWord(reg, vrB, 4);
7392 asmALU(X86_NOT, reg);
7393 asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7394 vec_setWord(vrD, 4, reg);
7395
7396 vec_getWord(reg, vrB, 8);
7397 asmALU(X86_NOT, reg);
7398 asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7399 vec_setWord(vrD, 8, reg);
7400
7401 vec_getWord(reg, vrB,12);
7402 asmALU(X86_NOT, reg);
7403 asmALU(X86_AND, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7404 vec_setWord(vrD,12, reg);
7405
7406 return flowContinue;
7407 #endif
7408 }
7409
7410 /* vor Vector Logical OR
7411 * v.217
7412 */
7413 void ppc_opc_vor()
7414 {
7415 VECTOR_DEBUG_COMMON;
7416 int vrD, vrA, vrB;
7417 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7418
7419 gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] | gCPU.vr[vrB].d[0];
7420 gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] | gCPU.vr[vrB].d[1];
7421 }
7422 JITCFlow ppc_opc_gen_vor()
7423 {
7424 #if 0
7425 ppc_opc_gen_interpret(ppc_opc_vor);
7426 return flowEndBlock;
7427 #else
7428 int vrD, vrA, vrB;
7429 modrm_o modrm;
7430 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7431
7432 if (vrA == vrB) {
7433 vec_Copy(vrD, vrA);
7434 return flowContinue;
7435 }
7436
7437 if (SSE_AVAIL) {
7438 commutative_operation(X86_ORPS, vrD, vrA, vrB);
7439 return flowContinue;
7440 }
7441
7442 jitcFlushClientVectorRegister(vrA);
7443 jitcFlushClientVectorRegister(vrB);
7444 jitcDropClientVectorRegister(vrD);
7445
7446 jitcClobberCarryAndFlags();
7447 NativeReg reg = jitcAllocRegister();
7448
7449 vec_getWord(reg, vrA, 0);
7450 asmALU(X86_OR, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7451 vec_setWord(vrD, 0, reg);
7452
7453 vec_getWord(reg, vrA, 4);
7454 asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7455 vec_setWord(vrD, 4, reg);
7456
7457 vec_getWord(reg, vrA, 8);
7458 asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7459 vec_setWord(vrD, 8, reg);
7460
7461 vec_getWord(reg, vrA,12);
7462 asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7463 vec_setWord(vrD,12, reg);
7464
7465 return flowContinue;
7466 #endif
7467 }
7468
7469 /* vnor Vector Logical NOR
7470 * v.216
7471 */
7472 void ppc_opc_vnor()
7473 {
7474 VECTOR_DEBUG;
7475 int vrD, vrA, vrB;
7476 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7477
7478 gCPU.vr[vrD].d[0] = ~(gCPU.vr[vrA].d[0] | gCPU.vr[vrB].d[0]);
7479 gCPU.vr[vrD].d[1] = ~(gCPU.vr[vrA].d[1] | gCPU.vr[vrB].d[1]);
7480 }
7481 JITCFlow ppc_opc_gen_vnor()
7482 {
7483 #if 0
7484 ppc_opc_gen_interpret(ppc_opc_vnor);
7485 return flowEndBlock;
7486 #else
7487 int vrD, vrA, vrB;
7488 modrm_o modrm;
7489 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7490
7491 if (vrA == vrB) {
7492 vec_Not(vrD, vrA);
7493 return flowContinue;
7494 }
7495
7496 if (SSE_AVAIL) {
7497 commutative_operation(X86_ORPS, vrD, vrA, vrB);
7498 vec_Not(vrD, vrD);
7499 return flowContinue;
7500 }
7501
7502 jitcFlushClientVectorRegister(vrA);
7503 jitcFlushClientVectorRegister(vrB);
7504 jitcDropClientVectorRegister(vrD);
7505
7506 jitcClobberCarryAndFlags();
7507 NativeReg reg = jitcAllocRegister();
7508
7509 vec_getWord(reg, vrA, 0);
7510 asmALU(X86_OR, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7511 asmALU(X86_NOT, reg);
7512 vec_setWord(vrD, 0, reg);
7513
7514 vec_getWord(reg, vrA, 4);
7515 asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7516 asmALU(X86_NOT, reg);
7517 vec_setWord(vrD, 4, reg);
7518
7519 vec_getWord(reg, vrA, 8);
7520 asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7521 asmALU(X86_NOT, reg);
7522 vec_setWord(vrD, 8, reg);
7523
7524 vec_getWord(reg, vrA,12);
7525 asmALU(X86_OR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7526 asmALU(X86_NOT, reg);
7527 vec_setWord(vrD,12, reg);
7528
7529 return flowContinue;
7530 #endif
7531 }
7532
7533 /* vxor Vector Logical XOR
7534 * v.282
7535 */
7536 void ppc_opc_vxor()
7537 {
7538 VECTOR_DEBUG_COMMON;
7539 int vrD, vrA, vrB;
7540 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7541
7542 gCPU.vr[vrD].d[0] = gCPU.vr[vrA].d[0] ^ gCPU.vr[vrB].d[0];
7543 gCPU.vr[vrD].d[1] = gCPU.vr[vrA].d[1] ^ gCPU.vr[vrB].d[1];
7544 }
7545 JITCFlow ppc_opc_gen_vxor()
7546 {
7547 #if 0
7548 ppc_opc_gen_interpret(ppc_opc_vxor);
7549 return flowEndBlock;
7550 #else
7551 int vrD, vrA, vrB;
7552 modrm_o modrm;
7553 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7554
7555 if (vrA == vrB) {
7556 vec_Zero(vrD);
7557 return flowContinue;
7558 }
7559
7560 if (SSE_AVAIL) {
7561 commutative_operation(X86_XORPS, vrD, vrA, vrB);
7562 return flowContinue;
7563 }
7564
7565 jitcFlushClientVectorRegister(vrA);
7566 jitcFlushClientVectorRegister(vrB);
7567 jitcDropClientVectorRegister(vrD);
7568
7569 jitcClobberCarryAndFlags();
7570 NativeReg reg = jitcAllocRegister();
7571
7572 vec_getWord(reg, vrA, 0);
7573 asmALU(X86_XOR, reg, x86_mem2(modrm, &gCPU.vr[vrB]));
7574 vec_setWord(vrD, 0, reg);
7575
7576 vec_getWord(reg, vrA, 4);
7577 asmALU(X86_XOR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[4])));
7578 vec_setWord(vrD, 4, reg);
7579
7580 vec_getWord(reg, vrA, 8);
7581 asmALU(X86_XOR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[8])));
7582 vec_setWord(vrD, 8, reg);
7583
7584 vec_getWord(reg, vrA,12);
7585 asmALU(X86_XOR, reg, x86_mem2(modrm, &(gCPU.vr[vrB].b[12])));
7586 vec_setWord(vrD,12, reg);
7587
7588 return flowContinue;
7589 #endif
7590 }
7591
7592
7593 #define CR_CR6 (0x00f0)
7594 #define CR_CR6_EQ (1<<7)
7595 #define CR_CR6_NE_SOME (1<<6)
7596 #define CR_CR6_NE (1<<5)
7597 #define CR_CR6_EQ_SOME (1<<4)
7598
7599 /* vcmpequbx Vector Compare Equal-to Unsigned Byte
7600 * v.160
7601 */
7602 void ppc_opc_vcmpequbx()
7603 {
7604 VECTOR_DEBUG;
7605 int vrD, vrA, vrB;
7606 int tf=CR_CR6_EQ | CR_CR6_NE;
7607 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7608
7609 for (int i=0; i<16; i++) {
7610 if (gCPU.vr[vrA].b[i] == gCPU.vr[vrB].b[i]) {
7611 gCPU.vr[vrD].b[i] = 0xff;
7612 tf &= ~CR_CR6_NE;
7613 tf |= CR_CR6_EQ_SOME;
7614 } else {
7615 gCPU.vr[vrD].b[i] = 0;
7616 tf &= ~CR_CR6_EQ;
7617 tf |= CR_CR6_NE_SOME;
7618 }
7619 }
7620
7621 if (PPC_OPC_VRc & gCPU.current_opc) {
7622 gCPU.cr &= ~CR_CR6;
7623 gCPU.cr |= tf;
7624 }
7625 }
7626 JITCFlow ppc_opc_gen_vcmpequbx()
7627 {
7628 #if 0
7629 ppc_opc_gen_interpret(ppc_opc_vcmpequbx);
7630 return flowEndBlock;
7631 #else
7632 int vrD, vrA, vrB;
7633 modrm_o modrm;
7634 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7635
7636 if (0 && vrA == vrB) {
7637 vec_Neg1(vrD);
7638
7639 if (PPC_OPC_VRc & gJITC.current_opc) {
7640 jitcClobberCarryAndFlags();
7641
7642 asmAND(&gCPU.cr, ~CR_CR6);
7643 asmOR(&gCPU.cr, CR_CR6_EQ | CR_CR6_EQ_SOME);
7644 }
7645
7646 return flowContinue;
7647 }
7648
7649 jitcFlushClientVectorRegister(vrA);
7650 jitcFlushClientVectorRegister(vrB);
7651 jitcDropClientVectorRegister(vrD);
7652
7653 jitcClobberCarryAndFlags();
7654 NativeReg reg = REG_NO;
7655 NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
7656
7657 if (PPC_OPC_VRc & gJITC.current_opc) {
7658 reg = jitcAllocRegister();
7659
7660 asmMOV_NoFlags(reg, CR_CR6_EQ | CR_CR6_NE);
7661 }
7662
7663 for (int i=0; i<16; i++) {
7664 vec_getByte(reg2, vrA, REG_NO, i);
7665
7666 asmALU(X86_CMP, reg2, x86_mem2(modrm, &gCPU.vr[vrB]+i));
7667
7668 if (PPC_OPC_VRc & gJITC.current_opc) {
7669 NativeAddress skip1 = asmJxxFixup(X86_NE);
7670
7671 asmMOV_NoFlags((NativeReg)reg2, 0xff);
7672 asmALU(X86_AND, reg, ~CR_CR6_NE);
7673 asmALU(X86_OR, reg, CR_CR6_EQ_SOME);
7674 NativeAddress skip2 = asmJMPFixup();
7675
7676 asmResolveFixup(skip1);
7677 asmALU(X86_XOR, reg2, reg2);
7678 asmALU(X86_AND, reg, ~CR_CR6_EQ);
7679 asmALU(X86_OR, reg, CR_CR6_NE_SOME);
7680
7681 asmResolveFixup(skip2);
7682 } else {
7683 asmSET(X86_NE, reg2);
7684
7685 asmALU(X86_SUB, reg2, 1);
7686 }
7687
7688 vec_setByte(vrD, REG_NO, i, reg2);
7689 }
7690
7691 if (PPC_OPC_VRc & gJITC.current_opc) {
7692 asmAND(&gCPU.cr, ~CR_CR6);
7693 asmALU(X86_OR, x86_mem2(modrm, &gCPU.cr), reg);
7694 }
7695
7696 return flowContinue;
7697 #endif
7698 }
7699
7700 /* vcmpequhx Vector Compare Equal-to Unsigned Half Word
7701 * v.161
7702 */
7703 void ppc_opc_vcmpequhx()
7704 {
7705 VECTOR_DEBUG;
7706 int vrD, vrA, vrB;
7707 int tf=CR_CR6_EQ | CR_CR6_NE;
7708 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7709
7710 for (int i=0; i<8; i++) {
7711 if (gCPU.vr[vrA].h[i] == gCPU.vr[vrB].h[i]) {
7712 gCPU.vr[vrD].h[i] = 0xffff;
7713 tf &= ~CR_CR6_NE;
7714 tf |= CR_CR6_EQ_SOME;
7715 } else {
7716 gCPU.vr[vrD].h[i] = 0;
7717 tf &= ~CR_CR6_EQ;
7718 tf |= CR_CR6_NE_SOME;
7719 }
7720 }
7721
7722 if (PPC_OPC_VRc & gCPU.current_opc) {
7723 gCPU.cr &= ~CR_CR6;
7724 gCPU.cr |= tf;
7725 }
7726 }
7727 JITCFlow ppc_opc_gen_vcmpequhx()
7728 {
7729 #if 0
7730 ppc_opc_gen_interpret(ppc_opc_vcmpequhx);
7731 return flowEndBlock;
7732 #else
7733 int vrD, vrA, vrB;
7734 modrm_o modrm;
7735 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7736
7737 if (vrA == vrB) {
7738 vec_Neg1(vrD);
7739
7740 if (PPC_OPC_VRc & gJITC.current_opc) {
7741 jitcClobberCarryAndFlags();
7742
7743 asmAND(&gCPU.cr, ~CR_CR6);
7744 asmOR(&gCPU.cr, CR_CR6_EQ | CR_CR6_EQ_SOME);
7745 }
7746
7747 return flowContinue;
7748 }
7749
7750 jitcFlushClientVectorRegister(vrA);
7751 jitcFlushClientVectorRegister(vrB);
7752 jitcDropClientVectorRegister(vrD);
7753
7754 jitcClobberCarryAndFlags();
7755 NativeReg reg = REG_NO;
7756 NativeReg16 reg2 = (NativeReg16)jitcAllocRegister(NATIVE_REG_8);
7757
7758 if (PPC_OPC_VRc & gJITC.current_opc) {
7759 reg = jitcAllocRegister();
7760
7761 asmMOV_NoFlags(reg, CR_CR6_EQ | CR_CR6_NE);
7762 }
7763
7764 for (int i=0; i<16; i += 2) {
7765 vec_getHalf(reg2, vrA, i);
7766
7767 asmALU(X86_CMP, reg2, x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
7768
7769 if (PPC_OPC_VRc & gJITC.current_opc) {
7770 NativeAddress skip1 = asmJxxFixup(X86_NE);
7771
7772 asmMOV_NoFlags(reg2, 0xffff);
7773 asmALU(X86_AND, reg, ~CR_CR6_NE);
7774 asmALU(X86_OR, reg, CR_CR6_EQ_SOME);
7775 NativeAddress skip2 = asmJMPFixup();
7776
7777 asmResolveFixup(skip1);
7778 asmALU(X86_XOR, reg2, reg2);
7779 asmALU(X86_AND, reg, ~CR_CR6_EQ);
7780 asmALU(X86_OR, reg, CR_CR6_NE_SOME);
7781
7782 asmResolveFixup(skip2, asmHERE());
7783 } else {
7784 asmSET(X86_NE, (NativeReg8)reg2);
7785 asmMOVxx(X86_MOVZX, (NativeReg)reg2, (NativeReg8)reg2);
7786
7787 asmALU(X86_SUB, reg2, 1);
7788 }
7789
7790 vec_setHalf(vrD, i, reg2);
7791 }
7792
7793 if (PPC_OPC_VRc & gJITC.current_opc) {
7794 asmAND(&gCPU.cr, ~CR_CR6);
7795 asmALU(X86_OR, x86_mem2(modrm, &gCPU.cr), reg);
7796 }
7797
7798 return flowContinue;
7799 #endif
7800 }
7801
7802 /* vcmpequwx Vector Compare Equal-to Unsigned Word
7803 * v.162
7804 */
7805 void ppc_opc_vcmpequwx()
7806 {
7807 VECTOR_DEBUG;
7808 int vrD, vrA, vrB;
7809 int tf=CR_CR6_EQ | CR_CR6_NE;
7810 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7811
7812 for (int i=0; i<4; i++) {
7813 if (gCPU.vr[vrA].w[i] == gCPU.vr[vrB].w[i]) {
7814 gCPU.vr[vrD].w[i] = 0xffffffff;
7815 tf &= ~CR_CR6_NE;
7816 tf |= CR_CR6_EQ_SOME;
7817 } else {
7818 gCPU.vr[vrD].w[i] = 0;
7819 tf &= ~CR_CR6_EQ;
7820 tf |= CR_CR6_NE_SOME;
7821 }
7822 }
7823
7824 if (PPC_OPC_VRc & gCPU.current_opc) {
7825 gCPU.cr &= ~CR_CR6;
7826 gCPU.cr |= tf;
7827 }
7828 }
7829 JITCFlow ppc_opc_gen_vcmpequwx()
7830 {
7831 ppc_opc_gen_interpret(ppc_opc_vcmpequwx);
7832 return flowEndBlock;
7833 }
7834
7835 /* vcmpeqfpx Vector Compare Equal-to-Floating Point
7836 * v.159
7837 */
7838 void ppc_opc_vcmpeqfpx()
7839 {
7840 VECTOR_DEBUG;
7841 int vrD, vrA, vrB;
7842 int tf=CR_CR6_EQ | CR_CR6_NE;
7843 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7844
7845 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
7846 if (gCPU.vr[vrA].f[i] == gCPU.vr[vrB].f[i]) {
7847 gCPU.vr[vrD].w[i] = 0xffffffff;
7848 tf &= ~CR_CR6_NE;
7849 tf |= CR_CR6_EQ_SOME;
7850 } else {
7851 gCPU.vr[vrD].w[i] = 0;
7852 tf &= ~CR_CR6_EQ;
7853 tf |= CR_CR6_NE_SOME;
7854 }
7855 }
7856
7857 if (PPC_OPC_VRc & gCPU.current_opc) {
7858 gCPU.cr &= ~CR_CR6;
7859 gCPU.cr |= tf;
7860 }
7861 }
7862 JITCFlow ppc_opc_gen_vcmpeqfpx()
7863 {
7864 ppc_opc_gen_interpret(ppc_opc_vcmpeqfpx);
7865 return flowEndBlock;
7866 }
7867
7868 /* vcmpgtubx Vector Compare Greater-Than Unsigned Byte
7869 * v.168
7870 */
7871 void ppc_opc_vcmpgtubx()
7872 {
7873 VECTOR_DEBUG;
7874 int vrD, vrA, vrB;
7875 int tf=CR_CR6_EQ | CR_CR6_NE;
7876 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7877
7878 for (int i=0; i<16; i++) {
7879 if (gCPU.vr[vrA].b[i] > gCPU.vr[vrB].b[i]) {
7880 gCPU.vr[vrD].b[i] = 0xff;
7881 tf &= ~CR_CR6_NE;
7882 tf |= CR_CR6_EQ_SOME;
7883 } else {
7884 gCPU.vr[vrD].b[i] = 0;
7885 tf &= ~CR_CR6_EQ;
7886 tf |= CR_CR6_NE_SOME;
7887 }
7888 }
7889
7890 if (PPC_OPC_VRc & gCPU.current_opc) {
7891 gCPU.cr &= ~CR_CR6;
7892 gCPU.cr |= tf;
7893 }
7894 }
7895 JITCFlow ppc_opc_gen_vcmpgtubx()
7896 {
7897 ppc_opc_gen_interpret(ppc_opc_vcmpgtubx);
7898 return flowEndBlock;
7899 }
7900
7901 /* vcmpgtsbx Vector Compare Greater-Than Signed Byte
7902 * v.165
7903 */
7904 void ppc_opc_vcmpgtsbx()
7905 {
7906 VECTOR_DEBUG;
7907 int vrD, vrA, vrB;
7908 int tf=CR_CR6_EQ | CR_CR6_NE;
7909 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7910
7911 for (int i=0; i<16; i++) {
7912 if (gCPU.vr[vrA].sb[i] > gCPU.vr[vrB].sb[i]) {
7913 gCPU.vr[vrD].b[i] = 0xff;
7914 tf &= ~CR_CR6_NE;
7915 tf |= CR_CR6_EQ_SOME;
7916 } else {
7917 gCPU.vr[vrD].b[i] = 0;
7918 tf &= ~CR_CR6_EQ;
7919 tf |= CR_CR6_NE_SOME;
7920 }
7921 }
7922
7923 if (PPC_OPC_VRc & gCPU.current_opc) {
7924 gCPU.cr &= ~CR_CR6;
7925 gCPU.cr |= tf;
7926 }
7927 }
7928 JITCFlow ppc_opc_gen_vcmpgtsbx()
7929 {
7930 ppc_opc_gen_interpret(ppc_opc_vcmpgtsbx);
7931 return flowEndBlock;
7932 }
7933
7934 /* vcmpgtuhx Vector Compare Greater-Than Unsigned Half Word
7935 * v.169
7936 */
7937 void ppc_opc_vcmpgtuhx()
7938 {
7939 VECTOR_DEBUG;
7940 int vrD, vrA, vrB;
7941 int tf=CR_CR6_EQ | CR_CR6_NE;
7942 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
7943
7944 for (int i=0; i<8; i++) {
7945 if (gCPU.vr[vrA].h[i] > gCPU.vr[vrB].h[i]) {
7946 gCPU.vr[vrD].h[i] = 0xffff;
7947 tf &= ~CR_CR6_NE;
7948 tf |= CR_CR6_EQ_SOME;
7949 } else {
7950 gCPU.vr[vrD].h[i] = 0;
7951 tf &= ~CR_CR6_EQ;
7952 tf |= CR_CR6_NE_SOME;
7953 }
7954 }
7955
7956 if (PPC_OPC_VRc & gCPU.current_opc) {
7957 gCPU.cr &= ~CR_CR6;
7958 gCPU.cr |= tf;
7959 }
7960 }
7961 JITCFlow ppc_opc_gen_vcmpgtuhx()
7962 {
7963 #if 0
7964 ppc_opc_gen_interpret(ppc_opc_vcmpgtuhx);
7965 return flowEndBlock;
7966 #else
7967 int vrD, vrA, vrB;
7968 modrm_o modrm;
7969 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
7970
7971 if (vrA == vrB) {
7972 vec_Zero(vrD);
7973
7974 if (PPC_OPC_VRc & gJITC.current_opc) {
7975 jitcClobberCarryAndFlags();
7976
7977 asmAND(&gCPU.cr, ~CR_CR6);
7978 asmOR(&gCPU.cr, CR_CR6_NE | CR_CR6_NE_SOME);
7979 }
7980
7981 return flowContinue;
7982 }
7983
7984 jitcFlushClientVectorRegister(vrA);
7985 jitcFlushClientVectorRegister(vrB);
7986 jitcDropClientVectorRegister(vrD);
7987
7988 jitcClobberCarryAndFlags();
7989 NativeReg reg = REG_NO;
7990 NativeReg16 reg2 = (NativeReg16)jitcAllocRegister(NATIVE_REG_8);
7991
7992 if (PPC_OPC_VRc & gJITC.current_opc) {
7993 reg = jitcAllocRegister();
7994
7995 asmMOV_NoFlags(reg, CR_CR6_EQ | CR_CR6_NE);
7996 }
7997
7998 for (int i=0; i<16; i += 2) {
7999 vec_getHalf(reg2, vrA, i);
8000
8001 asmALU(X86_CMP, reg2, x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
8002
8003 if (PPC_OPC_VRc & gJITC.current_opc) {
8004 NativeAddress skip1 = asmJxxFixup(X86_BE);
8005
8006 asmMOV_NoFlags(reg2, 0xffff);
8007 asmALU(X86_AND, reg, ~CR_CR6_NE);
8008 asmALU(X86_OR, reg, CR_CR6_EQ_SOME);
8009 NativeAddress skip2 = asmJMPFixup();
8010
8011 asmResolveFixup(skip1);
8012 asmALU(X86_XOR, reg2, reg2);
8013 asmALU(X86_AND, reg, ~CR_CR6_EQ);
8014 asmALU(X86_OR, reg, CR_CR6_NE_SOME);
8015
8016 asmResolveFixup(skip2);
8017 } else {
8018 asmSET(X86_BE, (NativeReg8)reg2);
8019 asmMOVxx(X86_MOVZX, (NativeReg)reg2, (NativeReg8)reg2);
8020
8021 asmALU(X86_SUB, reg2, 1);
8022 }
8023
8024 vec_setHalf(vrD, i, reg2);
8025 }
8026
8027 if (PPC_OPC_VRc & gJITC.current_opc) {
8028 asmAND(&gCPU.cr, ~CR_CR6);
8029 asmALU(X86_OR, x86_mem2(modrm, &gCPU.cr), reg);
8030 }
8031
8032 return flowContinue;
8033 #endif
8034 }
8035
8036 /* vcmpgtshx Vector Compare Greater-Than Signed Half Word
8037 * v.166
8038 */
8039 void ppc_opc_vcmpgtshx()
8040 {
8041 VECTOR_DEBUG;
8042 int vrD, vrA, vrB;
8043 int tf=CR_CR6_EQ | CR_CR6_NE;
8044 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8045
8046 for (int i=0; i<8; i++) {
8047 if (gCPU.vr[vrA].sh[i] > gCPU.vr[vrB].sh[i]) {
8048 gCPU.vr[vrD].h[i] = 0xffff;
8049 tf &= ~CR_CR6_NE;
8050 tf |= CR_CR6_EQ_SOME;
8051 } else {
8052 gCPU.vr[vrD].h[i] = 0;
8053 tf &= ~CR_CR6_EQ;
8054 tf |= CR_CR6_NE_SOME;
8055 }
8056 }
8057
8058 if (PPC_OPC_VRc & gCPU.current_opc) {
8059 gCPU.cr &= ~CR_CR6;
8060 gCPU.cr |= tf;
8061 }
8062 }
8063 JITCFlow ppc_opc_gen_vcmpgtshx()
8064 {
8065 ppc_opc_gen_interpret(ppc_opc_vcmpgtshx);
8066 return flowEndBlock;
8067 }
8068
8069 /* vcmpgtuwx Vector Compare Greater-Than Unsigned Word
8070 * v.170
8071 */
8072 void ppc_opc_vcmpgtuwx()
8073 {
8074 VECTOR_DEBUG;
8075 int vrD, vrA, vrB;
8076 int tf=CR_CR6_EQ | CR_CR6_NE;
8077 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8078
8079 for (int i=0; i<4; i++) {
8080 if (gCPU.vr[vrA].w[i] > gCPU.vr[vrB].w[i]) {
8081 gCPU.vr[vrD].w[i] = 0xffffffff;
8082 tf &= ~CR_CR6_NE;
8083 tf |= CR_CR6_EQ_SOME;
8084 } else {
8085 gCPU.vr[vrD].w[i] = 0;
8086 tf &= ~CR_CR6_EQ;
8087 tf |= CR_CR6_NE_SOME;
8088 }
8089 }
8090
8091 if (PPC_OPC_VRc & gCPU.current_opc) {
8092 gCPU.cr &= ~CR_CR6;
8093 gCPU.cr |= tf;
8094 }
8095 }
8096 JITCFlow ppc_opc_gen_vcmpgtuwx()
8097 {
8098 ppc_opc_gen_interpret(ppc_opc_vcmpgtuwx);
8099 return flowEndBlock;
8100 }
8101
8102 /* vcmpgtswx Vector Compare Greater-Than Signed Word
8103 * v.167
8104 */
8105 void ppc_opc_vcmpgtswx()
8106 {
8107 VECTOR_DEBUG;
8108 int vrD, vrA, vrB;
8109 int tf=CR_CR6_EQ | CR_CR6_NE;
8110 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8111
8112 for (int i=0; i<4; i++) {
8113 if (gCPU.vr[vrA].sw[i] > gCPU.vr[vrB].sw[i]) {
8114 gCPU.vr[vrD].w[i] = 0xffffffff;
8115 tf &= ~CR_CR6_NE;
8116 tf |= CR_CR6_EQ_SOME;
8117 } else {
8118 gCPU.vr[vrD].w[i] = 0;
8119 tf &= ~CR_CR6_EQ;
8120 tf |= CR_CR6_NE_SOME;
8121 }
8122 }
8123
8124 if (PPC_OPC_VRc & gCPU.current_opc) {
8125 gCPU.cr &= ~CR_CR6;
8126 gCPU.cr |= tf;
8127 }
8128 }
8129 JITCFlow ppc_opc_gen_vcmpgtswx()
8130 {
8131 ppc_opc_gen_interpret(ppc_opc_vcmpgtswx);
8132 return flowEndBlock;
8133 }
8134
8135 /* vcmpgtfpx Vector Compare Greater-Than Floating-Point
8136 * v.164
8137 */
8138 void ppc_opc_vcmpgtfpx()
8139 {
8140 VECTOR_DEBUG;
8141 int vrD, vrA, vrB;
8142 int tf=CR_CR6_EQ | CR_CR6_NE;
8143 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8144
8145 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
8146 if (gCPU.vr[vrA].f[i] > gCPU.vr[vrB].f[i]) {
8147 gCPU.vr[vrD].w[i] = 0xffffffff;
8148 tf &= ~CR_CR6_NE;
8149 tf |= CR_CR6_EQ_SOME;
8150 } else {
8151 gCPU.vr[vrD].w[i] = 0;
8152 tf &= ~CR_CR6_EQ;
8153 tf |= CR_CR6_NE_SOME;
8154 }
8155 }
8156
8157 if (PPC_OPC_VRc & gCPU.current_opc) {
8158 gCPU.cr &= ~CR_CR6;
8159 gCPU.cr |= tf;
8160 }
8161 }
8162 JITCFlow ppc_opc_gen_vcmpgtfpx()
8163 {
8164 ppc_opc_gen_interpret(ppc_opc_vcmpgtfpx);
8165 return flowEndBlock;
8166 }
8167
8168 /* vcmpgefpx Vector Compare Greater-Than-or-Equal-to Floating Point
8169 * v.163
8170 */
8171 void ppc_opc_vcmpgefpx()
8172 {
8173 VECTOR_DEBUG;
8174 int vrD, vrA, vrB;
8175 int tf=CR_CR6_EQ | CR_CR6_NE;
8176 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8177
8178 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
8179 if (gCPU.vr[vrA].f[i] >= gCPU.vr[vrB].f[i]) {
8180 gCPU.vr[vrD].w[i] = 0xffffffff;
8181 tf &= ~CR_CR6_NE;
8182 tf |= CR_CR6_EQ_SOME;
8183 } else {
8184 gCPU.vr[vrD].w[i] = 0;
8185 tf &= ~CR_CR6_EQ;
8186 tf |= CR_CR6_NE_SOME;
8187 }
8188 }
8189
8190 if (PPC_OPC_VRc & gCPU.current_opc) {
8191 gCPU.cr &= ~CR_CR6;
8192 gCPU.cr |= tf;
8193 }
8194 }
8195 JITCFlow ppc_opc_gen_vcmpgefpx()
8196 {
8197 ppc_opc_gen_interpret(ppc_opc_vcmpgefpx);
8198 return flowEndBlock;
8199 }
8200
8201 /* vcmpbfpx Vector Compare Bounds Floating Point
8202 * v.157
8203 */
8204 void ppc_opc_vcmpbfpx()
8205 {
8206 VECTOR_DEBUG;
8207 int vrD, vrA, vrB;
8208 int le, ge;
8209 int ib=CR_CR6_NE;
8210 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
8211
8212 for (int i=0; i<4; i++) { //FIXME: This might not comply with Java FP
8213 le = (gCPU.vr[vrA].f[i] <= gCPU.vr[vrB].f[i]) ? 0 : 0x80000000;
8214 ge = (gCPU.vr[vrA].f[i] >= -gCPU.vr[vrB].f[i]) ? 0 : 0x40000000;
8215
8216 gCPU.vr[vrD].w[i] = le | ge;
8217 if (le | ge) {
8218 ib = 0;
8219 }
8220 }
8221
8222 if (PPC_OPC_VRc & gCPU.current_opc) {
8223 gCPU.cr &= ~CR_CR6;
8224 gCPU.cr |= ib;
8225 }
8226 }
8227 JITCFlow ppc_opc_gen_vcmpbfpx()
8228 {
8229 ppc_opc_gen_interpret(ppc_opc_vcmpbfpx);
8230 return flowEndBlock;
8231 }

  ViewVC Help
Powered by ViewVC 1.1.26