/[pearpc]/src/cpu/cpu_jitc_x86/ppc_vec.cc
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /src/cpu/cpu_jitc_x86/ppc_vec.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (show annotations)
Wed Sep 5 17:11:21 2007 UTC (12 years, 2 months ago) by dpavlin
File size: 177354 byte(s)
import upstream CVS
1 /*
2 * PearPC
3 * ppc_vec.cc
4 *
5 * Copyright (C) 2004 Daniel Foesch (dfoesch@cs.nmsu.edu)
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21 /* Pages marked: v.???
22 * From: IBM PowerPC MicroProcessor Family: Altivec(tm) Technology...
23 * Programming Environments Manual
24 */
25
26 #include <string.h>
27 #include <math.h>
28
29 /*
30 * FIXME: put somewhere appropriate
31 */
32 #ifndef HAS_LOG2
33 #define log2(x) log(x)/log(2)
34 #endif /* HAS_LOG2 */
35
36 #ifndef HAS_EXP2
37 #define exp2(x) pow(2, x)
38 #endif /* HAS_EXP2 */
39
40 #define ASSERT_FLUSHED(vrA)
41 //#define ASSERT_FLUSHED(vrA) jitcAssertFlushedVectorRegister(vrA)
42
43 #include "debug/tracers.h"
44 #include "ppc_cpu.h"
45 #include "ppc_dec.h"
46 #include "ppc_fpu.h"
47 #include "ppc_vec.h"
48
49 #define SIGN32 0x80000000
50
51 #define RL2RH(RL) (NativeReg8)(RL+4)
52
53 #define vrT JITC_VECTOR_TEMP
54 #define vrNEG1 JITC_VECTOR_NEG1
55
56 static inline void commutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB);
57 static inline void noncommutative_operation(X86ALUPSopc opc, int vrD, int vrA, int vrB);
58
59 static inline void commutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB);
60 static inline void noncommutative_operation(X86PALUopc opc, int vrD, int vrA, int vrB);
61
62 /* PACK_PIXEL Packs a uint32 pixel to uint16 pixel
63 * v.219
64 */
65 static inline uint16 PACK_PIXEL(uint32 clr)
66 {
67 return (((clr & 0x000000f8) >> 3) | \
68 ((clr & 0x0000f800) >> 6) | \
69 ((clr & 0x01f80000) >> 9));
70 }
71
72 /* UNPACK_PIXEL Unpacks a uint16 pixel to uint32 pixel
73 * v.276 & v.279
74 */
75 static inline uint32 UNPACK_PIXEL(uint16 clr)
76 {
77 return (((uint32)(clr & 0x001f)) | \
78 ((uint32)(clr & 0x03E0) << 3) | \
79 ((uint32)(clr & 0x7c00) << 6) | \
80 (((clr) & 0x8000) ? 0xff000000 : 0));
81 }
82
83 static inline uint8 SATURATE_UB(uint16 val)
84 {
85 if (val & 0xff00) {
86 gCPU.vscr |= VSCR_SAT;
87 return 0xff;
88 }
89 return val;
90 }
91 static inline uint8 SATURATE_0B(uint16 val)
92 {
93 if (val & 0xff00) {
94 gCPU.vscr |= VSCR_SAT;
95 return 0;
96 }
97 return val;
98 }
99
100 static inline uint16 SATURATE_UH(uint32 val)
101 {
102 if (val & 0xffff0000) {
103 gCPU.vscr |= VSCR_SAT;
104 return 0xffff;
105 }
106 return val;
107 }
108
109 static inline uint16 SATURATE_0H(uint32 val)
110 {
111 if (val & 0xffff0000) {
112 gCPU.vscr |= VSCR_SAT;
113 return 0;
114 }
115 return val;
116 }
117
118 static inline sint8 SATURATE_SB(sint16 val)
119 {
120 if (val > 127) { // 0x7F
121 gCPU.vscr |= VSCR_SAT;
122 return 127;
123 } else if (val < -128) { // 0x80
124 gCPU.vscr |= VSCR_SAT;
125 return -128;
126 }
127 return val;
128 }
129
130 static inline uint8 SATURATE_USB(sint16 val)
131 {
132 if (val > 0xff) {
133 gCPU.vscr |= VSCR_SAT;
134 return 0xff;
135 } else if (val < 0) {
136 gCPU.vscr |= VSCR_SAT;
137 return 0;
138 }
139 return (uint8)val;
140 }
141
142 static inline sint16 SATURATE_SH(sint32 val)
143 {
144 if (val > 32767) { // 0x7fff
145 gCPU.vscr |= VSCR_SAT;
146 return 32767;
147 } else if (val < -32768) { // 0x8000
148 gCPU.vscr |= VSCR_SAT;
149 return -32768;
150 }
151 return val;
152 }
153
154 static inline uint16 SATURATE_USH(sint32 val)
155 {
156 if (val > 0xffff) {
157 gCPU.vscr |= VSCR_SAT;
158 return 0xffff;
159 } else if (val < 0) {
160 gCPU.vscr |= VSCR_SAT;
161 return 0;
162 }
163 return (uint16)val;
164 }
165
166 static inline sint32 SATURATE_UW(sint64 val)
167 {
168 if (val > 0xffffffffLL) {
169 gCPU.vscr |= VSCR_SAT;
170 return 0xffffffffLL;
171 }
172 return val;
173 }
174
175 static inline sint32 SATURATE_SW(sint64 val)
176 {
177 if (val > 2147483647LL) { // 0x7fffffff
178 gCPU.vscr |= VSCR_SAT;
179 return 2147483647LL;
180 } else if (val < -2147483648LL) { // 0x80000000
181 gCPU.vscr |= VSCR_SAT;
182 return -2147483648LL;
183 }
184 return val;
185 }
186
187 static inline void vec_raise_saturate(void)
188 {
189 asmOR(&gCPU.vscr, VSCR_SAT);
190 }
191
192 static inline void vec_saturateSB(NativeReg reg, NativeReg reg2)
193 {
194 jitcClobberCarryAndFlags();
195
196 asmMOV_NoFlags(reg2, 0x7f);
197 asmALU(X86_CMP, reg, reg2);
198 NativeAddress skip1 = asmJxxFixup(X86_G);
199
200 asmMOV_NoFlags(reg2, 0xffffff80);
201 asmALU(X86_CMP, reg, reg2);
202 NativeAddress skip2 = asmJxxFixup(X86_L);
203
204 NativeAddress skip3 = asmJMPFixup();
205
206 asmResolveFixup(skip1);
207 asmResolveFixup(skip2);
208
209 asmALU(X86_MOV, reg, reg2);
210 vec_raise_saturate();
211
212 asmResolveFixup(skip3);
213 }
214
215 static inline void vec_saturateSUB(NativeReg reg, NativeReg reg2)
216 {
217 jitcClobberCarryAndFlags();
218
219 asmMOV_NoFlags(reg2, 0xff);
220 asmALU(X86_CMP, reg, reg2);
221 NativeAddress skip1 = asmJxxFixup(X86_G);
222
223 asmALU(X86_XOR, reg2, reg2);
224 asmALU(X86_CMP, reg, reg2);
225 NativeAddress skip2 = asmJxxFixup(X86_L);
226
227 NativeAddress skip3 = asmJMPFixup();
228
229 asmResolveFixup(skip1);
230 asmResolveFixup(skip2);
231
232 asmALU(X86_MOV, reg, reg2);
233 vec_raise_saturate();
234
235 asmResolveFixup(skip3);
236 }
237
238 static inline void vec_saturateUB(NativeReg8 reg)
239 {
240 jitcClobberCarryAndFlags();
241
242 asmALU(X86_CMP, (NativeReg)reg, 0xff);
243 NativeAddress skip1 = asmJxxFixup(X86_BE);
244
245 asmSET(X86_S, reg);
246 asmALU(X86_SUB, reg, 1);
247
248 vec_raise_saturate();
249
250 asmResolveFixup(skip1);
251 }
252
253
254 static inline void vec_saturateSH(NativeReg reg, NativeReg reg2)
255 {
256 jitcClobberCarryAndFlags();
257
258 asmMOV_NoFlags(reg2, 0x7fff);
259 asmALU(X86_CMP, reg, reg2);
260 NativeAddress skip1 = asmJxxFixup(X86_G);
261
262 asmMOV_NoFlags(reg2, 0xffff8000);
263 asmALU(X86_CMP, reg, reg2);
264 NativeAddress skip2 = asmJxxFixup(X86_L);
265
266 NativeAddress skip3 = asmJMPFixup();
267
268 asmResolveFixup(skip1);
269 asmResolveFixup(skip2);
270
271 asmALU(X86_MOV, reg, reg2);
272 vec_raise_saturate();
273
274 asmResolveFixup(skip3);
275 }
276
277 static inline void vec_saturateSUH(NativeReg reg, NativeReg reg2)
278 {
279 jitcClobberCarryAndFlags();
280
281 asmMOV_NoFlags(reg2, 0xffff);
282 asmALU(X86_CMP, reg, reg2);
283 NativeAddress skip1 = asmJxxFixup(X86_G);
284
285 asmALU(X86_XOR, reg2, reg2);
286 asmALU(X86_CMP, reg, reg2);
287 NativeAddress skip2 = asmJxxFixup(X86_L);
288
289 NativeAddress skip3 = asmJMPFixup();
290
291 asmResolveFixup(skip1);
292 asmResolveFixup(skip2);
293
294 asmALU(X86_MOV, reg, reg2);
295 vec_raise_saturate();
296
297 asmResolveFixup(skip3);
298 }
299
300 static inline void vec_saturateUH(NativeReg8 reg)
301 {
302 jitcClobberCarryAndFlags();
303
304 asmALU(X86_CMP, (NativeReg)reg, 0xffff);
305 NativeAddress skip1 = asmJxxFixup(X86_BE);
306
307 asmSET(X86_S, reg);
308 asmMOVxx(X86_MOVZX, (NativeReg)reg, reg);
309 asmALU(X86_SUB, (NativeReg)reg, 1);
310
311 vec_raise_saturate();
312
313 asmResolveFixup(skip1);
314 }
315
316 static inline void vec_getByte(NativeReg8 dest, int src, NativeReg r, int i)
317 {
318 modrm_o modrm;
319
320 ASSERT_FLUSHED(src);
321
322 asmALU(X86_MOV, dest, x86_mem2(modrm, r, &(gCPU.vr[src].b[i])));
323 }
324
325 static inline void vec_setByte(int dest, NativeReg r, int i, NativeReg8 src)
326 {
327 modrm_o modrm;
328
329 ASSERT_FLUSHED(dest);
330
331 asmALU(X86_MOV, x86_mem2(modrm, r, &(gCPU.vr[dest].b[i])), src);
332 }
333
334 static inline void vec_getByteZ(NativeReg dest, int src, int i)
335 {
336 modrm_o modrm;
337
338 ASSERT_FLUSHED(src);
339
340 asmMOVxx_B(X86_MOVZX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
341 }
342
343 static inline void vec_getByteS(NativeReg dest, int src, int i)
344 {
345 modrm_o modrm;
346
347 ASSERT_FLUSHED(src);
348
349 asmMOVxx_B(X86_MOVSX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
350 }
351
352 static inline void vec_getHalf(NativeReg16 dest, int src, int i)
353 {
354 ASSERT_FLUSHED(src);
355
356 asmMOV(dest, &(gCPU.vr[src].b[i]));
357 }
358
359 static inline void vec_setHalf(int dest, int i, NativeReg16 src)
360 {
361 ASSERT_FLUSHED(dest);
362
363 asmMOV(&(gCPU.vr[dest].b[i]), src);
364 }
365
366 static inline void vec_getHalfZ(NativeReg dest, int src, int i)
367 {
368 modrm_o modrm;
369
370 ASSERT_FLUSHED(src);
371
372 asmMOVxx_W(X86_MOVZX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
373 }
374
375 static inline void vec_getHalfS(NativeReg dest, int src, int i)
376 {
377 modrm_o modrm;
378
379 ASSERT_FLUSHED(src);
380
381 asmMOVxx_W(X86_MOVSX, dest, x86_mem2(modrm, &(gCPU.vr[src].b[i])));
382 }
383
384 static inline void vec_getWord(NativeReg dest, int src, int i)
385 {
386 ASSERT_FLUSHED(src);
387
388 asmMOV(dest, &(gCPU.vr[src].b[i]));
389 }
390
391 static inline void vec_setWord(int dest, int i, NativeReg src)
392 {
393 ASSERT_FLUSHED(dest);
394
395 asmMOV(&(gCPU.vr[dest].b[i]), src);
396 }
397
398 #define SSE_AVAIL gJITC.hostCPUCaps.sse
399 //#define SSE_AVAIL 0
400
401 #define SSE2_AVAIL gJITC.hostCPUCaps.sse2
402 //#define SSE2_AVAIL 0
403
404 #define SSE_NO 0
405 #define SSE2_NO 0
406
407 const static sint32 sint32_max = 2147483647;
408 const static sint32 sint32_min = -2147483648;
409 const static double uint32_max = 4294967295;
410 const static uint32 uint32_min = 0;
411
412 static inline void vec_Copy(int dest, int src)
413 {
414 if (dest == src);
415
416 else if (SSE_AVAIL) {
417 NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
418 NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(src);
419
420 if (reg2 == VECTREG_NO)
421 asmMOVAPS(reg, &gCPU.vr[src]);
422 else
423 asmALUPS(X86_MOVAPS, reg, reg2);
424 } else {
425 jitcFlushClientVectorRegister(src);
426 jitcDropClientVectorRegister(dest);
427
428 NativeReg reg = jitcAllocRegister();
429
430 for (int i=0; i<16; i+=4) {
431 vec_getWord(reg, src, i);
432 vec_setWord(dest, i, reg);
433 }
434 }
435 }
436
437 static inline void vec_Zero(int dest)
438 {
439 if (SSE_AVAIL) {
440 NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
441
442 asmALUPS(X86_XORPS, reg, reg);
443 } else {
444 jitcDropClientVectorRegister(dest);
445
446 NativeReg reg = jitcAllocRegister();
447
448 asmMOV_NoFlags(reg, 0); // avoid flag clobber
449
450 vec_setWord(dest, 0, reg);
451 vec_setWord(dest, 4, reg);
452 vec_setWord(dest, 8, reg);
453 vec_setWord(dest,12, reg);
454 }
455 }
456
457 static inline void vec_Neg1(int dest)
458 {
459 if (SSE_AVAIL) {
460 NativeVectorReg reg = jitcMapClientVectorRegisterDirty(dest);
461 NativeVectorReg n1 = jitcGetClientVectorRegister(vrNEG1);
462
463 asmALUPS(X86_MOVAPS, reg, n1);
464 } else {
465 jitcDropClientVectorRegister(dest);
466
467 NativeReg reg = jitcAllocRegister();
468
469 asmMOV_NoFlags(reg, 0xffffffff); // avoid flag clobber
470
471 vec_setWord(dest, 0, reg);
472 vec_setWord(dest, 4, reg);
473 vec_setWord(dest, 8, reg);
474 vec_setWord(dest,12, reg);
475 }
476 }
477
478 static inline void vec_Not(int dest, int src)
479 {
480 if (SSE_AVAIL) {
481 NativeVectorReg reg1;
482 NativeVectorReg reg2;
483
484 if (src == dest) {
485 reg1 = jitcGetClientVectorRegisterDirty(dest);
486 } else {
487 reg1 = jitcMapClientVectorRegisterDirty(dest);
488 reg2 = jitcGetClientVectorRegisterMapping(src);
489
490 if (reg2 == VECTREG_NO)
491 asmMOVAPS(reg1, &gCPU.vr[src]);
492 else
493 asmALUPS(X86_MOVAPS, reg1, reg2);
494 }
495
496 reg2 = jitcGetClientVectorRegister(vrNEG1);
497
498 asmALUPS(X86_XORPS, reg1, reg2);
499 } else {
500 jitcFlushClientVectorRegister(src);
501 jitcDropClientVectorRegister(dest);
502
503 // according to documentation NOT does not effect flags!
504 NativeReg reg = jitcAllocRegister();
505
506 for (int i=0; i<16; i+=4) {
507 vec_getWord(reg, src, i);
508 asmALU(X86_NOT, reg);
509 vec_setWord(dest, i, reg);
510 }
511 }
512 }
513
514 static inline void vec_SelectiveLoadByte(X86FlagTest flags, NativeReg8 reg1, NativeReg reg2, int vrA, int vrB)
515 {
516 modrm_o modrm;
517
518 NativeAddress skip = asmJxxFixup(flags);
519 asmALU(X86_MOV, (NativeReg)reg1,
520 x86_mem2(modrm, reg2, &(gCPU.vr[vrA].b[16])));
521
522 NativeAddress end = asmJMPFixup();
523
524 asmResolveFixup(skip);
525 asmALU(X86_MOV, (NativeReg)reg1,
526 x86_mem2(modrm, reg2, &(gCPU.vr[vrB].b[16])));
527
528 asmResolveFixup(end);
529 }
530
531 static inline void vec_PermuteQuad(NativeReg8 reg1, NativeReg reg2, NativeReg8 regT, NativeReg regA, NativeReg regB, int vrA, int vrB, int vrD, int i)
532 {
533 modrm_o modrm;
534
535 asmMOVxx(X86_MOVSX, regA, reg1);
536 asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
537
538 if (gJITC.hostCPUCaps.cmov) {
539 asmALU(X86_TEST, reg2, 0x10);
540 asmCMOV(X86_Z, (NativeReg)regT,
541 x86_mem2(modrm, regA, &(gCPU.vr[vrA].b[16])));
542 asmCMOV(X86_NZ, (NativeReg)regT,
543 x86_mem2(modrm, regA, &(gCPU.vr[vrB].b[16])));
544
545 vec_setByte(vrD, REG_NO, i, regT);
546
547 asmALU(X86_TEST, reg2, 0x1000);
548 asmCMOV(X86_Z, (NativeReg)regT,
549 x86_mem2(modrm, regB, &(gCPU.vr[vrA].b[16])));
550 asmCMOV(X86_NZ, (NativeReg)regT,
551 x86_mem2(modrm, regB, &(gCPU.vr[vrB].b[16])));
552
553 vec_setByte(vrD, REG_NO, i+1, regT);
554
555 asmShift(X86_SHR, (NativeReg)reg1, 16);
556
557 asmMOVxx(X86_MOVSX, regA, reg1);
558 asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
559
560 asmALU(X86_TEST, reg2, 0x100000);
561 asmCMOV(X86_Z, (NativeReg)regT,
562 x86_mem2(modrm, regA, &(gCPU.vr[vrA].b[16])));
563 asmCMOV(X86_NZ, (NativeReg)regT,
564 x86_mem2(modrm, regA, &(gCPU.vr[vrB].b[16])));
565
566 vec_setByte(vrD, REG_NO, i+2, regT);
567
568 asmALU(X86_TEST, reg2, 0x10000000);
569 asmCMOV(X86_Z, (NativeReg)regT,
570 x86_mem2(modrm, regB, &(gCPU.vr[vrA].b[16])));
571 asmCMOV(X86_NZ, (NativeReg)regT,
572 x86_mem2(modrm, regB, &(gCPU.vr[vrB].b[16])));
573
574 vec_setByte(vrD, REG_NO, i+3, regT);
575 } else {
576 asmALU(X86_TEST, reg2, 0x10);
577 vec_SelectiveLoadByte(X86_NZ, regT, regA, vrA, vrB);
578 vec_setByte(vrD, REG_NO, i, regT);
579
580 asmALU(X86_TEST, reg2, 0x1000);
581 vec_SelectiveLoadByte(X86_NZ, regT, regB, vrA, vrB);
582 vec_setByte(vrD, REG_NO, i+1, regT);
583
584 asmShift(X86_SHR, (NativeReg)reg1, 16);
585
586 asmMOVxx(X86_MOVSX, regA, reg1);
587 asmMOVxx(X86_MOVSX, regB, RL2RH(reg1));
588
589 asmALU(X86_TEST, reg2, 0x100000);
590 vec_SelectiveLoadByte(X86_NZ, regT, regA, vrA, vrB);
591 vec_setByte(vrD, REG_NO, i+2, regT);
592
593 asmALU(X86_TEST, reg2, 0x10000000);
594 vec_SelectiveLoadByte(X86_NZ, regT, regB, vrA, vrB);
595 vec_setByte(vrD, REG_NO, i+3, regT);
596 }
597 }
598
599 /* vperm Vector Permutation
600 * v.218
601 */
602 void ppc_opc_vperm()
603 {
604 VECTOR_DEBUG_COMMON;
605 int vrD, vrA, vrB, vrC;
606 int sel;
607 Vector_t r;
608 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
609 for (int i=0; i<16; i++) {
610 sel = gCPU.vr[vrC].b[i];
611 if (sel & 0x10)
612 r.b[i] = VECT_B(gCPU.vr[vrB], sel & 0xf);
613 else
614 r.b[i] = VECT_B(gCPU.vr[vrA], sel & 0xf);
615 }
616
617 gCPU.vr[vrD] = r;
618 }
619 JITCFlow ppc_opc_gen_vperm()
620 {
621 #if 0
622 ppc_opc_gen_interpret(ppc_opc_vperm);
623 return flowEndBlock;
624 #else
625 int vrD, vrA, vrB, vrC, vrTMP;
626 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
627
628 jitcFlushClientVectorRegister(vrA);
629 jitcFlushClientVectorRegister(vrB);
630 jitcFlushClientVectorRegister(vrC);
631
632 jitcClobberCarryAndFlags();
633 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
634 NativeReg8 regT = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
635 NativeReg reg2 = jitcAllocRegister();
636 NativeReg regA = jitcAllocRegister();
637 NativeReg regB = jitcAllocRegister();
638
639 vrTMP = ((vrD == vrA) || (vrD == vrB)) ? vrT : vrD;
640 jitcDropClientVectorRegister(vrTMP);
641
642 for (int i=0; i<16; i+=4) {
643 vec_getWord((NativeReg)reg1, vrC, i);
644
645 asmALU(X86_MOV, reg2, (NativeReg)reg1);
646
647 asmALU(X86_AND, (NativeReg)reg1, 0x0f0f0f0f);
648 asmALU(X86_NOT, (NativeReg)reg1);
649
650 // reg1 = original map (selector mask)
651 // reg2 = negative offsets - 1
652 // regT = spare use register (must be 8-bit)
653 // regA = spare use register
654 // regB = spare use register
655 vec_PermuteQuad(reg1, reg2, regT, regA, regB, vrA, vrB, vrTMP, i);
656 }
657
658 if (vrTMP == vrT) {
659 if (SSE_AVAIL) {
660 vec_Copy(vrD, vrT);
661 } else {
662 /* Since we already have a number of registers
663 * allocated, this is much faster than the non-SSE
664 * vec_Copy()
665 */
666 jitcDropClientVectorRegister(vrD);
667
668 vec_getWord((NativeReg)reg1, vrT, 0);
669 vec_getWord(reg2, vrT, 4);
670 vec_getWord(regA, vrT, 8);
671 vec_getWord(regB, vrT,12);
672
673 vec_setWord(vrD, 0, (NativeReg)reg1);
674 vec_setWord(vrD, 4, reg2);
675 vec_setWord(vrD, 8, regA);
676 vec_setWord(vrD,12, regB);
677 }
678 }
679
680 return flowContinue;
681 #endif
682 }
683
684 /* vsel Vector Select
685 * v.238
686 */
687 void ppc_opc_vsel()
688 {
689 VECTOR_DEBUG;
690 int vrD, vrA, vrB, vrC;
691 uint64 mask, val;
692 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, vrC);
693
694 mask = gCPU.vr[vrC].d[0];
695 val = gCPU.vr[vrB].d[0] & mask;
696 val |= gCPU.vr[vrA].d[0] & ~mask;
697 gCPU.vr[vrD].d[0] = val;
698
699 mask = gCPU.vr[vrC].d[1];
700 val = gCPU.vr[vrB].d[1] & mask;
701 val |= gCPU.vr[vrA].d[1] & ~mask;
702 gCPU.vr[vrD].d[1] = val;
703 }
704 JITCFlow ppc_opc_gen_vsel()
705 {
706 #if 0
707 ppc_opc_gen_interpret(ppc_opc_vsel);
708 return flowEndBlock;
709 #else
710 int vrD, vrA, vrB, vrC;
711 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, vrC);
712
713 if (vrA == vrB) {
714 vec_Copy(vrD, vrB);
715 return flowContinue;
716 }
717
718 if (SSE_AVAIL) {
719 modrm_o modrm;
720
721 /* We cannot use jitcMapClientVectorRegisterDirty(vrD) here,
722 * because if vrD == vrC
723 */
724 NativeVectorReg d = jitcAllocVectorRegister();
725 NativeVectorReg c;
726
727 if (vrA == vrC) {
728 c = jitcGetClientVectorRegisterMapping(vrC);
729
730 if (c == VECTREG_NO) {
731 asmMOVAPS(d, &gCPU.vr[vrC]);
732 } else {
733 asmALUPS(X86_MOVAPS, d, c);
734 }
735 } else {
736 c = jitcGetClientVectorRegister(vrC);
737 jitcTrashVectorRegister(NATIVE_REG | c);
738
739 asmALUPS(X86_MOVAPS, d, c);
740 }
741
742 if (vrB != vrC) {
743 NativeVectorReg b = jitcGetClientVectorRegisterMapping(vrB);
744
745 if (b == VECTREG_NO) {
746 asmALUPS(X86_ANDPS, d,
747 x86_mem2(modrm, &gCPU.vr[vrB]));
748 } else
749 asmALUPS(X86_ANDPS, d, b);
750 }
751
752 if (vrA != vrC) {
753 NativeVectorReg a = jitcGetClientVectorRegisterMapping(vrA);
754
755 if (a == VECTREG_NO) {
756 asmALUPS(X86_ANDNPS, c,
757 x86_mem2(modrm, &gCPU.vr[vrA]));
758 } else
759 asmALUPS(X86_ANDNPS, c, a);
760
761 asmALUPS(X86_ORPS, d, c);
762 }
763
764 jitcRenameVectorRegisterDirty(d, vrD);
765
766 return flowContinue;
767 }
768
769 jitcFlushClientVectorRegister(vrA);
770 jitcFlushClientVectorRegister(vrB);
771 jitcFlushClientVectorRegister(vrC);
772 jitcDropClientVectorRegister(vrD);
773
774 jitcClobberCarryAndFlags();
775 NativeReg reg1 = jitcAllocRegister();
776 NativeReg reg2 = jitcAllocRegister();
777 modrm_o modrm;
778
779 for (int i=0; i<16; i+=4) {
780 vec_getWord(reg1, vrC, i);
781 asmALU(X86_MOV, reg2, reg1);
782 asmALU(X86_NOT, reg2);
783
784 asmALU(X86_AND, reg1, x86_mem2(modrm, &(gCPU.vr[vrB].b[i])));
785
786 asmALU(X86_AND, reg2, x86_mem2(modrm, &(gCPU.vr[vrA].b[i])));
787
788 asmALU(X86_OR, reg1, reg2);
789 vec_setWord(vrD, i, reg1);
790 }
791
792 return flowContinue;
793 #endif
794 }
795
796 /* vsrb Vector Shift Right Byte
797 * v.256
798 */
799 void ppc_opc_vsrb()
800 {
801 VECTOR_DEBUG;
802 int vrD, vrA, vrB;
803 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
804 for (int i=0; i<16; i++) {
805 gCPU.vr[vrD].b[i] = gCPU.vr[vrA].b[i] >> (gCPU.vr[vrB].b[i] & 0x7);
806 }
807 }
808 JITCFlow ppc_opc_gen_vsrb()
809 {
810 #if 0
811 ppc_opc_gen_interpret(ppc_opc_vsrb);
812 return flowEndBlock;
813 #else
814 int vrD, vrA, vrB;
815 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
816
817 jitcFlushClientVectorRegister(vrA);
818 jitcFlushClientVectorRegister(vrB);
819 jitcDropClientVectorRegister(vrD);
820
821 jitcClobberCarryAndFlags();
822 jitcAllocRegister(NATIVE_REG | ECX);
823 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
824
825 for (int i=0; i<16; i++) {
826 vec_getByte(reg, vrA, REG_NO, i);
827 vec_getByte(CL, vrB, REG_NO, i);
828
829 asmALU(X86_AND, CL, 0x7);
830 asmShift_CL(X86_SHR, reg);
831
832 vec_setByte(vrD, REG_NO, i, reg);
833 }
834
835 return flowContinue;
836 #endif
837 }
838
839 /* vsrh Vector Shift Right Half Word
840 * v.257
841 */
842 void ppc_opc_vsrh()
843 {
844 VECTOR_DEBUG;
845 int vrD, vrA, vrB;
846 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
847 for (int i=0; i<8; i++) {
848 gCPU.vr[vrD].h[i] = gCPU.vr[vrA].h[i] >> (gCPU.vr[vrB].h[i] & 0xf);
849 }
850 }
851 JITCFlow ppc_opc_gen_vsrh()
852 {
853 #if 0
854 ppc_opc_gen_interpret(ppc_opc_vsrh);
855 return flowEndBlock;
856 #else
857 int vrD, vrA, vrB;
858 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
859
860 jitcFlushClientVectorRegister(vrA);
861 jitcFlushClientVectorRegister(vrB);
862 jitcDropClientVectorRegister(vrD);
863
864 jitcClobberCarryAndFlags();
865 jitcAllocRegister(NATIVE_REG | ECX);
866 NativeReg16 reg = (NativeReg16)jitcAllocRegister();
867
868 for (int i=0; i<16; i+=2) {
869 vec_getHalf(reg, vrA, i);
870 vec_getHalf(CX, vrB, i);
871
872 asmALU(X86_AND, CL, 0x0f);
873 asmShift_CL(X86_SHR, reg);
874
875 vec_setHalf(vrD, i, reg);
876 }
877
878 return flowContinue;
879 #endif
880 }
881
882 /* vsrw Vector Shift Right Word
883 * v.259
884 */
885 void ppc_opc_vsrw()
886 {
887 VECTOR_DEBUG;
888 int vrD, vrA, vrB;
889 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
890 for (int i=0; i<4; i++) {
891 gCPU.vr[vrD].w[i] = gCPU.vr[vrA].w[i] >> (gCPU.vr[vrB].w[i] & 0x1f);
892 }
893 }
894 JITCFlow ppc_opc_gen_vsrw()
895 {
896 #if 0
897 ppc_opc_gen_interpret(ppc_opc_vsrw);
898 return flowEndBlock;
899 #else
900 int vrD, vrA, vrB;
901 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
902
903 jitcFlushClientVectorRegister(vrA);
904 jitcFlushClientVectorRegister(vrB);
905 jitcDropClientVectorRegister(vrD);
906
907 jitcClobberCarryAndFlags();
908 jitcAllocRegister(NATIVE_REG | ECX);
909 NativeReg reg = jitcAllocRegister();
910
911 for (int i=0; i<16; i+=4) {
912 vec_getWord(reg, vrA, i);
913 vec_getWord(ECX, vrB, i);
914
915 // this instruction auto-masks to 0x1f
916 asmShift_CL(X86_SHR, reg);
917
918 vec_setWord(vrD, i, reg);
919 }
920
921 return flowContinue;
922 #endif
923 }
924
925 /* vsrab Vector Shift Right Arithmetic Byte
926 * v.253
927 */
928 void ppc_opc_vsrab()
929 {
930 VECTOR_DEBUG;
931 int vrD, vrA, vrB;
932 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
933 for (int i=0; i<16; i++) {
934 gCPU.vr[vrD].sb[i] = gCPU.vr[vrA].sb[i] >> (gCPU.vr[vrB].b[i] & 0x7);
935 }
936 }
937 JITCFlow ppc_opc_gen_vsrab()
938 {
939 #if 0
940 ppc_opc_gen_interpret(ppc_opc_vsrab);
941 return flowEndBlock;
942 #else
943 int vrD, vrA, vrB;
944 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
945
946 jitcFlushClientVectorRegister(vrA);
947 jitcFlushClientVectorRegister(vrB);
948 jitcDropClientVectorRegister(vrD);
949
950 jitcClobberCarryAndFlags();
951 jitcAllocRegister(NATIVE_REG | ECX);
952 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
953
954 for (int i=0; i<16; i++) {
955 vec_getByte(reg, vrA, REG_NO, i);
956 vec_getByte(CL, vrB, REG_NO, i);
957
958 asmALU(X86_AND, CL, 0x7);
959 asmShift_CL(X86_SAR, reg);
960
961 vec_setByte(vrD, REG_NO, i, reg);
962 }
963
964 return flowContinue;
965 #endif
966 }
967
968 /* vsrah Vector Shift Right Arithmetic Half Word
969 * v.254
970 */
971 void ppc_opc_vsrah()
972 {
973 VECTOR_DEBUG;
974 int vrD, vrA, vrB;
975 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
976 for (int i=0; i<8; i++) {
977 gCPU.vr[vrD].sh[i] = gCPU.vr[vrA].sh[i] >> (gCPU.vr[vrB].h[i] & 0xf);
978 }
979 }
980 JITCFlow ppc_opc_gen_vsrah()
981 {
982 #if 0
983 ppc_opc_gen_interpret(ppc_opc_vsrah);
984 return flowEndBlock;
985 #else
986 int vrD, vrA, vrB;
987 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
988
989 jitcFlushClientVectorRegister(vrA);
990 jitcFlushClientVectorRegister(vrB);
991 jitcDropClientVectorRegister(vrD);
992
993 jitcClobberCarryAndFlags();
994 jitcAllocRegister(NATIVE_REG | ECX);
995 NativeReg16 reg = (NativeReg16)jitcAllocRegister();
996
997 for (int i=0; i<16; i+=2) {
998 vec_getHalf(reg, vrA, i);
999 vec_getHalf(CX, vrB, i);
1000
1001 asmALU(X86_AND, CL, 0x0f);
1002 asmShift_CL(X86_SAR, reg);
1003
1004 vec_setHalf(vrD, i, reg);
1005 }
1006
1007 return flowContinue;
1008 #endif
1009 }
1010
1011 /* vsraw Vector Shift Right Arithmetic Word
1012 * v.255
1013 */
1014 void ppc_opc_vsraw()
1015 {
1016 VECTOR_DEBUG;
1017 int vrD, vrA, vrB;
1018 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1019 for (int i=0; i<4; i++) {
1020 gCPU.vr[vrD].sw[i] = gCPU.vr[vrA].sw[i] >> (gCPU.vr[vrB].w[i] & 0x1f);
1021 }
1022 }
1023 JITCFlow ppc_opc_gen_vsraw()
1024 {
1025 #if 0
1026 ppc_opc_gen_interpret(ppc_opc_vsraw);
1027 return flowEndBlock;
1028 #else
1029 int vrD, vrA, vrB;
1030 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1031
1032 jitcFlushClientVectorRegister(vrB);
1033 jitcDropClientVectorRegister(vrD);
1034
1035 jitcClobberCarryAndFlags();
1036 jitcAllocRegister(NATIVE_REG | ECX);
1037 NativeReg reg = jitcAllocRegister();
1038
1039 for (int i=0; i<16; i+=4) {
1040 vec_getWord(reg, vrA, i);
1041 vec_getWord(ECX, vrB, i);
1042
1043 // this instruction auto-masks to 0x1f
1044 asmShift_CL(X86_SAR, reg);
1045
1046 vec_setWord(vrD, i, reg);
1047 }
1048
1049 return flowContinue;
1050 #endif
1051 }
1052
1053 /* vslb Vector Shift Left Byte
1054 * v.240
1055 */
1056 void ppc_opc_vslb()
1057 {
1058 VECTOR_DEBUG;
1059 int vrD, vrA, vrB;
1060 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1061 for (int i=0; i<16; i++) {
1062 gCPU.vr[vrD].b[i] = gCPU.vr[vrA].b[i] << (gCPU.vr[vrB].b[i] & 0x7);
1063 }
1064 }
1065 JITCFlow ppc_opc_gen_vslb()
1066 {
1067 #if 0
1068 ppc_opc_gen_interpret(ppc_opc_vslb);
1069 return flowEndBlock;
1070 #else
1071 int vrD, vrA, vrB;
1072 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1073
1074 jitcFlushClientVectorRegister(vrA);
1075 jitcFlushClientVectorRegister(vrB);
1076 jitcDropClientVectorRegister(vrD);
1077
1078 jitcClobberCarryAndFlags();
1079 jitcAllocRegister(NATIVE_REG | ECX);
1080 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1081
1082 for (int i=0; i<16; i++) {
1083 vec_getByte(reg, vrA, REG_NO, i);
1084 vec_getByte(CL, vrB, REG_NO, i);
1085
1086 asmALU(X86_AND, CL, 0x7);
1087 asmShift_CL(X86_SHL, reg);
1088
1089 vec_setByte(vrD, REG_NO, i, reg);
1090 }
1091
1092 return flowContinue;
1093 #endif
1094 }
1095
1096 /* vslh Vector Shift Left Half Word
1097 * v.242
1098 */
1099 void ppc_opc_vslh()
1100 {
1101 VECTOR_DEBUG;
1102 int vrD, vrA, vrB;
1103 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1104 for (int i=0; i<8; i++) {
1105 gCPU.vr[vrD].h[i] = gCPU.vr[vrA].h[i] << (gCPU.vr[vrB].h[i] & 0xf);
1106 }
1107 }
1108 JITCFlow ppc_opc_gen_vslh()
1109 {
1110 #if 0
1111 ppc_opc_gen_interpret(ppc_opc_vslh);
1112 return flowEndBlock;
1113 #else
1114 int vrD, vrA, vrB;
1115 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1116
1117 jitcFlushClientVectorRegister(vrA);
1118 jitcFlushClientVectorRegister(vrB);
1119 jitcDropClientVectorRegister(vrD);
1120
1121 jitcClobberCarryAndFlags();
1122 jitcAllocRegister(NATIVE_REG | ECX);
1123 NativeReg16 reg = (NativeReg16)jitcAllocRegister();
1124
1125 for (int i=0; i<16; i+=2) {
1126 vec_getHalf(reg, vrA, i);
1127 vec_getHalf(CX, vrB, i);
1128
1129 asmALU(X86_AND, CL, 0x0f);
1130 asmShift_CL(X86_SHL, reg);
1131
1132 vec_setHalf(vrD, i, reg);
1133 }
1134
1135 return flowContinue;
1136 #endif
1137 }
1138
1139 /* vslw Vector Shift Left Word
1140 * v.244
1141 */
1142 void ppc_opc_vslw()
1143 {
1144 VECTOR_DEBUG;
1145 int vrD, vrA, vrB;
1146 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1147 for (int i=0; i<4; i++) {
1148 gCPU.vr[vrD].w[i] = gCPU.vr[vrA].w[i] << (gCPU.vr[vrB].w[i] & 0x1f);
1149 }
1150 }
1151 JITCFlow ppc_opc_gen_vslw()
1152 {
1153 #if 0
1154 ppc_opc_gen_interpret(ppc_opc_vslw);
1155 return flowEndBlock;
1156 #else
1157 int vrD, vrA, vrB;
1158 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1159
1160 jitcFlushClientVectorRegister(vrA);
1161 jitcFlushClientVectorRegister(vrB);
1162 jitcDropClientVectorRegister(vrD);
1163
1164 jitcClobberCarryAndFlags();
1165 jitcAllocRegister(NATIVE_REG | ECX);
1166 NativeReg reg = jitcAllocRegister();
1167
1168 for (int i=0; i<16; i+=4) {
1169 vec_getWord(reg, vrA, i);
1170 vec_getWord(ECX, vrB, i);
1171
1172 // this instruction auto-masks to 0x1f
1173 asmShift_CL(X86_SHL, reg);
1174
1175 vec_setWord(vrD, i, reg);
1176 }
1177
1178 return flowContinue;
1179 #endif
1180 }
1181
1182 /* vsr Vector Shift Right
1183 * v.251
1184 */
1185 void ppc_opc_vsr()
1186 {
1187 VECTOR_DEBUG;
1188 int vrD, vrA, vrB;
1189 Vector_t r;
1190 int shift;
1191 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1192
1193 /* Specs say that the low-order 3 bits of all byte elements in vB
1194 * must be the same, or the result is undefined. So we can just
1195 * use the same low-order 3 bits for all of our shifts.
1196 */
1197 shift = gCPU.vr[vrB].w[0] & 0x7;
1198
1199 r.d[0] = gCPU.vr[vrA].d[0] >> shift;
1200 r.d[1] = gCPU.vr[vrA].d[1] >> shift;
1201
1202 VECT_D(r, 1) |= VECT_D(gCPU.vr[vrA], 0) << (64 - shift);
1203
1204 gCPU.vr[vrD] = r;
1205 }
1206 JITCFlow ppc_opc_gen_vsr()
1207 {
1208 ppc_opc_gen_interpret(ppc_opc_vsr);
1209 return flowEndBlock;
1210 }
1211
1212 /* vsro Vector Shift Right Octet
1213 * v.258
1214 */
1215 void ppc_opc_vsro()
1216 {
1217 VECTOR_DEBUG;
1218 int vrD, vrA, vrB;
1219 Vector_t r;
1220 int shift, i;
1221 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1222
1223 shift = (gCPU.vr[vrB].w[0] >> 3) & 0xf;
1224 #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1225 for (i=0; i<(16-shift); i++) {
1226 r.b[i] = gCPU.vr[vrA].b[i+shift];
1227 }
1228
1229 for (; i<16; i++) {
1230 r.b[i] = 0;
1231 }
1232 #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1233 for (i=0; i<shift; i++) {
1234 r.b[i] = 0;
1235 }
1236
1237 for (; i<16; i++) {
1238 r.b[i] = gCPU.vr[vrA].b[i-shift];
1239 }
1240 #else
1241 #error Endianess not supported!
1242 #endif
1243
1244 gCPU.vr[vrD] = r;
1245 }
1246 JITCFlow ppc_opc_gen_vsro()
1247 {
1248 ppc_opc_gen_interpret(ppc_opc_vsro);
1249 return flowEndBlock;
1250 }
1251
1252 /* vsl Vector Shift Left
1253 * v.239
1254 */
1255 void ppc_opc_vsl()
1256 {
1257 VECTOR_DEBUG;
1258 int vrD, vrA, vrB;
1259 Vector_t r;
1260 int shift;
1261 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1262
1263 /* Specs say that the low-order 3 bits of all byte elements in vB
1264 * must be the same, or the result is undefined. So we can just
1265 * use the same low-order 3 bits for all of our shifts.
1266 */
1267 shift = gCPU.vr[vrB].w[0] & 0x7;
1268
1269 r.d[0] = gCPU.vr[vrA].d[0] << shift;
1270 r.d[1] = gCPU.vr[vrA].d[1] << shift;
1271
1272 VECT_D(r, 0) |= VECT_D(gCPU.vr[vrA], 1) >> (64 - shift);
1273
1274 gCPU.vr[vrD] = r;
1275 }
1276 JITCFlow ppc_opc_gen_vsl()
1277 {
1278 ppc_opc_gen_interpret(ppc_opc_vsl);
1279 return flowEndBlock;
1280 }
1281
1282 /* vslo Vector Shift Left Octet
1283 * v.243
1284 */
1285 void ppc_opc_vslo()
1286 {
1287 VECTOR_DEBUG;
1288 int vrD, vrA, vrB;
1289 Vector_t r;
1290 int shift, i;
1291 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1292
1293 shift = (gCPU.vr[vrB].w[0] >> 3) & 0xf;
1294 #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1295 for (i=0; i<shift; i++) {
1296 r.b[i] = 0;
1297 }
1298
1299 for (; i<16; i++) {
1300 r.b[i] = gCPU.vr[vrA].b[i-shift];
1301 }
1302 #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1303 for (i=0; i<(16-shift); i++) {
1304 r.b[i] = gCPU.vr[vrA].b[i+shift];
1305 }
1306
1307 for (; i<16; i++) {
1308 r.b[i] = 0;
1309 }
1310 #else
1311 #error Endianess not supported!
1312 #endif
1313
1314 gCPU.vr[vrD] = r;
1315 }
1316 JITCFlow ppc_opc_gen_vslo()
1317 {
1318 ppc_opc_gen_interpret(ppc_opc_vslo);
1319 return flowEndBlock;
1320 }
1321
1322 static inline void ppc_opc_gen_helper_vsldoi_left(int vrA, int vrD, int shift, int pshift, NativeReg reg)
1323 {
1324 for (int i=12; i>=shift; i-=4) {
1325 vec_getWord(reg, vrA, i-shift);
1326
1327 vec_setWord(vrD, i, reg);
1328 }
1329
1330 if (pshift) {
1331 for (int i=shift+4-pshift; i>=shift; i--) {
1332 vec_getByte((NativeReg8)reg, vrA, REG_NO, i-shift);
1333
1334 vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1335 }
1336 }
1337 }
1338
1339 static inline void ppc_opc_gen_helper_vsldoi_right(int vrB, int vrD, int shift, int ashift, int bshift, NativeReg reg)
1340 {
1341 for (int i=0; i<bshift; i+=4) {
1342 vec_getWord(reg, vrB, i+ashift);
1343
1344 vec_setWord(vrD, i, reg);
1345 }
1346
1347 for (int i=bshift; i<shift; i++) {
1348 vec_getByte((NativeReg8)reg, vrB, REG_NO, i+ashift);
1349
1350 vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1351 }
1352 }
1353
1354 /* vsldoi Vector Shift Left Double by Octet Immediate
1355 * v.241
1356 */
1357 void ppc_opc_vsldoi()
1358 {
1359 VECTOR_DEBUG_COMMON;
1360 int vrD, vrA, vrB, shift, ashift;
1361 int i;
1362 Vector_t r;
1363 PPC_OPC_TEMPL_A(gCPU.current_opc, vrD, vrA, vrB, shift);
1364
1365 shift &= 0xf;
1366 ashift = 16 - shift;
1367
1368 #if HOST_ENDIANESS == HOST_ENDIANESS_LE
1369 for (i=0; i<shift; i++) {
1370 r.b[i] = gCPU.vr[vrB].b[i+ashift];
1371 }
1372
1373 for (; i<16; i++) {
1374 r.b[i] = gCPU.vr[vrA].b[i-shift];
1375 }
1376 #elif HOST_ENDIANESS == HOST_ENDIANESS_BE
1377 for (i=0; i<ashift; i++) {
1378 r.b[i] = gCPU.vr[vrA].b[i+shift];
1379 }
1380
1381 for (; i<16; i++) {
1382 r.b[i] = gCPU.vr[vrB].b[i-ashift];
1383 }
1384 #else
1385 #error Endianess not supported!
1386 #endif
1387
1388 gCPU.vr[vrD] = r;
1389 }
1390 JITCFlow ppc_opc_gen_vsldoi()
1391 {
1392 #if 0
1393 ppc_opc_gen_interpret(ppc_opc_vsldoi);
1394 return flowEndBlock;
1395 #else
1396 int vrD, vrA, vrB;
1397 int shift, ashift, pshift, bshift;
1398 PPC_OPC_TEMPL_A(gJITC.current_opc, vrD, vrA, vrB, shift);
1399
1400 jitcFlushClientVectorRegister(vrA);
1401 jitcFlushClientVectorRegister(vrB);
1402 jitcDropClientVectorRegister(vrD);
1403
1404 shift &= 0xf;
1405 ashift = 16 - shift;
1406 bshift = (shift & ~0x03);
1407 pshift = shift & 0x03;
1408
1409 NativeReg reg = jitcAllocRegister(NATIVE_REG_8);
1410
1411 if ((vrD == vrA) && (vrD != vrB)) {
1412 ppc_opc_gen_helper_vsldoi_left(vrA, vrD, shift, pshift, reg);
1413
1414 ppc_opc_gen_helper_vsldoi_right(vrB, vrD, shift, ashift, bshift, reg);
1415
1416 return flowContinue;
1417 }
1418
1419 if (vrD == vrA) {
1420 jitcDropClientVectorRegister(vrT);
1421 ppc_opc_gen_helper_vsldoi_right(vrB, vrT, shift, ashift, bshift, reg);
1422 } else {
1423 ppc_opc_gen_helper_vsldoi_right(vrB, vrD, shift, ashift, bshift, reg);
1424 }
1425
1426 ppc_opc_gen_helper_vsldoi_left(vrA, vrD, shift, pshift, reg);
1427
1428 if (vrD == vrA) {
1429 for (int i=0; i<bshift; i+=4) {
1430 vec_getWord(reg, vrT, i);
1431 vec_setWord(vrD, i, reg);
1432 }
1433
1434 for (int i=bshift; i<shift; i++) {
1435 vec_getByte((NativeReg8)reg, vrT, REG_NO, i);
1436 vec_setByte(vrD, REG_NO, i, (NativeReg8)reg);
1437 }
1438 }
1439
1440 return flowContinue;
1441 #endif
1442 }
1443
1444 /* vrlb Vector Rotate Left Byte
1445 * v.234
1446 */
1447 void ppc_opc_vrlb()
1448 {
1449 VECTOR_DEBUG;
1450 int vrD, vrA, vrB, shift;
1451 Vector_t r;
1452 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1453
1454 for (int i=0; i<16; i++) {
1455 shift = (gCPU.vr[vrB].b[i] & 0x7);
1456
1457 r.b[i] = gCPU.vr[vrA].b[i] << shift;
1458 r.b[i] |= gCPU.vr[vrA].b[i] >> (8 - shift);
1459 }
1460
1461 gCPU.vr[vrD] = r;
1462 }
1463 JITCFlow ppc_opc_gen_vrlb()
1464 {
1465 #if 0
1466 ppc_opc_gen_interpret(ppc_opc_vrlb);
1467 return flowEndBlock;
1468 #else
1469
1470 int vrD, vrA, vrB;
1471 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1472
1473 jitcFlushClientVectorRegister(vrA);
1474 jitcFlushClientVectorRegister(vrB);
1475 jitcDropClientVectorRegister(vrD);
1476
1477 jitcClobberCarryAndFlags();
1478 jitcAllocRegister(NATIVE_REG | ECX);
1479 NativeReg8 reg = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1480
1481 for (int i=0; i<16; i++) {
1482 vec_getByte(reg, vrA, REG_NO, i);
1483 vec_getByte(CL, vrB, REG_NO, i);
1484
1485 asmShift_CL(X86_ROL, reg);
1486
1487 vec_setByte(vrD, REG_NO, i, reg);
1488 }
1489
1490 return flowContinue;
1491 #endif
1492 }
1493
1494 /* vrlh Vector Rotate Left Half Word
1495 * v.235
1496 */
1497 void ppc_opc_vrlh()
1498 {
1499 VECTOR_DEBUG;
1500 int vrD, vrA, vrB, shift;
1501 Vector_t r;
1502 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1503
1504 for (int i=0; i<8; i++) {
1505 shift = (gCPU.vr[vrB].h[i] & 0xf);
1506
1507 r.h[i] = gCPU.vr[vrA].h[i] << shift;
1508 r.h[i] |= gCPU.vr[vrA].h[i] >> (16 - shift);
1509 }
1510
1511 gCPU.vr[vrD] = r;
1512 }
1513 JITCFlow ppc_opc_gen_vrlh()
1514 {
1515 ppc_opc_gen_interpret(ppc_opc_vrlh);
1516 return flowEndBlock;
1517 }
1518
1519 /* vrlw Vector Rotate Left Word
1520 * v.236
1521 */
1522 void ppc_opc_vrlw()
1523 {
1524 VECTOR_DEBUG;
1525 int vrD, vrA, vrB, shift;
1526 Vector_t r;
1527 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1528
1529 for (int i=0; i<4; i++) {
1530 shift = (gCPU.vr[vrB].w[i] & 0x1F);
1531
1532 r.w[i] = gCPU.vr[vrA].w[i] << shift;
1533 r.w[i] |= gCPU.vr[vrA].w[i] >> (32 - shift);
1534 }
1535
1536 gCPU.vr[vrD] = r;
1537 }
1538 JITCFlow ppc_opc_gen_vrlw()
1539 {
1540 ppc_opc_gen_interpret(ppc_opc_vrlw);
1541 return flowEndBlock;
1542 }
1543
1544 /* With the merges, I just don't see any point in risking that a compiler
1545 * might generate actual alu code to calculate anything when it's
1546 * compile-time known. Plus, it's easier to validate it like this.
1547 */
1548
1549 /* vmrghb Vector Merge High Byte
1550 * v.195
1551 */
1552 void ppc_opc_vmrghb()
1553 {
1554 VECTOR_DEBUG;
1555 int vrD, vrA, vrB;
1556 Vector_t r;
1557 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1558
1559 VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 0);
1560 VECT_B(r, 1) = VECT_B(gCPU.vr[vrB], 0);
1561 VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 1);
1562 VECT_B(r, 3) = VECT_B(gCPU.vr[vrB], 1);
1563 VECT_B(r, 4) = VECT_B(gCPU.vr[vrA], 2);
1564 VECT_B(r, 5) = VECT_B(gCPU.vr[vrB], 2);
1565 VECT_B(r, 6) = VECT_B(gCPU.vr[vrA], 3);
1566 VECT_B(r, 7) = VECT_B(gCPU.vr[vrB], 3);
1567 VECT_B(r, 8) = VECT_B(gCPU.vr[vrA], 4);
1568 VECT_B(r, 9) = VECT_B(gCPU.vr[vrB], 4);
1569 VECT_B(r,10) = VECT_B(gCPU.vr[vrA], 5);
1570 VECT_B(r,11) = VECT_B(gCPU.vr[vrB], 5);
1571 VECT_B(r,12) = VECT_B(gCPU.vr[vrA], 6);
1572 VECT_B(r,13) = VECT_B(gCPU.vr[vrB], 6);
1573 VECT_B(r,14) = VECT_B(gCPU.vr[vrA], 7);
1574 VECT_B(r,15) = VECT_B(gCPU.vr[vrB], 7);
1575
1576 gCPU.vr[vrD] = r;
1577 }
1578 JITCFlow ppc_opc_gen_vmrghb()
1579 {
1580 #if 0
1581 ppc_opc_gen_interpret(ppc_opc_vmrghb);
1582 return flowEndBlock;
1583 #else
1584 int vrD, vrA, vrB;
1585 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1586
1587 if (SSE2_AVAIL) {
1588 noncommutative_operation(PALUB(X86_PUNPCKH), vrD, vrB, vrA);
1589 return flowContinue;
1590 }
1591
1592 jitcFlushClientVectorRegister(vrA);
1593 jitcFlushClientVectorRegister(vrB);
1594 jitcDropClientVectorRegister(vrD);
1595
1596 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1597 NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1598
1599 vec_getHalf((NativeReg16)reg1, vrB, 8);
1600 vec_getHalf((NativeReg16)reg2, vrA, 8);
1601 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1602 vec_setHalf(vrD, 0, (NativeReg16)reg1);
1603 vec_setHalf(vrD, 2, (NativeReg16)reg2);
1604
1605 vec_getHalf((NativeReg16)reg1, vrB, 10);
1606 vec_getHalf((NativeReg16)reg2, vrA, 10);
1607 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1608 vec_setHalf(vrD, 4, (NativeReg16)reg1);
1609 vec_setHalf(vrD, 6, (NativeReg16)reg2);
1610
1611 vec_getHalf((NativeReg16)reg1, vrB, 12);
1612 vec_getHalf((NativeReg16)reg2, vrA, 12);
1613 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1614 vec_setHalf(vrD, 8, (NativeReg16)reg1);
1615 vec_setHalf(vrD, 10, (NativeReg16)reg2);
1616
1617 vec_getHalf((NativeReg16)reg1, vrB, 14);
1618 vec_getHalf((NativeReg16)reg2, vrA, 14);
1619 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1620 vec_setHalf(vrD, 12, (NativeReg16)reg1);
1621 vec_setHalf(vrD, 14, (NativeReg16)reg2);
1622
1623 return flowContinue;
1624 #endif
1625 }
1626
1627 /* vmrghh Vector Merge High Half Word
1628 * v.196
1629 */
1630 void ppc_opc_vmrghh()
1631 {
1632 VECTOR_DEBUG;
1633 int vrD, vrA, vrB;
1634 Vector_t r;
1635 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1636
1637 VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 0);
1638 VECT_H(r, 1) = VECT_H(gCPU.vr[vrB], 0);
1639 VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 1);
1640 VECT_H(r, 3) = VECT_H(gCPU.vr[vrB], 1);
1641 VECT_H(r, 4) = VECT_H(gCPU.vr[vrA], 2);
1642 VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 2);
1643 VECT_H(r, 6) = VECT_H(gCPU.vr[vrA], 3);
1644 VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 3);
1645
1646 gCPU.vr[vrD] = r;
1647 }
1648 JITCFlow ppc_opc_gen_vmrghh()
1649 {
1650 #if 0
1651 ppc_opc_gen_interpret(ppc_opc_vmrghh);
1652 return flowEndBlock;
1653 #else
1654 int vrD, vrA, vrB;
1655 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1656
1657 if (SSE2_AVAIL) {
1658 noncommutative_operation(PALUW(X86_PUNPCKH), vrD, vrB, vrA);
1659 return flowContinue;
1660 }
1661
1662 jitcFlushClientVectorRegister(vrA);
1663 jitcFlushClientVectorRegister(vrB);
1664 jitcDropClientVectorRegister(vrD);
1665
1666 NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
1667
1668 vec_getHalf(reg1, vrB, 8);
1669 vec_setHalf(vrD, 0, reg1);
1670 vec_getHalf(reg1, vrB, 10);
1671 vec_setHalf(vrD, 4, reg1);
1672
1673 vec_getHalf(reg1, vrA, 8);
1674 vec_setHalf(vrD, 2, reg1);
1675 vec_getHalf(reg1, vrA, 10);
1676 vec_setHalf(vrD, 6, reg1);
1677
1678 vec_getHalf(reg1, vrB, 12);
1679 vec_setHalf(vrD, 8, reg1);
1680
1681 vec_getHalf(reg1, vrA, 12);
1682 vec_setHalf(vrD, 10, reg1);
1683
1684 vec_getHalf(reg1, vrB, 14);
1685 vec_setHalf(vrD, 12, reg1);
1686
1687 if (vrD != vrA) {
1688 vec_getHalf(reg1, vrA, 14);
1689 vec_setHalf(vrD, 14, reg1);
1690 }
1691
1692 return flowContinue;
1693 #endif
1694 }
1695
1696 /* vmrghw Vector Merge High Word
1697 * v.197
1698 */
1699 void ppc_opc_vmrghw()
1700 {
1701 VECTOR_DEBUG;
1702 int vrD, vrA, vrB;
1703 Vector_t r;
1704 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1705
1706 VECT_W(r, 0) = VECT_W(gCPU.vr[vrA], 0);
1707 VECT_W(r, 1) = VECT_W(gCPU.vr[vrB], 0);
1708 VECT_W(r, 2) = VECT_W(gCPU.vr[vrA], 1);
1709 VECT_W(r, 3) = VECT_W(gCPU.vr[vrB], 1);
1710
1711 gCPU.vr[vrD] = r;
1712 }
1713 JITCFlow ppc_opc_gen_vmrghw()
1714 {
1715 #if 0
1716 ppc_opc_gen_interpret(ppc_opc_vmrghw);
1717 return flowEndBlock;
1718 #else
1719 int vrD, vrA, vrB;
1720 modrm_o modrm;
1721 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1722
1723 if (SSE2_AVAIL) {
1724 noncommutative_operation(PALUD(X86_PUNPCKH), vrD, vrB, vrA);
1725 return flowContinue;
1726 }
1727
1728 if (SSE_AVAIL) {
1729 NativeVectorReg dest, src;
1730 int shuf_map = 0x72;
1731 int vrS = vrB;
1732
1733 if (vrA == vrD) {
1734 dest = jitcGetClientVectorRegisterDirty(vrA);
1735 src = jitcGetClientVectorRegisterMapping(vrB);
1736 } else if (vrB == vrD) {
1737 dest = jitcGetClientVectorRegisterDirty(vrB);
1738 src = jitcGetClientVectorRegisterMapping(vrA);
1739 shuf_map = 0xd8;
1740 vrS = vrA;
1741 } else {
1742 dest = jitcMapClientVectorRegisterDirty(vrD);
1743 src = jitcGetClientVectorRegisterMapping(vrA);
1744
1745 if (src == VECTREG_NO)
1746 asmMOVAPS(dest, &gCPU.vr[vrA]);
1747 else
1748 asmALUPS(X86_MOVAPS, dest, src);
1749
1750 src = jitcGetClientVectorRegisterMapping(vrB);
1751 }
1752
1753 if (src == VECTREG_NO) {
1754 asmSHUFPS(dest, x86_mem2(modrm, &gCPU.vr[vrS]), 0xee);
1755 } else {
1756 asmSHUFPS(dest, src, 0xee);
1757 }
1758
1759 asmSHUFPS(dest, dest, shuf_map);
1760
1761 return flowContinue;
1762 }
1763
1764 jitcFlushClientVectorRegister(vrA);
1765 jitcFlushClientVectorRegister(vrB);
1766 jitcDropClientVectorRegister(vrD);
1767
1768 NativeReg reg = jitcAllocRegister();
1769 JitcVectorReg vrTMP = vrD;
1770
1771 if (vrA == vrD || vrB == vrD) {
1772 vrTMP = vrT;
1773 }
1774
1775 vec_getWord(reg, vrA, 12);
1776 vec_setWord(vrTMP, 12, reg);
1777
1778 vec_getWord(reg, vrB, 12);
1779 vec_setWord(vrTMP, 8, reg);
1780
1781 vec_getWord(reg, vrA, 8);
1782 vec_setWord(vrTMP, 4, reg);
1783
1784 vec_getWord(reg, vrB, 8);
1785 vec_setWord(vrTMP, 0, reg);
1786
1787 vec_Copy(vrD, vrTMP);
1788
1789 return flowContinue;
1790 #endif
1791 }
1792
1793 /* vmrglb Vector Merge Low Byte
1794 * v.198
1795 */
1796 void ppc_opc_vmrglb()
1797 {
1798 VECTOR_DEBUG;
1799 int vrD, vrA, vrB;
1800 Vector_t r;
1801 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1802
1803 VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 8);
1804 VECT_B(r, 1) = VECT_B(gCPU.vr[vrB], 8);
1805 VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 9);
1806 VECT_B(r, 3) = VECT_B(gCPU.vr[vrB], 9);
1807 VECT_B(r, 4) = VECT_B(gCPU.vr[vrA],10);
1808 VECT_B(r, 5) = VECT_B(gCPU.vr[vrB],10);
1809 VECT_B(r, 6) = VECT_B(gCPU.vr[vrA],11);
1810 VECT_B(r, 7) = VECT_B(gCPU.vr[vrB],11);
1811 VECT_B(r, 8) = VECT_B(gCPU.vr[vrA],12);
1812 VECT_B(r, 9) = VECT_B(gCPU.vr[vrB],12);
1813 VECT_B(r,10) = VECT_B(gCPU.vr[vrA],13);
1814 VECT_B(r,11) = VECT_B(gCPU.vr[vrB],13);
1815 VECT_B(r,12) = VECT_B(gCPU.vr[vrA],14);
1816 VECT_B(r,13) = VECT_B(gCPU.vr[vrB],14);
1817 VECT_B(r,14) = VECT_B(gCPU.vr[vrA],15);
1818 VECT_B(r,15) = VECT_B(gCPU.vr[vrB],15);
1819
1820 gCPU.vr[vrD] = r;
1821 }
1822 JITCFlow ppc_opc_gen_vmrglb()
1823 {
1824 #if 0
1825 ppc_opc_gen_interpret(ppc_opc_vmrglb);
1826 return flowEndBlock;
1827 #else
1828 int vrD, vrA, vrB;
1829 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1830
1831 if (SSE2_AVAIL) {
1832 noncommutative_operation(PALUB(X86_PUNPCKL), vrD, vrB, vrA);
1833 return flowContinue;
1834 }
1835
1836 jitcFlushClientVectorRegister(vrA);
1837 jitcFlushClientVectorRegister(vrB);
1838 jitcDropClientVectorRegister(vrD);
1839
1840 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1841 NativeReg8 reg2 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
1842
1843 vec_getHalf((NativeReg16)reg1, vrB, 6);
1844 vec_getHalf((NativeReg16)reg2, vrA, 6);
1845 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1846 vec_setHalf(vrD, 12, (NativeReg16)reg1);
1847 vec_setHalf(vrD, 14, (NativeReg16)reg2);
1848
1849 vec_getHalf((NativeReg16)reg1, vrB, 4);
1850 vec_getHalf((NativeReg16)reg2, vrA, 4);
1851 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1852 vec_setHalf(vrD, 8, (NativeReg16)reg1);
1853 vec_setHalf(vrD, 10, (NativeReg16)reg2);
1854
1855 vec_getHalf((NativeReg16)reg1, vrB, 2);
1856 vec_getHalf((NativeReg16)reg2, vrA, 2);
1857 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1858 vec_setHalf(vrD, 4, (NativeReg16)reg1);
1859 vec_setHalf(vrD, 6, (NativeReg16)reg2);
1860
1861 vec_getHalf((NativeReg16)reg1, vrB, 0);
1862 vec_getHalf((NativeReg16)reg2, vrA, 0);
1863 asmALU(X86_XCHG, RL2RH(reg1), reg2);
1864 vec_setHalf(vrD, 0, (NativeReg16)reg1);
1865 vec_setHalf(vrD, 2, (NativeReg16)reg2);
1866
1867 return flowContinue;
1868 #endif
1869 }
1870
1871 /* vmrglh Vector Merge Low Half Word
1872 * v.199
1873 */
1874 void ppc_opc_vmrglh()
1875 {
1876 VECTOR_DEBUG;
1877 int vrD, vrA, vrB;
1878 Vector_t r;
1879 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1880
1881 VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 4);
1882 VECT_H(r, 1) = VECT_H(gCPU.vr[vrB], 4);
1883 VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 5);
1884 VECT_H(r, 3) = VECT_H(gCPU.vr[vrB], 5);
1885 VECT_H(r, 4) = VECT_H(gCPU.vr[vrA], 6);
1886 VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 6);
1887 VECT_H(r, 6) = VECT_H(gCPU.vr[vrA], 7);
1888 VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 7);
1889
1890 gCPU.vr[vrD] = r;
1891 }
1892 JITCFlow ppc_opc_gen_vmrglh()
1893 {
1894 #if 0
1895 ppc_opc_gen_interpret(ppc_opc_vmrglh);
1896 return flowEndBlock;
1897 #else
1898 int vrD, vrA, vrB;
1899 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1900
1901 if (SSE2_AVAIL) {
1902 noncommutative_operation(PALUW(X86_PUNPCKL), vrD, vrB, vrA);
1903 return flowContinue;
1904 }
1905
1906 jitcFlushClientVectorRegister(vrA);
1907 jitcFlushClientVectorRegister(vrB);
1908 jitcDropClientVectorRegister(vrD);
1909
1910 NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
1911
1912 vec_getHalf(reg1, vrA, 6);
1913 vec_setHalf(vrD, 14, reg1);
1914 vec_getHalf(reg1, vrA, 4);
1915 vec_setHalf(vrD, 10, reg1);
1916
1917 vec_getHalf(reg1, vrB, 6);
1918 vec_setHalf(vrD, 12, reg1);
1919 vec_getHalf(reg1, vrB, 4);
1920 vec_setHalf(vrD, 8, reg1);
1921
1922 vec_getHalf(reg1, vrA, 2);
1923 vec_setHalf(vrD, 6, reg1);
1924
1925 vec_getHalf(reg1, vrB, 2);
1926 vec_setHalf(vrD, 4, reg1);
1927
1928 vec_getHalf(reg1, vrA, 0);
1929 vec_setHalf(vrD, 2, reg1);
1930
1931 if (vrD != vrB) {
1932 vec_getHalf(reg1, vrB, 0);
1933 vec_setHalf(vrD, 0, reg1);
1934 }
1935
1936 return flowContinue;
1937 #endif
1938 }
1939
1940 /* vmrglw Vector Merge Low Word
1941 * v.200
1942 */
1943 void ppc_opc_vmrglw()
1944 {
1945 VECTOR_DEBUG;
1946 int vrD, vrA, vrB;
1947 Vector_t r;
1948 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
1949
1950 VECT_W(r, 0) = VECT_W(gCPU.vr[vrA], 2);
1951 VECT_W(r, 1) = VECT_W(gCPU.vr[vrB], 2);
1952 VECT_W(r, 2) = VECT_W(gCPU.vr[vrA], 3);
1953 VECT_W(r, 3) = VECT_W(gCPU.vr[vrB], 3);
1954
1955 gCPU.vr[vrD] = r;
1956 }
1957 JITCFlow ppc_opc_gen_vmrglw()
1958 {
1959 #if 0
1960 ppc_opc_gen_interpret(ppc_opc_vmrglw);
1961 return flowEndBlock;
1962 #else
1963 int vrD, vrA, vrB;
1964 modrm_o modrm;
1965 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
1966
1967 if (SSE2_AVAIL) {
1968 noncommutative_operation(PALUD(X86_PUNPCKL), vrD, vrB, vrA);
1969 return flowContinue;
1970 }
1971
1972 if (SSE_AVAIL) {
1973 NativeVectorReg dest, src;
1974 int shuf_map = 0x72;
1975 int vrS = vrB;
1976
1977 if (vrA == vrD) {
1978 dest = jitcGetClientVectorRegisterDirty(vrA);
1979 src = jitcGetClientVectorRegisterMapping(vrB);
1980 } else if (vrB == vrD) {
1981 dest = jitcGetClientVectorRegisterDirty(vrB);
1982 src = jitcGetClientVectorRegisterMapping(vrA);
1983 shuf_map = 0xd8;
1984 vrS = vrA;
1985 } else {
1986 dest = jitcGetClientVectorRegister(vrA);
1987 jitcRenameVectorRegisterDirty(dest, vrD);
1988
1989 src = jitcGetClientVectorRegisterMapping(vrB);
1990 }
1991
1992 if (src == VECTREG_NO) {
1993 asmSHUFPS(dest, x86_mem2(modrm, &gCPU.vr[vrS]), 0x44);
1994 } else {
1995 asmSHUFPS(dest, src, 0x44);
1996 }
1997
1998 asmSHUFPS(dest, dest, shuf_map);
1999
2000 return flowContinue;
2001 }
2002
2003 NativeReg reg = jitcAllocRegister();
2004
2005 jitcFlushClientVectorRegister(vrA);
2006 jitcFlushClientVectorRegister(vrB);
2007 jitcDropClientVectorRegister(vrD);
2008
2009 vec_getWord(reg, vrA, 4);
2010 vec_setWord(vrD, 12, reg);
2011
2012 vec_getWord(reg, vrB, 4);
2013 vec_setWord(vrD, 8, reg);
2014
2015 vec_getWord(reg, vrA, 0);
2016 vec_setWord(vrD, 4, reg);
2017
2018 if (vrB != vrD) {
2019 vec_getWord(reg, vrB, 0);
2020 vec_setWord(vrD, 0, reg);
2021 }
2022
2023 return flowContinue;
2024 #endif
2025 }
2026
2027 /* vspltb Vector Splat Byte
2028 * v.245
2029 */
2030 void ppc_opc_vspltb()
2031 {
2032 VECTOR_DEBUG;
2033 int vrD, vrB;
2034 uint32 uimm;
2035 uint64 val;
2036 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2037
2038 /* The documentation doesn't stipulate what a value higher than 0xf
2039 * will do. Thus, this is by default an undefined value. We
2040 * are thus doing this the fastest way that won't crash us.
2041 */
2042 val = VECT_B(gCPU.vr[vrB], uimm & 0xf);
2043 val |= (val << 8);
2044 val |= (val << 16);
2045 val |= (val << 32);
2046
2047 gCPU.vr[vrD].d[0] = val;
2048 gCPU.vr[vrD].d[1] = val;
2049 }
2050 JITCFlow ppc_opc_gen_vspltb()
2051 {
2052 #if 0
2053 ppc_opc_gen_interpret(ppc_opc_vspltb);
2054 return flowEndBlock;
2055 #else
2056 int vrD, vrB;
2057 uint32 uimm;
2058 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2059
2060 jitcFlushClientVectorRegister(vrB);
2061 jitcDropClientVectorRegister(vrD);
2062
2063 jitcClobberCarryAndFlags();
2064 NativeReg reg1 = jitcAllocRegister(NATIVE_REG_8);
2065 NativeReg reg2 = jitcAllocRegister();
2066
2067 /* The documentation doesn't stipulate what a value higher than 0xf
2068 * will do. Thus, this is by default an undefined value. We
2069 * are thus doing this the fastest way that won't crash us.
2070 */
2071 vec_getByteZ(reg1, vrB, (15 - (uimm & 0xf)));
2072 asmALU(X86_MOV, RL2RH(reg1), (NativeReg8)reg1);
2073
2074 asmALU(X86_MOV, reg2, reg1);
2075 asmShift(X86_SHL, reg2, 16);
2076 asmALU(X86_OR, reg1, reg2);
2077
2078 vec_setWord(vrD, 0, reg1);
2079 vec_setWord(vrD, 4, reg1);
2080 vec_setWord(vrD, 8, reg1);
2081 vec_setWord(vrD, 12, reg1);
2082
2083 return flowContinue;
2084 #endif
2085 }
2086
2087 /* vsplth Vector Splat Half Word
2088 * v.246
2089 */
2090 void ppc_opc_vsplth()
2091 {
2092 VECTOR_DEBUG;
2093 int vrD, vrB;
2094 uint32 uimm;
2095 uint64 val;
2096 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2097
2098 /* The documentation doesn't stipulate what a value higher than 0x7
2099 * will do. Thus, this is by default an undefined value. We
2100 * are thus doing this the fastest way that won't crash us.
2101 */
2102 val = VECT_H(gCPU.vr[vrB], uimm & 0x7);
2103 val |= (val << 16);
2104 val |= (val << 32);
2105
2106 gCPU.vr[vrD].d[0] = val;
2107 gCPU.vr[vrD].d[1] = val;
2108 }
2109 JITCFlow ppc_opc_gen_vsplth()
2110 {
2111 #if 0
2112 ppc_opc_gen_interpret(ppc_opc_vsplth);
2113 return flowEndBlock;
2114 #else
2115 int vrD, vrB;
2116 uint32 uimm;
2117 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2118
2119 jitcFlushClientVectorRegister(vrB);
2120 jitcDropClientVectorRegister(vrD);
2121
2122 jitcClobberCarryAndFlags();
2123 NativeReg reg1 = jitcAllocRegister();
2124 NativeReg reg2 = jitcAllocRegister();
2125
2126 /* The documentation doesn't stipulate what a value higher than 0x7
2127 * will do. Thus, this is by default an undefined value. We
2128 * are thus doing this the fastest way that won't crash us.
2129 */
2130 vec_getHalfZ(reg1, vrB, (7 - (uimm & 0x7)) << 1);
2131
2132 asmALU(X86_MOV, reg2, reg1);
2133 asmShift(X86_SHL, reg2, 16);
2134 asmALU(X86_OR, reg1, reg2);
2135
2136 vec_setWord(vrD, 0, reg1);
2137 vec_setWord(vrD, 4, reg1);
2138 vec_setWord(vrD, 8, reg1);
2139 vec_setWord(vrD, 12, reg1);
2140
2141 return flowContinue;
2142 #endif
2143 }
2144
2145 /* vspltw Vector Splat Word
2146 * v.250
2147 */
2148 void ppc_opc_vspltw()
2149 {
2150 VECTOR_DEBUG;
2151 int vrD, vrB;
2152 uint32 uimm;
2153 uint64 val;
2154 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, uimm, vrB);
2155
2156 /* The documentation doesn't stipulate what a value higher than 0x3
2157 * will do. Thus, this is by default an undefined value. We
2158 * are thus doing this the fastest way that won't crash us.
2159 */
2160 val = VECT_W(gCPU.vr[vrB], uimm & 0x3);
2161 val |= (val << 32);
2162
2163 gCPU.vr[vrD].d[0] = val;
2164 gCPU.vr[vrD].d[1] = val;
2165 }
2166 JITCFlow ppc_opc_gen_vspltw()
2167 {
2168 #if 0
2169 ppc_opc_gen_interpret(ppc_opc_vspltw);
2170 return flowEndBlock;
2171 #else
2172 int vrD, vrB;
2173 uint32 uimm;
2174 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, uimm, vrB);
2175
2176 if (SSE_AVAIL) {
2177 NativeVectorReg reg;
2178
2179 if (vrD == vrB) {
2180 reg = jitcGetClientVectorRegisterDirty(vrD);
2181 } else {
2182 reg = jitcMapClientVectorRegisterDirty(vrD);
2183 NativeVectorReg reg2 = jitcGetClientVectorRegisterMapping(vrB);
2184
2185 if (reg2 == VECTREG_NO)
2186 asmMOVAPS(reg, &gCPU.vr[vrB]);
2187 else
2188 asmALUPS(X86_MOVAPS, reg, reg2);
2189 }
2190
2191 asmSHUFPS(reg, reg, 0xff - ((uimm & 0x3) * 0x55));
2192
2193 return flowContinue;
2194 }
2195
2196 jitcFlushClientVectorRegister(vrB);
2197 jitcDropClientVectorRegister(vrD);
2198
2199 jitcClobberCarryAndFlags();
2200 NativeReg reg1 = jitcAllocRegister();
2201
2202 /* The documentation doesn't stipulate what a value higher than 0xf
2203 * will do. Thus, this is by default an undefined value. We
2204 * are thus doing this the fastest way that won't crash us.
2205 */
2206 vec_getWord(reg1, vrB, (3 - (uimm & 0x3)) << 2);
2207
2208 vec_setWord(vrD, 0, reg1);
2209 vec_setWord(vrD, 4, reg1);
2210 vec_setWord(vrD, 8, reg1);
2211 vec_setWord(vrD, 12, reg1);
2212
2213 return flowContinue;
2214 #endif
2215 }
2216
2217 /* vspltisb Vector Splat Immediate Signed Byte
2218 * v.247
2219 */
2220 void ppc_opc_vspltisb()
2221 {
2222 VECTOR_DEBUG_COMMON;
2223 int vrD, vrB;
2224 uint32 simm;
2225 uint64 val;
2226 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2227 PPC_OPC_ASSERT(vrB==0);
2228
2229 val = (simm & 0x10) ? (simm | 0xE0) : simm;
2230 val |= (val << 8);
2231 val |= (val << 16);
2232 val |= (val << 32);
2233
2234 gCPU.vr[vrD].d[0] = val;
2235 gCPU.vr[vrD].d[1] = val;
2236 }
2237 JITCFlow ppc_opc_gen_vspltisb()
2238 {
2239 #if 0
2240 ppc_opc_gen_interpret(ppc_opc_vspltisb);
2241 return flowEndBlock;
2242 #else
2243 int vrD, vrB;
2244 uint32 simm, val;
2245 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2246
2247 if (simm == 0) {
2248 vec_Zero(vrD);
2249 return flowContinue;
2250 } else if (simm == 0x1f) {
2251 vec_Neg1(vrD);
2252 return flowContinue;
2253 }
2254
2255 val = (simm & 0x10) ? (simm | 0xE0) : simm;
2256 val |= (val << 8);
2257 val |= (val << 16);
2258
2259 if (SSE_AVAIL) {
2260 modrm_o modrm;
2261 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2262
2263 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2264 asmMOVSS(reg1, &gCPU.vtemp);
2265 asmSHUFPS(reg1, reg1, 0x00);
2266
2267 return flowContinue;
2268 }
2269
2270 jitcDropClientVectorRegister(vrD);
2271
2272 NativeReg r1 = jitcAllocRegister();
2273 asmALU(X86_MOV, r1, val);
2274
2275 vec_setWord(vrD, 0, r1);
2276 vec_setWord(vrD, 4, r1);
2277 vec_setWord(vrD, 8, r1);
2278 vec_setWord(vrD,12, r1);
2279
2280 return flowContinue;
2281 #endif
2282 }
2283
2284 /* vspltish Vector Splat Immediate Signed Half Word
2285 * v.248
2286 */
2287 void ppc_opc_vspltish()
2288 {
2289 VECTOR_DEBUG_COMMON;
2290 int vrD, vrB;
2291 uint32 simm;
2292 uint64 val;
2293 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2294 PPC_OPC_ASSERT(vrB==0);
2295
2296 val = (simm & 0x10) ? (simm | 0xFFE0) : simm;
2297 val |= (val << 16);
2298 val |= (val << 32);
2299
2300 gCPU.vr[vrD].d[0] = val;
2301 gCPU.vr[vrD].d[1] = val;
2302 }
2303 JITCFlow ppc_opc_gen_vspltish()
2304 {
2305 #if 0
2306 ppc_opc_gen_interpret(ppc_opc_vspltish);
2307 return flowEndBlock;
2308 #else
2309 int vrD, vrB;
2310 uint32 simm, val;
2311 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2312
2313 if (simm == 0) {
2314 vec_Zero(vrD);
2315 return flowContinue;
2316 } else if (simm == 0x1f) {
2317 vec_Neg1(vrD);
2318 return flowContinue;
2319 }
2320
2321 val = (simm & 0x10) ? (simm | 0xFFE0) : simm;
2322 val |= (val << 16);
2323
2324 if (SSE_AVAIL) {
2325 modrm_o modrm;
2326 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2327
2328 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2329 asmMOVSS(reg1, &gCPU.vtemp);
2330 asmSHUFPS(reg1, reg1, 0x00);
2331
2332 return flowContinue;
2333 }
2334
2335 jitcDropClientVectorRegister(vrD);
2336
2337 NativeReg r1 = jitcAllocRegister();
2338 asmALU(X86_MOV, r1, val);
2339
2340 vec_setWord(vrD, 0, r1);
2341 vec_setWord(vrD, 4, r1);
2342 vec_setWord(vrD, 8, r1);
2343 vec_setWord(vrD,12, r1);
2344
2345 return flowContinue;
2346 #endif
2347 }
2348
2349 /* vspltisw Vector Splat Immediate Signed Word
2350 * v.249
2351 */
2352 void ppc_opc_vspltisw()
2353 {
2354 VECTOR_DEBUG_COMMON;
2355 int vrD, vrB;
2356 uint32 simm;
2357 uint64 val;
2358 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, simm, vrB);
2359 PPC_OPC_ASSERT(vrB==0);
2360
2361 val = (simm & 0x10) ? (simm | 0xFFFFFFE0) : simm;
2362 val |= (val << 32);
2363
2364 gCPU.vr[vrD].d[0] = val;
2365 gCPU.vr[vrD].d[1] = val;
2366 }
2367 JITCFlow ppc_opc_gen_vspltisw()
2368 {
2369 #if 0
2370 ppc_opc_gen_interpret(ppc_opc_vspltisw);
2371 return flowEndBlock;
2372 #else
2373 int vrD, vrB;
2374 uint32 simm, val;
2375 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, simm, vrB);
2376
2377 if (simm == 0) {
2378 vec_Zero(vrD);
2379 return flowContinue;
2380 } else if (simm == 0x1f) {
2381 vec_Neg1(vrD);
2382 return flowContinue;
2383 }
2384
2385 val = (simm & 0x10) ? (simm | 0xFFFFFFE0) : simm;
2386
2387 if (SSE_AVAIL) {
2388 modrm_o modrm;
2389 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2390
2391 asmALU_D(X86_MOV, x86_mem2(modrm, &gCPU.vtemp), val);
2392 asmMOVSS(reg1, &gCPU.vtemp);
2393 asmSHUFPS(reg1, reg1, 0x00);
2394
2395 return flowContinue;
2396 }
2397
2398 jitcDropClientVectorRegister(vrD);
2399
2400 NativeReg r1 = jitcAllocRegister();
2401 asmALU(X86_MOV, r1, val);
2402
2403 vec_setWord(vrD, 0, r1);
2404 vec_setWord(vrD, 4, r1);
2405 vec_setWord(vrD, 8, r1);
2406 vec_setWord(vrD,12, r1);
2407
2408 return flowContinue;
2409 #endif
2410 }
2411
2412 /* mfvscr Move from Vector Status and Control Register
2413 * v.129
2414 */
2415 void ppc_opc_mfvscr()
2416 {
2417 VECTOR_DEBUG_COMMON;
2418 int vrD, vrA, vrB;
2419 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2420 PPC_OPC_ASSERT(vrA==0);
2421 PPC_OPC_ASSERT(vrB==0);
2422
2423 VECT_W(gCPU.vr[vrD], 3) = gCPU.vscr;
2424 VECT_W(gCPU.vr[vrD], 2) = 0;
2425 VECT_D(gCPU.vr[vrD], 0) = 0;
2426 }
2427 JITCFlow ppc_opc_gen_mfvscr()
2428 {
2429 #if 0
2430 ppc_opc_gen_interpret(ppc_opc_mfvscr);
2431 return flowEndBlock;
2432 #else
2433 int vrD, vrA, vrB;
2434 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2435
2436 if (SSE_AVAIL) {
2437 NativeVectorReg reg1 = jitcMapClientVectorRegisterDirty(vrD);
2438 NativeReg reg2 = jitcGetClientRegisterMapping(PPC_VSCR);
2439
2440 if (reg1 != VECTREG_NO)
2441 jitcFlushRegister(NATIVE_REG | reg2);
2442
2443 asmMOVSS(reg1, &gCPU.vscr);
2444 } else {
2445 jitcDropClientVectorRegister(vrD);
2446
2447 NativeReg r1 = jitcGetClientRegister(PPC_VSCR);
2448 vec_setWord(vrD, 0, r1);
2449
2450 jitcClobberRegister(NATIVE_REG | r1);
2451
2452 asmMOV_NoFlags(r1, 0); // avoid flag clobber
2453 vec_setWord(vrD, 4, r1);
2454 vec_setWord(vrD, 8, r1);
2455 vec_setWord(vrD,12, r1);
2456 }
2457
2458 return flowContinue;
2459 #endif
2460 }
2461
2462 /* mtvscr Move to Vector Status and Control Register
2463 * v.130
2464 */
2465 void ppc_opc_mtvscr()
2466 {
2467 VECTOR_DEBUG_COMMON;
2468 int vrD, vrA, vrB;
2469 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2470 PPC_OPC_ASSERT(vrA==0);
2471 PPC_OPC_ASSERT(vrD==0);
2472
2473 gCPU.vscr = VECT_W(gCPU.vr[vrB], 3);
2474 }
2475 JITCFlow ppc_opc_gen_mtvscr()
2476 {
2477 #if 0
2478 ppc_opc_gen_interpret(ppc_opc_mtvscr);
2479 return flowEndBlock;
2480 #else
2481 int vrD, vrA, vrB;
2482 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2483
2484 if (SSE_AVAIL) {
2485 NativeVectorReg reg1 = jitcGetClientVectorRegisterMapping(vrB);
2486
2487 if (reg1 != VECTREG_NO) {
2488 NativeReg reg2 = jitcGetClientRegisterMapping(PPC_VSCR);
2489
2490 if (reg2 != REG_NO)
2491 jitcClobberRegister(NATIVE_REG | reg2);
2492
2493 asmMOVSS(&gCPU.vscr, reg1);
2494 return flowContinue;
2495 }
2496 }
2497
2498 jitcFlushClientVectorRegister(vrB);
2499
2500 NativeReg r1 = jitcMapClientRegisterDirty(PPC_VSCR);
2501
2502 vec_getWord(r1, vrB, 0);
2503 return flowContinue;
2504 #endif
2505 }
2506
2507 /* vpkuhum Vector Pack Unsigned Half Word Unsigned Modulo
2508 * v.224
2509 */
2510 void ppc_opc_vpkuhum()
2511 {
2512 VECTOR_DEBUG;
2513 int vrD, vrA, vrB;
2514 Vector_t r;
2515 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2516
2517 VECT_B(r, 0) = VECT_B(gCPU.vr[vrA], 1);
2518 VECT_B(r, 1) = VECT_B(gCPU.vr[vrA], 3);
2519 VECT_B(r, 2) = VECT_B(gCPU.vr[vrA], 5);
2520 VECT_B(r, 3) = VECT_B(gCPU.vr[vrA], 7);
2521 VECT_B(r, 4) = VECT_B(gCPU.vr[vrA], 9);
2522 VECT_B(r, 5) = VECT_B(gCPU.vr[vrA],11);
2523 VECT_B(r, 6) = VECT_B(gCPU.vr[vrA],13);
2524 VECT_B(r, 7) = VECT_B(gCPU.vr[vrA],15);
2525
2526 VECT_B(r, 8) = VECT_B(gCPU.vr[vrB], 1);
2527 VECT_B(r, 9) = VECT_B(gCPU.vr[vrB], 3);
2528 VECT_B(r,10) = VECT_B(gCPU.vr[vrB], 5);
2529 VECT_B(r,11) = VECT_B(gCPU.vr[vrB], 7);
2530 VECT_B(r,12) = VECT_B(gCPU.vr[vrB], 9);
2531 VECT_B(r,13) = VECT_B(gCPU.vr[vrB],11);
2532 VECT_B(r,14) = VECT_B(gCPU.vr[vrB],13);
2533 VECT_B(r,15) = VECT_B(gCPU.vr[vrB],15);
2534
2535 gCPU.vr[vrD] = r;
2536 }
2537 JITCFlow ppc_opc_gen_vpkuhum()
2538 {
2539 #if 0
2540 ppc_opc_gen_interpret(ppc_opc_vpkuhum);
2541 return flowEndBlock;
2542 #else
2543 int vrD, vrA, vrB;
2544 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2545
2546 jitcFlushClientVectorRegister(vrA);
2547 jitcFlushClientVectorRegister(vrB);
2548 jitcDropClientVectorRegister(vrD);
2549
2550 jitcClobberCarryAndFlags();
2551 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
2552
2553 if (vrB == vrD && vrA == vrD) {
2554 for (int i=0; i<16; i += 2) {
2555 vec_getHalf((NativeReg16)reg1, vrA, i);
2556
2557 vec_setByte(vrT, REG_NO, (i >> 1), reg1);
2558 vec_setByte(vrT, REG_NO, 8+(i >> 1), reg1);
2559 }
2560
2561 vec_Copy(vrD, vrT);
2562 } else if (vrA == vrD) {
2563 for (int i=0; i<16; i += 2) {
2564 vec_getHalf((NativeReg16)reg1, vrA, 14 - i);
2565 vec_setByte(vrD, REG_NO, 15 - (i >> 1), reg1);
2566 }
2567
2568 for (int i=0; i<16; i += 2) {
2569 vec_getHalf((NativeReg16)reg1, vrB, i);
2570 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2571 }
2572 } else {
2573 for (int i=0; i<16; i += 2) {
2574 vec_getHalf((NativeReg16)reg1, vrB, i);
2575 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2576 }
2577
2578 for (int i=0; i<16; i += 2) {
2579 vec_getHalf((NativeReg16)reg1, vrA, i);
2580 vec_setByte(vrD, REG_NO, 8 + (i >> 1), reg1);
2581 }
2582 }
2583
2584 return flowContinue;
2585 #endif
2586 }
2587
2588 /* vpkuwum Vector Pack Unsigned Word Unsigned Modulo
2589 * v.226
2590 */
2591 void ppc_opc_vpkuwum()
2592 {
2593 VECTOR_DEBUG;
2594 int vrD, vrA, vrB;
2595 Vector_t r;
2596 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2597
2598 VECT_H(r, 0) = VECT_H(gCPU.vr[vrA], 1);
2599 VECT_H(r, 1) = VECT_H(gCPU.vr[vrA], 3);
2600 VECT_H(r, 2) = VECT_H(gCPU.vr[vrA], 5);
2601 VECT_H(r, 3) = VECT_H(gCPU.vr[vrA], 7);
2602
2603 VECT_H(r, 4) = VECT_H(gCPU.vr[vrB], 1);
2604 VECT_H(r, 5) = VECT_H(gCPU.vr[vrB], 3);
2605 VECT_H(r, 6) = VECT_H(gCPU.vr[vrB], 5);
2606 VECT_H(r, 7) = VECT_H(gCPU.vr[vrB], 7);
2607
2608 gCPU.vr[vrD] = r;
2609 }
2610 JITCFlow ppc_opc_gen_vpkuwum()
2611 {
2612 #if 0
2613 ppc_opc_gen_interpret(ppc_opc_vpkuwum);
2614 return flowEndBlock;
2615 #else
2616 int vrD, vrA, vrB;
2617 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2618
2619 jitcFlushClientVectorRegister(vrA);
2620 jitcFlushClientVectorRegister(vrB);
2621 jitcDropClientVectorRegister(vrD);
2622
2623 jitcClobberCarryAndFlags();
2624 NativeReg16 reg1 = (NativeReg16)jitcAllocRegister();
2625
2626 if (vrB == vrD && vrA == vrD) {
2627 for (int i=0; i<16; i += 4) {
2628 vec_getWord((NativeReg)reg1, vrA, i);
2629
2630 vec_setHalf(vrT, (i >> 1), reg1);
2631 vec_setHalf(vrT, 8+(i >> 1), reg1);
2632 }
2633
2634 vec_Copy(vrD, vrT);
2635 } else if (vrA == vrD) {
2636 for (int i=0; i<16; i += 4) {
2637 vec_getWord((NativeReg)reg1, vrA, 12 - i);
2638 vec_setHalf(vrD, 14 - (i >> 1), reg1);
2639 }
2640
2641 for (int i=0; i<16; i += 4) {
2642 vec_getWord((NativeReg)reg1, vrB, i);
2643 vec_setHalf(vrD, (i >> 1), reg1);
2644 }
2645 } else {
2646 for (int i=0; i<16; i += 4) {
2647 vec_getWord((NativeReg)reg1, vrB, i);
2648 vec_setHalf(vrD, (i >> 1), reg1);
2649 }
2650
2651 for (int i=0; i<16; i += 4) {
2652 vec_getWord((NativeReg)reg1, vrA, i);
2653 vec_setHalf(vrD, 8 + (i >> 1), reg1);
2654 }
2655 }
2656
2657 return flowContinue;
2658 #endif
2659 }
2660
2661 static inline void ppc_opc_gen_helper_vpkpx(NativeReg reg1, NativeReg reg2, NativeReg reg3)
2662 {
2663 asmALU(X86_MOV, reg2, reg1);
2664 asmALU(X86_MOV, reg3, reg1);
2665
2666 asmALU(X86_AND, reg1, 0x000000f8);
2667 asmALU(X86_AND, reg2, 0x0000f800);
2668 asmALU(X86_AND, reg3, 0x01f80000);
2669
2670 asmShift(X86_SHR, reg1, 3);
2671 asmShift(X86_SHR, reg2, 6);
2672 asmShift(X86_SHR, reg3, 9);
2673
2674 asmALU(X86_OR, reg1, reg2);
2675 asmALU(X86_OR, reg1, reg3);
2676 }
2677
2678 /* vpkpx Vector Pack Pixel32
2679 * v.219
2680 */
2681 void ppc_opc_vpkpx()
2682 {
2683 VECTOR_DEBUG;
2684 int vrD, vrA, vrB;
2685 Vector_t r;
2686 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2687
2688 VECT_H(r, 0) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 0));
2689 VECT_H(r, 1) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 1));
2690 VECT_H(r, 2) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 2));
2691 VECT_H(r, 3) = PACK_PIXEL(VECT_W(gCPU.vr[vrA], 3));
2692
2693 VECT_H(r, 4) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 0));
2694 VECT_H(r, 5) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 1));
2695 VECT_H(r, 6) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 2));
2696 VECT_H(r, 7) = PACK_PIXEL(VECT_W(gCPU.vr[vrB], 3));
2697
2698 gCPU.vr[vrD] = r;
2699 }
2700 JITCFlow ppc_opc_gen_vpkpx()
2701 {
2702 #if 0
2703 ppc_opc_gen_interpret(ppc_opc_vpkpx);
2704 return flowEndBlock;
2705 #else
2706 int vrD, vrA, vrB;
2707 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2708
2709 jitcFlushClientVectorRegister(vrA);
2710 jitcFlushClientVectorRegister(vrB);
2711 jitcDropClientVectorRegister(vrD);
2712
2713 jitcClobberCarryAndFlags();
2714 NativeReg reg1 = jitcAllocRegister();
2715 NativeReg reg2 = jitcAllocRegister();
2716 NativeReg reg3 = jitcAllocRegister();
2717
2718 if((vrA == vrB)) {
2719 for (int i=0; i<8; i += 2) {
2720 vec_getWord(reg1, vrB, i<<1);
2721 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2722 vec_setHalf(vrD, i, (NativeReg16)reg1);
2723
2724 if (vrD != vrA)
2725 vec_setHalf(vrD, i+8, (NativeReg16)reg1);
2726 }
2727
2728 if (vrD == vrB) {
2729 vec_getWord(reg1, vrD, 0);
2730 vec_getWord(reg2, vrD, 4);
2731
2732 vec_setWord(vrD, 8, reg1);
2733 vec_setWord(vrD,12, reg2);
2734 }
2735 } else if (vrB == vrD) {
2736 for (int i=0; i<8; i += 2) {
2737 vec_getWord(reg1, vrB, i<<1);
2738 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2739 vec_setHalf(vrD, i, (NativeReg16)reg1);
2740 }
2741
2742 for (int i=0; i<8; i += 2) {
2743 vec_getWord(reg1, vrA, i<<1);
2744 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2745 vec_setHalf(vrD, i+8, (NativeReg16)reg1);
2746 }
2747 } else {
2748 for (int i=0; i<8; i += 2) {
2749 vec_getWord(reg1, vrA, 12-(i<<1));
2750 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2751 vec_setHalf(vrD, 14-i, (NativeReg16)reg1);
2752 }
2753
2754 for (int i=0; i<8; i += 2) {
2755 vec_getWord(reg1, vrB, i<<1);
2756 ppc_opc_gen_helper_vpkpx(reg1, reg2, reg3);
2757 vec_setHalf(vrD, i, (NativeReg16)reg1);
2758 }
2759 }
2760
2761 return flowContinue;
2762 #endif
2763 }
2764
2765
2766 /* vpkuhus Vector Pack Unsigned Half Word Unsigned Saturate
2767 * v.225
2768 */
2769 void ppc_opc_vpkuhus()
2770 {
2771 VECTOR_DEBUG;
2772 int vrD, vrA, vrB;
2773 Vector_t r;
2774 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2775
2776 VECT_B(r, 0) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 0));
2777 VECT_B(r, 1) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 1));
2778 VECT_B(r, 2) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 2));
2779 VECT_B(r, 3) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 3));
2780 VECT_B(r, 4) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 4));
2781 VECT_B(r, 5) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 5));
2782 VECT_B(r, 6) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 6));
2783 VECT_B(r, 7) = SATURATE_UB(VECT_H(gCPU.vr[vrA], 7));
2784
2785 VECT_B(r, 8) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 0));
2786 VECT_B(r, 9) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 1));
2787 VECT_B(r,10) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 2));
2788 VECT_B(r,11) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 3));
2789 VECT_B(r,12) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 4));
2790 VECT_B(r,13) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 5));
2791 VECT_B(r,14) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 6));
2792 VECT_B(r,15) = SATURATE_UB(VECT_H(gCPU.vr[vrB], 7));
2793
2794 gCPU.vr[vrD] = r;
2795 }
2796 JITCFlow ppc_opc_gen_vpkuhus()
2797 {
2798 #if 0
2799 ppc_opc_gen_interpret(ppc_opc_vpkuhus);
2800 return flowEndBlock;
2801 #else
2802 int vrD, vrA, vrB;
2803 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2804
2805 jitcFlushClientVectorRegister(vrA);
2806 jitcFlushClientVectorRegister(vrB);
2807 jitcDropClientVectorRegister(vrD);
2808
2809 jitcClobberCarryAndFlags();
2810 NativeReg8 reg1 = (NativeReg8)jitcAllocRegister(NATIVE_REG_8);
2811
2812 if (vrB == vrD && vrA == vrD) {
2813 for (int i=0; i<16; i += 2) {
2814 vec_getHalfZ((NativeReg)reg1, vrA, i);
2815
2816 vec_saturateUB(reg1);
2817
2818 vec_setByte(vrT, REG_NO, (i >> 1), reg1);
2819 vec_setByte(vrT, REG_NO, 8+(i >> 1), reg1);
2820 }
2821
2822 vec_Copy(vrD, vrT);
2823 } else if (vrA == vrD) {
2824 for (int i=0; i<16; i += 2) {
2825 vec_getHalfZ((NativeReg)reg1, vrA, 14 - i);
2826
2827 vec_saturateUB(reg1);
2828
2829 vec_setByte(vrD, REG_NO, 15 - (i >> 1), reg1);
2830 }
2831
2832 for (int i=0; i<16; i += 2) {
2833 vec_getHalfZ((NativeReg)reg1, vrB, i);
2834
2835 vec_saturateUB(reg1);
2836
2837 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2838 }
2839 } else {
2840 for (int i=0; i<16; i += 2) {
2841 vec_getHalfZ((NativeReg)reg1, vrB, i);
2842
2843 vec_saturateUB(reg1);
2844
2845 vec_setByte(vrD, REG_NO, (i >> 1), reg1);
2846 }
2847
2848 for (int i=0; i<16; i += 2) {
2849 vec_getHalfZ((NativeReg)reg1, vrA, i);
2850
2851 vec_saturateUB(reg1);
2852
2853 vec_setByte(vrD, REG_NO, 8 + (i >> 1), reg1);
2854 }
2855 }
2856
2857 return flowContinue;
2858 #endif
2859 }
2860
2861 /* vpkshss Vector Pack Signed Half Word Signed Saturate
2862 * v.220
2863 */
2864 void ppc_opc_vpkshss()
2865 {
2866 VECTOR_DEBUG;
2867 int vrD, vrA, vrB;
2868 Vector_t r;
2869 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2870
2871 VECT_B(r, 0) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 0));
2872 VECT_B(r, 1) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 1));
2873 VECT_B(r, 2) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 2));
2874 VECT_B(r, 3) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 3));
2875 VECT_B(r, 4) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 4));
2876 VECT_B(r, 5) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 5));
2877 VECT_B(r, 6) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 6));
2878 VECT_B(r, 7) = SATURATE_SB(VECT_H(gCPU.vr[vrA], 7));
2879
2880 VECT_B(r, 8) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 0));
2881 VECT_B(r, 9) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 1));
2882 VECT_B(r,10) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 2));
2883 VECT_B(r,11) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 3));
2884 VECT_B(r,12) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 4));
2885 VECT_B(r,13) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 5));
2886 VECT_B(r,14) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 6));
2887 VECT_B(r,15) = SATURATE_SB(VECT_H(gCPU.vr[vrB], 7));
2888
2889 gCPU.vr[vrD] = r;
2890 }
2891 JITCFlow ppc_opc_gen_vpkshss()
2892 {
2893 ppc_opc_gen_interpret(ppc_opc_vpkshss);
2894 return flowEndBlock;
2895 }
2896
2897 /* vpkuwus Vector Pack Unsigned Word Unsigned Saturate
2898 * v.227
2899 */
2900 void ppc_opc_vpkuwus()
2901 {
2902 VECTOR_DEBUG;
2903 int vrD, vrA, vrB;
2904 Vector_t r;
2905 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2906
2907 VECT_H(r, 0) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 0));
2908 VECT_H(r, 1) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 1));
2909 VECT_H(r, 2) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 2));
2910 VECT_H(r, 3) = SATURATE_UH(VECT_W(gCPU.vr[vrA], 3));
2911
2912 VECT_H(r, 4) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 0));
2913 VECT_H(r, 5) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 1));
2914 VECT_H(r, 6) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 2));
2915 VECT_H(r, 7) = SATURATE_UH(VECT_W(gCPU.vr[vrB], 3));
2916
2917 gCPU.vr[vrD] = r;
2918 }
2919 JITCFlow ppc_opc_gen_vpkuwus()
2920 {
2921 ppc_opc_gen_interpret(ppc_opc_vpkuwus);
2922 return flowEndBlock;
2923 }
2924
2925 /* vpkswss Vector Pack Signed Word Signed Saturate
2926 * v.222
2927 */
2928 void ppc_opc_vpkswss()
2929 {
2930 VECTOR_DEBUG;
2931 int vrD, vrA, vrB;
2932 Vector_t r;
2933 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
2934
2935 VECT_H(r, 0) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 0));
2936 VECT_H(r, 1) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 1));
2937 VECT_H(r, 2) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 2));
2938 VECT_H(r, 3) = SATURATE_SH(VECT_W(gCPU.vr[vrA], 3));
2939
2940 VECT_H(r, 4) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 0));
2941 VECT_H(r, 5) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 1));
2942 VECT_H(r, 6) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 2));
2943 VECT_H(r, 7) = SATURATE_SH(VECT_W(gCPU.vr[vrB], 3));
2944
2945 gCPU.vr[vrD] = r;
2946 }
2947 JITCFlow ppc_opc_gen_vpkswss()
2948 {
2949 #if 0
2950 ppc_opc_gen_interpret(ppc_opc_vpkswss);
2951 return flowEndBlock;
2952 #else
2953 int vrD, vrA, vrB;
2954 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
2955
2956 if (SSE2_AVAIL) {
2957 noncommutative_operation(X86_PACKSSDW, vrD, vrB, vrA);
2958 return flowContinue;
2959 }
2960
2961 jitcFlushClientVectorRegister(vrA);
2962 jitcFlushClientVectorRegister(vrB);
2963 jitcDropClientVectorRegister(vrD);
2964
2965 jitcClobberCarryAndFlags();
2966 NativeReg reg1 = jitcAllocRegister();
2967 NativeReg reg2 = jitcAllocRegister();
2968
2969 if (vrB == vrD && vrA == vrD) {
2970 for (int i=0; i<16; i += 4) {
2971 vec_getWord(reg1, vrA, i);
2972
2973 vec_saturateSH(reg1, reg2);
2974
2975 vec_setHalf(vrT, (i >> 1), (NativeReg16)reg1);
2976 vec_setHalf(vrT, 8+(i >> 1), (NativeReg16)reg1);
2977 }
2978
2979 vec_Copy(vrD, vrT);
2980 } else if (vrA == vrD) {
2981 for (int i=0; i<16; i += 4) {
2982 vec_getWord(reg1, vrA, 12 - i);
2983
2984 vec_saturateSH(reg1, reg2);
2985
2986 vec_setHalf(vrD, 14 - (i >> 1), (NativeReg16)reg1);
2987 }
2988
2989 for (int i=0; i<16; i += 4) {
2990 vec_getWord(reg1, vrB, i);
2991
2992 vec_saturateSH(reg1, reg2);
2993
2994 vec_setHalf(vrD, (i >> 1), (NativeReg16)reg1);
2995 }
2996 } else {
2997 for (int i=0; i<16; i += 4) {
2998 vec_getWord(reg1, vrB, i);
2999
3000 vec_saturateSH(reg1, reg2);
3001
3002 vec_setHalf(vrD, (i >> 1), (NativeReg16)reg1);
3003 }
3004
3005 for (int i=0; i<16; i += 4) {
3006 vec_getWord(reg1, vrA, i);
3007
3008 vec_saturateSH(reg1, reg2);
3009
3010 vec_setHalf(vrD, 8 + (i >> 1), (NativeReg16)reg1);
3011 }
3012 }
3013
3014 return flowContinue;
3015 #endif
3016 }
3017
3018 /* vpkshus Vector Pack Signed Half Word Unsigned Saturate
3019 * v.221
3020 */
3021 void ppc_opc_vpkshus()
3022 {
3023 VECTOR_DEBUG;
3024 int vrD, vrA, vrB;
3025 Vector_t r;
3026 PPC_OPC_TEMPL_X(gCPU.current_opc, vrD, vrA, vrB);
3027
3028 VECT_B(r, 0) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 0));
3029 VECT_B(r, 1) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 1));
3030 VECT_B(r, 2) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 2));
3031 VECT_B(r, 3) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 3));
3032 VECT_B(r, 4) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 4));
3033 VECT_B(r, 5) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 5));
3034 VECT_B(r, 6) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 6));
3035 VECT_B(r, 7) = SATURATE_USB(VECT_H(gCPU.vr[vrA], 7));
3036
3037 VECT_B(r, 8) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 0));
3038 VECT_B(r, 9) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 1));
3039 VECT_B(r,10) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 2));
3040 VECT_B(r,11) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 3));
3041 VECT_B(r,12) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 4));
3042 VECT_B(r,13) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 5));
3043 VECT_B(r,14) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 6));
3044 VECT_B(r,15) = SATURATE_USB(VECT_H(gCPU.vr[vrB], 7));
3045
3046 gCPU.vr[vrD] = r;
3047 }
3048 JITCFlow ppc_opc_gen_vpkshus()
3049 {
3050 #if 0
3051 ppc_opc_gen_interpret(ppc_opc_vpkshus);
3052 return flowEndBlock;
3053 #else
3054 int vrD, vrA, vrB;
3055 PPC_OPC_TEMPL_X(gJITC.current_opc, vrD, vrA, vrB);
3056
3057 if (SSE2_AVAIL) {
3058 noncommutative_operation(X86_PACKUSWB, vrD, vrB, vrA);
3059 return flowContinue;
3060 }
3061
3062 NativeReg reg1 = jitcAllocRegister(NATIVE_REG_8);
3063 NativeReg reg2 = jitcAllocRegister();
3064
3065 if (vrB == vrD && vrA == vrD) {
3066 for (int i=0; i<16; i += 2) {
3067 vec_getHalfS(reg1, vrA, i);
3068
3069 vec_saturateSUB(reg1, reg2);
3070
3071 vec_setByte(vrT, REG_NO, (i >> 1), (NativeReg8)reg1);
3072 vec_setByte(vrT, REG_NO, 8+(i >> 1), (NativeReg8)reg1);
3073 }
3074
3075 vec_Copy(vrD, vrT);
3076 } else if (vrA == vrD) {
3077 for (int i=0; i<16; i += 2) {
3078 vec_getHalfS(reg1, vrA, 14 - i);
3079
3080 vec_saturateSUB(reg1, reg2);
3081
3082 vec_setByte(vrD, REG_NO, 15 - (i >> 1), (