gem5 v24.0.0.0
Loading...
Searching...
No Matches
vop3.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
35
36namespace gem5
37{
38
39namespace VegaISA
40{
41 // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
42
44 : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
45 {
46 setFlag(ALU);
47 setFlag(ReadsVCC);
48 } // Inst_VOP3__V_CNDMASK_B32
49
51 {
52 } // ~Inst_VOP3__V_CNDMASK_B32
53
54 // --- description from .arch file ---
55 // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
56 // as a scalar GPR in S2.
57 void
59 {
60 Wavefront *wf = gpuDynInst->wavefront();
61 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
62 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
63 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
64 VecOperandU32 vdst(gpuDynInst, instData.VDST);
65
66 src0.readSrc();
67 src1.readSrc();
68 vcc.read();
69
70 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
71 if (wf->execMask(lane)) {
72 vdst[lane] = bits(vcc.rawData(), lane)
73 ? src1[lane] : src0[lane];
74 }
75 }
76
77 vdst.write();
78 } // execute
79 // --- Inst_VOP3__V_ADD_F32 class methods ---
80
82 : Inst_VOP3A(iFmt, "v_add_f32", false)
83 {
84 setFlag(ALU);
85 setFlag(F32);
86 } // Inst_VOP3__V_ADD_F32
87
89 {
90 } // ~Inst_VOP3__V_ADD_F32
91
92 // --- description from .arch file ---
93 // D.f = S0.f + S1.f.
94 void
96 {
97 Wavefront *wf = gpuDynInst->wavefront();
98 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
99 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
100 VecOperandF32 vdst(gpuDynInst, instData.VDST);
101
102 src0.readSrc();
103 src1.readSrc();
104
105 if (instData.ABS & 0x1) {
106 src0.absModifier();
107 }
108
109 if (instData.ABS & 0x2) {
110 src1.absModifier();
111 }
112
113 if (extData.NEG & 0x1) {
114 src0.negModifier();
115 }
116
117 if (extData.NEG & 0x2) {
118 src1.negModifier();
119 }
120
124 assert(!(instData.ABS & 0x4));
125 assert(!(extData.NEG & 0x4));
126
127 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
128 if (wf->execMask(lane)) {
129 vdst[lane] = src0[lane] + src1[lane];
130 }
131 }
132
133 vdst.write();
134 } // execute
135 // --- Inst_VOP3__V_SUB_F32 class methods ---
136
138 : Inst_VOP3A(iFmt, "v_sub_f32", false)
139 {
140 setFlag(ALU);
141 setFlag(F32);
142 } // Inst_VOP3__V_SUB_F32
143
145 {
146 } // ~Inst_VOP3__V_SUB_F32
147
148 // --- description from .arch file ---
149 // D.f = S0.f - S1.f.
150 // SQ translates to V_ADD_F32.
151 void
153 {
154 Wavefront *wf = gpuDynInst->wavefront();
155 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
156 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
157 VecOperandF32 vdst(gpuDynInst, instData.VDST);
158
159 src0.readSrc();
160 src1.readSrc();
161
162 if (instData.ABS & 0x1) {
163 src0.absModifier();
164 }
165
166 if (instData.ABS & 0x2) {
167 src1.absModifier();
168 }
169
170 if (extData.NEG & 0x1) {
171 src0.negModifier();
172 }
173
174 if (extData.NEG & 0x2) {
175 src1.negModifier();
176 }
177
181 assert(!(instData.ABS & 0x4));
182 assert(!(extData.NEG & 0x4));
183
184 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
185 if (wf->execMask(lane)) {
186 vdst[lane] = src0[lane] - src1[lane];
187 }
188 }
189
190 vdst.write();
191 } // execute
192 // --- Inst_VOP3__V_SUBREV_F32 class methods ---
193
195 : Inst_VOP3A(iFmt, "v_subrev_f32", false)
196 {
197 setFlag(ALU);
198 setFlag(F32);
199 } // Inst_VOP3__V_SUBREV_F32
200
202 {
203 } // ~Inst_VOP3__V_SUBREV_F32
204
205 // --- description from .arch file ---
206 // D.f = S1.f - S0.f.
207 // SQ translates to V_ADD_F32.
208 void
210 {
211 Wavefront *wf = gpuDynInst->wavefront();
212 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
213 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
214 VecOperandF32 vdst(gpuDynInst, instData.VDST);
215
216 src0.readSrc();
217 src1.readSrc();
218
219 if (instData.ABS & 0x1) {
220 src0.absModifier();
221 }
222
223 if (instData.ABS & 0x2) {
224 src1.absModifier();
225 }
226
227 if (extData.NEG & 0x1) {
228 src0.negModifier();
229 }
230
231 if (extData.NEG & 0x2) {
232 src1.negModifier();
233 }
234
238 assert(!(instData.ABS & 0x4));
239 assert(!(extData.NEG & 0x4));
240
241 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
242 if (wf->execMask(lane)) {
243 vdst[lane] = src1[lane] - src0[lane];
244 }
245 }
246
247 vdst.write();
248 } // execute
249 // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
250
252 : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
253 {
254 setFlag(ALU);
255 setFlag(F32);
256 } // Inst_VOP3__V_MUL_LEGACY_F32
257
259 {
260 } // ~Inst_VOP3__V_MUL_LEGACY_F32
261
262 // --- description from .arch file ---
263 // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
264 void
266 {
267 Wavefront *wf = gpuDynInst->wavefront();
268 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
269 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
270 VecOperandF32 vdst(gpuDynInst, instData.VDST);
271
272 src0.readSrc();
273 src1.readSrc();
274
275 if (instData.ABS & 0x1) {
276 src0.absModifier();
277 }
278
279 if (instData.ABS & 0x2) {
280 src1.absModifier();
281 }
282
283 if (extData.NEG & 0x1) {
284 src0.negModifier();
285 }
286
287 if (extData.NEG & 0x2) {
288 src1.negModifier();
289 }
290
294 assert(!(instData.ABS & 0x4));
295 assert(!(extData.NEG & 0x4));
296
297 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
298 if (wf->execMask(lane)) {
299 if (std::isnan(src0[lane]) ||
300 std::isnan(src1[lane])) {
301 vdst[lane] = NAN;
302 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
303 std::fpclassify(src0[lane]) == FP_ZERO) &&
304 !std::signbit(src0[lane])) {
305 if (std::isinf(src1[lane])) {
306 vdst[lane] = NAN;
307 } else if (!std::signbit(src1[lane])) {
308 vdst[lane] = +0.0;
309 } else {
310 vdst[lane] = -0.0;
311 }
312 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
313 std::fpclassify(src0[lane]) == FP_ZERO) &&
314 std::signbit(src0[lane])) {
315 if (std::isinf(src1[lane])) {
316 vdst[lane] = NAN;
317 } else if (std::signbit(src1[lane])) {
318 vdst[lane] = +0.0;
319 } else {
320 vdst[lane] = -0.0;
321 }
322 } else if (std::isinf(src0[lane]) &&
323 !std::signbit(src0[lane])) {
324 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
325 std::fpclassify(src1[lane]) == FP_ZERO) {
326 vdst[lane] = NAN;
327 } else if (!std::signbit(src1[lane])) {
328 vdst[lane] = +INFINITY;
329 } else {
330 vdst[lane] = -INFINITY;
331 }
332 } else if (std::isinf(src0[lane]) &&
333 std::signbit(src0[lane])) {
334 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
335 std::fpclassify(src1[lane]) == FP_ZERO) {
336 vdst[lane] = NAN;
337 } else if (std::signbit(src1[lane])) {
338 vdst[lane] = +INFINITY;
339 } else {
340 vdst[lane] = -INFINITY;
341 }
342 } else {
343 vdst[lane] = src0[lane] * src1[lane];
344 }
345 }
346 }
347
348 vdst.write();
349 } // execute
350 // --- Inst_VOP3__V_MUL_F32 class methods ---
351
353 : Inst_VOP3A(iFmt, "v_mul_f32", false)
354 {
355 setFlag(ALU);
356 setFlag(F32);
357 } // Inst_VOP3__V_MUL_F32
358
360 {
361 } // ~Inst_VOP3__V_MUL_F32
362
363 // --- description from .arch file ---
364 // D.f = S0.f * S1.f.
365 void
367 {
368 Wavefront *wf = gpuDynInst->wavefront();
369 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
370 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
371 VecOperandF32 vdst(gpuDynInst, instData.VDST);
372
373 src0.readSrc();
374 src1.readSrc();
375
376 if (instData.ABS & 0x1) {
377 src0.absModifier();
378 }
379
380 if (instData.ABS & 0x2) {
381 src1.absModifier();
382 }
383
384 if (extData.NEG & 0x1) {
385 src0.negModifier();
386 }
387
388 if (extData.NEG & 0x2) {
389 src1.negModifier();
390 }
391
395 assert(!(instData.ABS & 0x4));
396 assert(!(extData.NEG & 0x4));
397
398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
399 if (wf->execMask(lane)) {
400 if (std::isnan(src0[lane]) ||
401 std::isnan(src1[lane])) {
402 vdst[lane] = NAN;
403 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
404 std::fpclassify(src0[lane]) == FP_ZERO) &&
405 !std::signbit(src0[lane])) {
406 if (std::isinf(src1[lane])) {
407 vdst[lane] = NAN;
408 } else if (!std::signbit(src1[lane])) {
409 vdst[lane] = +0.0;
410 } else {
411 vdst[lane] = -0.0;
412 }
413 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
414 std::fpclassify(src0[lane]) == FP_ZERO) &&
415 std::signbit(src0[lane])) {
416 if (std::isinf(src1[lane])) {
417 vdst[lane] = NAN;
418 } else if (std::signbit(src1[lane])) {
419 vdst[lane] = +0.0;
420 } else {
421 vdst[lane] = -0.0;
422 }
423 } else if (std::isinf(src0[lane]) &&
424 !std::signbit(src0[lane])) {
425 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
426 std::fpclassify(src1[lane]) == FP_ZERO) {
427 vdst[lane] = NAN;
428 } else if (!std::signbit(src1[lane])) {
429 vdst[lane] = +INFINITY;
430 } else {
431 vdst[lane] = -INFINITY;
432 }
433 } else if (std::isinf(src0[lane]) &&
434 std::signbit(src0[lane])) {
435 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
436 std::fpclassify(src1[lane]) == FP_ZERO) {
437 vdst[lane] = NAN;
438 } else if (std::signbit(src1[lane])) {
439 vdst[lane] = +INFINITY;
440 } else {
441 vdst[lane] = -INFINITY;
442 }
443 } else {
444 vdst[lane] = src0[lane] * src1[lane];
445 }
446 }
447 }
448
449 vdst.write();
450 } // execute
451 // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
452
454 : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
455 {
456 setFlag(ALU);
457 } // Inst_VOP3__V_MUL_I32_I24
458
460 {
461 } // ~Inst_VOP3__V_MUL_I32_I24
462
463 // --- description from .arch file ---
464 // D.i = S0.i[23:0] * S1.i[23:0].
465 void
467 {
468 Wavefront *wf = gpuDynInst->wavefront();
469 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
470 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
471 VecOperandI32 vdst(gpuDynInst, instData.VDST);
472
473 src0.readSrc();
474 src1.read();
475
479 assert(!(instData.ABS & 0x1));
480 assert(!(instData.ABS & 0x2));
481 assert(!(instData.ABS & 0x4));
482 assert(!(extData.NEG & 0x1));
483 assert(!(extData.NEG & 0x2));
484 assert(!(extData.NEG & 0x4));
485
486 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
487 if (wf->execMask(lane)) {
488 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
489 * sext<24>(bits(src1[lane], 23, 0));
490 }
491 }
492
493 vdst.write();
494 } // execute
495 // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
496
498 : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
499 {
500 setFlag(ALU);
501 } // Inst_VOP3__V_MUL_HI_I32_I24
502
504 {
505 } // ~Inst_VOP3__V_MUL_HI_I32_I24
506
507 // --- description from .arch file ---
508 // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
509 void
511 {
512 Wavefront *wf = gpuDynInst->wavefront();
513 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
514 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
515 VecOperandI32 vdst(gpuDynInst, instData.VDST);
516
517 src0.readSrc();
518 src1.readSrc();
519
523 assert(!(instData.ABS & 0x1));
524 assert(!(instData.ABS & 0x2));
525 assert(!(instData.ABS & 0x4));
526 assert(!(extData.NEG & 0x1));
527 assert(!(extData.NEG & 0x2));
528 assert(!(extData.NEG & 0x4));
529
530 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
531 if (wf->execMask(lane)) {
532 VecElemI64 tmp_src0
533 = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
534 VecElemI64 tmp_src1
535 = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
536
537 vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
538 }
539 }
540
541 vdst.write();
542 } // execute
543 // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
544
546 : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
547 {
548 setFlag(ALU);
549 } // Inst_VOP3__V_MUL_U32_U24
550
552 {
553 } // ~Inst_VOP3__V_MUL_U32_U24
554
555 // --- description from .arch file ---
556 // D.u = S0.u[23:0] * S1.u[23:0].
557 void
559 {
560 Wavefront *wf = gpuDynInst->wavefront();
561 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
562 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
563 VecOperandU32 vdst(gpuDynInst, instData.VDST);
564
565 src0.readSrc();
566 src1.readSrc();
567
571 assert(!(instData.ABS & 0x1));
572 assert(!(instData.ABS & 0x2));
573 assert(!(instData.ABS & 0x4));
574 assert(!(extData.NEG & 0x1));
575 assert(!(extData.NEG & 0x2));
576 assert(!(extData.NEG & 0x4));
577
578 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
579 if (wf->execMask(lane)) {
580 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
581 }
582 }
583
584 vdst.write();
585 } // execute
586 // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
587
589 : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
590 {
591 setFlag(ALU);
592 } // Inst_VOP3__V_MUL_HI_U32_U24
593
595 {
596 } // ~Inst_VOP3__V_MUL_HI_U32_U24
597
598 // --- description from .arch file ---
599 // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
600 void
602 {
603 Wavefront *wf = gpuDynInst->wavefront();
604 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
605 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
606 VecOperandU32 vdst(gpuDynInst, instData.VDST);
607
608 src0.readSrc();
609 src1.readSrc();
610
614 assert(!(instData.ABS & 0x1));
615 assert(!(instData.ABS & 0x2));
616 assert(!(instData.ABS & 0x4));
617 assert(!(extData.NEG & 0x1));
618 assert(!(extData.NEG & 0x2));
619 assert(!(extData.NEG & 0x4));
620
621 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
622 if (wf->execMask(lane)) {
623 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
624 VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
625 vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
626 }
627 }
628
629 vdst.write();
630 } // execute
631 // --- Inst_VOP3__V_MIN_F32 class methods ---
632
634 : Inst_VOP3A(iFmt, "v_min_f32", false)
635 {
636 setFlag(ALU);
637 setFlag(F32);
638 } // Inst_VOP3__V_MIN_F32
639
641 {
642 } // ~Inst_VOP3__V_MIN_F32
643
644 // --- description from .arch file ---
645 // D.f = (S0.f < S1.f ? S0.f : S1.f).
646 void
648 {
649 Wavefront *wf = gpuDynInst->wavefront();
650 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
651 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
652 VecOperandF32 vdst(gpuDynInst, instData.VDST);
653
654 src0.readSrc();
655 src1.readSrc();
656
657 if (instData.ABS & 0x1) {
658 src0.absModifier();
659 }
660
661 if (instData.ABS & 0x2) {
662 src1.absModifier();
663 }
664
665 if (extData.NEG & 0x1) {
666 src0.negModifier();
667 }
668
669 if (extData.NEG & 0x2) {
670 src1.negModifier();
671 }
672
676 assert(!(instData.ABS & 0x4));
677 assert(!(extData.NEG & 0x4));
678
679 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
680 if (wf->execMask(lane)) {
681 vdst[lane] = std::fmin(src0[lane], src1[lane]);
682 }
683 }
684
685 vdst.write();
686 } // execute
687 // --- Inst_VOP3__V_MAX_F32 class methods ---
688
690 : Inst_VOP3A(iFmt, "v_max_f32", false)
691 {
692 setFlag(ALU);
693 setFlag(F32);
694 } // Inst_VOP3__V_MAX_F32
695
697 {
698 } // ~Inst_VOP3__V_MAX_F32
699
700 // --- description from .arch file ---
701 // D.f = (S0.f >= S1.f ? S0.f : S1.f).
702 void
704 {
705 Wavefront *wf = gpuDynInst->wavefront();
706 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
707 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
708 VecOperandF32 vdst(gpuDynInst, instData.VDST);
709
710 src0.readSrc();
711 src1.readSrc();
712
713 if (instData.ABS & 0x1) {
714 src0.absModifier();
715 }
716
717 if (instData.ABS & 0x2) {
718 src1.absModifier();
719 }
720
721 if (extData.NEG & 0x1) {
722 src0.negModifier();
723 }
724
725 if (extData.NEG & 0x2) {
726 src1.negModifier();
727 }
728
732 assert(!(instData.ABS & 0x4));
733 assert(!(extData.NEG & 0x4));
734
735 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
736 if (wf->execMask(lane)) {
737 vdst[lane] = std::fmax(src0[lane], src1[lane]);
738 }
739 }
740
741 vdst.write();
742 } // execute
743 // --- Inst_VOP3__V_MIN_I32 class methods ---
744
746 : Inst_VOP3A(iFmt, "v_min_i32", false)
747 {
748 setFlag(ALU);
749 } // Inst_VOP3__V_MIN_I32
750
752 {
753 } // ~Inst_VOP3__V_MIN_I32
754
755 // --- description from .arch file ---
756 // D.i = min(S0.i, S1.i).
757 void
759 {
760 Wavefront *wf = gpuDynInst->wavefront();
761 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
762 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
763 VecOperandI32 vdst(gpuDynInst, instData.VDST);
764
765 src0.readSrc();
766 src1.readSrc();
767
771 assert(!(instData.ABS & 0x1));
772 assert(!(instData.ABS & 0x2));
773 assert(!(instData.ABS & 0x4));
774 assert(!(extData.NEG & 0x1));
775 assert(!(extData.NEG & 0x2));
776 assert(!(extData.NEG & 0x4));
777
778 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
779 if (wf->execMask(lane)) {
780 vdst[lane] = std::min(src0[lane], src1[lane]);
781 }
782 }
783
784 vdst.write();
785 } // execute
786 // --- Inst_VOP3__V_MAX_I32 class methods ---
787
789 : Inst_VOP3A(iFmt, "v_max_i32", false)
790 {
791 setFlag(ALU);
792 } // Inst_VOP3__V_MAX_I32
793
795 {
796 } // ~Inst_VOP3__V_MAX_I32
797
798 // --- description from .arch file ---
799 // D.i = max(S0.i, S1.i).
800 void
802 {
803 Wavefront *wf = gpuDynInst->wavefront();
804 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
805 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
806 VecOperandI32 vdst(gpuDynInst, instData.VDST);
807
808 src0.readSrc();
809 src1.readSrc();
810
814 assert(!(instData.ABS & 0x1));
815 assert(!(instData.ABS & 0x2));
816 assert(!(instData.ABS & 0x4));
817 assert(!(extData.NEG & 0x1));
818 assert(!(extData.NEG & 0x2));
819 assert(!(extData.NEG & 0x4));
820
821 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
822 if (wf->execMask(lane)) {
823 vdst[lane] = std::max(src0[lane], src1[lane]);
824 }
825 }
826
827 vdst.write();
828 } // execute
829 // --- Inst_VOP3__V_MIN_U32 class methods ---
830
832 : Inst_VOP3A(iFmt, "v_min_u32", false)
833 {
834 setFlag(ALU);
835 } // Inst_VOP3__V_MIN_U32
836
838 {
839 } // ~Inst_VOP3__V_MIN_U32
840
841 // --- description from .arch file ---
842 // D.u = min(S0.u, S1.u).
843 void
845 {
846 Wavefront *wf = gpuDynInst->wavefront();
847 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
848 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
849 VecOperandU32 vdst(gpuDynInst, instData.VDST);
850
851 src0.readSrc();
852 src1.readSrc();
853
857 assert(!(instData.ABS & 0x1));
858 assert(!(instData.ABS & 0x2));
859 assert(!(instData.ABS & 0x4));
860 assert(!(extData.NEG & 0x1));
861 assert(!(extData.NEG & 0x2));
862 assert(!(extData.NEG & 0x4));
863
864 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
865 if (wf->execMask(lane)) {
866 vdst[lane] = std::min(src0[lane], src1[lane]);
867 }
868 }
869
870 vdst.write();
871 } // execute
872 // --- Inst_VOP3__V_MAX_U32 class methods ---
873
875 : Inst_VOP3A(iFmt, "v_max_u32", false)
876 {
877 setFlag(ALU);
878 } // Inst_VOP3__V_MAX_U32
879
881 {
882 } // ~Inst_VOP3__V_MAX_U32
883
884 // --- description from .arch file ---
885 // D.u = max(S0.u, S1.u).
886 void
888 {
889 Wavefront *wf = gpuDynInst->wavefront();
890 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
891 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
892 VecOperandU32 vdst(gpuDynInst, instData.VDST);
893
894 src0.readSrc();
895 src1.readSrc();
896
900 assert(!(instData.ABS & 0x1));
901 assert(!(instData.ABS & 0x2));
902 assert(!(instData.ABS & 0x4));
903 assert(!(extData.NEG & 0x1));
904 assert(!(extData.NEG & 0x2));
905 assert(!(extData.NEG & 0x4));
906
907 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
908 if (wf->execMask(lane)) {
909 vdst[lane] = std::max(src0[lane], src1[lane]);
910 }
911 }
912
913 vdst.write();
914 } // execute
915 // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
916
918 : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
919 {
920 setFlag(ALU);
921 } // Inst_VOP3__V_LSHRREV_B32
922
924 {
925 } // ~Inst_VOP3__V_LSHRREV_B32
926
927 // --- description from .arch file ---
928 // D.u = S1.u >> S0.u[4:0].
929 // The vacated bits are set to zero.
930 // SQ translates this to an internal SP opcode.
931 void
933 {
934 Wavefront *wf = gpuDynInst->wavefront();
935 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
936 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
937 VecOperandU32 vdst(gpuDynInst, instData.VDST);
938
939 src0.readSrc();
940 src1.readSrc();
941
945 assert(!(instData.ABS & 0x1));
946 assert(!(instData.ABS & 0x2));
947 assert(!(instData.ABS & 0x4));
948 assert(!(extData.NEG & 0x1));
949 assert(!(extData.NEG & 0x2));
950 assert(!(extData.NEG & 0x4));
951
952 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
953 if (wf->execMask(lane)) {
954 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
955 }
956 }
957
958 vdst.write();
959 } // execute
960 // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
961
963 : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
964 {
965 setFlag(ALU);
966 } // Inst_VOP3__V_ASHRREV_I32
967
969 {
970 } // ~Inst_VOP3__V_ASHRREV_I32
971
972 // --- description from .arch file ---
973 // D.i = signext(S1.i) >> S0.i[4:0].
974 // The vacated bits are set to the sign bit of the input value.
975 // SQ translates this to an internal SP opcode.
976 void
978 {
979 Wavefront *wf = gpuDynInst->wavefront();
980 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
981 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
982 VecOperandI32 vdst(gpuDynInst, instData.VDST);
983
984 src0.readSrc();
985 src1.readSrc();
986
990 assert(!(instData.ABS & 0x1));
991 assert(!(instData.ABS & 0x2));
992 assert(!(instData.ABS & 0x4));
993 assert(!(extData.NEG & 0x1));
994 assert(!(extData.NEG & 0x2));
995 assert(!(extData.NEG & 0x4));
996
997 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
998 if (wf->execMask(lane)) {
999 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
1000 }
1001 }
1002
1003 vdst.write();
1004 } // execute
1005 // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
1006
1008 : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
1009 {
1010 setFlag(ALU);
1011 } // Inst_VOP3__V_LSHLREV_B32
1012
1014 {
1015 } // ~Inst_VOP3__V_LSHLREV_B32
1016
1017 // --- description from .arch file ---
1018 // D.u = S1.u << S0.u[4:0].
1019 // SQ translates this to an internal SP opcode.
1020 void
1022 {
1023 Wavefront *wf = gpuDynInst->wavefront();
1024 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1025 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1026 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1027
1028 src0.readSrc();
1029 src1.readSrc();
1030
1034 assert(!(instData.ABS & 0x1));
1035 assert(!(instData.ABS & 0x2));
1036 assert(!(instData.ABS & 0x4));
1037 assert(!(extData.NEG & 0x1));
1038 assert(!(extData.NEG & 0x2));
1039 assert(!(extData.NEG & 0x4));
1040
1041 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1042 if (wf->execMask(lane)) {
1043 vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
1044 }
1045 }
1046
1047 vdst.write();
1048 } // execute
1049 // --- Inst_VOP3__V_AND_B32 class methods ---
1050
1052 : Inst_VOP3A(iFmt, "v_and_b32", false)
1053 {
1054 setFlag(ALU);
1055 } // Inst_VOP3__V_AND_B32
1056
1058 {
1059 } // ~Inst_VOP3__V_AND_B32
1060
1061 // --- description from .arch file ---
1062 // D.u = S0.u & S1.u.
1063 // Input and output modifiers not supported.
1064 void
1066 {
1067 Wavefront *wf = gpuDynInst->wavefront();
1068 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1069 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1070 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1071
1072 src0.readSrc();
1073 src1.readSrc();
1074
1078 assert(!(instData.ABS & 0x1));
1079 assert(!(instData.ABS & 0x2));
1080 assert(!(instData.ABS & 0x4));
1081 assert(!(extData.NEG & 0x1));
1082 assert(!(extData.NEG & 0x2));
1083 assert(!(extData.NEG & 0x4));
1084
1085 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1086 if (wf->execMask(lane)) {
1087 vdst[lane] = src0[lane] & src1[lane];
1088 }
1089 }
1090
1091 vdst.write();
1092 } // execute
1093 // --- Inst_VOP3__V_OR_B32 class methods ---
1094
1096 : Inst_VOP3A(iFmt, "v_or_b32", false)
1097 {
1098 setFlag(ALU);
1099 } // Inst_VOP3__V_OR_B32
1100
1102 {
1103 } // ~Inst_VOP3__V_OR_B32
1104
1105 // --- description from .arch file ---
1106 // D.u = S0.u | S1.u.
1107 // Input and output modifiers not supported.
1108 void
1110 {
1111 Wavefront *wf = gpuDynInst->wavefront();
1112 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1113 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1114 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1115
1116 src0.readSrc();
1117 src1.readSrc();
1118
1122 assert(!(instData.ABS & 0x1));
1123 assert(!(instData.ABS & 0x2));
1124 assert(!(instData.ABS & 0x4));
1125 assert(!(extData.NEG & 0x1));
1126 assert(!(extData.NEG & 0x2));
1127 assert(!(extData.NEG & 0x4));
1128
1129 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1130 if (wf->execMask(lane)) {
1131 vdst[lane] = src0[lane] | src1[lane];
1132 }
1133 }
1134
1135 vdst.write();
1136 } // execute
1137 // --- Inst_VOP3__V_OR3_B32 class methods ---
1138
1140 : Inst_VOP3A(iFmt, "v_or3_b32", false)
1141 {
1142 setFlag(ALU);
1143 } // Inst_VOP3__V_OR3_B32
1144
1146 {
1147 } // ~Inst_VOP3__V_OR3_B32
1148
1149 // --- description from .arch file ---
1150 // D.u = S0.u | S1.u | S2.u.
1151 // Input and output modifiers not supported.
1152 void
1154 {
1155 Wavefront *wf = gpuDynInst->wavefront();
1156 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1157 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1158 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
1159 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1160
1161 src0.readSrc();
1162 src1.readSrc();
1163 src2.readSrc();
1164
1168 assert(!(instData.ABS & 0x1));
1169 assert(!(instData.ABS & 0x2));
1170 assert(!(instData.ABS & 0x4));
1171 assert(!(extData.NEG & 0x1));
1172 assert(!(extData.NEG & 0x2));
1173 assert(!(extData.NEG & 0x4));
1174
1175 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1176 if (wf->execMask(lane)) {
1177 vdst[lane] = src0[lane] | src1[lane] | src2[lane];
1178 }
1179 }
1180
1181 vdst.write();
1182 } // execute
1183 // --- Inst_VOP3__V_XOR_B32 class methods ---
1184
1186 : Inst_VOP3A(iFmt, "v_xor_b32", false)
1187 {
1188 setFlag(ALU);
1189 } // Inst_VOP3__V_XOR_B32
1190
1192 {
1193 } // ~Inst_VOP3__V_XOR_B32
1194
1195 // --- description from .arch file ---
1196 // D.u = S0.u ^ S1.u.
1197 // Input and output modifiers not supported.
1198 void
1200 {
1201 Wavefront *wf = gpuDynInst->wavefront();
1202 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1203 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1204 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1205
1206 src0.readSrc();
1207 src1.readSrc();
1208
1212 assert(!(instData.ABS & 0x1));
1213 assert(!(instData.ABS & 0x2));
1214 assert(!(instData.ABS & 0x4));
1215 assert(!(extData.NEG & 0x1));
1216 assert(!(extData.NEG & 0x2));
1217 assert(!(extData.NEG & 0x4));
1218
1219 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1220 if (wf->execMask(lane)) {
1221 vdst[lane] = src0[lane] ^ src1[lane];
1222 }
1223 }
1224
1225 vdst.write();
1226 } // execute
1227 // --- Inst_VOP3__V_MAC_F32 class methods ---
1228
1230 : Inst_VOP3A(iFmt, "v_mac_f32", false)
1231 {
1232 setFlag(ALU);
1233 setFlag(F32);
1234 setFlag(MAC);
1235 } // Inst_VOP3__V_MAC_F32
1236
1238 {
1239 } // ~Inst_VOP3__V_MAC_F32
1240
1241 // --- description from .arch file ---
1242 // D.f = S0.f * S1.f + D.f.
1243 // SQ translates to V_MAD_F32.
1244 void
1246 {
1247 Wavefront *wf = gpuDynInst->wavefront();
1248 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
1249 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
1250 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1251
1252 src0.readSrc();
1253 src1.readSrc();
1254 vdst.read();
1255
1256 if (instData.ABS & 0x1) {
1257 src0.absModifier();
1258 }
1259
1260 if (instData.ABS & 0x2) {
1261 src1.absModifier();
1262 }
1263
1264 if (extData.NEG & 0x1) {
1265 src0.negModifier();
1266 }
1267
1268 if (extData.NEG & 0x2) {
1269 src1.negModifier();
1270 }
1271
1275 assert(!(instData.ABS & 0x4));
1276 assert(!(extData.NEG & 0x4));
1277
1278 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1279 if (wf->execMask(lane)) {
1280 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
1281 }
1282 }
1283
1284 vdst.write();
1285 } // execute
1286 // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
1287
1289 : Inst_VOP3B(iFmt, "v_add_co_u32")
1290 {
1291 setFlag(ALU);
1292 setFlag(WritesVCC);
1293 } // Inst_VOP3__V_ADD_CO_U32
1294
1296 {
1297 } // ~Inst_VOP3__V_ADD_CO_U32
1298
1299 // --- description from .arch file ---
1300 // D.u = S0.u + S1.u;
1301 // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
1302 // --- overflow or carry-out for V_ADDC_U32.
1303 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1304 void
1306 {
1307 Wavefront *wf = gpuDynInst->wavefront();
1308 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1309 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1310 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1311 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1312
1313 src0.readSrc();
1314 src1.readSrc();
1315
1319 assert(!(extData.NEG & 0x1));
1320 assert(!(extData.NEG & 0x2));
1321 assert(!(extData.NEG & 0x4));
1322
1323 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1324 if (wf->execMask(lane)) {
1325 vdst[lane] = src0[lane] + src1[lane];
1326 vcc.setBit(lane, ((VecElemU64)src0[lane]
1327 + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
1328 }
1329 }
1330
1331 vdst.write();
1332 vcc.write();
1333 } // execute
1334 // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
1335
1337 : Inst_VOP3B(iFmt, "v_sub_co_u32")
1338 {
1339 setFlag(ALU);
1340 setFlag(WritesVCC);
1341 } // Inst_VOP3__V_SUB_CO_U32
1342
1344 {
1345 } // ~Inst_VOP3__V_SUB_CO_U32
1346
1347 // --- description from .arch file ---
1348 // D.u = S0.u - S1.u;
1349 // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
1350 // carry-out for V_SUBB_U32.
1351 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1352 void
1354 {
1355 Wavefront *wf = gpuDynInst->wavefront();
1356 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1357 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1358 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1359 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1360
1361 src0.readSrc();
1362 src1.readSrc();
1363
1367 assert(!(extData.NEG & 0x1));
1368 assert(!(extData.NEG & 0x2));
1369 assert(!(extData.NEG & 0x4));
1370
1371 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1372 if (wf->execMask(lane)) {
1373 vdst[lane] = src0[lane] - src1[lane];
1374 vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
1375 }
1376 }
1377
1378 vdst.write();
1379 vcc.write();
1380 } // execute
1381 // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
1382
1384 InFmt_VOP3B *iFmt)
1385 : Inst_VOP3B(iFmt, "v_subrev_co_u32")
1386 {
1387 setFlag(ALU);
1388 setFlag(WritesVCC);
1389 } // Inst_VOP3__V_SUBREV_CO_U32
1390
1392 {
1393 } // ~Inst_VOP3__V_SUBREV_CO_U32
1394
1395 // --- description from .arch file ---
1396 // D.u = S1.u - S0.u;
1397 // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
1398 // carry-out for V_SUBB_U32.
1399 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1400 // SQ translates this to V_SUB_U32 with reversed operands.
1401 void
1403 {
1404 Wavefront *wf = gpuDynInst->wavefront();
1405 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1406 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1407 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1408 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1409
1410 src0.readSrc();
1411 src1.readSrc();
1412
1416 assert(!(extData.NEG & 0x1));
1417 assert(!(extData.NEG & 0x2));
1418 assert(!(extData.NEG & 0x4));
1419
1420 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1421 if (wf->execMask(lane)) {
1422 vdst[lane] = src1[lane] - src0[lane];
1423 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
1424 }
1425 }
1426
1427 vdst.write();
1428 vcc.write();
1429 } // execute
1430 // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
1431
1433 : Inst_VOP3B(iFmt, "v_addc_co_u32")
1434 {
1435 setFlag(ALU);
1436 setFlag(WritesVCC);
1437 setFlag(ReadsVCC);
1438 } // Inst_VOP3__V_ADDC_CO_U32
1439
1441 {
1442 } // ~Inst_VOP3__V_ADDC_CO_U32
1443
1444 // --- description from .arch file ---
1445 // D.u = S0.u + S1.u + VCC[threadId];
1446 // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
1447 // is an UNSIGNED overflow.
1448 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1449 // source comes from the SGPR-pair at S2.u.
1450 void
1452 {
1453 Wavefront *wf = gpuDynInst->wavefront();
1454 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1455 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1456 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1457 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1458 ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1459
1460 src0.readSrc();
1461 src1.readSrc();
1462 vcc.read();
1463
1467 assert(!(extData.NEG & 0x1));
1468 assert(!(extData.NEG & 0x2));
1469 assert(!(extData.NEG & 0x4));
1470
1471 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1472 if (wf->execMask(lane)) {
1473 vdst[lane] = src0[lane] + src1[lane]
1474 + bits(vcc.rawData(), lane);
1475 sdst.setBit(lane, ((VecElemU64)src0[lane]
1476 + (VecElemU64)src1[lane]
1477 + (VecElemU64)bits(vcc.rawData(), lane))
1478 >= 0x100000000 ? 1 : 0);
1479 }
1480 }
1481
1482 vdst.write();
1483 sdst.write();
1484 } // execute
1485 // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
1486
1488 : Inst_VOP3B(iFmt, "v_subb_co_u32")
1489 {
1490 setFlag(ALU);
1491 setFlag(WritesVCC);
1492 setFlag(ReadsVCC);
1493 } // Inst_VOP3__V_SUBB_CO_U32
1494
1496 {
1497 } // ~Inst_VOP3__V_SUBB_CO_U32
1498
1499 // --- description from .arch file ---
1500 // D.u = S0.u - S1.u - VCC[threadId];
1501 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1502 // --- overflow.
1503 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1504 // --- source comes from the SGPR-pair at S2.u.
1505 void
1507 {
1508 Wavefront *wf = gpuDynInst->wavefront();
1509 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1510 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1511 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1512 ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1513 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1514
1515 src0.readSrc();
1516 src1.readSrc();
1517 vcc.read();
1518
1522 assert(!(extData.NEG & 0x1));
1523 assert(!(extData.NEG & 0x2));
1524 assert(!(extData.NEG & 0x4));
1525
1526 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1527 if (wf->execMask(lane)) {
1528 vdst[lane] = src0[lane] - src1[lane]
1529 - bits(vcc.rawData(), lane);
1530 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1531 > src0[lane] ? 1 : 0);
1532 }
1533 }
1534
1535 vdst.write();
1536 sdst.write();
1537 } // execute
1538 // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
1539
1541 InFmt_VOP3B *iFmt)
1542 : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
1543 {
1544 setFlag(ALU);
1545 setFlag(WritesVCC);
1546 setFlag(ReadsVCC);
1547 } // Inst_VOP3__V_SUBBREV_CO_U32
1548
1550 {
1551 } // ~Inst_VOP3__V_SUBBREV_CO_U32
1552
1553 // --- description from .arch file ---
1554 // D.u = S1.u - S0.u - VCC[threadId];
1555 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1556 // overflow.
1557 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1558 // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
1559 void
1561 {
1562 Wavefront *wf = gpuDynInst->wavefront();
1563 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1564 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1565 ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1566 ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1567 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1568
1569 src0.readSrc();
1570 src1.readSrc();
1571 vcc.read();
1572
1576 assert(!(extData.NEG & 0x1));
1577 assert(!(extData.NEG & 0x2));
1578 assert(!(extData.NEG & 0x4));
1579
1580 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1581 if (wf->execMask(lane)) {
1582 vdst[lane] = src1[lane] - src0[lane]
1583 - bits(vcc.rawData(), lane);
1584 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1585 > src0[lane] ? 1 : 0);
1586 }
1587 }
1588
1589 vdst.write();
1590 sdst.write();
1591 } // execute
1592 // --- Inst_VOP3__V_ADD_F16 class methods ---
1593
1595 : Inst_VOP3A(iFmt, "v_add_f16", false)
1596 {
1597 setFlag(ALU);
1598 setFlag(F16);
1599 } // Inst_VOP3__V_ADD_F16
1600
1602 {
1603 } // ~Inst_VOP3__V_ADD_F16
1604
1605 // --- description from .arch file ---
1606 // D.f16 = S0.f16 + S1.f16.
1607 // Supports denormals, round mode, exception flags, saturation.
1608 void
1610 {
1612 } // execute
1613 // --- Inst_VOP3__V_SUB_F16 class methods ---
1614
1616 : Inst_VOP3A(iFmt, "v_sub_f16", false)
1617 {
1618 setFlag(ALU);
1619 setFlag(F16);
1620 } // Inst_VOP3__V_SUB_F16
1621
1623 {
1624 } // ~Inst_VOP3__V_SUB_F16
1625
1626 // --- description from .arch file ---
1627 // D.f16 = S0.f16 - S1.f16.
1628 // Supports denormals, round mode, exception flags, saturation.
1629 // SQ translates to V_ADD_F16.
1630 void
1632 {
1634 } // execute
1635 // --- Inst_VOP3__V_SUBREV_F16 class methods ---
1636
1638 : Inst_VOP3A(iFmt, "v_subrev_f16", false)
1639 {
1640 setFlag(ALU);
1641 setFlag(F16);
1642 } // Inst_VOP3__V_SUBREV_F16
1643
1645 {
1646 } // ~Inst_VOP3__V_SUBREV_F16
1647
1648 // --- description from .arch file ---
1649 // D.f16 = S1.f16 - S0.f16.
1650 // Supports denormals, round mode, exception flags, saturation.
1651 // SQ translates to V_ADD_F16.
1652 void
1654 {
1656 } // execute
1657 // --- Inst_VOP3__V_MUL_F16 class methods ---
1658
1660 : Inst_VOP3A(iFmt, "v_mul_f16", false)
1661 {
1662 setFlag(ALU);
1663 setFlag(F16);
1664 } // Inst_VOP3__V_MUL_F16
1665
1667 {
1668 } // ~Inst_VOP3__V_MUL_F16
1669
1670 // --- description from .arch file ---
1671 // D.f16 = S0.f16 * S1.f16.
1672 // Supports denormals, round mode, exception flags, saturation.
1673 void
1675 {
1677 } // execute
1678 // --- Inst_VOP3__V_MAC_F16 class methods ---
1679
1681 : Inst_VOP3A(iFmt, "v_mac_f16", false)
1682 {
1683 setFlag(ALU);
1684 setFlag(F16);
1685 setFlag(MAC);
1686 } // Inst_VOP3__V_MAC_F16
1687
1689 {
1690 } // ~Inst_VOP3__V_MAC_F16
1691
1692 // --- description from .arch file ---
1693 // D.f16 = S0.f16 * S1.f16 + D.f16.
1694 // Supports round mode, exception flags, saturation.
1695 // SQ translates this to V_MAD_F16.
1696 void
1698 {
1700 } // execute
1701 // --- Inst_VOP3__V_ADD_U16 class methods ---
1702
1704 : Inst_VOP3A(iFmt, "v_add_u16", false)
1705 {
1706 setFlag(ALU);
1707 } // Inst_VOP3__V_ADD_U16
1708
1710 {
1711 } // ~Inst_VOP3__V_ADD_U16
1712
1713 // --- description from .arch file ---
1714 // D.u16 = S0.u16 + S1.u16.
1715 // Supports saturation (unsigned 16-bit integer domain).
1716 void
1718 {
1719 Wavefront *wf = gpuDynInst->wavefront();
1720 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1721 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1722 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1723
1724 src0.readSrc();
1725 src1.readSrc();
1726
1730 assert(!(instData.ABS & 0x1));
1731 assert(!(instData.ABS & 0x2));
1732 assert(!(instData.ABS & 0x4));
1733 assert(!(extData.NEG & 0x1));
1734 assert(!(extData.NEG & 0x2));
1735 assert(!(extData.NEG & 0x4));
1736
1737 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1738 if (wf->execMask(lane)) {
1739 vdst[lane] = src0[lane] + src1[lane];
1740 }
1741 }
1742
1743 vdst.write();
1744 } // execute
1745 // --- Inst_VOP3__V_SUB_U16 class methods ---
1746
1748 : Inst_VOP3A(iFmt, "v_sub_u16", false)
1749 {
1750 setFlag(ALU);
1751 } // Inst_VOP3__V_SUB_U16
1752
1754 {
1755 } // ~Inst_VOP3__V_SUB_U16
1756
1757 // --- description from .arch file ---
1758 // D.u16 = S0.u16 - S1.u16.
1759 // Supports saturation (unsigned 16-bit integer domain).
1760 void
1762 {
1763 Wavefront *wf = gpuDynInst->wavefront();
1764 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1765 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1766 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1767
1768 src0.readSrc();
1769 src1.readSrc();
1770
1774 assert(!(instData.ABS & 0x1));
1775 assert(!(instData.ABS & 0x2));
1776 assert(!(instData.ABS & 0x4));
1777 assert(!(extData.NEG & 0x1));
1778 assert(!(extData.NEG & 0x2));
1779 assert(!(extData.NEG & 0x4));
1780
1781 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1782 if (wf->execMask(lane)) {
1783 vdst[lane] = src0[lane] - src1[lane];
1784 }
1785 }
1786
1787 vdst.write();
1788 } // execute
1789 // --- Inst_VOP3__V_SUBREV_U16 class methods ---
1790
1792 : Inst_VOP3A(iFmt, "v_subrev_u16", false)
1793 {
1794 setFlag(ALU);
1795 } // Inst_VOP3__V_SUBREV_U16
1796
1798 {
1799 } // ~Inst_VOP3__V_SUBREV_U16
1800
1801 // --- description from .arch file ---
1802 // D.u16 = S1.u16 - S0.u16.
1803 // Supports saturation (unsigned 16-bit integer domain).
1804 // SQ translates this to V_SUB_U16 with reversed operands.
1805 void
1807 {
1808 Wavefront *wf = gpuDynInst->wavefront();
1809 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1810 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1811 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1812
1813 src0.readSrc();
1814 src1.readSrc();
1815
1819 assert(!(instData.ABS & 0x1));
1820 assert(!(instData.ABS & 0x2));
1821 assert(!(instData.ABS & 0x4));
1822 assert(!(extData.NEG & 0x1));
1823 assert(!(extData.NEG & 0x2));
1824 assert(!(extData.NEG & 0x4));
1825
1826 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1827 if (wf->execMask(lane)) {
1828 vdst[lane] = src1[lane] - src0[lane];
1829 }
1830 }
1831
1832 vdst.write();
1833 } // execute
1834 // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
1835
1837 : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
1838 {
1839 setFlag(ALU);
1840 } // Inst_VOP3__V_MUL_LO_U16
1841
1843 {
1844 } // ~Inst_VOP3__V_MUL_LO_U16
1845
1846 // --- description from .arch file ---
1847 // D.u16 = S0.u16 * S1.u16.
1848 // Supports saturation (unsigned 16-bit integer domain).
1849 void
1851 {
1852 Wavefront *wf = gpuDynInst->wavefront();
1853 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1854 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1855 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1856
1857 src0.readSrc();
1858 src1.readSrc();
1859
1863 assert(!(instData.ABS & 0x1));
1864 assert(!(instData.ABS & 0x2));
1865 assert(!(instData.ABS & 0x4));
1866 assert(!(extData.NEG & 0x1));
1867 assert(!(extData.NEG & 0x2));
1868 assert(!(extData.NEG & 0x4));
1869
1870 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1871 if (wf->execMask(lane)) {
1872 vdst[lane] = src0[lane] * src1[lane];
1873 }
1874 }
1875
1876 vdst.write();
1877 } // execute
1878 // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
1879
1881 : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
1882 {
1883 setFlag(ALU);
1884 } // Inst_VOP3__V_LSHLREV_B16
1885
1887 {
1888 } // ~Inst_VOP3__V_LSHLREV_B16
1889
1890 // --- description from .arch file ---
1891 // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
1892 // SQ translates this to an internal SP opcode.
1893 void
1895 {
1896 Wavefront *wf = gpuDynInst->wavefront();
1897 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1898 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1899 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1900
1901 src0.readSrc();
1902 src1.readSrc();
1903
1907 assert(!(instData.ABS & 0x1));
1908 assert(!(instData.ABS & 0x2));
1909 assert(!(instData.ABS & 0x4));
1910 assert(!(extData.NEG & 0x1));
1911 assert(!(extData.NEG & 0x2));
1912 assert(!(extData.NEG & 0x4));
1913
1914 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1915 if (wf->execMask(lane)) {
1916 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
1917 }
1918 }
1919
1920 vdst.write();
1921 } // execute
1922 // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
1923
1925 : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
1926 {
1927 setFlag(ALU);
1928 } // Inst_VOP3__V_LSHRREV_B16
1929
1931 {
1932 } // ~Inst_VOP3__V_LSHRREV_B16
1933
1934 // --- description from .arch file ---
1935 // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
1936 // The vacated bits are set to zero.
1937 // SQ translates this to an internal SP opcode.
1938 void
1940 {
1941 Wavefront *wf = gpuDynInst->wavefront();
1942 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1943 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1944 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1945
1946 src0.readSrc();
1947 src1.readSrc();
1948
1949 if (instData.ABS & 0x1) {
1950 src0.absModifier();
1951 }
1952
1953 if (instData.ABS & 0x2) {
1954 src1.absModifier();
1955 }
1956
1957 if (extData.NEG & 0x1) {
1958 src0.negModifier();
1959 }
1960
1961 if (extData.NEG & 0x2) {
1962 src1.negModifier();
1963 }
1964
1965 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1966 if (wf->execMask(lane)) {
1967 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
1968 }
1969 }
1970
1971 vdst.write();
1972 } // execute
1973 // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
1974
1976 : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
1977 {
1978 setFlag(ALU);
1979 } // Inst_VOP3__V_ASHRREV_I16
1980
1982 {
1983 } // ~Inst_VOP3__V_ASHRREV_I16
1984
1985 // --- description from .arch file ---
1986 // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
1987 // The vacated bits are set to the sign bit of the input value.
1988 // SQ translates this to an internal SP opcode.
1989 void
1991 {
1992 Wavefront *wf = gpuDynInst->wavefront();
1993 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1994 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
1995 VecOperandI16 vdst(gpuDynInst, instData.VDST);
1996
1997 src0.readSrc();
1998 src1.readSrc();
1999
2003 assert(!(instData.ABS & 0x1));
2004 assert(!(instData.ABS & 0x2));
2005 assert(!(instData.ABS & 0x4));
2006 assert(!(extData.NEG & 0x1));
2007 assert(!(extData.NEG & 0x2));
2008 assert(!(extData.NEG & 0x4));
2009
2010 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2011 if (wf->execMask(lane)) {
2012 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
2013 }
2014 }
2015
2016 vdst.write();
2017 } // execute
2018 // --- Inst_VOP3__V_MAX_F16 class methods ---
2019
2021 : Inst_VOP3A(iFmt, "v_max_f16", false)
2022 {
2023 setFlag(ALU);
2024 setFlag(F16);
2025 } // Inst_VOP3__V_MAX_F16
2026
2028 {
2029 } // ~Inst_VOP3__V_MAX_F16
2030
2031 // --- description from .arch file ---
2032 // D.f16 = max(S0.f16, S1.f16).
2033 // IEEE compliant. Supports denormals, round mode, exception flags,
2034 // saturation.
2035 void
2037 {
2039 } // execute
2040 // --- Inst_VOP3__V_MIN_F16 class methods ---
2041
2043 : Inst_VOP3A(iFmt, "v_min_f16", false)
2044 {
2045 setFlag(ALU);
2046 setFlag(F16);
2047 } // Inst_VOP3__V_MIN_F16
2048
2050 {
2051 } // ~Inst_VOP3__V_MIN_F16
2052
2053 // --- description from .arch file ---
2054 // D.f16 = min(S0.f16, S1.f16).
2055 // IEEE compliant. Supports denormals, round mode, exception flags,
2056 // saturation.
2057 void
2059 {
2061 } // execute
2062 // --- Inst_VOP3__V_MAX_U16 class methods ---
2063
2065 : Inst_VOP3A(iFmt, "v_max_u16", false)
2066 {
2067 setFlag(ALU);
2068 } // Inst_VOP3__V_MAX_U16
2069
2071 {
2072 } // ~Inst_VOP3__V_MAX_U16
2073
2074 // --- description from .arch file ---
2075 // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
2076 void
2078 {
2079 Wavefront *wf = gpuDynInst->wavefront();
2080 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
2081 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
2082 VecOperandU16 vdst(gpuDynInst, instData.VDST);
2083
2084 src0.readSrc();
2085 src1.readSrc();
2086
2087 if (instData.ABS & 0x1) {
2088 src0.absModifier();
2089 }
2090
2091 if (instData.ABS & 0x2) {
2092 src1.absModifier();
2093 }
2094
2095 if (extData.NEG & 0x1) {
2096 src0.negModifier();
2097 }
2098
2099 if (extData.NEG & 0x2) {
2100 src1.negModifier();
2101 }
2102
2103 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2104 if (wf->execMask(lane)) {
2105 vdst[lane] = std::max(src0[lane], src1[lane]);
2106 }
2107 }
2108
2109 vdst.write();
2110 } // execute
2111 // --- Inst_VOP3__V_MAX_I16 class methods ---
2112
2114 : Inst_VOP3A(iFmt, "v_max_i16", false)
2115 {
2116 setFlag(ALU);
2117 } // Inst_VOP3__V_MAX_I16
2118
2120 {
2121 } // ~Inst_VOP3__V_MAX_I16
2122
2123 // --- description from .arch file ---
2124 // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
2125 void
2127 {
2128 Wavefront *wf = gpuDynInst->wavefront();
2129 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
2130 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
2131 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2132
2133 src0.readSrc();
2134 src1.readSrc();
2135
2136 if (instData.ABS & 0x1) {
2137 src0.absModifier();
2138 }
2139
2140 if (instData.ABS & 0x2) {
2141 src1.absModifier();
2142 }
2143
2144 if (extData.NEG & 0x1) {
2145 src0.negModifier();
2146 }
2147
2148 if (extData.NEG & 0x2) {
2149 src1.negModifier();
2150 }
2151
2152 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2153 if (wf->execMask(lane)) {
2154 vdst[lane] = std::max(src0[lane], src1[lane]);
2155 }
2156 }
2157
2158 vdst.write();
2159 } // execute
2160 // --- Inst_VOP3__V_MIN_U16 class methods ---
2161
2163 : Inst_VOP3A(iFmt, "v_min_u16", false)
2164 {
2165 setFlag(ALU);
2166 } // Inst_VOP3__V_MIN_U16
2167
2169 {
2170 } // ~Inst_VOP3__V_MIN_U16
2171
2172 // --- description from .arch file ---
2173 // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
2174 void
2176 {
2177 Wavefront *wf = gpuDynInst->wavefront();
2178 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
2179 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
2180 VecOperandU16 vdst(gpuDynInst, instData.VDST);
2181
2182 src0.readSrc();
2183 src1.readSrc();
2184
2185 if (instData.ABS & 0x1) {
2186 src0.absModifier();
2187 }
2188
2189 if (instData.ABS & 0x2) {
2190 src1.absModifier();
2191 }
2192
2193 if (extData.NEG & 0x1) {
2194 src0.negModifier();
2195 }
2196
2197 if (extData.NEG & 0x2) {
2198 src1.negModifier();
2199 }
2200
2201 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2202 if (wf->execMask(lane)) {
2203 vdst[lane] = std::min(src0[lane], src1[lane]);
2204 }
2205 }
2206
2207 vdst.write();
2208 } // execute
2209 // --- Inst_VOP3__V_MIN_I16 class methods ---
2210
2212 : Inst_VOP3A(iFmt, "v_min_i16", false)
2213 {
2214 setFlag(ALU);
2215 } // Inst_VOP3__V_MIN_I16
2216
2218 {
2219 } // ~Inst_VOP3__V_MIN_I16
2220
2221 // --- description from .arch file ---
2222 // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
2223 void
2225 {
2226 Wavefront *wf = gpuDynInst->wavefront();
2227 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
2228 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
2229 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2230
2231 src0.readSrc();
2232 src1.readSrc();
2233
2234 if (instData.ABS & 0x1) {
2235 src0.absModifier();
2236 }
2237
2238 if (instData.ABS & 0x2) {
2239 src1.absModifier();
2240 }
2241
2242 if (extData.NEG & 0x1) {
2243 src0.negModifier();
2244 }
2245
2246 if (extData.NEG & 0x2) {
2247 src1.negModifier();
2248 }
2249
2250 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2251 if (wf->execMask(lane)) {
2252 vdst[lane] = std::min(src0[lane], src1[lane]);
2253 }
2254 }
2255
2256 vdst.write();
2257 } // execute
2258 // --- Inst_VOP3__V_LDEXP_F16 class methods ---
2259
2261 : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
2262 {
2263 setFlag(ALU);
2264 setFlag(F16);
2265 } // Inst_VOP3__V_LDEXP_F16
2266
2268 {
2269 } // ~Inst_VOP3__V_LDEXP_F16
2270
2271 // --- description from .arch file ---
2272 // D.f16 = S0.f16 * (2 ** S1.i16).
2273 void
2275 {
2277 } // execute
2278 // --- Inst_VOP3__V_ADD_U32 class methods ---
2279
2281 : Inst_VOP3A(iFmt, "v_add_u32", false)
2282 {
2283 setFlag(ALU);
2284 } // Inst_VOP3__V_ADD_U32
2285
2287 {
2288 } // ~Inst_VOP3__V_ADD_U32
2289
2290 // --- description from .arch file ---
2291 // D.u32 = S0.u32 + S1.u32.
2292 void
2294 {
2295 Wavefront *wf = gpuDynInst->wavefront();
2296 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2297 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2298 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2299
2300 src0.readSrc();
2301 src1.readSrc();
2302
2306 assert(!(instData.ABS & 0x1));
2307 assert(!(instData.ABS & 0x2));
2308 assert(!(instData.ABS & 0x4));
2309 assert(!(extData.NEG & 0x1));
2310 assert(!(extData.NEG & 0x2));
2311 assert(!(extData.NEG & 0x4));
2312
2313 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2314 if (wf->execMask(lane)) {
2315 vdst[lane] = src0[lane] + src1[lane];
2316 }
2317 }
2318
2319 vdst.write();
2320 } // execute
2321 // --- Inst_VOP3__V_SUB_U32 class methods ---
2322
2324 : Inst_VOP3A(iFmt, "v_sub_u32", false)
2325 {
2326 setFlag(ALU);
2327 } // Inst_VOP3__V_SUB_U32
2328
2330 {
2331 } // ~Inst_VOP3__V_SUB_U32
2332
2333 // --- description from .arch file ---
2334 // D.u32 = S0.u32 - S1.u32.
2335 void
2337 {
2338 Wavefront *wf = gpuDynInst->wavefront();
2339 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2340 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2341 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2342
2343 src0.readSrc();
2344 src1.readSrc();
2345
2349 assert(!(instData.ABS & 0x1));
2350 assert(!(instData.ABS & 0x2));
2351 assert(!(instData.ABS & 0x4));
2352 assert(!(extData.NEG & 0x1));
2353 assert(!(extData.NEG & 0x2));
2354 assert(!(extData.NEG & 0x4));
2355
2356 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2357 if (wf->execMask(lane)) {
2358 vdst[lane] = src0[lane] - src1[lane];
2359 }
2360 }
2361
2362 vdst.write();
2363 } // execute
2364 // --- Inst_VOP3__V_SUBREV_U32 class methods ---
2365
2367 : Inst_VOP3A(iFmt, "v_subrev_u32", false)
2368 {
2369 setFlag(ALU);
2370 } // Inst_VOP3__V_SUBREV_U32
2371
2373 {
2374 } // ~Inst_VOP3__V_SUBREV_U32
2375
2376 // --- description from .arch file ---
2377 // D.u32 = S1.u32 - S0.u32.
2378 void
2380 {
2381 Wavefront *wf = gpuDynInst->wavefront();
2382 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2383 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2384 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2385
2386 src0.readSrc();
2387 src1.readSrc();
2388
2392 assert(!(instData.ABS & 0x1));
2393 assert(!(instData.ABS & 0x2));
2394 assert(!(instData.ABS & 0x4));
2395 assert(!(extData.NEG & 0x1));
2396 assert(!(extData.NEG & 0x2));
2397 assert(!(extData.NEG & 0x4));
2398
2399 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2400 if (wf->execMask(lane)) {
2401 vdst[lane] = src1[lane] - src0[lane];
2402 }
2403 }
2404
2405 vdst.write();
2406 } // execute
2407 // --- Inst_VOP3__V_FMAC_F32 class methods ---
2408
2410 : Inst_VOP3A(iFmt, "v_fmac_f32", false)
2411 {
2412 setFlag(ALU);
2413 setFlag(F32);
2414 setFlag(FMA);
2415 } // Inst_VOP3__V_FMAC_F32
2416
2418 {
2419 } // ~Inst_VOP3__V_FMAC_F32
2420
2421 // --- description from .arch file ---
2422 // D.f = S0.f * S1.f + D.f.
2423 void
2425 {
2426 Wavefront *wf = gpuDynInst->wavefront();
2427 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
2428 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
2429 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2430
2431 src0.readSrc();
2432 src1.readSrc();
2433 vdst.read();
2434
2435 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
2436 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2437 panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
2438
2439 if (instData.ABS & 0x1) {
2440 src0.absModifier();
2441 }
2442
2443 if (instData.ABS & 0x2) {
2444 src1.absModifier();
2445 }
2446
2447 if (instData.ABS & 0x4) {
2448 vdst.absModifier();
2449 }
2450
2451 if (extData.NEG & 0x1) {
2452 src0.negModifier();
2453 }
2454
2455 if (extData.NEG & 0x2) {
2456 src1.negModifier();
2457 }
2458
2459 if (extData.NEG & 0x4) {
2460 vdst.negModifier();
2461 }
2462
2463 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2464 if (wf->execMask(lane)) {
2465 float out = std::fma(src0[lane], src1[lane], vdst[lane]);
2466 out = omodModifier(out, extData.OMOD);
2467 out = std::clamp(vdst[lane], 0.0f, 1.0f);
2468 vdst[lane] = out;
2469 }
2470 }
2471
2472 vdst.write();
2473 } // execute
2474 // --- Inst_VOP3__V_NOP class methods ---
2475
2477 : Inst_VOP3A(iFmt, "v_nop", false)
2478 {
2479 setFlag(Nop);
2480 setFlag(ALU);
2481 } // Inst_VOP3__V_NOP
2482
2484 {
2485 } // ~Inst_VOP3__V_NOP
2486
2487 // --- description from .arch file ---
2488 // Do nothing.
2489 void
2491 {
2492 } // execute
2493 // --- Inst_VOP3__V_MOV_B32 class methods ---
2494
2496 : Inst_VOP3A(iFmt, "v_mov_b32", false)
2497 {
2498 setFlag(ALU);
2499 } // Inst_VOP3__V_MOV_B32
2500
2502 {
2503 } // ~Inst_VOP3__V_MOV_B32
2504
2505 // --- description from .arch file ---
2506 // D.u = S0.u.
2507 // Input and output modifiers not supported; this is an untyped operation.
2508 void
2510 {
2511 Wavefront *wf = gpuDynInst->wavefront();
2512 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
2513 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2514
2515 src.readSrc();
2516
2517 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2518 if (wf->execMask(lane)) {
2519 vdst[lane] = src[lane];
2520 }
2521 }
2522
2523 vdst.write();
2524 } // execute
2525 // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
2526
2528 : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
2529 {
2530 setFlag(ALU);
2531 setFlag(F64);
2532 } // Inst_VOP3__V_CVT_I32_F64
2533
2535 {
2536 } // ~Inst_VOP3__V_CVT_I32_F64
2537
2538 // --- description from .arch file ---
2539 // D.i = (int)S0.d.
2540 // Out-of-range floating point values (including infinity) saturate. NaN is
2541 // --- converted to 0.
2542 void
2544 {
2545 Wavefront *wf = gpuDynInst->wavefront();
2546 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
2547 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2548
2549 src.readSrc();
2550
2551 if (instData.ABS & 0x1) {
2552 src.absModifier();
2553 }
2554
2555 if (extData.NEG & 0x1) {
2556 src.negModifier();
2557 }
2558
2559 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2560 if (wf->execMask(lane)) {
2561 int exp;
2562 std::frexp(src[lane],&exp);
2563 if (std::isnan(src[lane])) {
2564 vdst[lane] = 0;
2565 } else if (std::isinf(src[lane]) || exp > 30) {
2566 if (std::signbit(src[lane])) {
2567 vdst[lane] = INT_MIN;
2568 } else {
2569 vdst[lane] = INT_MAX;
2570 }
2571 } else {
2572 vdst[lane] = (VecElemI32)src[lane];
2573 }
2574 }
2575 }
2576
2577 vdst.write();
2578 } // execute
2579 // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
2580
2582 : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
2583 {
2584 setFlag(ALU);
2585 setFlag(F64);
2586 } // Inst_VOP3__V_CVT_F64_I32
2587
2589 {
2590 } // ~Inst_VOP3__V_CVT_F64_I32
2591
2592 // --- description from .arch file ---
2593 // D.d = (double)S0.i.
2594 void
2596 {
2597 Wavefront *wf = gpuDynInst->wavefront();
2598 ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
2599 VecOperandF64 vdst(gpuDynInst, instData.VDST);
2600
2601 src.readSrc();
2602
2603 if (instData.ABS & 0x1) {
2604 src.absModifier();
2605 }
2606
2607 if (extData.NEG & 0x1) {
2608 src.negModifier();
2609 }
2610
2611 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2612 if (wf->execMask(lane)) {
2613 vdst[lane] = (VecElemF64)src[lane];
2614 }
2615 }
2616
2617 vdst.write();
2618 } // execute
2619 // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
2620
2622 : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
2623 {
2624 setFlag(ALU);
2625 setFlag(F32);
2626 } // Inst_VOP3__V_CVT_F32_I32
2627
2629 {
2630 } // ~Inst_VOP3__V_CVT_F32_I32
2631
2632 // --- description from .arch file ---
2633 // D.f = (float)S0.i.
2634 void
2636 {
2637 Wavefront *wf = gpuDynInst->wavefront();
2638 VecOperandI32 src(gpuDynInst, extData.SRC0);
2639 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2640
2641 src.readSrc();
2642
2646 assert(!(instData.ABS & 0x1));
2647 assert(!(instData.ABS & 0x2));
2648 assert(!(instData.ABS & 0x4));
2649 assert(!(extData.NEG & 0x1));
2650 assert(!(extData.NEG & 0x2));
2651 assert(!(extData.NEG & 0x4));
2652
2653 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2654 if (wf->execMask(lane)) {
2655 vdst[lane] = (VecElemF32)src[lane];
2656 }
2657 }
2658
2659 vdst.write();
2660 } // execute
2661 // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
2662
2664 : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
2665 {
2666 setFlag(ALU);
2667 setFlag(F32);
2668 } // Inst_VOP3__V_CVT_F32_U32
2669
2671 {
2672 } // ~Inst_VOP3__V_CVT_F32_U32
2673
2674 // --- description from .arch file ---
2675 // D.f = (float)S0.u.
2676 void
2678 {
2679 Wavefront *wf = gpuDynInst->wavefront();
2680 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
2681 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2682
2683 src.readSrc();
2684
2685 if (instData.ABS & 0x1) {
2686 src.absModifier();
2687 }
2688
2689 if (extData.NEG & 0x1) {
2690 src.negModifier();
2691 }
2692
2693 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2694 if (wf->execMask(lane)) {
2695 vdst[lane] = (VecElemF32)src[lane];
2696 }
2697 }
2698
2699 vdst.write();
2700 } // execute
2701 // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
2702
2704 : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
2705 {
2706 setFlag(ALU);
2707 setFlag(F32);
2708 } // Inst_VOP3__V_CVT_U32_F32
2709
2711 {
2712 } // ~Inst_VOP3__V_CVT_U32_F32
2713
2714 // --- description from .arch file ---
2715 // D.u = (unsigned)S0.f.
2716 // Out-of-range floating point values (including infinity) saturate. NaN is
2717 // --- converted to 0.
2718 void
2720 {
2721 Wavefront *wf = gpuDynInst->wavefront();
2722 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2723 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2724
2725 src.readSrc();
2726
2727 if (instData.ABS & 0x1) {
2728 src.absModifier();
2729 }
2730
2731 if (extData.NEG & 0x1) {
2732 src.negModifier();
2733 }
2734
2735 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2736 if (wf->execMask(lane)) {
2737 int exp;
2738 std::frexp(src[lane],&exp);
2739 if (std::isnan(src[lane])) {
2740 vdst[lane] = 0;
2741 } else if (std::isinf(src[lane])) {
2742 if (std::signbit(src[lane])) {
2743 vdst[lane] = 0;
2744 } else {
2745 vdst[lane] = UINT_MAX;
2746 }
2747 } else if (exp > 31) {
2748 vdst[lane] = UINT_MAX;
2749 } else {
2750 vdst[lane] = (VecElemU32)src[lane];
2751 }
2752 }
2753 }
2754
2755 vdst.write();
2756 } // execute
2757 // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
2758
2760 : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
2761 {
2762 setFlag(ALU);
2763 setFlag(F32);
2764 } // Inst_VOP3__V_CVT_I32_F32
2765
2767 {
2768 } // ~Inst_VOP3__V_CVT_I32_F32
2769
2770 // --- description from .arch file ---
2771 // D.i = (int)S0.f.
2772 // Out-of-range floating point values (including infinity) saturate. NaN is
2773 // --- converted to 0.
2774 void
2776 {
2777 Wavefront *wf = gpuDynInst->wavefront();
2778 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2779 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2780
2781 src.readSrc();
2782
2783 if (instData.ABS & 0x1) {
2784 src.absModifier();
2785 }
2786
2787 if (extData.NEG & 0x1) {
2788 src.negModifier();
2789 }
2790
2794 assert(!(instData.ABS & 0x2));
2795 assert(!(instData.ABS & 0x4));
2796 assert(!(extData.NEG & 0x2));
2797 assert(!(extData.NEG & 0x4));
2798
2799 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2800 if (wf->execMask(lane)) {
2801 int exp;
2802 std::frexp(src[lane],&exp);
2803 if (std::isnan(src[lane])) {
2804 vdst[lane] = 0;
2805 } else if (std::isinf(src[lane]) || exp > 30) {
2806 if (std::signbit(src[lane])) {
2807 vdst[lane] = INT_MIN;
2808 } else {
2809 vdst[lane] = INT_MAX;
2810 }
2811 } else {
2812 vdst[lane] = (VecElemI32)src[lane];
2813 }
2814 }
2815 }
2816
2817 vdst.write();
2818 } // execute
2819 // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
2820
2822 : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
2823 {
2824 setFlag(ALU);
2825 } // Inst_VOP3__V_MOV_FED_B32
2826
2828 {
2829 } // ~Inst_VOP3__V_MOV_FED_B32
2830
2831 // --- description from .arch file ---
2832 // D.u = S0.u;
2833 // Introduce EDC double error upon write to dest vgpr without causing an
2834 // --- exception.
2835 // Input and output modifiers not supported; this is an untyped operation.
2836 void
2838 {
2840 } // execute
2841 // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
2842
2844 : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
2845 {
2846 setFlag(ALU);
2847 setFlag(F32);
2848 } // Inst_VOP3__V_CVT_F16_F32
2849
2851 {
2852 } // ~Inst_VOP3__V_CVT_F16_F32
2853
2854 // --- description from .arch file ---
2855 // D.f16 = flt32_to_flt16(S0.f).
2856 // Supports input modifiers and creates FP16 denormals when appropriate.
2857 void
2859 {
2860 Wavefront *wf = gpuDynInst->wavefront();
2861 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
2862 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2863
2864 src0.readSrc();
2865 vdst.read();
2866
2867 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2868 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2869
2870 unsigned abs = instData.ABS;
2871 unsigned neg = extData.NEG;
2872 int opsel = instData.OPSEL;
2873
2874 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2875 if (wf->execMask(lane)) {
2876 float tmp = src0[lane];
2877
2878 if ((abs & 1) && (tmp < 0)) tmp = -tmp;
2879 if (neg & 1) tmp = -tmp;
2880
2881 tmp = omodModifier(tmp, extData.OMOD);
2882 tmp = std::clamp(tmp, 0.0f, 1.0f);
2883
2884 AMDGPU::mxfloat16 out(tmp);
2885
2886 // If opsel[3] use upper 16-bits of dest, otherwise lower.
2887 if (opsel & 8) {
2888 replaceBits(vdst[lane], 31, 16, (out.data >> 16));
2889 } else {
2890 replaceBits(vdst[lane], 15, 0, (out.data >> 16));
2891 }
2892 }
2893 }
2894
2895 vdst.write();
2896 } // execute
2897 // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
2898
2900 : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
2901 {
2902 setFlag(ALU);
2903 setFlag(F32);
2904 } // Inst_VOP3__V_CVT_F32_F16
2905
2907 {
2908 } // ~Inst_VOP3__V_CVT_F32_F16
2909
2910 // --- description from .arch file ---
2911 // D.f = flt16_to_flt32(S0.f16).
2912 // FP16 denormal inputs are always accepted.
2913 void
2915 {
2916 Wavefront *wf = gpuDynInst->wavefront();
2917 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2918 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2919
2920 src0.readSrc();
2921
2922 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2923 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2924 panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
2925
2926 unsigned abs = instData.ABS;
2927 unsigned neg = extData.NEG;
2928
2929 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2930 if (wf->execMask(lane)) {
2931 AMDGPU::mxfloat16 tmp(src0[lane]);
2932
2933 if ((abs & 1) && (tmp < 0)) tmp = -tmp;
2934 if (neg & 1) tmp = -tmp;
2935
2936 float out = omodModifier(float(tmp), extData.OMOD);
2937 out = std::clamp(out, 0.0f, 1.0f);
2938
2939 vdst[lane] = out;
2940 }
2941 }
2942
2943 vdst.write();
2944 } // execute
2945 // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
2946
2948 InFmt_VOP3A *iFmt)
2949 : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
2950 {
2951 setFlag(ALU);
2952 setFlag(F32);
2953 } // Inst_VOP3__V_CVT_RPI_I32_F32
2954
2956 {
2957 } // ~Inst_VOP3__V_CVT_RPI_I32_F32
2958
2959 // --- description from .arch file ---
2960 // D.i = (int)floor(S0.f + 0.5).
2961 void
2963 {
2964 Wavefront *wf = gpuDynInst->wavefront();
2965 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2966 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2967
2968 src.readSrc();
2969
2970 if (instData.ABS & 0x1) {
2971 src.absModifier();
2972 }
2973
2974 if (extData.NEG & 0x1) {
2975 src.negModifier();
2976 }
2977
2978 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2979 if (wf->execMask(lane)) {
2980 vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
2981 }
2982 }
2983
2984 vdst.write();
2985 } // execute
2986 // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
2987
2989 InFmt_VOP3A *iFmt)
2990 : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
2991 {
2992 setFlag(ALU);
2993 setFlag(F32);
2994 } // Inst_VOP3__V_CVT_FLR_I32_F32
2995
2997 {
2998 } // ~Inst_VOP3__V_CVT_FLR_I32_F32
2999
3000 // --- description from .arch file ---
3001 // D.i = (int)floor(S0.f).
3002 void
3004 {
3005 Wavefront *wf = gpuDynInst->wavefront();
3006 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3007 VecOperandI32 vdst(gpuDynInst, instData.VDST);
3008
3009 src.readSrc();
3010
3011 if (instData.ABS & 0x1) {
3012 src.absModifier();
3013 }
3014
3015 if (extData.NEG & 0x1) {
3016 src.negModifier();
3017 }
3018
3019 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3020 if (wf->execMask(lane)) {
3021 vdst[lane] = (VecElemI32)std::floor(src[lane]);
3022 }
3023 }
3024
3025 vdst.write();
3026 } // execute
3027 // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
3028
3030 : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
3031 {
3032 setFlag(ALU);
3033 setFlag(F32);
3034 } // Inst_VOP3__V_CVT_OFF_F32_I4
3035
3037 {
3038 } // ~Inst_VOP3__V_CVT_OFF_F32_I4
3039
3040 // --- description from .arch file ---
3041 // 4-bit signed int to 32-bit float. Used for interpolation in shader.
3042 void
3044 {
3045 // Could not parse sq_uc.arch desc field
3047 } // execute
3048 // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
3049
3051 : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
3052 {
3053 setFlag(ALU);
3054 setFlag(F64);
3055 } // Inst_VOP3__V_CVT_F32_F64
3056
3058 {
3059 } // ~Inst_VOP3__V_CVT_F32_F64
3060
3061 // --- description from .arch file ---
3062 // D.f = (float)S0.d.
3063 void
3065 {
3066 Wavefront *wf = gpuDynInst->wavefront();
3067 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3068 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3069
3070 src.readSrc();
3071
3072 if (instData.ABS & 0x1) {
3073 src.absModifier();
3074 }
3075
3076 if (extData.NEG & 0x1) {
3077 src.negModifier();
3078 }
3079
3083 assert(!(instData.ABS & 0x2));
3084 assert(!(instData.ABS & 0x4));
3085 assert(!(extData.NEG & 0x2));
3086 assert(!(extData.NEG & 0x4));
3087
3088 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3089 if (wf->execMask(lane)) {
3090 vdst[lane] = (VecElemF32)src[lane];
3091 }
3092 }
3093
3094 vdst.write();
3095 } // execute
3096 // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
3097
3099 : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
3100 {
3101 setFlag(ALU);
3102 setFlag(F64);
3103 } // Inst_VOP3__V_CVT_F64_F32
3104
3106 {
3107 } // ~Inst_VOP3__V_CVT_F64_F32
3108
3109 // --- description from .arch file ---
3110 // D.d = (double)S0.f.
3111 void
3113 {
3114 Wavefront *wf = gpuDynInst->wavefront();
3115 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3116 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3117
3118 src.readSrc();
3119
3120 if (instData.ABS & 0x1) {
3121 src.absModifier();
3122 }
3123
3124 if (extData.NEG & 0x1) {
3125 src.negModifier();
3126 }
3127
3131 assert(!(instData.ABS & 0x2));
3132 assert(!(instData.ABS & 0x4));
3133 assert(!(extData.NEG & 0x2));
3134 assert(!(extData.NEG & 0x4));
3135
3136 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3137 if (wf->execMask(lane)) {
3138 vdst[lane] = (VecElemF64)src[lane];
3139 }
3140 }
3141
3142 vdst.write();
3143 } // execute
3144 // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
3145
3147 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
3148 {
3149 setFlag(ALU);
3150 setFlag(F32);
3151 } // Inst_VOP3__V_CVT_F32_UBYTE0
3152
3154 {
3155 } // ~Inst_VOP3__V_CVT_F32_UBYTE0
3156
3157 // --- description from .arch file ---
3158 // D.f = (float)(S0.u[7:0]).
3159 void
3161 {
3162 Wavefront *wf = gpuDynInst->wavefront();
3163 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3164 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3165
3166 src.readSrc();
3167
3168 if (instData.ABS & 0x1) {
3169 src.absModifier();
3170 }
3171
3172 if (extData.NEG & 0x1) {
3173 src.negModifier();
3174 }
3175
3176 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3177 if (wf->execMask(lane)) {
3178 vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
3179 }
3180 }
3181
3182 vdst.write();
3183 } // execute
3184 // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
3185
3187 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
3188 {
3189 setFlag(ALU);
3190 setFlag(F32);
3191 } // Inst_VOP3__V_CVT_F32_UBYTE1
3192
3194 {
3195 } // ~Inst_VOP3__V_CVT_F32_UBYTE1
3196
3197 // --- description from .arch file ---
3198 // D.f = (float)(S0.u[15:8]).
3199 void
3201 {
3202 Wavefront *wf = gpuDynInst->wavefront();
3203 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3204 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3205
3206 src.readSrc();
3207
3208 if (instData.ABS & 0x1) {
3209 src.absModifier();
3210 }
3211
3212 if (extData.NEG & 0x1) {
3213 src.negModifier();
3214 }
3215
3216 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3217 if (wf->execMask(lane)) {
3218 vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
3219 }
3220 }
3221
3222 vdst.write();
3223 } // execute
3224 // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
3225
3227 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
3228 {
3229 setFlag(ALU);
3230 setFlag(F32);
3231 } // Inst_VOP3__V_CVT_F32_UBYTE2
3232
3234 {
3235 } // ~Inst_VOP3__V_CVT_F32_UBYTE2
3236
3237 // --- description from .arch file ---
3238 // D.f = (float)(S0.u[23:16]).
3239 void
3241 {
3242 Wavefront *wf = gpuDynInst->wavefront();
3243 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3244 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3245
3246 src.readSrc();
3247
3248 if (instData.ABS & 0x1) {
3249 src.absModifier();
3250 }
3251
3252 if (extData.NEG & 0x1) {
3253 src.negModifier();
3254 }
3255
3256 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3257 if (wf->execMask(lane)) {
3258 vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
3259 }
3260 }
3261
3262 vdst.write();
3263 } // execute
3264 // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
3265
3267 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
3268 {
3269 setFlag(ALU);
3270 setFlag(F32);
3271 } // Inst_VOP3__V_CVT_F32_UBYTE3
3272
3274 {
3275 } // ~Inst_VOP3__V_CVT_F32_UBYTE3
3276
3277 // --- description from .arch file ---
3278 // D.f = (float)(S0.u[31:24]).
3279 void
3281 {
3282 Wavefront *wf = gpuDynInst->wavefront();
3283 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3284 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3285
3286 src.readSrc();
3287
3288 if (instData.ABS & 0x1) {
3289 src.absModifier();
3290 }
3291
3292 if (extData.NEG & 0x1) {
3293 src.negModifier();
3294 }
3295
3296 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3297 if (wf->execMask(lane)) {
3298 vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
3299 }
3300 }
3301
3302 vdst.write();
3303 } // execute
3304 // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
3305
3307 : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
3308 {
3309 setFlag(ALU);
3310 setFlag(F64);
3311 } // Inst_VOP3__V_CVT_U32_F64
3312
3314 {
3315 } // ~Inst_VOP3__V_CVT_U32_F64
3316
3317 // --- description from .arch file ---
3318 // D.u = (unsigned)S0.d.
3319 // Out-of-range floating point values (including infinity) saturate. NaN is
3320 // --- converted to 0.
3321 void
3323 {
3324 Wavefront *wf = gpuDynInst->wavefront();
3325 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3326 VecOperandU32 vdst(gpuDynInst, instData.VDST);
3327
3328 src.readSrc();
3329
3330 if (instData.ABS & 0x1) {
3331 src.absModifier();
3332 }
3333
3334 if (extData.NEG & 0x1) {
3335 src.negModifier();
3336 }
3337
3338 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3339 if (wf->execMask(lane)) {
3340 int exp;
3341 std::frexp(src[lane],&exp);
3342 if (std::isnan(src[lane])) {
3343 vdst[lane] = 0;
3344 } else if (std::isinf(src[lane])) {
3345 if (std::signbit(src[lane])) {
3346 vdst[lane] = 0;
3347 } else {
3348 vdst[lane] = UINT_MAX;
3349 }
3350 } else if (exp > 31) {
3351 vdst[lane] = UINT_MAX;
3352 } else {
3353 vdst[lane] = (VecElemU32)src[lane];
3354 }
3355 }
3356 }
3357
3358 vdst.write();
3359 } // execute
3360 // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
3361
3363 : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
3364 {
3365 setFlag(ALU);
3366 setFlag(F64);
3367 } // Inst_VOP3__V_CVT_F64_U32
3368
3370 {
3371 } // ~Inst_VOP3__V_CVT_F64_U32
3372
3373 // --- description from .arch file ---
3374 // D.d = (double)S0.u.
3375 void
3377 {
3378 Wavefront *wf = gpuDynInst->wavefront();
3379 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3380 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3381
3382 src.readSrc();
3383
3384 if (instData.ABS & 0x1) {
3385 src.absModifier();
3386 }
3387
3388 if (extData.NEG & 0x1) {
3389 src.negModifier();
3390 }
3391
3392 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3393 if (wf->execMask(lane)) {
3394 vdst[lane] = (VecElemF64)src[lane];
3395 }
3396 }
3397
3398 vdst.write();
3399 } // execute
3400 // --- Inst_VOP3__V_TRUNC_F64 class methods ---
3401
3403 : Inst_VOP3A(iFmt, "v_trunc_f64", false)
3404 {
3405 setFlag(ALU);
3406 setFlag(F64);
3407 } // Inst_VOP3__V_TRUNC_F64
3408
3410 {
3411 } // ~Inst_VOP3__V_TRUNC_F64
3412
3413 // --- description from .arch file ---
3414 // D.d = trunc(S0.d), return integer part of S0.d.
3415 void
3417 {
3418 Wavefront *wf = gpuDynInst->wavefront();
3419 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3420 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3421
3422 src.readSrc();
3423
3424 if (instData.ABS & 0x1) {
3425 src.absModifier();
3426 }
3427
3428 if (extData.NEG & 0x1) {
3429 src.negModifier();
3430 }
3431
3432 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3433 if (wf->execMask(lane)) {
3434 vdst[lane] = std::trunc(src[lane]);
3435 }
3436 }
3437
3438 vdst.write();
3439 } // execute
3440 // --- Inst_VOP3__V_CEIL_F64 class methods ---
3441
3443 : Inst_VOP3A(iFmt, "v_ceil_f64", false)
3444 {
3445 setFlag(ALU);
3446 setFlag(F64);
3447 } // Inst_VOP3__V_CEIL_F64
3448
3450 {
3451 } // ~Inst_VOP3__V_CEIL_F64
3452
3453 // --- description from .arch file ---
3454 // D.d = trunc(S0.d);
3455 // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
3456 void
3458 {
3459 Wavefront *wf = gpuDynInst->wavefront();
3460 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3461 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3462
3463 src.readSrc();
3464
3465 if (instData.ABS & 0x1) {
3466 src.absModifier();
3467 }
3468
3469 if (extData.NEG & 0x1) {
3470 src.negModifier();
3471 }
3472
3473 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3474 if (wf->execMask(lane)) {
3475 vdst[lane] = std::ceil(src[lane]);
3476 }
3477 }
3478
3479 vdst.write();
3480 } // execute
3481 // --- Inst_VOP3__V_RNDNE_F64 class methods ---
3482
3484 : Inst_VOP3A(iFmt, "v_rndne_f64", false)
3485 {
3486 setFlag(ALU);
3487 setFlag(F64);
3488 } // Inst_VOP3__V_RNDNE_F64
3489
3491 {
3492 } // ~Inst_VOP3__V_RNDNE_F64
3493
3494 // --- description from .arch file ---
3495 // D.d = round_nearest_even(S0.d).
3496 void
3498 {
3499 Wavefront *wf = gpuDynInst->wavefront();
3500 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3501 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3502
3503 src.readSrc();
3504
3505 if (instData.ABS & 0x1) {
3506 src.absModifier();
3507 }
3508
3509 if (extData.NEG & 0x1) {
3510 src.negModifier();
3511 }
3512
3513 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3514 if (wf->execMask(lane)) {
3515 vdst[lane] = roundNearestEven(src[lane]);
3516 }
3517 }
3518
3519 vdst.write();
3520 } // execute
3521 // --- Inst_VOP3__V_FLOOR_F64 class methods ---
3522
3524 : Inst_VOP3A(iFmt, "v_floor_f64", false)
3525 {
3526 setFlag(ALU);
3527 setFlag(F64);
3528 } // Inst_VOP3__V_FLOOR_F64
3529
3531 {
3532 } // ~Inst_VOP3__V_FLOOR_F64
3533
3534 // --- description from .arch file ---
3535 // D.d = trunc(S0.d);
3536 // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
3537 void
3539 {
3540 Wavefront *wf = gpuDynInst->wavefront();
3541 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3542 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3543
3544 src.readSrc();
3545
3546 if (instData.ABS & 0x1) {
3547 src.absModifier();
3548 }
3549
3550 if (extData.NEG & 0x1) {
3551 src.negModifier();
3552 }
3553
3554 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3555 if (wf->execMask(lane)) {
3556 vdst[lane] = std::floor(src[lane]);
3557 }
3558 }
3559
3560 vdst.write();
3561 } // execute
3562 // --- Inst_VOP3__V_FRACT_F32 class methods ---
3563
3565 : Inst_VOP3A(iFmt, "v_fract_f32", false)
3566 {
3567 setFlag(ALU);
3568 setFlag(F32);
3569 } // Inst_VOP3__V_FRACT_F32
3570
3572 {
3573 } // ~Inst_VOP3__V_FRACT_F32
3574
3575 // --- description from .arch file ---
3576 // D.f = S0.f - floor(S0.f).
3577 void
3579 {
3580 Wavefront *wf = gpuDynInst->wavefront();
3581 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3582 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3583
3584 src.readSrc();
3585
3586 if (instData.ABS & 0x1) {
3587 src.absModifier();
3588 }
3589
3590 if (extData.NEG & 0x1) {
3591 src.negModifier();
3592 }
3593
3594 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3595 if (wf->execMask(lane)) {
3596 VecElemF32 int_part(0.0);
3597 vdst[lane] = std::modf(src[lane], &int_part);
3598 }
3599 }
3600
3601 vdst.write();
3602 } // execute
3603 // --- Inst_VOP3__V_TRUNC_F32 class methods ---
3604
3606 : Inst_VOP3A(iFmt, "v_trunc_f32", false)
3607 {
3608 setFlag(ALU);
3609 setFlag(F32);
3610 } // Inst_VOP3__V_TRUNC_F32
3611
3613 {
3614 } // ~Inst_VOP3__V_TRUNC_F32
3615
3616 // --- description from .arch file ---
3617 // D.f = trunc(S0.f), return integer part of S0.f.
3618 void
3620 {
3621 Wavefront *wf = gpuDynInst->wavefront();
3622 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3623 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3624
3625 src.readSrc();
3626
3627 if (instData.ABS & 0x1) {
3628 src.absModifier();
3629 }
3630
3631 if (extData.NEG & 0x1) {
3632 src.negModifier();
3633 }
3634
3635 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3636 if (wf->execMask(lane)) {
3637 vdst[lane] = std::trunc(src[lane]);
3638 }
3639 }
3640
3641 vdst.write();
3642 } // execute
3643 // --- Inst_VOP3__V_CEIL_F32 class methods ---
3644
3646 : Inst_VOP3A(iFmt, "v_ceil_f32", false)
3647 {
3648 setFlag(ALU);
3649 setFlag(F32);
3650 } // Inst_VOP3__V_CEIL_F32
3651
3653 {
3654 } // ~Inst_VOP3__V_CEIL_F32
3655
3656 // --- description from .arch file ---
3657 // D.f = trunc(S0.f);
3658 // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
3659 void
3661 {
3662 Wavefront *wf = gpuDynInst->wavefront();
3663 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3664 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3665
3666 src.readSrc();
3667
3668 if (instData.ABS & 0x1) {
3669 src.absModifier();
3670 }
3671
3672 if (extData.NEG & 0x1) {
3673 src.negModifier();
3674 }
3675
3676 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3677 if (wf->execMask(lane)) {
3678 vdst[lane] = std::ceil(src[lane]);
3679 }
3680 }
3681
3682 vdst.write();
3683 } // execute
3684 // --- Inst_VOP3__V_RNDNE_F32 class methods ---
3685
3687 : Inst_VOP3A(iFmt, "v_rndne_f32", false)
3688 {
3689 setFlag(ALU);
3690 setFlag(F32);
3691 } // Inst_VOP3__V_RNDNE_F32
3692
3694 {
3695 } // ~Inst_VOP3__V_RNDNE_F32
3696
3697 // --- description from .arch file ---
3698 // D.f = round_nearest_even(S0.f).
3699 void
3701 {
3702 Wavefront *wf = gpuDynInst->wavefront();
3703 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3704 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3705
3706 src.readSrc();
3707
3708 if (instData.ABS & 0x1) {
3709 src.absModifier();
3710 }
3711
3712 if (extData.NEG & 0x1) {
3713 src.negModifier();
3714 }
3715
3716 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3717 if (wf->execMask(lane)) {
3718 vdst[lane] = roundNearestEven(src[lane]);
3719 }
3720 }
3721
3722 vdst.write();
3723 } // execute
3724 // --- Inst_VOP3__V_FLOOR_F32 class methods ---
3725
3727 : Inst_VOP3A(iFmt, "v_floor_f32", false)
3728 {
3729 setFlag(ALU);
3730 setFlag(F32);
3731 } // Inst_VOP3__V_FLOOR_F32
3732
3734 {
3735 } // ~Inst_VOP3__V_FLOOR_F32
3736
3737 // --- description from .arch file ---
3738 // D.f = trunc(S0.f);
3739 // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
3740 void
3742 {
3743 Wavefront *wf = gpuDynInst->wavefront();
3744 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3745 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3746
3747 src.readSrc();
3748
3749 if (instData.ABS & 0x1) {
3750 src.absModifier();
3751 }
3752
3753 if (extData.NEG & 0x1) {
3754 src.negModifier();
3755 }
3756
3757 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3758 if (wf->execMask(lane)) {
3759 vdst[lane] = std::floor(src[lane]);
3760 }
3761 }
3762
3763 vdst.write();
3764 } // execute
3765 // --- Inst_VOP3__V_EXP_F32 class methods ---
3766
3768 : Inst_VOP3A(iFmt, "v_exp_f32", false)
3769 {
3770 setFlag(ALU);
3771 setFlag(F32);
3772 } // Inst_VOP3__V_EXP_F32
3773
3775 {
3776 } // ~Inst_VOP3__V_EXP_F32
3777
3778 // --- description from .arch file ---
3779 // D.f = pow(2.0, S0.f).
3780 void
3782 {
3783 Wavefront *wf = gpuDynInst->wavefront();
3784 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3785 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3786
3787 src.readSrc();
3788
3789 if (instData.ABS & 0x1) {
3790 src.absModifier();
3791 }
3792
3793 if (extData.NEG & 0x1) {
3794 src.negModifier();
3795 }
3796
3797 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3798 if (wf->execMask(lane)) {
3799 vdst[lane] = std::pow(2.0, src[lane]);
3800 }
3801 }
3802
3803 vdst.write();
3804 } // execute
3805 // --- Inst_VOP3__V_LOG_F32 class methods ---
3806
3808 : Inst_VOP3A(iFmt, "v_log_f32", false)
3809 {
3810 setFlag(ALU);
3811 setFlag(F32);
3812 } // Inst_VOP3__V_LOG_F32
3813
3815 {
3816 } // ~Inst_VOP3__V_LOG_F32
3817
3818 // --- description from .arch file ---
3819 // D.f = log2(S0.f). Base 2 logarithm.
3820 void
3822 {
3823 Wavefront *wf = gpuDynInst->wavefront();
3824 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3825 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3826
3827 src.readSrc();
3828
3829 if (instData.ABS & 0x1) {
3830 src.absModifier();
3831 }
3832
3833 if (extData.NEG & 0x1) {
3834 src.negModifier();
3835 }
3836
3840 assert(!(instData.ABS & 0x2));
3841 assert(!(instData.ABS & 0x4));
3842 assert(!(extData.NEG & 0x2));
3843 assert(!(extData.NEG & 0x4));
3844
3845 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3846 if (wf->execMask(lane)) {
3847 vdst[lane] = std::log2(src[lane]);
3848 }
3849 }
3850
3851 vdst.write();
3852 } // execute
3853 // --- Inst_VOP3__V_RCP_F32 class methods ---
3854
3856 : Inst_VOP3A(iFmt, "v_rcp_f32", false)
3857 {
3858 setFlag(ALU);
3859 setFlag(F32);
3860 } // Inst_VOP3__V_RCP_F32
3861
3863 {
3864 } // ~Inst_VOP3__V_RCP_F32
3865
3866 // --- description from .arch file ---
3867 // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
3868 void
3870 {
3871 Wavefront *wf = gpuDynInst->wavefront();
3872 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3873 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3874
3875 src.readSrc();
3876
3877 if (instData.ABS & 0x1) {
3878 src.absModifier();
3879 }
3880
3881 if (extData.NEG & 0x1) {
3882 src.negModifier();
3883 }
3884
3885 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3886 if (wf->execMask(lane)) {
3887 vdst[lane] = 1.0 / src[lane];
3888 }
3889 }
3890
3891 vdst.write();
3892 } // execute
3893 // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
3894
3896 : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
3897 {
3898 setFlag(ALU);
3899 setFlag(F32);
3900 } // Inst_VOP3__V_RCP_IFLAG_F32
3901
3903 {
3904 } // ~Inst_VOP3__V_RCP_IFLAG_F32
3905
3906 // --- description from .arch file ---
3907 // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
3908 // --- integer DIV_BY_ZERO exception but cannot raise floating-point
3909 // --- exceptions.
3910 void
3912 {
3913 Wavefront *wf = gpuDynInst->wavefront();
3914 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3915 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3916
3917 src.readSrc();
3918
3919 if (instData.ABS & 0x1) {
3920 src.absModifier();
3921 }
3922
3923 if (extData.NEG & 0x1) {
3924 src.negModifier();
3925 }
3926
3927 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3928 if (wf->execMask(lane)) {
3929 vdst[lane] = 1.0 / src[lane];
3930 }
3931 }
3932
3933 vdst.write();
3934 } // execute
3935 // --- Inst_VOP3__V_RSQ_F32 class methods ---
3936
3938 : Inst_VOP3A(iFmt, "v_rsq_f32", false)
3939 {
3940 setFlag(ALU);
3941 setFlag(F32);
3942 } // Inst_VOP3__V_RSQ_F32
3943
3945 {
3946 } // ~Inst_VOP3__V_RSQ_F32
3947
3948 // --- description from .arch file ---
3949 // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
3950 void
3952 {
3953 Wavefront *wf = gpuDynInst->wavefront();
3954 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3955 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3956
3957 src.readSrc();
3958
3959 if (instData.ABS & 0x1) {
3960 src.absModifier();
3961 }
3962
3963 if (extData.NEG & 0x1) {
3964 src.negModifier();
3965 }
3966
3967 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3968 if (wf->execMask(lane)) {
3969 vdst[lane] = 1.0 / std::sqrt(src[lane]);
3970 }
3971 }
3972
3973 vdst.write();
3974 } // execute
3975 // --- Inst_VOP3__V_RCP_F64 class methods ---
3976
3978 : Inst_VOP3A(iFmt, "v_rcp_f64", false)
3979 {
3980 setFlag(ALU);
3981 setFlag(F64);
3982 } // Inst_VOP3__V_RCP_F64
3983
3985 {
3986 } // ~Inst_VOP3__V_RCP_F64
3987
3988 // --- description from .arch file ---
3989 // D.d = 1.0 / S0.d.
3990 void
3992 {
3993 Wavefront *wf = gpuDynInst->wavefront();
3994 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3995 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3996
3997 src.readSrc();
3998
3999 if (instData.ABS & 0x1) {
4000 src.absModifier();
4001 }
4002
4003 if (extData.NEG & 0x1) {
4004 src.negModifier();
4005 }
4006
4007 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4008 if (wf->execMask(lane)) {
4009 if (std::fpclassify(src[lane]) == FP_ZERO) {
4010 vdst[lane] = +INFINITY;
4011 } else if (std::isnan(src[lane])) {
4012 vdst[lane] = NAN;
4013 } else if (std::isinf(src[lane])) {
4014 if (std::signbit(src[lane])) {
4015 vdst[lane] = -0.0;
4016 } else {
4017 vdst[lane] = 0.0;
4018 }
4019 } else {
4020 vdst[lane] = 1.0 / src[lane];
4021 }
4022 }
4023 }
4024
4025 vdst.write();
4026 } // execute
4027 // --- Inst_VOP3__V_RSQ_F64 class methods ---
4028
4030 : Inst_VOP3A(iFmt, "v_rsq_f64", false)
4031 {
4032 setFlag(ALU);
4033 setFlag(F64);
4034 } // Inst_VOP3__V_RSQ_F64
4035
4037 {
4038 } // ~Inst_VOP3__V_RSQ_F64
4039
4040 // --- description from .arch file ---
4041 // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
4042 void
4044 {
4045 Wavefront *wf = gpuDynInst->wavefront();
4046 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4047 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4048
4049 src.readSrc();
4050
4051 if (instData.ABS & 0x1) {
4052 src.absModifier();
4053 }
4054
4055 if (extData.NEG & 0x1) {
4056 src.negModifier();
4057 }
4058
4059 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4060 if (wf->execMask(lane)) {
4061 if (std::fpclassify(src[lane]) == FP_ZERO) {
4062 vdst[lane] = +INFINITY;
4063 } else if (std::isnan(src[lane])) {
4064 vdst[lane] = NAN;
4065 } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
4066 vdst[lane] = 0.0;
4067 } else if (std::signbit(src[lane])) {
4068 vdst[lane] = NAN;
4069 } else {
4070 vdst[lane] = 1.0 / std::sqrt(src[lane]);
4071 }
4072 }
4073 }
4074
4075 vdst.write();
4076 } // execute
4077 // --- Inst_VOP3__V_SQRT_F32 class methods ---
4078
4080 : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
4081 {
4082 setFlag(ALU);
4083 setFlag(F32);
4084 } // Inst_VOP3__V_SQRT_F32
4085
4087 {
4088 } // ~Inst_VOP3__V_SQRT_F32
4089
4090 // --- description from .arch file ---
4091 // D.f = sqrt(S0.f).
4092 void
4094 {
4095 Wavefront *wf = gpuDynInst->wavefront();
4096 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4097 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4098
4099 src.readSrc();
4100
4101 if (instData.ABS & 0x1) {
4102 src.absModifier();
4103 }
4104
4105 if (extData.NEG & 0x1) {
4106 src.negModifier();
4107 }
4108
4109 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4110 if (wf->execMask(lane)) {
4111 vdst[lane] = std::sqrt(src[lane]);
4112 }
4113 }
4114
4115 vdst.write();
4116 } // execute
4117 // --- Inst_VOP3__V_SQRT_F64 class methods ---
4118
4120 : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
4121 {
4122 setFlag(ALU);
4123 setFlag(F64);
4124 } // Inst_VOP3__V_SQRT_F64
4125
4127 {
4128 } // ~Inst_VOP3__V_SQRT_F64
4129
4130 // --- description from .arch file ---
4131 // D.d = sqrt(S0.d).
4132 void
4134 {
4135 Wavefront *wf = gpuDynInst->wavefront();
4136 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4137 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4138
4139 src.readSrc();
4140
4141 if (instData.ABS & 0x1) {
4142 src.absModifier();
4143 }
4144
4145 if (extData.NEG & 0x1) {
4146 src.negModifier();
4147 }
4148
4149 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4150 if (wf->execMask(lane)) {
4151 vdst[lane] = std::sqrt(src[lane]);
4152 }
4153 }
4154
4155 vdst.write();
4156 } // execute
4157 // --- Inst_VOP3__V_SIN_F32 class methods ---
4158
4160 : Inst_VOP3A(iFmt, "v_sin_f32", false)
4161 {
4162 setFlag(ALU);
4163 setFlag(F32);
4164 } // Inst_VOP3__V_SIN_F32
4165
4167 {
4168 } // ~Inst_VOP3__V_SIN_F32
4169
4170 // --- description from .arch file ---
4171 // D.f = sin(S0.f * 2 * PI).
4172 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
4173 // float 0.0.
4174 void
4176 {
4177 Wavefront *wf = gpuDynInst->wavefront();
4178 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4179 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
4180 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4181
4182 src.readSrc();
4183 pi.read();
4184
4185 if (instData.ABS & 0x1) {
4186 src.absModifier();
4187 }
4188
4189 if (extData.NEG & 0x1) {
4190 src.negModifier();
4191 }
4192
4193 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4194 if (wf->execMask(lane)) {
4195 vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
4196 }
4197 }
4198
4199 vdst.write();
4200 } // execute
4201 // --- Inst_VOP3__V_COS_F32 class methods ---
4202
4204 : Inst_VOP3A(iFmt, "v_cos_f32", false)
4205 {
4206 setFlag(ALU);
4207 setFlag(F32);
4208 } // Inst_VOP3__V_COS_F32
4209
4211 {
4212 } // ~Inst_VOP3__V_COS_F32
4213
4214 // --- description from .arch file ---
4215 // D.f = cos(S0.f * 2 * PI).
4216 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
4217 // float 1.0.
4218 void
4220 {
4221 Wavefront *wf = gpuDynInst->wavefront();
4222 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4223 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
4224 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4225
4226 src.readSrc();
4227 pi.read();
4228
4229 if (instData.ABS & 0x1) {
4230 src.absModifier();
4231 }
4232
4233 if (extData.NEG & 0x1) {
4234 src.negModifier();
4235 }
4236
4237 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4238 if (wf->execMask(lane)) {
4239 vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
4240 }
4241 }
4242
4243 vdst.write();
4244 } // execute
4245 // --- Inst_VOP3__V_NOT_B32 class methods ---
4246
4248 : Inst_VOP3A(iFmt, "v_not_b32", false)
4249 {
4250 setFlag(ALU);
4251 } // Inst_VOP3__V_NOT_B32
4252
4254 {
4255 } // ~Inst_VOP3__V_NOT_B32
4256
4257 // --- description from .arch file ---
4258 // D.u = ~S0.u.
4259 // Input and output modifiers not supported.
4260 void
4262 {
4263 Wavefront *wf = gpuDynInst->wavefront();
4264 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4265 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4266
4267 src.readSrc();
4268
4269 if (instData.ABS & 0x1) {
4270 src.absModifier();
4271 }
4272
4273 if (extData.NEG & 0x1) {
4274 src.negModifier();
4275 }
4276
4277 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4278 if (wf->execMask(lane)) {
4279 vdst[lane] = ~src[lane];
4280 }
4281 }
4282
4283 vdst.write();
4284 } // execute
4285 // --- Inst_VOP3__V_BFREV_B32 class methods ---
4286
4288 : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
4289 {
4290 setFlag(ALU);
4291 } // Inst_VOP3__V_BFREV_B32
4292
4294 {
4295 } // ~Inst_VOP3__V_BFREV_B32
4296
4297 // --- description from .arch file ---
4298 // D.u[31:0] = S0.u[0:31], bitfield reverse.
4299 // Input and output modifiers not supported.
4300 void
4302 {
4303 Wavefront *wf = gpuDynInst->wavefront();
4304 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4305 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4306
4307 src.readSrc();
4308
4309 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4310 if (wf->execMask(lane)) {
4311 vdst[lane] = reverseBits(src[lane]);
4312 }
4313 }
4314
4315 vdst.write();
4316 } // execute
4317 // --- Inst_VOP3__V_FFBH_U32 class methods ---
4318
4320 : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
4321 {
4322 setFlag(ALU);
4323 } // Inst_VOP3__V_FFBH_U32
4324
4326 {
4327 } // ~Inst_VOP3__V_FFBH_U32
4328
4329 // --- description from .arch file ---
4330 // D.u = position of first 1 in S0.u from MSB;
4331 // D.u = 0xffffffff if S0.u == 0.
4332 void
4334 {
4335 Wavefront *wf = gpuDynInst->wavefront();
4336 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4337 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4338
4339 src.readSrc();
4340
4341 if (instData.ABS & 0x1) {
4342 src.absModifier();
4343 }
4344
4345 if (extData.NEG & 0x1) {
4346 src.negModifier();
4347 }
4348
4349 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4350 if (wf->execMask(lane)) {
4351 vdst[lane] = findFirstOneMsb(src[lane]);
4352 }
4353 }
4354
4355 vdst.write();
4356 } // execute
4357 // --- Inst_VOP3__V_FFBL_B32 class methods ---
4358
4360 : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
4361 {
4362 setFlag(ALU);
4363 } // Inst_VOP3__V_FFBL_B32
4364
4366 {
4367 } // ~Inst_VOP3__V_FFBL_B32
4368
4369 // --- description from .arch file ---
4370 // D.u = position of first 1 in S0.u from LSB;
4371 // D.u = 0xffffffff if S0.u == 0.
4372 void
4374 {
4375 Wavefront *wf = gpuDynInst->wavefront();
4376 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4377 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4378
4379 src.readSrc();
4380
4381 if (instData.ABS & 0x1) {
4382 src.absModifier();
4383 }
4384
4385 if (extData.NEG & 0x1) {
4386 src.negModifier();
4387 }
4388
4389 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4390 if (wf->execMask(lane)) {
4391 vdst[lane] = findFirstOne(src[lane]);
4392 }
4393 }
4394
4395 vdst.write();
4396 } // execute
4397 // --- Inst_VOP3__V_FFBH_I32 class methods ---
4398
4400 : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
4401 {
4402 setFlag(ALU);
4403 } // Inst_VOP3__V_FFBH_I32
4404
4406 {
4407 } // ~Inst_VOP3__V_FFBH_I32
4408
4409 // --- description from .arch file ---
4410 // D.u = position of first bit different from sign bit in S0.i from MSB;
4411 // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
4412 void
4414 {
4415 Wavefront *wf = gpuDynInst->wavefront();
4416 ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
4417 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4418
4419 src.readSrc();
4420
4421 if (instData.ABS & 0x1) {
4422 src.absModifier();
4423 }
4424
4425 if (extData.NEG & 0x1) {
4426 src.negModifier();
4427 }
4428
4429 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4430 if (wf->execMask(lane)) {
4431 vdst[lane] = firstOppositeSignBit(src[lane]);
4432 }
4433 }
4434
4435 vdst.write();
4436 } // execute
4437 // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
4438
4440 InFmt_VOP3A *iFmt)
4441 : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
4442 {
4443 setFlag(ALU);
4444 setFlag(F64);
4445 } // Inst_VOP3__V_FREXP_EXP_I32_F64
4446
4448 {
4449 } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
4450
4451 // --- description from .arch file ---
4452 // See V_FREXP_EXP_I32_F32.
4453 void
4455 {
4456 Wavefront *wf = gpuDynInst->wavefront();
4457 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4458 VecOperandI32 vdst(gpuDynInst, instData.VDST);
4459
4460 src.readSrc();
4461
4462 if (instData.ABS & 0x1) {
4463 src.absModifier();
4464 }
4465
4466 if (extData.NEG & 0x1) {
4467 src.negModifier();
4468 }
4469
4470 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4471 if (wf->execMask(lane)) {
4472 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
4473 vdst[lane] = 0;
4474 } else {
4475 VecElemI32 exp(0);
4476 std::frexp(src[lane], &exp);
4477 vdst[lane] = exp;
4478 }
4479 }
4480 }
4481
4482 vdst.write();
4483 } // execute
4484 // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
4485
4487 : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
4488 {
4489 setFlag(ALU);
4490 setFlag(F64);
4491 } // Inst_VOP3__V_FREXP_MANT_F64
4492
4494 {
4495 } // ~Inst_VOP3__V_FREXP_MANT_F64
4496
4497 // --- description from .arch file ---
4498 // See V_FREXP_MANT_F32.
4499 void
4501 {
4502 Wavefront *wf = gpuDynInst->wavefront();
4503 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4504 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4505
4506 src.readSrc();
4507
4508 if (instData.ABS & 0x1) {
4509 src.absModifier();
4510 }
4511
4512 if (extData.NEG & 0x1) {
4513 src.negModifier();
4514 }
4515
4516 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4517 if (wf->execMask(lane)) {
4518 VecElemI32 exp(0);
4519 vdst[lane] = std::frexp(src[lane], &exp);
4520 }
4521 }
4522
4523 vdst.write();
4524 } // execute
4525 // --- Inst_VOP3__V_FRACT_F64 class methods ---
4526
4528 : Inst_VOP3A(iFmt, "v_fract_f64", false)
4529 {
4530 setFlag(ALU);
4531 setFlag(F64);
4532 } // Inst_VOP3__V_FRACT_F64
4533
4535 {
4536 } // ~Inst_VOP3__V_FRACT_F64
4537
4538 // --- description from .arch file ---
4539 // See V_FRACT_F32.
4540 void
4542 {
4543 Wavefront *wf = gpuDynInst->wavefront();
4544 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4545 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4546
4547 src.readSrc();
4548
4549 if (instData.ABS & 0x1) {
4550 src.absModifier();
4551 }
4552
4553 if (extData.NEG & 0x1) {
4554 src.negModifier();
4555 }
4556
4557 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4558 if (wf->execMask(lane)) {
4559 VecElemF32 int_part(0.0);
4560 vdst[lane] = std::modf(src[lane], &int_part);
4561 }
4562 }
4563
4564 vdst.write();
4565 } // execute
4566 // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
4567
4569 InFmt_VOP3A *iFmt)
4570 : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
4571 {
4572 setFlag(ALU);
4573 setFlag(F32);
4574 } // Inst_VOP3__V_FREXP_EXP_I32_F32
4575
4577 {
4578 } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
4579
4580 // --- description from .arch file ---
4581 // if (S0.f == INF || S0.f == NAN) then D.i = 0;
4582 // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
4583 // Returns exponent of single precision float input, such that S0.f =
4584 // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
4585 // the significand.
4586 void
4588 {
4589 Wavefront *wf = gpuDynInst->wavefront();
4590 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4591 VecOperandI32 vdst(gpuDynInst, instData.VDST);
4592
4593 src.readSrc();
4594
4595 if (instData.ABS & 0x1) {
4596 src.absModifier();
4597 }
4598
4599 if (extData.NEG & 0x1) {
4600 src.negModifier();
4601 }
4602
4603 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4604 if (wf->execMask(lane)) {
4605 if (std::isinf(src[lane])|| std::isnan(src[lane])) {
4606 vdst[lane] = 0;
4607 } else {
4608 VecElemI32 exp(0);
4609 std::frexp(src[lane], &exp);
4610 vdst[lane] = exp;
4611 }
4612 }
4613 }
4614
4615 vdst.write();
4616 } // execute
4617 // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
4618
4620 : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
4621 {
4622 setFlag(ALU);
4623 setFlag(F32);
4624 } // Inst_VOP3__V_FREXP_MANT_F32
4625
4627 {
4628 } // ~Inst_VOP3__V_FREXP_MANT_F32
4629
4630 // --- description from .arch file ---
4631 // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
4632 // else D.f = Mantissa(S0.f).
4633 // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
4634 // --- significand of single precision float input, such that S0.f =
4635 // --- significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
4636 // --- returns integer exponent.
4637 void
4639 {
4640 Wavefront *wf = gpuDynInst->wavefront();
4641 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4642 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4643
4644 src.readSrc();
4645
4646 if (instData.ABS & 0x1) {
4647 src.absModifier();
4648 }
4649
4650 if (extData.NEG & 0x1) {
4651 src.negModifier();
4652 }
4653
4654 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4655 if (wf->execMask(lane)) {
4656 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
4657 vdst[lane] = src[lane];
4658 } else {
4659 VecElemI32 exp(0);
4660 vdst[lane] = std::frexp(src[lane], &exp);
4661 }
4662 }
4663 }
4664
4665 vdst.write();
4666 } // execute
4667 // --- Inst_VOP3__V_CLREXCP class methods ---
4668
4670 : Inst_VOP3A(iFmt, "v_clrexcp", false)
4671 {
4672 } // Inst_VOP3__V_CLREXCP
4673
4675 {
4676 } // ~Inst_VOP3__V_CLREXCP
4677
4678 // --- description from .arch file ---
4679 // Clear wave's exception state in SIMD (SP).
4680 void
4682 {
4684 } // execute
4685 // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
4686
4688 : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
4689 {
4690 setFlag(ALU);
4691 setFlag(F16);
4692 } // Inst_VOP3__V_CVT_F16_U16
4693
4695 {
4696 } // ~Inst_VOP3__V_CVT_F16_U16
4697
4698 // --- description from .arch file ---
4699 // D.f16 = uint16_to_flt16(S.u16).
4700 // Supports denormals, rounding, exception flags and saturation.
4701 void
4703 {
4705 } // execute
4706 // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
4707
4709 : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
4710 {
4711 setFlag(ALU);
4712 setFlag(F16);
4713 } // Inst_VOP3__V_CVT_F16_I16
4714
4716 {
4717 } // ~Inst_VOP3__V_CVT_F16_I16
4718
4719 // --- description from .arch file ---
4720 // D.f16 = int16_to_flt16(S.i16).
4721 // Supports denormals, rounding, exception flags and saturation.
4722 void
4724 {
4726 } // execute
4727 // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
4728
4730 : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
4731 {
4732 setFlag(ALU);
4733 setFlag(F16);
4734 } // Inst_VOP3__V_CVT_U16_F16
4735
4737 {
4738 } // ~Inst_VOP3__V_CVT_U16_F16
4739
4740 // --- description from .arch file ---
4741 // D.u16 = flt16_to_uint16(S.f16).
4742 // Supports rounding, exception flags and saturation.
4743 void
4745 {
4747 } // execute
4748 // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
4749
4751 : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
4752 {
4753 setFlag(ALU);
4754 setFlag(F16);
4755 } // Inst_VOP3__V_CVT_I16_F16
4756
4758 {
4759 } // ~Inst_VOP3__V_CVT_I16_F16
4760
4761 // --- description from .arch file ---
4762 // D.i16 = flt16_to_int16(S.f16).
4763 // Supports rounding, exception flags and saturation.
4764 void
4766 {
4768 } // execute
4769 // --- Inst_VOP3__V_RCP_F16 class methods ---
4770
4772 : Inst_VOP3A(iFmt, "v_rcp_f16", false)
4773 {
4774 setFlag(ALU);
4775 setFlag(F16);
4776 } // Inst_VOP3__V_RCP_F16
4777
4779 {
4780 } // ~Inst_VOP3__V_RCP_F16
4781
4782 // --- description from .arch file ---
4783 // if (S0.f16 == 1.0f)
4784 // D.f16 = 1.0f;
4785 // else
4786 // D.f16 = ApproximateRecip(S0.f16).
4787 void
4789 {
4791 } // execute
4792 // --- Inst_VOP3__V_SQRT_F16 class methods ---
4793
4795 : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
4796 {
4797 setFlag(ALU);
4798 setFlag(F16);
4799 } // Inst_VOP3__V_SQRT_F16
4800
4802 {
4803 } // ~Inst_VOP3__V_SQRT_F16
4804
4805 // --- description from .arch file ---
4806 // if (S0.f16 == 1.0f)
4807 // D.f16 = 1.0f;
4808 // else
4809 // D.f16 = ApproximateSqrt(S0.f16).
4810 void
4812 {
4814 } // execute
4815 // --- Inst_VOP3__V_RSQ_F16 class methods ---
4816
4818 : Inst_VOP3A(iFmt, "v_rsq_f16", false)
4819 {
4820 setFlag(ALU);
4821 setFlag(F16);
4822 } // Inst_VOP3__V_RSQ_F16
4823
4825 {
4826 } // ~Inst_VOP3__V_RSQ_F16
4827
4828 // --- description from .arch file ---
4829 // if (S0.f16 == 1.0f)
4830 // D.f16 = 1.0f;
4831 // else
4832 // D.f16 = ApproximateRecipSqrt(S0.f16).
4833 void
4835 {
4837 } // execute
4838 // --- Inst_VOP3__V_LOG_F16 class methods ---
4839
4841 : Inst_VOP3A(iFmt, "v_log_f16", false)
4842 {
4843 setFlag(ALU);
4844 setFlag(F16);
4845 } // Inst_VOP3__V_LOG_F16
4846
4848 {
4849 } // ~Inst_VOP3__V_LOG_F16
4850
4851 // --- description from .arch file ---
4852 // if (S0.f16 == 1.0f)
4853 // D.f16 = 0.0f;
4854 // else
4855 // D.f16 = ApproximateLog2(S0.f16).
4856 void
4858 {
4860 } // execute
4861 // --- Inst_VOP3__V_EXP_F16 class methods ---
4862
4864 : Inst_VOP3A(iFmt, "v_exp_f16", false)
4865 {
4866 setFlag(ALU);
4867 setFlag(F16);
4868 } // Inst_VOP3__V_EXP_F16
4869
4871 {
4872 } // ~Inst_VOP3__V_EXP_F16
4873
4874 // --- description from .arch file ---
4875 // if (S0.f16 == 0.0f)
4876 // D.f16 = 1.0f;
4877 // else
4878 // D.f16 = Approximate2ToX(S0.f16).
4879 void
4881 {
4883 } // execute
4884 // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
4885
4887 : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
4888 {
4889 setFlag(ALU);
4890 setFlag(F16);
4891 } // Inst_VOP3__V_FREXP_MANT_F16
4892
4894 {
4895 } // ~Inst_VOP3__V_FREXP_MANT_F16
4896
4897 // --- description from .arch file ---
4898 // if (S0.f16 == +-INF || S0.f16 == NAN)
4899 // D.f16 = S0.f16;
4900 // else
4901 // D.f16 = mantissa(S0.f16).
4902 // Result range is (-1.0,-0.5][0.5,1.0).
4903 // C math library frexp function.
4904 // Returns binary significand of half precision float input, such that the
4905 // original single float = significand * (2 ** exponent).
4906 void
4908 {
4910 } // execute
4911 // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
4912
4914 InFmt_VOP3A *iFmt)
4915 : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
4916 {
4917 setFlag(ALU);
4918 setFlag(F16);
4919 } // Inst_VOP3__V_FREXP_EXP_I16_F16
4920
4922 {
4923 } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
4924
4925 // --- description from .arch file ---
4926 // if (S0.f16 == +-INF || S0.f16 == NAN)
4927 // D.i16 = 0;
4928 // else
4929 // D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
4930 // C math library frexp function.
4931 // Returns exponent of half precision float input, such that the
4932 // original single float = significand * (2 ** exponent).
4933 void
4938 // --- Inst_VOP3__V_FLOOR_F16 class methods ---
4939
4941 : Inst_VOP3A(iFmt, "v_floor_f16", false)
4942 {
4943 setFlag(ALU);
4944 setFlag(F16);
4945 } // Inst_VOP3__V_FLOOR_F16
4946
4948 {
4949 } // ~Inst_VOP3__V_FLOOR_F16
4950
4951 // --- description from .arch file ---
4952 // D.f16 = trunc(S0.f16);
4953 // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
4954 void
4956 {
4958 } // execute
4959 // --- Inst_VOP3__V_CEIL_F16 class methods ---
4960
4962 : Inst_VOP3A(iFmt, "v_ceil_f16", false)
4963 {
4964 setFlag(ALU);
4965 setFlag(F16);
4966 } // Inst_VOP3__V_CEIL_F16
4967
4969 {
4970 } // ~Inst_VOP3__V_CEIL_F16
4971
4972 // --- description from .arch file ---
4973 // D.f16 = trunc(S0.f16);
4974 // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
4975 void
4977 {
4979 } // execute
4980 // --- Inst_VOP3__V_TRUNC_F16 class methods ---
4981
4983 : Inst_VOP3A(iFmt, "v_trunc_f16", false)
4984 {
4985 setFlag(ALU);
4986 setFlag(F16);
4987 } // Inst_VOP3__V_TRUNC_F16
4988
4990 {
4991 } // ~Inst_VOP3__V_TRUNC_F16
4992
4993 // --- description from .arch file ---
4994 // D.f16 = trunc(S0.f16).
4995 // Round-to-zero semantics.
4996 void
4998 {
5000 } // execute
5001 // --- Inst_VOP3__V_RNDNE_F16 class methods ---
5002
5004 : Inst_VOP3A(iFmt, "v_rndne_f16", false)
5005 {
5006 setFlag(ALU);
5007 setFlag(F16);
5008 } // Inst_VOP3__V_RNDNE_F16
5009
5011 {
5012 } // ~Inst_VOP3__V_RNDNE_F16
5013
5014 // --- description from .arch file ---
5015 // D.f16 = FLOOR(S0.f16 + 0.5f);
5016 // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
5017 // Round-to-nearest-even semantics.
5018 void
5020 {
5022 } // execute
5023 // --- Inst_VOP3__V_FRACT_F16 class methods ---
5024
5026 : Inst_VOP3A(iFmt, "v_fract_f16", false)
5027 {
5028 setFlag(ALU);
5029 setFlag(F16);
5030 } // Inst_VOP3__V_FRACT_F16
5031
5033 {
5034 } // ~Inst_VOP3__V_FRACT_F16
5035
5036 // --- description from .arch file ---
5037 // D.f16 = S0.f16 + -floor(S0.f16).
5038 void
5040 {
5042 } // execute
5043 // --- Inst_VOP3__V_SIN_F16 class methods ---
5044
5046 : Inst_VOP3A(iFmt, "v_sin_f16", false)
5047 {
5048 setFlag(ALU);
5049 setFlag(F16);
5050 } // Inst_VOP3__V_SIN_F16
5051
5053 {
5054 } // ~Inst_VOP3__V_SIN_F16
5055
5056 // --- description from .arch file ---
5057 // D.f16 = sin(S0.f16 * 2 * PI).
5058 void
5060 {
5062 } // execute
5063 // --- Inst_VOP3__V_COS_F16 class methods ---
5064
5066 : Inst_VOP3A(iFmt, "v_cos_f16", false)
5067 {
5068 setFlag(ALU);
5069 setFlag(F16);
5070 } // Inst_VOP3__V_COS_F16
5071
5073 {
5074 } // ~Inst_VOP3__V_COS_F16
5075
5076 // --- description from .arch file ---
5077 // D.f16 = cos(S0.f16 * 2 * PI).
5078 void
5080 {
5082 } // execute
5083 // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
5084
5086 : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
5087 {
5088 setFlag(ALU);
5089 setFlag(F32);
5090 } // Inst_VOP3__V_EXP_LEGACY_F32
5091
5093 {
5094 } // ~Inst_VOP3__V_EXP_LEGACY_F32
5095
5096 // --- description from .arch file ---
5097 // D.f = pow(2.0, S0.f) with legacy semantics.
5098 void
5100 {
5101 Wavefront *wf = gpuDynInst->wavefront();
5102 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
5103 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5104
5105 src.readSrc();
5106
5107 if (instData.ABS & 0x1) {
5108 src.absModifier();
5109 }
5110
5111 if (extData.NEG & 0x1) {
5112 src.negModifier();
5113 }
5114
5118 assert(!(instData.ABS & 0x2));
5119 assert(!(instData.ABS & 0x4));
5120 assert(!(extData.NEG & 0x2));
5121 assert(!(extData.NEG & 0x4));
5122
5123 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5124 if (wf->execMask(lane)) {
5125 vdst[lane] = std::pow(2.0, src[lane]);
5126 }
5127 }
5128
5129 vdst.write();
5130 } // execute
5131 // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
5132
5134 : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
5135 {
5136 setFlag(ALU);
5137 setFlag(F32);
5138 } // Inst_VOP3__V_LOG_LEGACY_F32
5139
5141 {
5142 } // ~Inst_VOP3__V_LOG_LEGACY_F32
5143
5144 // --- description from .arch file ---
5145 // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
5146 void
5148 {
5149 Wavefront *wf = gpuDynInst->wavefront();
5150 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
5151 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5152
5153 src.readSrc();
5154
5155 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5156 if (wf->execMask(lane)) {
5157 vdst[lane] = std::log2(src[lane]);
5158 }
5159 }
5160
5161 vdst.write();
5162 } // execute
5163 // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
5164
5166 : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
5167 {
5168 setFlag(ALU);
5169 setFlag(F32);
5170 setFlag(MAD);
5171 } // Inst_VOP3__V_MAD_LEGACY_F32
5172
5174 {
5175 } // ~Inst_VOP3__V_MAD_LEGACY_F32
5176
5177 // --- description from .arch file ---
5178 // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
5179 void
5181 {
5182 Wavefront *wf = gpuDynInst->wavefront();
5183 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5184 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5185 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5186 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5187
5188 src0.readSrc();
5189 src1.readSrc();
5190 src2.readSrc();
5191
5192 if (instData.ABS & 0x1) {
5193 src0.absModifier();
5194 }
5195
5196 if (instData.ABS & 0x2) {
5197 src1.absModifier();
5198 }
5199
5200 if (instData.ABS & 0x4) {
5201 src2.absModifier();
5202 }
5203
5204 if (extData.NEG & 0x1) {
5205 src0.negModifier();
5206 }
5207
5208 if (extData.NEG & 0x2) {
5209 src1.negModifier();
5210 }
5211
5212 if (extData.NEG & 0x4) {
5213 src2.negModifier();
5214 }
5215
5216 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5217 if (wf->execMask(lane)) {
5218 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5219 }
5220 }
5221
5222 vdst.write();
5223 } // execute
5224 // --- Inst_VOP3__V_MAD_F32 class methods ---
5225
5227 : Inst_VOP3A(iFmt, "v_mad_f32", false)
5228 {
5229 setFlag(ALU);
5230 setFlag(F32);
5231 setFlag(MAD);
5232 } // Inst_VOP3__V_MAD_F32
5233
5235 {
5236 } // ~Inst_VOP3__V_MAD_F32
5237
5238 // --- description from .arch file ---
5239 // D.f = S0.f * S1.f + S2.f.
5240 void
5242 {
5243 Wavefront *wf = gpuDynInst->wavefront();
5244 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5245 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5246 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5247 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5248
5249 src0.readSrc();
5250 src1.readSrc();
5251 src2.readSrc();
5252
5253 if (instData.ABS & 0x1) {
5254 src0.absModifier();
5255 }
5256
5257 if (instData.ABS & 0x2) {
5258 src1.absModifier();
5259 }
5260
5261 if (instData.ABS & 0x4) {
5262 src2.absModifier();
5263 }
5264
5265 if (extData.NEG & 0x1) {
5266 src0.negModifier();
5267 }
5268
5269 if (extData.NEG & 0x2) {
5270 src1.negModifier();
5271 }
5272
5273 if (extData.NEG & 0x4) {
5274 src2.negModifier();
5275 }
5276
5277 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5278 if (wf->execMask(lane)) {
5279 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5280 }
5281 }
5282
5283 vdst.write();
5284 } // execute
5285 // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
5286
5288 : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
5289 {
5290 setFlag(ALU);
5291 setFlag(MAD);
5292 } // Inst_VOP3__V_MAD_I32_I24
5293
5295 {
5296 } // ~Inst_VOP3__V_MAD_I32_I24
5297
5298 // --- description from .arch file ---
5299 // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
5300 void
5302 {
5303 Wavefront *wf = gpuDynInst->wavefront();
5304 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5305 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
5306 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
5307 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5308
5309 src0.readSrc();
5310 src1.readSrc();
5311 src2.readSrc();
5312
5316 assert(!(instData.ABS & 0x1));
5317 assert(!(instData.ABS & 0x2));
5318 assert(!(instData.ABS & 0x4));
5319 assert(!(extData.NEG & 0x1));
5320 assert(!(extData.NEG & 0x2));
5321 assert(!(extData.NEG & 0x4));
5322
5323 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5324 if (wf->execMask(lane)) {
5325 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
5326 * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
5327 }
5328 }
5329
5330 vdst.write();
5331 } // execute
5332 // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
5333
5335 : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
5336 {
5337 setFlag(ALU);
5338 setFlag(MAD);
5339 } // Inst_VOP3__V_MAD_U32_U24
5340
5342 {
5343 } // ~Inst_VOP3__V_MAD_U32_U24
5344
5345 // --- description from .arch file ---
5346 // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
5347 void
5349 {
5350 Wavefront *wf = gpuDynInst->wavefront();
5351 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5352 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5353 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5354 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5355
5356 src0.readSrc();
5357 src1.readSrc();
5358 src2.readSrc();
5359
5363 assert(!(instData.ABS & 0x1));
5364 assert(!(instData.ABS & 0x2));
5365 assert(!(instData.ABS & 0x4));
5366 assert(!(extData.NEG & 0x1));
5367 assert(!(extData.NEG & 0x2));
5368 assert(!(extData.NEG & 0x4));
5369
5370 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5371 if (wf->execMask(lane)) {
5372 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
5373 + src2[lane];
5374 }
5375 }
5376
5377 vdst.write();
5378 } // execute
5379 // --- Inst_VOP3__V_CUBEID_F32 class methods ---
5380
5382 : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
5383 {
5384 setFlag(ALU);
5385 setFlag(F32);
5386 } // Inst_VOP3__V_CUBEID_F32
5387
5389 {
5390 } // ~Inst_VOP3__V_CUBEID_F32
5391
5392 // --- description from .arch file ---
5393 // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
5394 // --- (S0.f, S1.f, S2.f).
5395 void
5397 {
5399 } // execute
5400 // --- Inst_VOP3__V_CUBESC_F32 class methods ---
5401
5403 : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
5404 {
5405 setFlag(ALU);
5406 setFlag(F32);
5407 } // Inst_VOP3__V_CUBESC_F32
5408
5410 {
5411 } // ~Inst_VOP3__V_CUBESC_F32
5412
5413 // --- description from .arch file ---
5414 // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
5415 // S2.f).
5416 void
5418 {
5420 } // execute
5421 // --- Inst_VOP3__V_CUBETC_F32 class methods ---
5422
5424 : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
5425 {
5426 setFlag(ALU);
5427 setFlag(F32);
5428 } // Inst_VOP3__V_CUBETC_F32
5429
5431 {
5432 } // ~Inst_VOP3__V_CUBETC_F32
5433
5434 // --- description from .arch file ---
5435 // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
5436 // S2.f).
5437 void
5439 {
5441 } // execute
5442 // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
5443
5445 : Inst_VOP3A(iFmt, "v_cubema_f32", false)
5446 {
5447 setFlag(ALU);
5448 setFlag(F32);
5449 } // Inst_VOP3__V_CUBEMA_F32
5450
5452 {
5453 } // ~Inst_VOP3__V_CUBEMA_F32
5454
5455 // --- description from .arch file ---
5456 // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
5457 // --- S2.f).
5458 void
5460 {
5462 } // execute
5463 // --- Inst_VOP3__V_BFE_U32 class methods ---
5464
5466 : Inst_VOP3A(iFmt, "v_bfe_u32", false)
5467 {
5468 setFlag(ALU);
5469 } // Inst_VOP3__V_BFE_U32
5470
5472 {
5473 } // ~Inst_VOP3__V_BFE_U32
5474
5475 // --- description from .arch file ---
5476 // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
5477 // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
5478 void
5480 {
5481 Wavefront *wf = gpuDynInst->wavefront();
5482 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5483 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5484 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5485 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5486
5487 src0.readSrc();
5488 src1.readSrc();
5489 src2.readSrc();
5490
5494 assert(!(instData.ABS & 0x1));
5495 assert(!(instData.ABS & 0x2));
5496 assert(!(instData.ABS & 0x4));
5497 assert(!(extData.NEG & 0x1));
5498 assert(!(extData.NEG & 0x2));
5499 assert(!(extData.NEG & 0x4));
5500
5501 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5502 if (wf->execMask(lane)) {
5503 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
5504 & ((1 << bits(src2[lane], 4, 0)) - 1);
5505 }
5506 }
5507
5508 vdst.write();
5509 } // execute
5510 // --- Inst_VOP3__V_BFE_I32 class methods ---
5511
5513 : Inst_VOP3A(iFmt, "v_bfe_i32", false)
5514 {
5515 setFlag(ALU);
5516 } // Inst_VOP3__V_BFE_I32
5517
5519 {
5520 } // ~Inst_VOP3__V_BFE_I32
5521
5522 // --- description from .arch file ---
5523 // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
5524 // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
5525 void
5527 {
5528 Wavefront *wf = gpuDynInst->wavefront();
5529 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5530 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5531 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5532 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5533
5534 src0.readSrc();
5535 src1.readSrc();
5536 src2.readSrc();
5537
5541 assert(!(instData.ABS & 0x1));
5542 assert(!(instData.ABS & 0x2));
5543 assert(!(instData.ABS & 0x4));
5544 assert(!(extData.NEG & 0x1));
5545 assert(!(extData.NEG & 0x2));
5546 assert(!(extData.NEG & 0x4));
5547
5548 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5549 if (wf->execMask(lane)) {
5550 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
5551 & ((1 << bits(src2[lane], 4, 0)) - 1);
5552
5553 // Above extracted a signed int of size src2 bits which needs
5554 // to be signed-extended. Check if the MSB of our src2-bit
5555 // integer is 1, and sign extend it is.
5556 if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
5557 vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
5558 }
5559 }
5560 }
5561
5562 vdst.write();
5563 } // execute
5564 // --- Inst_VOP3__V_BFI_B32 class methods ---
5565
5567 : Inst_VOP3A(iFmt, "v_bfi_b32", false)
5568 {
5569 setFlag(ALU);
5570 } // Inst_VOP3__V_BFI_B32
5571
5573 {
5574 } // ~Inst_VOP3__V_BFI_B32
5575
5576 // --- description from .arch file ---
5577 // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
5578 void
5580 {
5581 Wavefront *wf = gpuDynInst->wavefront();
5582 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5583 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5584 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5585 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5586
5587 src0.readSrc();
5588 src1.readSrc();
5589 src2.readSrc();
5590
5594 assert(!(instData.ABS & 0x1));
5595 assert(!(instData.ABS & 0x2));
5596 assert(!(instData.ABS & 0x4));
5597 assert(!(extData.NEG & 0x1));
5598 assert(!(extData.NEG & 0x2));
5599 assert(!(extData.NEG & 0x4));
5600
5601 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5602 if (wf->execMask(lane)) {
5603 vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
5604 & src2[lane]);
5605 }
5606 }
5607
5608 vdst.write();
5609 } // execute
5610 // --- Inst_VOP3__V_FMA_F32 class methods ---
5611
5613 : Inst_VOP3A(iFmt, "v_fma_f32", false)
5614 {
5615 setFlag(ALU);
5616 setFlag(F32);
5617 setFlag(FMA);
5618 } // Inst_VOP3__V_FMA_F32
5619
5621 {
5622 } // ~Inst_VOP3__V_FMA_F32
5623
5624 // --- description from .arch file ---
5625 // D.f = S0.f * S1.f + S2.f.
5626 void
5628 {
5629 Wavefront *wf = gpuDynInst->wavefront();
5630 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5631 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5632 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5633 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5634
5635 src0.readSrc();
5636 src1.readSrc();
5637 src2.readSrc();
5638
5639 if (instData.ABS & 0x1) {
5640 src0.absModifier();
5641 }
5642
5643 if (instData.ABS & 0x2) {
5644 src1.absModifier();
5645 }
5646
5647 if (instData.ABS & 0x4) {
5648 src2.absModifier();
5649 }
5650
5651 if (extData.NEG & 0x1) {
5652 src0.negModifier();
5653 }
5654
5655 if (extData.NEG & 0x2) {
5656 src1.negModifier();
5657 }
5658
5659 if (extData.NEG & 0x4) {
5660 src2.negModifier();
5661 }
5662
5663 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5664 if (wf->execMask(lane)) {
5665 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5666 }
5667 }
5668
5669 vdst.write();
5670 } // execute
5671 // --- Inst_VOP3__V_FMA_F64 class methods ---
5672
5674 : Inst_VOP3A(iFmt, "v_fma_f64", false)
5675 {
5676 setFlag(ALU);
5677 setFlag(F64);
5678 setFlag(FMA);
5679 } // Inst_VOP3__V_FMA_F64
5680
5682 {
5683 } // ~Inst_VOP3__V_FMA_F64
5684
5685 // --- description from .arch file ---
5686 // D.d = S0.d * S1.d + S2.d.
5687 void
5689 {
5690 Wavefront *wf = gpuDynInst->wavefront();
5691 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
5692 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
5693 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
5694 VecOperandF64 vdst(gpuDynInst, instData.VDST);
5695
5696 src0.readSrc();
5697 src1.readSrc();
5698 src2.readSrc();
5699
5700 if (instData.ABS & 0x1) {
5701 src0.absModifier();
5702 }
5703
5704 if (instData.ABS & 0x2) {
5705 src1.absModifier();
5706 }
5707
5708 if (instData.ABS & 0x4) {
5709 src2.absModifier();
5710 }
5711
5712 if (extData.NEG & 0x1) {
5713 src0.negModifier();
5714 }
5715
5716 if (extData.NEG & 0x2) {
5717 src1.negModifier();
5718 }
5719
5720 if (extData.NEG & 0x4) {
5721 src2.negModifier();
5722 }
5723
5724 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5725 if (wf->execMask(lane)) {
5726 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5727 }
5728 }
5729
5730 vdst.write();
5731 } // execute
5732 // --- Inst_VOP3__V_LERP_U8 class methods ---
5733
5735 : Inst_VOP3A(iFmt, "v_lerp_u8", false)
5736 {
5737 setFlag(ALU);
5738 } // Inst_VOP3__V_LERP_U8
5739
5741 {
5742 } // ~Inst_VOP3__V_LERP_U8
5743
5744 // --- description from .arch file ---
5745 // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
5746 // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
5747 // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
5748 // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
5749 // Unsigned 8-bit pixel average on packed unsigned bytes (linear
5750 // --- interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
5751 // --- otherwise 0.5 truncates.
5752 void
5754 {
5755 Wavefront *wf = gpuDynInst->wavefront();
5756 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5757 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5758 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5759 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5760
5761 src0.readSrc();
5762 src1.readSrc();
5763 src2.readSrc();
5764
5768 assert(!(instData.ABS & 0x1));
5769 assert(!(instData.ABS & 0x2));
5770 assert(!(instData.ABS & 0x4));
5771 assert(!(extData.NEG & 0x1));
5772 assert(!(extData.NEG & 0x2));
5773 assert(!(extData.NEG & 0x4));
5774
5775 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5776 if (wf->execMask(lane)) {
5777 vdst[lane] = ((bits(src0[lane], 31, 24)
5778 + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
5779 << 24;
5780 vdst[lane] += ((bits(src0[lane], 23, 16)
5781 + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
5782 << 16;
5783 vdst[lane] += ((bits(src0[lane], 15, 8)
5784 + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
5785 << 8;
5786 vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
5787 + bits(src2[lane], 0)) >> 1);
5788 }
5789 }
5790
5791 vdst.write();
5792 } // execute
5793 // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
5794
5796 : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
5797 {
5798 setFlag(ALU);
5799 } // Inst_VOP3__V_ALIGNBIT_B32
5800
5802 {
5803 } // ~Inst_VOP3__V_ALIGNBIT_B32
5804
5805 // --- description from .arch file ---
5806 // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
5807 void
5809 {
5810 Wavefront *wf = gpuDynInst->wavefront();
5811 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5812 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5813 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5814 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5815
5816 src0.readSrc();
5817 src1.readSrc();
5818 src2.readSrc();
5819
5823 assert(!(instData.ABS & 0x1));
5824 assert(!(instData.ABS & 0x2));
5825 assert(!(instData.ABS & 0x4));
5826 assert(!(extData.NEG & 0x1));
5827 assert(!(extData.NEG & 0x2));
5828 assert(!(extData.NEG & 0x4));
5829
5830 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5831 if (wf->execMask(lane)) {
5832 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
5833 | (VecElemU64)src1[lane]);
5834 vdst[lane] = (VecElemU32)((src_0_1
5835 >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
5836 }
5837 }
5838
5839 vdst.write();
5840 } // execute
5841 // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
5842
5844 : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
5845 {
5846 setFlag(ALU);
5847 } // Inst_VOP3__V_ALIGNBYTE_B32
5848
5850 {
5851 } // ~Inst_VOP3__V_ALIGNBYTE_B32
5852
5853 // --- description from .arch file ---
5854 // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
5855 void
5857 {
5858 Wavefront *wf = gpuDynInst->wavefront();
5859 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5860 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5861 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5862 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5863
5864 src0.readSrc();
5865 src1.readSrc();
5866 src2.readSrc();
5867
5871 assert(!(instData.ABS & 0x1));
5872 assert(!(instData.ABS & 0x2));
5873 assert(!(instData.ABS & 0x4));
5874 assert(!(extData.NEG & 0x1));
5875 assert(!(extData.NEG & 0x2));
5876 assert(!(extData.NEG & 0x4));
5877
5878 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5879 if (wf->execMask(lane)) {
5880 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
5881 | (VecElemU64)src1[lane]);
5882 vdst[lane] = (VecElemU32)((src_0_1
5883 >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
5884 & 0xffffffff);
5885 }
5886 }
5887
5888 vdst.write();
5889 } // execute
5890 // --- Inst_VOP3__V_MIN3_F32 class methods ---
5891
5893 : Inst_VOP3A(iFmt, "v_min3_f32", false)
5894 {
5895 setFlag(ALU);
5896 setFlag(F32);
5897 } // Inst_VOP3__V_MIN3_F32
5898
5900 {
5901 } // ~Inst_VOP3__V_MIN3_F32
5902
5903 // --- description from .arch file ---
5904 // D.f = min(S0.f, S1.f, S2.f).
5905 void
5907 {
5908 Wavefront *wf = gpuDynInst->wavefront();
5909 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5910 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5911 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5912 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5913
5914 src0.readSrc();
5915 src1.readSrc();
5916 src2.readSrc();
5917
5918 if (instData.ABS & 0x1) {
5919 src0.absModifier();
5920 }
5921
5922 if (instData.ABS & 0x2) {
5923 src1.absModifier();
5924 }
5925
5926 if (instData.ABS & 0x4) {
5927 src2.absModifier();
5928 }
5929
5930 if (extData.NEG & 0x1) {
5931 src0.negModifier();
5932 }
5933
5934 if (extData.NEG & 0x2) {
5935 src1.negModifier();
5936 }
5937
5938 if (extData.NEG & 0x4) {
5939 src2.negModifier();
5940 }
5941
5942 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5943 if (wf->execMask(lane)) {
5944 VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
5945 vdst[lane] = std::fmin(min_0_1, src2[lane]);
5946 }
5947 }
5948
5949 vdst.write();
5950 } // execute
5951 // --- Inst_VOP3__V_MIN3_I32 class methods ---
5952
5954 : Inst_VOP3A(iFmt, "v_min3_i32", false)
5955 {
5956 setFlag(ALU);
5957 } // Inst_VOP3__V_MIN3_I32
5958
5960 {
5961 } // ~Inst_VOP3__V_MIN3_I32
5962
5963 // --- description from .arch file ---
5964 // D.i = min(S0.i, S1.i, S2.i).
5965 void
5967 {
5968 Wavefront *wf = gpuDynInst->wavefront();
5969 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5970 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
5971 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
5972 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5973
5974 src0.readSrc();
5975 src1.readSrc();
5976 src2.readSrc();
5977
5981 assert(!(instData.ABS & 0x1));
5982 assert(!(instData.ABS & 0x2));
5983 assert(!(instData.ABS & 0x4));
5984 assert(!(extData.NEG & 0x1));
5985 assert(!(extData.NEG & 0x2));
5986 assert(!(extData.NEG & 0x4));
5987
5988 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5989 if (wf->execMask(lane)) {
5990 VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
5991 vdst[lane] = std::min(min_0_1, src2[lane]);
5992 }
5993 }
5994
5995 vdst.write();
5996 } // execute
5997 // --- Inst_VOP3__V_MIN3_U32 class methods ---
5998
6000 : Inst_VOP3A(iFmt, "v_min3_u32", false)
6001 {
6002 setFlag(ALU);
6003 } // Inst_VOP3__V_MIN3_U32
6004
6006 {
6007 } // ~Inst_VOP3__V_MIN3_U32
6008
6009 // --- description from .arch file ---
6010 // D.u = min(S0.u, S1.u, S2.u).
6011 void
6013 {
6014 Wavefront *wf = gpuDynInst->wavefront();
6015 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6016 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6017 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6018 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6019
6020 src0.readSrc();
6021 src1.readSrc();
6022 src2.readSrc();
6023
6027 assert(!(instData.ABS & 0x1));
6028 assert(!(instData.ABS & 0x2));
6029 assert(!(instData.ABS & 0x4));
6030 assert(!(extData.NEG & 0x1));
6031 assert(!(extData.NEG & 0x2));
6032 assert(!(extData.NEG & 0x4));
6033
6034 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6035 if (wf->execMask(lane)) {
6036 VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
6037 vdst[lane] = std::min(min_0_1, src2[lane]);
6038 }
6039 }
6040
6041 vdst.write();
6042 } // execute
6043 // --- Inst_VOP3__V_MAX3_F32 class methods ---
6044
6046 : Inst_VOP3A(iFmt, "v_max3_f32", false)
6047 {
6048 setFlag(ALU);
6049 setFlag(F32);
6050 } // Inst_VOP3__V_MAX3_F32
6051
6053 {
6054 } // ~Inst_VOP3__V_MAX3_F32
6055
6056 // --- description from .arch file ---
6057 // D.f = max(S0.f, S1.f, S2.f).
6058 void
6060 {
6061 Wavefront *wf = gpuDynInst->wavefront();
6062 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6063 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6064 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6065 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6066
6067 src0.readSrc();
6068 src1.readSrc();
6069 src2.readSrc();
6070
6071 if (instData.ABS & 0x1) {
6072 src0.absModifier();
6073 }
6074
6075 if (instData.ABS & 0x2) {
6076 src1.absModifier();
6077 }
6078
6079 if (instData.ABS & 0x4) {
6080 src2.absModifier();
6081 }
6082
6083 if (extData.NEG & 0x1) {
6084 src0.negModifier();
6085 }
6086
6087 if (extData.NEG & 0x2) {
6088 src1.negModifier();
6089 }
6090
6091 if (extData.NEG & 0x4) {
6092 src2.negModifier();
6093 }
6094
6095 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6096 if (wf->execMask(lane)) {
6097 VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
6098 vdst[lane] = std::fmax(max_0_1, src2[lane]);
6099 }
6100 }
6101
6102 vdst.write();
6103 } // execute
6104 // --- Inst_VOP3__V_MAX3_I32 class methods ---
6105
6107 : Inst_VOP3A(iFmt, "v_max3_i32", false)
6108 {
6109 setFlag(ALU);
6110 } // Inst_VOP3__V_MAX3_I32
6111
6113 {
6114 } // ~Inst_VOP3__V_MAX3_I32
6115
6116 // --- description from .arch file ---
6117 // D.i = max(S0.i, S1.i, S2.i).
6118 void
6120 {
6121 Wavefront *wf = gpuDynInst->wavefront();
6122 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6123 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6124 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
6125 VecOperandI32 vdst(gpuDynInst, instData.VDST);
6126
6127 src0.readSrc();
6128 src1.readSrc();
6129 src2.readSrc();
6130
6134 assert(!(instData.ABS & 0x1));
6135 assert(!(instData.ABS & 0x2));
6136 assert(!(instData.ABS & 0x4));
6137 assert(!(extData.NEG & 0x1));
6138 assert(!(extData.NEG & 0x2));
6139 assert(!(extData.NEG & 0x4));
6140
6141 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6142 if (wf->execMask(lane)) {
6143 VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
6144 vdst[lane] = std::max(max_0_1, src2[lane]);
6145 }
6146 }
6147
6148 vdst.write();
6149 } // execute
6150 // --- Inst_VOP3__V_MAX3_U32 class methods ---
6151
6153 : Inst_VOP3A(iFmt, "v_max3_u32", false)
6154 {
6155 setFlag(ALU);
6156 } // Inst_VOP3__V_MAX3_U32
6157
6159 {
6160 } // ~Inst_VOP3__V_MAX3_U32
6161
6162 // --- description from .arch file ---
6163 // D.u = max(S0.u, S1.u, S2.u).
6164 void
6166 {
6167 Wavefront *wf = gpuDynInst->wavefront();
6168 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6169 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6170 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6171 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6172
6173 src0.readSrc();
6174 src1.readSrc();
6175 src2.readSrc();
6176
6180 assert(!(instData.ABS & 0x1));
6181 assert(!(instData.ABS & 0x2));
6182 assert(!(instData.ABS & 0x4));
6183 assert(!(extData.NEG & 0x1));
6184 assert(!(extData.NEG & 0x2));
6185 assert(!(extData.NEG & 0x4));
6186
6187 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6188 if (wf->execMask(lane)) {
6189 VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
6190 vdst[lane] = std::max(max_0_1, src2[lane]);
6191 }
6192 }
6193
6194 vdst.write();
6195 } // execute
6196 // --- Inst_VOP3__V_MED3_F32 class methods ---
6197
6199 : Inst_VOP3A(iFmt, "v_med3_f32", false)
6200 {
6201 setFlag(ALU);
6202 setFlag(F32);
6203 } // Inst_VOP3__V_MED3_F32
6204
6206 {
6207 } // ~Inst_VOP3__V_MED3_F32
6208
6209 // --- description from .arch file ---
6210 // D.f = median(S0.f, S1.f, S2.f).
6211 void
6213 {
6214 Wavefront *wf = gpuDynInst->wavefront();
6215 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6216 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6217 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6218 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6219
6220 src0.readSrc();
6221 src1.readSrc();
6222 src2.readSrc();
6223
6224 if (instData.ABS & 0x1) {
6225 src0.absModifier();
6226 }
6227
6228 if (instData.ABS & 0x2) {
6229 src1.absModifier();
6230 }
6231
6232 if (instData.ABS & 0x4) {
6233 src2.absModifier();
6234 }
6235
6236 if (extData.NEG & 0x1) {
6237 src0.negModifier();
6238 }
6239
6240 if (extData.NEG & 0x2) {
6241 src1.negModifier();
6242 }
6243
6244 if (extData.NEG & 0x4) {
6245 src2.negModifier();
6246 }
6247
6248 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6249 if (wf->execMask(lane)) {
6250 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6251 }
6252 }
6253
6254 vdst.write();
6255 } // execute
6256 // --- Inst_VOP3__V_MED3_I32 class methods ---
6257
6259 : Inst_VOP3A(iFmt, "v_med3_i32", false)
6260 {
6261 setFlag(ALU);
6262 } // Inst_VOP3__V_MED3_I32
6263
6265 {
6266 } // ~Inst_VOP3__V_MED3_I32
6267
6268 // --- description from .arch file ---
6269 // D.i = median(S0.i, S1.i, S2.i).
6270 void
6272 {
6273 Wavefront *wf = gpuDynInst->wavefront();
6274 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6275 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6276 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
6277 VecOperandI32 vdst(gpuDynInst, instData.VDST);
6278
6279 src0.readSrc();
6280 src1.readSrc();
6281 src2.readSrc();
6282
6286 assert(!(instData.ABS & 0x1));
6287 assert(!(instData.ABS & 0x2));
6288 assert(!(instData.ABS & 0x4));
6289 assert(!(extData.NEG & 0x1));
6290 assert(!(extData.NEG & 0x2));
6291 assert(!(extData.NEG & 0x4));
6292
6293 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6294 if (wf->execMask(lane)) {
6295 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6296 }
6297 }
6298
6299 vdst.write();
6300 } // execute
6301 // --- Inst_VOP3__V_MED3_U32 class methods ---
6302
6304 : Inst_VOP3A(iFmt, "v_med3_u32", false)
6305 {
6306 setFlag(ALU);
6307 } // Inst_VOP3__V_MED3_U32
6308
6310 {
6311 } // ~Inst_VOP3__V_MED3_U32
6312
6313 // --- description from .arch file ---
6314 // D.u = median(S0.u, S1.u, S2.u).
6315 void
6317 {
6318 Wavefront *wf = gpuDynInst->wavefront();
6319 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6320 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6321 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6322 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6323
6324 src0.readSrc();
6325 src1.readSrc();
6326 src2.readSrc();
6327
6331 assert(!(instData.ABS & 0x1));
6332 assert(!(instData.ABS & 0x2));
6333 assert(!(instData.ABS & 0x4));
6334 assert(!(extData.NEG & 0x1));
6335 assert(!(extData.NEG & 0x2));
6336 assert(!(extData.NEG & 0x4));
6337
6338 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6339 if (wf->execMask(lane)) {
6340 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6341 }
6342 }
6343
6344 vdst.write();
6345 } // execute
6346 // --- Inst_VOP3__V_SAD_U8 class methods ---
6347
6349 : Inst_VOP3A(iFmt, "v_sad_u8", false)
6350 {
6351 setFlag(ALU);
6352 } // Inst_VOP3__V_SAD_U8
6353
6355 {
6356 } // ~Inst_VOP3__V_SAD_U8
6357
6358 // --- description from .arch file ---
6359 // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
6360 // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
6361 // Sum of absolute differences with accumulation, overflow into upper bits
6362 // is allowed.
6363 void
6365 {
6366 Wavefront *wf = gpuDynInst->wavefront();
6367 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6368 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6369 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6370 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6371
6372 src0.readSrc();
6373 src1.readSrc();
6374 src2.readSrc();
6375
6379 assert(!(instData.ABS & 0x1));
6380 assert(!(instData.ABS & 0x2));
6381 assert(!(instData.ABS & 0x4));
6382 assert(!(extData.NEG & 0x1));
6383 assert(!(extData.NEG & 0x2));
6384 assert(!(extData.NEG & 0x4));
6385
6386 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6387 if (wf->execMask(lane)) {
6388 vdst[lane] = std::abs(bits(src0[lane], 31, 24)
6389 - bits(src1[lane], 31, 24))
6390 + std::abs(bits(src0[lane], 23, 16)
6391 - bits(src1[lane], 23, 16))
6392 + std::abs(bits(src0[lane], 15, 8)
6393 - bits(src1[lane], 15, 8))
6394 + std::abs(bits(src0[lane], 7, 0)
6395 - bits(src1[lane], 7, 0)) + src2[lane];
6396 }
6397 }
6398
6399 vdst.write();
6400 } // execute
6401 // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
6402
6404 : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
6405 {
6406 setFlag(ALU);
6407 } // Inst_VOP3__V_SAD_HI_U8
6408
6410 {
6411 } // ~Inst_VOP3__V_SAD_HI_U8
6412
6413 // --- description from .arch file ---
6414 // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
6415 // Sum of absolute differences with accumulation, overflow is lost.
6416 void
6418 {
6419 Wavefront *wf = gpuDynInst->wavefront();
6420 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6421 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6422 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6423 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6424
6425 src0.readSrc();
6426 src1.readSrc();
6427 src2.readSrc();
6428
6432 assert(!(instData.ABS & 0x1));
6433 assert(!(instData.ABS & 0x2));
6434 assert(!(instData.ABS & 0x4));
6435 assert(!(extData.NEG & 0x1));
6436 assert(!(extData.NEG & 0x2));
6437 assert(!(extData.NEG & 0x4));
6438
6439 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6440 if (wf->execMask(lane)) {
6441 vdst[lane] = (((bits(src0[lane], 31, 24)
6442 - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
6443 - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
6444 - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
6445 - bits(src1[lane], 7, 0))) << 16) + src2[lane];
6446 }
6447 }
6448
6449 vdst.write();
6450 } // execute
6451 // --- Inst_VOP3__V_SAD_U16 class methods ---
6452
6454 : Inst_VOP3A(iFmt, "v_sad_u16", false)
6455 {
6456 setFlag(ALU);
6457 } // Inst_VOP3__V_SAD_U16
6458
6460 {
6461 } // ~Inst_VOP3__V_SAD_U16
6462
6463 // --- description from .arch file ---
6464 // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
6465 // + S2.u.
6466 // Word SAD with accumulation.
6467 void
6469 {
6470 Wavefront *wf = gpuDynInst->wavefront();
6471 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6472 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6473 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6474 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6475
6476 src0.readSrc();
6477 src1.readSrc();
6478 src2.readSrc();
6479
6483 assert(!(instData.ABS & 0x1));
6484 assert(!(instData.ABS & 0x2));
6485 assert(!(instData.ABS & 0x4));
6486 assert(!(extData.NEG & 0x1));
6487 assert(!(extData.NEG & 0x2));
6488 assert(!(extData.NEG & 0x4));
6489
6490 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6491 if (wf->execMask(lane)) {
6492 vdst[lane] = std::abs(bits(src0[lane], 31, 16)
6493 - bits(src1[lane], 31, 16))
6494 + std::abs(bits(src0[lane], 15, 0)
6495 - bits(src1[lane], 15, 0)) + src2[lane];
6496 }
6497 }
6498
6499 vdst.write();
6500 } // execute
6501 // --- Inst_VOP3__V_SAD_U32 class methods ---
6502
6504 : Inst_VOP3A(iFmt, "v_sad_u32", false)
6505 {
6506 setFlag(ALU);
6507 } // Inst_VOP3__V_SAD_U32
6508
6510 {
6511 } // ~Inst_VOP3__V_SAD_U32
6512
6513 // --- description from .arch file ---
6514 // D.u = abs(S0.i - S1.i) + S2.u.
6515 // Dword SAD with accumulation.
6516 void
6518 {
6519 Wavefront *wf = gpuDynInst->wavefront();
6520 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6521 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6522 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6523 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6524
6525 src0.readSrc();
6526 src1.readSrc();
6527 src2.readSrc();
6528
6532 assert(!(instData.ABS & 0x1));
6533 assert(!(instData.ABS & 0x2));
6534 assert(!(instData.ABS & 0x4));
6535 assert(!(extData.NEG & 0x1));
6536 assert(!(extData.NEG & 0x2));
6537 assert(!(extData.NEG & 0x4));
6538
6539 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6540 if (wf->execMask(lane)) {
6541 vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
6542 } // if
6543 } // for
6544
6545 vdst.write();
6546 } // execute
6547 // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
6548
6550 : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
6551 {
6552 setFlag(ALU);
6553 setFlag(F32);
6554 } // Inst_VOP3__V_CVT_PK_U8_F32
6555
6557 {
6558 } // ~Inst_VOP3__V_CVT_PK_U8_F32
6559
6560 // --- description from .arch file ---
6561 // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
6562 // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
6563 // Convert floating point value S0 to 8-bit unsigned integer and pack the
6564 // result into byte S1 of dword S2.
6565 void
6567 {
6568 Wavefront *wf = gpuDynInst->wavefront();
6569 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6570 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6571 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6572 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6573
6574 src0.readSrc();
6575 src1.readSrc();
6576 src2.readSrc();
6577
6578 if (instData.ABS & 0x1) {
6579 src0.absModifier();
6580 }
6581
6582
6583 if (extData.NEG & 0x1) {
6584 src0.negModifier();
6585 }
6586
6590 assert(!(instData.ABS & 0x2));
6591 assert(!(instData.ABS & 0x4));
6592 assert(!(extData.NEG & 0x2));
6593 assert(!(extData.NEG & 0x4));
6594
6595 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6596 if (wf->execMask(lane)) {
6597 vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
6598 << (8 * bits(src1[lane], 1, 0)))
6599 | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
6600 }
6601 }
6602
6603 vdst.write();
6604 } // execute
6605 // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
6606
6608 : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
6609 {
6610 setFlag(ALU);
6611 setFlag(F32);
6612 } // Inst_VOP3__V_DIV_FIXUP_F32
6613
6615 {
6616 } // ~Inst_VOP3__V_DIV_FIXUP_F32
6617
6618 // --- description from .arch file ---
6619 // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
6620 // s2.f = Numerator. This opcode generates exceptions resulting from the
6621 // division operation.
6622 void
6624 {
6625 Wavefront *wf = gpuDynInst->wavefront();
6626 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6627 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6628 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6629 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6630
6631 src0.readSrc();
6632 src1.readSrc();
6633 src2.readSrc();
6634
6635 if (instData.ABS & 0x1) {
6636 src0.absModifier();
6637 }
6638
6639 if (instData.ABS & 0x2) {
6640 src1.absModifier();
6641 }
6642
6643 if (instData.ABS & 0x4) {
6644 src2.absModifier();
6645 }
6646
6647 if (extData.NEG & 0x1) {
6648 src0.negModifier();
6649 }
6650
6651 if (extData.NEG & 0x2) {
6652 src1.negModifier();
6653 }
6654
6655 if (extData.NEG & 0x4) {
6656 src2.negModifier();
6657 }
6658
6659 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6660 if (wf->execMask(lane)) {
6661 if (std::fpclassify(src1[lane]) == FP_ZERO) {
6662 if (std::signbit(src1[lane])) {
6663 vdst[lane] = -INFINITY;
6664 } else {
6665 vdst[lane] = +INFINITY;
6666 }
6667 } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
6668 vdst[lane] = NAN;
6669 } else if (std::isinf(src1[lane])) {
6670 if (std::signbit(src1[lane])) {
6671 vdst[lane] = -INFINITY;
6672 } else {
6673 vdst[lane] = +INFINITY;
6674 }
6675 } else {
6676 vdst[lane] = src2[lane] / src1[lane];
6677 }
6678 }
6679 }
6680
6681 vdst.write();
6682 } // execute
6683 // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
6684
6686 : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
6687 {
6688 setFlag(ALU);
6689 setFlag(F64);
6690 } // Inst_VOP3__V_DIV_FIXUP_F64
6691
6693 {
6694 } // ~Inst_VOP3__V_DIV_FIXUP_F64
6695
6696 // --- description from .arch file ---
6697 // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
6698 // s2.d = Numerator. This opcode generates exceptions resulting from the
6699 // division operation.
6700 void
6702 {
6703 Wavefront *wf = gpuDynInst->wavefront();
6704 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
6705 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
6706 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
6707 VecOperandF64 vdst(gpuDynInst, instData.VDST);
6708
6709 src0.readSrc();
6710 src1.readSrc();
6711 src2.readSrc();
6712
6713 if (instData.ABS & 0x1) {
6714 src0.absModifier();
6715 }
6716
6717 if (instData.ABS & 0x2) {
6718 src1.absModifier();
6719 }
6720
6721 if (instData.ABS & 0x4) {
6722 src2.absModifier();
6723 }
6724
6725 if (extData.NEG & 0x1) {
6726 src0.negModifier();
6727 }
6728
6729 if (extData.NEG & 0x2) {
6730 src1.negModifier();
6731 }
6732
6733 if (extData.NEG & 0x4) {
6734 src2.negModifier();
6735 }
6736
6737 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6738 if (wf->execMask(lane)) {
6739 int sign_out = std::signbit(src1[lane])
6740 ^ std::signbit(src2[lane]);
6741 int exp1(0);
6742 int exp2(0);
6743 std::frexp(src1[lane], &exp1);
6744 std::frexp(src2[lane], &exp2);
6745
6746 if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
6747 vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
6748 } else if (std::fpclassify(src1[lane]) == FP_ZERO
6749 && std::fpclassify(src2[lane]) == FP_ZERO) {
6750 vdst[lane]
6751 = std::numeric_limits<VecElemF64>::signaling_NaN();
6752 } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
6753 vdst[lane]
6754 = std::numeric_limits<VecElemF64>::signaling_NaN();
6755 } else if (std::fpclassify(src1[lane]) == FP_ZERO
6756 || std::isinf(src2[lane])) {
6757 vdst[lane] = sign_out ? -INFINITY : +INFINITY;
6758 } else if (std::isinf(src1[lane])
6759 || std::fpclassify(src2[lane]) == FP_ZERO) {
6760 vdst[lane] = sign_out ? -0.0 : +0.0;
6761 } else if (exp2 - exp1 < -1075) {
6762 vdst[lane] = src0[lane];
6763 } else if (exp1 == 2047) {
6764 vdst[lane] = src0[lane];
6765 } else {
6766 vdst[lane] = sign_out ? -std::fabs(src0[lane])
6767 : std::fabs(src0[lane]);
6768 }
6769 }
6770 }
6771
6772 vdst.write();
6773 } // execute
6774 // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
6775
6777 InFmt_VOP3B *iFmt)
6778 : Inst_VOP3B(iFmt, "v_div_scale_f32")
6779 {
6780 setFlag(ALU);
6781 setFlag(WritesVCC);
6782 setFlag(F32);
6783 } // Inst_VOP3__V_DIV_SCALE_F32
6784
6786 {
6787 } // ~Inst_VOP3__V_DIV_SCALE_F32
6788
6789 // --- description from .arch file ---
6790 // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
6791 // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
6792 // numerator and denominator, this opcode will appropriately scale inputs
6793 // for division to avoid subnormal terms during Newton-Raphson correction
6794 // algorithm. This opcode producses a VCC flag for post-scale of quotient.
6795 void
6797 {
6798 Wavefront *wf = gpuDynInst->wavefront();
6799 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6800 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6801 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6802 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
6803 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6804
6805 src0.readSrc();
6806 src1.readSrc();
6807 src2.readSrc();
6808
6809 if (extData.NEG & 0x1) {
6810 src0.negModifier();
6811 }
6812
6813 if (extData.NEG & 0x2) {
6814 src1.negModifier();
6815 }
6816
6817 if (extData.NEG & 0x4) {
6818 src2.negModifier();
6819 }
6820
6821 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6822 if (wf->execMask(lane)) {
6823 vdst[lane] = src0[lane];
6824 vcc.setBit(lane, 0);
6825 }
6826 }
6827
6828 vcc.write();
6829 vdst.write();
6830 } // execute
6831 // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
6832
6834 InFmt_VOP3B *iFmt)
6835 : Inst_VOP3B(iFmt, "v_div_scale_f64")
6836 {
6837 setFlag(ALU);
6838 setFlag(WritesVCC);
6839 setFlag(F64);
6840 } // Inst_VOP3__V_DIV_SCALE_F64
6841
6843 {
6844 } // ~Inst_VOP3__V_DIV_SCALE_F64
6845
6846 // --- description from .arch file ---
6847 // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
6848 // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
6849 // numerator and denominator, this opcode will appropriately scale inputs
6850 // for division to avoid subnormal terms during Newton-Raphson correction
6851 // algorithm. This opcode producses a VCC flag for post-scale of quotient.
6852 void
6854 {
6855 Wavefront *wf = gpuDynInst->wavefront();
6856 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
6857 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
6858 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
6859 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
6860 VecOperandF64 vdst(gpuDynInst, instData.VDST);
6861
6862 src0.readSrc();
6863 src1.readSrc();
6864 src2.readSrc();
6865
6866 if (extData.NEG & 0x1) {
6867 src0.negModifier();
6868 }
6869
6870 if (extData.NEG & 0x2) {
6871 src1.negModifier();
6872 }
6873
6874 if (extData.NEG & 0x4) {
6875 src2.negModifier();
6876 }
6877
6878 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6879 if (wf->execMask(lane)) {
6880 int exp1(0);
6881 int exp2(0);
6882 std::frexp(src1[lane], &exp1);
6883 std::frexp(src2[lane], &exp2);
6884 vcc.setBit(lane, 0);
6885
6886 if (std::fpclassify(src1[lane]) == FP_ZERO
6887 || std::fpclassify(src2[lane]) == FP_ZERO) {
6888 vdst[lane] = NAN;
6889 } else if (exp2 - exp1 >= 768) {
6890 vcc.setBit(lane, 1);
6891 if (src0[lane] == src1[lane]) {
6892 vdst[lane] = std::ldexp(src0[lane], 128);
6893 }
6894 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
6895 vdst[lane] = std::ldexp(src0[lane], 128);
6896 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
6897 && std::fpclassify(src2[lane] / src1[lane])
6898 == FP_SUBNORMAL) {
6899 vcc.setBit(lane, 1);
6900 if (src0[lane] == src1[lane]) {
6901 vdst[lane] = std::ldexp(src0[lane], 128);
6902 }
6903 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
6904 vdst[lane] = std::ldexp(src0[lane], -128);
6905 } else if (std::fpclassify(src2[lane] / src1[lane])
6906 == FP_SUBNORMAL) {
6907 vcc.setBit(lane, 1);
6908 if (src0[lane] == src2[lane]) {
6909 vdst[lane] = std::ldexp(src0[lane], 128);
6910 }
6911 } else if (exp2 <= 53) {
6912 vdst[lane] = std::ldexp(src0[lane], 128);
6913 }
6914 }
6915 }
6916
6917 vcc.write();
6918 vdst.write();
6919 } // execute
6920 // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
6921
6923 : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
6924 {
6925 setFlag(ALU);
6926 setFlag(ReadsVCC);
6927 setFlag(F32);
6928 setFlag(FMA);
6929 } // Inst_VOP3__V_DIV_FMAS_F32
6930
6932 {
6933 } // ~Inst_VOP3__V_DIV_FMAS_F32
6934
6935 // --- description from .arch file ---
6936 // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
6937 // s1.f = Denominator, s2.f = Numerator)
6938 void
6940 {
6941 Wavefront *wf = gpuDynInst->wavefront();
6942 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6943 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6944 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6945 VecOperandF64 vdst(gpuDynInst, instData.VDST);
6946
6947 src0.readSrc();
6948 src1.readSrc();
6949 src2.readSrc();
6950
6951 if (instData.ABS & 0x1) {
6952 src0.absModifier();
6953 }
6954
6955 if (instData.ABS & 0x2) {
6956 src1.absModifier();
6957 }
6958
6959 if (instData.ABS & 0x4) {
6960 src2.absModifier();
6961 }
6962
6963 if (extData.NEG & 0x1) {
6964 src0.negModifier();
6965 }
6966
6967 if (extData.NEG & 0x2) {
6968 src1.negModifier();
6969 }
6970
6971 if (extData.NEG & 0x4) {
6972 src2.negModifier();
6973 }
6974
6975 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6976 if (wf->execMask(lane)) {
6977 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
6978 }
6979 }
6980
6981 //vdst.write();
6982 } // execute
6983 // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
6984
6986 : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
6987 {
6988 setFlag(ALU);
6989 setFlag(ReadsVCC);
6990 setFlag(F64);
6991 setFlag(FMA);
6992 } // Inst_VOP3__V_DIV_FMAS_F64
6993
6995 {
6996 } // ~Inst_VOP3__V_DIV_FMAS_F64
6997
6998 // --- description from .arch file ---
6999 // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
7000 // s1.d = Denominator, s2.d = Numerator)
7001 void
7003 {
7004 Wavefront *wf = gpuDynInst->wavefront();
7005 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
7006 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
7007 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
7008 VecOperandF64 vdst(gpuDynInst, instData.VDST);
7009 ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
7010
7011 src0.readSrc();
7012 src1.readSrc();
7013 src2.readSrc();
7014 vcc.read();
7015
7016 if (instData.ABS & 0x1) {
7017 src0.absModifier();
7018 }
7019
7020 if (instData.ABS & 0x2) {
7021 src1.absModifier();
7022 }
7023
7024 if (instData.ABS & 0x4) {
7025 src2.absModifier();
7026 }
7027
7028 if (extData.NEG & 0x1) {
7029 src0.negModifier();
7030 }
7031
7032 if (extData.NEG & 0x2) {
7033 src1.negModifier();
7034 }
7035
7036 if (extData.NEG & 0x4) {
7037 src2.negModifier();
7038 }
7039
7040 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7041 if (wf->execMask(lane)) {
7042 if (bits(vcc.rawData(), lane)) {
7043 vdst[lane] = std::pow(2, 64)
7044 * std::fma(src0[lane], src1[lane], src2[lane]);
7045 } else {
7046 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
7047 }
7048 }
7049 }
7050
7051 vdst.write();
7052 } // execute
7053 // --- Inst_VOP3__V_MSAD_U8 class methods ---
7054
7056 : Inst_VOP3A(iFmt, "v_msad_u8", false)
7057 {
7058 setFlag(ALU);
7059 } // Inst_VOP3__V_MSAD_U8
7060
7062 {
7063 } // ~Inst_VOP3__V_MSAD_U8
7064
7065 // --- description from .arch file ---
7066 // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
7067 void
7069 {
7071 } // execute
7072 // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
7073
7075 : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
7076 {
7077 setFlag(ALU);
7078 } // Inst_VOP3__V_QSAD_PK_U16_U8
7079
7081 {
7082 } // ~Inst_VOP3__V_QSAD_PK_U16_U8
7083
7084 // --- description from .arch file ---
7085 // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
7086 // S1.u[31:0], S2.u[63:0])
7087 void
7089 {
7091 } // execute
7092 // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
7093
7095 InFmt_VOP3A *iFmt)
7096 : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
7097 {
7098 setFlag(ALU);
7099 } // Inst_VOP3__V_MQSAD_PK_U16_U8
7100
7102 {
7103 } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
7104
7105 // --- description from .arch file ---
7106 // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
7107 // --- S1.u[31:0], S2.u[63:0])
7108 void
7110 {
7112 } // execute
7113 // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
7114
7116 : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
7117 {
7118 setFlag(ALU);
7119 } // Inst_VOP3__V_MQSAD_U32_U8
7120
7122 {
7123 } // ~Inst_VOP3__V_MQSAD_U32_U8
7124
7125 // --- description from .arch file ---
7126 // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
7127 // --- S1.u[31:0], S2.u[127:0])
7128 void
7130 {
7132 } // execute
7133 // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
7134
7136 InFmt_VOP3B *iFmt)
7137 : Inst_VOP3B(iFmt, "v_mad_u64_u32")
7138 {
7139 setFlag(ALU);
7140 setFlag(WritesVCC);
7141 setFlag(MAD);
7142 } // Inst_VOP3__V_MAD_U64_U32
7143
7145 {
7146 } // ~Inst_VOP3__V_MAD_U64_U32
7147
7148 // --- description from .arch file ---
7149 // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
7150 void
7152 {
7153 Wavefront *wf = gpuDynInst->wavefront();
7154 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7155 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7156 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
7157 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
7158 VecOperandU64 vdst(gpuDynInst, instData.VDST);
7159
7160 src0.readSrc();
7161 src1.readSrc();
7162 src2.readSrc();
7163 vdst.read();
7164
7168 assert(!(extData.NEG & 0x1));
7169 assert(!(extData.NEG & 0x2));
7170 assert(!(extData.NEG & 0x4));
7171
7172 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7173 if (wf->execMask(lane)) {
7174 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
7175 src2[lane]));
7176 }
7177 }
7178
7179 vcc.write();
7180 vdst.write();
7181 } // execute
7182 // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
7183
7185 InFmt_VOP3B *iFmt)
7186 : Inst_VOP3B(iFmt, "v_mad_i64_i32")
7187 {
7188 setFlag(ALU);
7189 setFlag(WritesVCC);
7190 setFlag(MAD);
7191 } // Inst_VOP3__V_MAD_I64_I32
7192
7194 {
7195 } // ~Inst_VOP3__V_MAD_I64_I32
7196
7197 // --- description from .arch file ---
7198 // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
7199 void
7201 {
7202 Wavefront *wf = gpuDynInst->wavefront();
7203 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
7204 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
7205 ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
7206 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
7207 VecOperandI64 vdst(gpuDynInst, instData.VDST);
7208
7209 src0.readSrc();
7210 src1.readSrc();
7211 src2.readSrc();
7212
7216 assert(!(extData.NEG & 0x1));
7217 assert(!(extData.NEG & 0x2));
7218 assert(!(extData.NEG & 0x4));
7219
7220 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7221 if (wf->execMask(lane)) {
7222 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
7223 src2[lane]));
7224 }
7225 }
7226
7227 vcc.write();
7228 vdst.write();
7229 } // execute
7230 // --- Inst_VOP3__V_XAD_U32 class methods ---
7231
7233 : Inst_VOP3A(iFmt, "v_xad_u32", false)
7234 {
7235 setFlag(ALU);
7236 } // Inst_VOP3__V_XAD_U32
7237
7239 {
7240 } // ~Inst_VOP3__V_XAD_U32
7241
7242 // --- description from .arch file ---
7243 // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
7244 void
7246 {
7247 Wavefront *wf = gpuDynInst->wavefront();
7248 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7249 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7250 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7251 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7252
7253 src0.readSrc();
7254 src1.readSrc();
7255 src2.readSrc();
7256
7260 assert(!(instData.ABS & 0x1));
7261 assert(!(instData.ABS & 0x2));
7262 assert(!(instData.ABS & 0x4));
7263 assert(!(extData.NEG & 0x1));
7264 assert(!(extData.NEG & 0x2));
7265 assert(!(extData.NEG & 0x4));
7266
7267 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7268 if (wf->execMask(lane)) {
7269 vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
7270 }
7271 }
7272
7273 vdst.write();
7274 } // execute
7275 // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
7276
7278 : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
7279 {
7280 setFlag(ALU);
7281 } // Inst_VOP3__V_LSHL_ADD_U32
7282
7284 {
7285 } // ~Inst_VOP3__V_LSHL_ADD_U32
7286
7287 // --- description from .arch file ---
7288 // D.u = (S0.u << S1.u[4:0]) + S2.u.
7289 void
7291 {
7292 Wavefront *wf = gpuDynInst->wavefront();
7293 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7294 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7295 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7296 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7297
7298 src0.readSrc();
7299 src1.readSrc();
7300 src2.readSrc();
7301
7305 assert(!(instData.ABS & 0x1));
7306 assert(!(instData.ABS & 0x2));
7307 assert(!(instData.ABS & 0x4));
7308 assert(!(extData.NEG & 0x1));
7309 assert(!(extData.NEG & 0x2));
7310 assert(!(extData.NEG & 0x4));
7311
7312 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7313 if (wf->execMask(lane)) {
7314 vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
7315 + src2[lane];
7316 }
7317 }
7318
7319 vdst.write();
7320 } // execute
7321 // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
7322
7324 : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
7325 {
7326 setFlag(ALU);
7327 } // Inst_VOP3__V_ADD_LSHL_U32
7328
7330 {
7331 } // ~Inst_VOP3__V_ADD_LSHL_U32
7332
7333 // --- description from .arch file ---
7334 // D.u = (S0.u + S1.u) << S2.u[4:0].
7335 void
7337 {
7338 Wavefront *wf = gpuDynInst->wavefront();
7339 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7340 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7341 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7342 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7343
7344 src0.readSrc();
7345 src1.readSrc();
7346 src2.readSrc();
7347
7351 assert(!(instData.ABS & 0x1));
7352 assert(!(instData.ABS & 0x2));
7353 assert(!(instData.ABS & 0x4));
7354 assert(!(extData.NEG & 0x1));
7355 assert(!(extData.NEG & 0x2));
7356 assert(!(extData.NEG & 0x4));
7357
7358 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7359 if (wf->execMask(lane)) {
7360 vdst[lane] =
7361 (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
7362 }
7363 }
7364
7365 vdst.write();
7366 } // execute
7367 // --- Inst_VOP3__V_ADD3_U32 class methods ---
7368
7370 : Inst_VOP3A(iFmt, "v_add3_u32", false)
7371 {
7372 setFlag(ALU);
7373 } // Inst_VOP3__V_ADD3_U32
7374
7376 {
7377 } // ~Inst_VOP3__V_ADD3_U32
7378
7379 // --- description from .arch file ---
7380 // D.u = S0.u + S1.u + S2.u.
7381 void
7383 {
7384 Wavefront *wf = gpuDynInst->wavefront();
7385 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7386 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7387 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7388 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7389
7390 src0.readSrc();
7391 src1.readSrc();
7392 src2.readSrc();
7393
7397 assert(!(instData.ABS & 0x1));
7398 assert(!(instData.ABS & 0x2));
7399 assert(!(instData.ABS & 0x4));
7400 assert(!(extData.NEG & 0x1));
7401 assert(!(extData.NEG & 0x2));
7402 assert(!(extData.NEG & 0x4));
7403
7404 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7405 if (wf->execMask(lane)) {
7406 vdst[lane] = src0[lane] + src1[lane] + src2[lane];
7407 }
7408 }
7409
7410 vdst.write();
7411 } // execute
7412 // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
7413
7415 : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
7416 {
7417 setFlag(ALU);
7418 } // Inst_VOP3__V_LSHL_OR_B32
7419
7421 {
7422 } // ~Inst_VOP3__V_LSHL_OR_B32
7423
7424 // --- description from .arch file ---
7425 // D.u = (S0.u << S1.u[4:0]) | S2.u.
7426 void
7428 {
7429 Wavefront *wf = gpuDynInst->wavefront();
7430 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7431 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7432 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7433 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7434
7435 src0.readSrc();
7436 src1.readSrc();
7437 src2.readSrc();
7438
7442 assert(!(instData.ABS & 0x1));
7443 assert(!(instData.ABS & 0x2));
7444 assert(!(instData.ABS & 0x4));
7445 assert(!(extData.NEG & 0x1));
7446 assert(!(extData.NEG & 0x2));
7447 assert(!(extData.NEG & 0x4));
7448
7449 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7450 if (wf->execMask(lane)) {
7451 vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
7452 | src2[lane];
7453 }
7454 }
7455
7456 vdst.write();
7457 } // execute
7458 // --- Inst_VOP3__V_AND_OR_B32 class methods ---
7459
7461 : Inst_VOP3A(iFmt, "v_and_or_b32", false)
7462 {
7463 setFlag(ALU);
7464 } // Inst_VOP3__V_AND_OR_B32
7465
7467 {
7468 } // ~Inst_VOP3__V_AND_OR_B32
7469
7470 // --- description from .arch file ---
7471 // D.u = (S0.u & S1.u) | S2.u.
7472 // Input and output modifiers not supported.
7473 void
7475 {
7476 Wavefront *wf = gpuDynInst->wavefront();
7477 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7478 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7479 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7480 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7481
7482 src0.readSrc();
7483 src1.readSrc();
7484 src2.readSrc();
7485
7489 assert(!(instData.ABS & 0x1));
7490 assert(!(instData.ABS & 0x2));
7491 assert(!(instData.ABS & 0x4));
7492 assert(!(extData.NEG & 0x1));
7493 assert(!(extData.NEG & 0x2));
7494 assert(!(extData.NEG & 0x4));
7495
7496 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7497 if (wf->execMask(lane)) {
7498 vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
7499 }
7500 }
7501
7502 vdst.write();
7503 } // execute
7504 // --- Inst_VOP3__V_MAD_F16 class methods ---
7505
7507 : Inst_VOP3A(iFmt, "v_mad_f16", false)
7508 {
7509 setFlag(ALU);
7510 setFlag(F16);
7511 setFlag(MAD);
7512 } // Inst_VOP3__V_MAD_F16
7513
7515 {
7516 } // ~Inst_VOP3__V_MAD_F16
7517
7518 // --- description from .arch file ---
7519 // D.f16 = S0.f16 * S1.f16 + S2.f16.
7520 // Supports round mode, exception flags, saturation.
7521 void
7523 {
7525 } // execute
7526 // --- Inst_VOP3__V_MAD_U16 class methods ---
7527
7529 : Inst_VOP3A(iFmt, "v_mad_u16", false)
7530 {
7531 setFlag(ALU);
7532 setFlag(MAD);
7533 } // Inst_VOP3__V_MAD_U16
7534
7536 {
7537 } // ~Inst_VOP3__V_MAD_U16
7538
7539 // --- description from .arch file ---
7540 // D.u16 = S0.u16 * S1.u16 + S2.u16.
7541 // Supports saturation (unsigned 16-bit integer domain).
7542 void
7544 {
7545 Wavefront *wf = gpuDynInst->wavefront();
7546 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
7547 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
7548 ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
7549 VecOperandU16 vdst(gpuDynInst, instData.VDST);
7550
7551 src0.readSrc();
7552 src1.readSrc();
7553 src2.readSrc();
7554
7558 assert(!(instData.ABS & 0x1));
7559 assert(!(instData.ABS & 0x2));
7560 assert(!(instData.ABS & 0x4));
7561 assert(!(extData.NEG & 0x1));
7562 assert(!(extData.NEG & 0x2));
7563 assert(!(extData.NEG & 0x4));
7564
7565 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7566 if (wf->execMask(lane)) {
7567 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
7568 }
7569 }
7570
7571 vdst.write();
7572 } // execute
7573 // --- Inst_VOP3__V_MAD_I16 class methods ---
7574
7576 : Inst_VOP3A(iFmt, "v_mad_i16", false)
7577 {
7578 setFlag(ALU);
7579 setFlag(MAD);
7580 } // Inst_VOP3__V_MAD_I16
7581
7583 {
7584 } // ~Inst_VOP3__V_MAD_I16
7585
7586 // --- description from .arch file ---
7587 // D.i16 = S0.i16 * S1.i16 + S2.i16.
7588 // Supports saturation (signed 16-bit integer domain).
7589 void
7591 {
7592 Wavefront *wf = gpuDynInst->wavefront();
7593 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
7594 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
7595 ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
7596 VecOperandI16 vdst(gpuDynInst, instData.VDST);
7597
7598 src0.readSrc();
7599 src1.readSrc();
7600 src2.readSrc();
7601
7605 assert(!(instData.ABS & 0x1));
7606 assert(!(instData.ABS & 0x2));
7607 assert(!(instData.ABS & 0x4));
7608 assert(!(extData.NEG & 0x1));
7609 assert(!(extData.NEG & 0x2));
7610 assert(!(extData.NEG & 0x4));
7611
7612 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7613 if (wf->execMask(lane)) {
7614 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
7615 }
7616 }
7617
7618 vdst.write();
7619 } // execute
7620 // --- Inst_VOP3__V_PERM_B32 class methods ---
7621
7623 : Inst_VOP3A(iFmt, "v_perm_b32", false)
7624 {
7625 setFlag(ALU);
7626 } // Inst_VOP3__V_PERM_B32
7627
7629 {
7630 } // ~Inst_VOP3__V_PERM_B32
7631
7632 // --- description from .arch file ---
7633 // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
7634 // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
7635 // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
7636 // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
7637 // byte permute(byte in[8], byte sel) {
7638 // if (sel>=13) then return 0xff;
7639 // elsif(sel==12) then return 0x00;
7640 // elsif(sel==11) then return in[7][7] * 0xff;
7641 // elsif(sel==10) then return in[5][7] * 0xff;
7642 // elsif(sel==9) then return in[3][7] * 0xff;
7643 // elsif(sel==8) then return in[1][7] * 0xff;
7644 // else return in[sel];
7645 // }
7646 // Byte permute.
7647 void
7649 {
7650 Wavefront *wf = gpuDynInst->wavefront();
7651 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7652 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7653 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7654 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7655
7656 src0.readSrc();
7657 src1.readSrc();
7658 src2.readSrc();
7659
7660 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7661 if (wf->execMask(lane)) {
7662 VecElemU64 selector = (VecElemU64)src0[lane];
7663 selector = (selector << 32) | (VecElemU64)src1[lane];
7664 vdst[lane] = 0;
7665
7666 DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
7667 "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
7668 src1[lane], src2[lane], vdst[lane]);
7669 DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
7670
7671 for (int i = 0; i < 4 ; ++i) {
7672 VecElemU32 permuted_val = permute(selector, 0xFF
7673 & ((VecElemU32)src2[lane] >> (8 * i)));
7674 vdst[lane] |= (permuted_val << (8 * i));
7675 }
7676
7677 DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
7678 }
7679 }
7680
7681 vdst.write();
7682 } // execute
7683 // --- Inst_VOP3__V_FMA_F16 class methods ---
7684
7686 : Inst_VOP3A(iFmt, "v_fma_f16", false)
7687 {
7688 setFlag(ALU);
7689 setFlag(F16);
7690 setFlag(FMA);
7691 } // Inst_VOP3__V_FMA_F16
7692
7694 {
7695 } // ~Inst_VOP3__V_FMA_F16
7696
7697 // --- description from .arch file ---
7698 // D.f16 = S0.f16 * S1.f16 + S2.f16.
7699 // Fused half precision multiply add.
7700 void
7702 {
7704 } // execute
7705 // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
7706
7708 : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
7709 {
7710 setFlag(ALU);
7711 setFlag(F16);
7712 } // Inst_VOP3__V_DIV_FIXUP_F16
7713
7715 {
7716 } // ~Inst_VOP3__V_DIV_FIXUP_F16
7717
7718 // --- description from .arch file ---
7719 // sign_out = sign(S1.f16)^sign(S2.f16);
7720 // if (S2.f16 == NAN)
7721 // D.f16 = Quiet(S2.f16);
7722 // else if (S1.f16 == NAN)
7723 // D.f16 = Quiet(S1.f16);
7724 // else if (S1.f16 == S2.f16 == 0)
7725 // # 0/0
7726 // D.f16 = pele_nan(0xfe00);
7727 // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
7728 // # inf/inf
7729 // D.f16 = pele_nan(0xfe00);
7730 // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
7731 // # x/0, or inf/y
7732 // D.f16 = sign_out ? -INF : INF;
7733 // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
7734 // # x/inf, 0/y
7735 // D.f16 = sign_out ? -0 : 0;
7736 // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
7737 // D.f16 = sign_out ? -underflow : underflow;
7738 // else if (exp(S1.f16) == 255)
7739 // D.f16 = sign_out ? -overflow : overflow;
7740 // else
7741 // D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
7742 // Half precision division fixup.
7743 // S0 = Quotient, S1 = Denominator, S3 = Numerator.
7744 // Given a numerator, denominator, and quotient from a divide, this opcode
7745 // will detect and apply special case numerics, touching up the quotient if
7746 // necessary. This opcode also generates invalid, denorm and divide by
7747 // zero exceptions caused by the division.
7748 void
7750 {
7752 } // execute
7753 // --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
7754
7756 : Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
7757 {
7758 setFlag(ALU);
7759 } // Inst_VOP3__V_LSHL_ADD_U64
7760
7762 {
7763 } // ~Inst_VOP3__V_LSHL_ADD_U64
7764
7765 // --- description from .arch file ---
7766 // D.u = (S0.u << S1.u[4:0]) + S2.u.
7767 void
7769 {
7770 Wavefront *wf = gpuDynInst->wavefront();
7771 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
7772 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7773 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
7774 VecOperandU64 vdst(gpuDynInst, instData.VDST);
7775
7776 src0.readSrc();
7777 src1.readSrc();
7778 src2.readSrc();
7779
7783 assert(!(instData.ABS & 0x1));
7784 assert(!(instData.ABS & 0x2));
7785 assert(!(instData.ABS & 0x4));
7786 assert(!(extData.NEG & 0x1));
7787 assert(!(extData.NEG & 0x2));
7788 assert(!(extData.NEG & 0x4));
7789
7790 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7791 if (wf->execMask(lane)) {
7792 int shift_amount = bits(src1[lane], 2, 0);
7793 shift_amount = shift_amount > 4 ? 0 : shift_amount;
7794 vdst[lane] = (src0[lane] << shift_amount)
7795 + src2[lane];
7796 }
7797 }
7798
7799 vdst.write();
7800 } // execute
7801 // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
7802
7804 InFmt_VOP3A *iFmt)
7805 : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
7806 {
7807 setFlag(ALU);
7808 setFlag(F32);
7809 } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
7810
7812 {
7813 } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
7814
7815 // --- description from .arch file ---
7816 // byte = S1.u[1:0]; bit = byte * 8;
7817 // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
7818 // Pack converted value of S0.f into byte S1 of the destination.
7819 // SQ translates to V_CVT_PK_U8_F32.
7820 // Note: this opcode uses src_c to pass destination in as a source.
7821 void
7826 // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
7827
7829 : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
7830 {
7831 setFlag(ALU);
7832 setFlag(F32);
7833 } // Inst_VOP3__V_INTERP_P1_F32
7834
7836 {
7837 } // ~Inst_VOP3__V_INTERP_P1_F32
7838
7839 // --- description from .arch file ---
7840 // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
7841 // V_MAD_F32 for SP).
7842 // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
7843 // D == S then data corruption will occur.
7844 // NOTE: In textual representations the I/J VGPR is the first source and
7845 // the attribute is the second source; however in the VOP3 encoding the
7846 // attribute is stored in the src0 field and the VGPR is stored in the
7847 // src1 field.
7848 void
7850 {
7852 } // execute
7853 // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
7854
7856 : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
7857 {
7858 setFlag(ALU);
7859 setFlag(F32);
7860 } // Inst_VOP3__V_INTERP_P2_F32
7861
7863 {
7864 } // ~Inst_VOP3__V_INTERP_P2_F32
7865
7866 // --- description from .arch file ---
7867 // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
7868 // V_MAD_F32 for SP).
7869 // NOTE: In textual representations the I/J VGPR is the first source and
7870 // the attribute is the second source; however in the VOP3 encoding the
7871 // attribute is stored in the src0 field and the VGPR is stored in the
7872 // src1 field.
7873 void
7875 {
7877 } // execute
7878 // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
7879
7881 : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
7882 {
7883 setFlag(ALU);
7884 setFlag(F32);
7885 } // Inst_VOP3__V_INTERP_MOV_F32
7886
7888 {
7889 } // ~Inst_VOP3__V_INTERP_MOV_F32
7890
7891 // --- description from .arch file ---
7892 // D.f = {P10,P20,P0}[S.u]; parameter load.
7893 void
7895 {
7897 } // execute
7898 // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
7899
7901 InFmt_VOP3A *iFmt)
7902 : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
7903 {
7904 setFlag(ALU);
7905 setFlag(F16);
7906 } // Inst_VOP3__V_INTERP_P1LL_F16
7907
7909 {
7910 } // ~Inst_VOP3__V_INTERP_P1LL_F16
7911
7912 // --- description from .arch file ---
7913 // D.f32 = P10.f16 * S0.f32 + P0.f16.
7914 // 'LL' stands for 'two LDS arguments'.
7915 // attr_word selects the high or low half 16 bits of each LDS dword
7916 // accessed.
7917 // This opcode is available for 32-bank LDS only.
7918 // NOTE: In textual representations the I/J VGPR is the first source and
7919 // the attribute is the second source; however in the VOP3 encoding the
7920 // attribute is stored in the src0 field and the VGPR is stored in the
7921 // src1 field.
7922 void
7924 {
7926 } // execute
7927 // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
7928
7930 InFmt_VOP3A *iFmt)
7931 : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
7932 {
7933 setFlag(ALU);
7934 setFlag(F16);
7935 } // Inst_VOP3__V_INTERP_P1LV_F16
7936
7938 {
7939 } // ~Inst_VOP3__V_INTERP_P1LV_F16
7940
7941 // --- description from .arch file ---
7942 // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
7943 // 'LV' stands for 'One LDS and one VGPR argument'.
7944 // S2 holds two parameters, attr_word selects the high or low word of the
7945 // VGPR for this calculation, as well as the high or low half of the LDS
7946 // data.
7947 // Meant for use with 16-bank LDS.
7948 // NOTE: In textual representations the I/J VGPR is the first source and
7949 // the attribute is the second source; however in the VOP3 encoding the
7950 // attribute is stored in the src0 field and the VGPR is stored in the
7951 // src1 field.
7952 void
7954 {
7956 } // execute
7957 // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
7958
7960 : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
7961 {
7962 setFlag(ALU);
7963 setFlag(F16);
7964 } // Inst_VOP3__V_INTERP_P2_F16
7965
7967 {
7968 } // ~Inst_VOP3__V_INTERP_P2_F16
7969
7970 // --- description from .arch file ---
7971 // D.f16 = P20.f16 * S0.f32 + S2.f32.
7972 // Final computation. attr_word selects LDS high or low 16bits. Used for
7973 // both 16- and 32-bank LDS.
7974 // Result is always written to the 16 LSBs of the destination VGPR.
7975 // NOTE: In textual representations the I/J VGPR is the first source and
7976 // the attribute is the second source; however in the VOP3 encoding the
7977 // attribute is stored in the src0 field and the VGPR is stored in the
7978 // src1 field.
7979 void
7981 {
7983 } // execute
7984 // --- Inst_VOP3__V_ADD_F64 class methods ---
7985
7987 : Inst_VOP3A(iFmt, "v_add_f64", false)
7988 {
7989 setFlag(ALU);
7990 setFlag(F64);
7991 } // Inst_VOP3__V_ADD_F64
7992
7994 {
7995 } // ~Inst_VOP3__V_ADD_F64
7996
7997 // --- description from .arch file ---
7998 // D.d = S0.d + S1.d.
7999 void
8001 {
8002 Wavefront *wf = gpuDynInst->wavefront();
8003 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8004 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8005 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8006
8007 src0.readSrc();
8008 src1.readSrc();
8009
8010 if (instData.ABS & 0x1) {
8011 src0.absModifier();
8012 }
8013
8014 if (instData.ABS & 0x2) {
8015 src1.absModifier();
8016 }
8017
8018 if (extData.NEG & 0x1) {
8019 src0.negModifier();
8020 }
8021
8022 if (extData.NEG & 0x2) {
8023 src1.negModifier();
8024 }
8025
8029 assert(!(instData.ABS & 0x4));
8030 assert(!(extData.NEG & 0x4));
8031
8032 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8033 if (wf->execMask(lane)) {
8034 if (std::isnan(src0[lane]) ||
8035 std::isnan(src1[lane]) ) {
8036 vdst[lane] = NAN;
8037 } else if (std::isinf(src0[lane]) &&
8038 std::isinf(src1[lane])) {
8039 if (std::signbit(src0[lane]) !=
8040 std::signbit(src1[lane])) {
8041 vdst[lane] = NAN;
8042 } else {
8043 vdst[lane] = src0[lane];
8044 }
8045 } else if (std::isinf(src0[lane])) {
8046 vdst[lane] = src0[lane];
8047 } else if (std::isinf(src1[lane])) {
8048 vdst[lane] = src1[lane];
8049 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8050 std::fpclassify(src0[lane]) == FP_ZERO) {
8051 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8052 std::fpclassify(src1[lane]) == FP_ZERO) {
8053 if (std::signbit(src0[lane]) &&
8054 std::signbit(src1[lane])) {
8055 vdst[lane] = -0.0;
8056 } else {
8057 vdst[lane] = 0.0;
8058 }
8059 } else {
8060 vdst[lane] = src1[lane];
8061 }
8062 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8063 std::fpclassify(src1[lane]) == FP_ZERO) {
8064 if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8065 std::fpclassify(src0[lane]) == FP_ZERO) {
8066 if (std::signbit(src0[lane]) &&
8067 std::signbit(src1[lane])) {
8068 vdst[lane] = -0.0;
8069 } else {
8070 vdst[lane] = 0.0;
8071 }
8072 } else {
8073 vdst[lane] = src0[lane];
8074 }
8075 } else {
8076 vdst[lane] = src0[lane] + src1[lane];
8077 }
8078 }
8079 }
8080
8081 vdst.write();
8082 } // execute
8083 // --- Inst_VOP3__V_MUL_F64 class methods ---
8084
8086 : Inst_VOP3A(iFmt, "v_mul_f64", false)
8087 {
8088 setFlag(ALU);
8089 setFlag(F64);
8090 } // Inst_VOP3__V_MUL_F64
8091
8093 {
8094 } // ~Inst_VOP3__V_MUL_F64
8095
8096 // --- description from .arch file ---
8097 // D.d = S0.d * S1.d.
8098 void
8100 {
8101 Wavefront *wf = gpuDynInst->wavefront();
8102 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8103 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8104 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8105
8106 src0.readSrc();
8107 src1.readSrc();
8108
8109 if (instData.ABS & 0x1) {
8110 src0.absModifier();
8111 }
8112
8113 if (instData.ABS & 0x2) {
8114 src1.absModifier();
8115 }
8116
8117 if (extData.NEG & 0x1) {
8118 src0.negModifier();
8119 }
8120
8121 if (extData.NEG & 0x2) {
8122 src1.negModifier();
8123 }
8124
8128 assert(!(instData.ABS & 0x4));
8129 assert(!(extData.NEG & 0x4));
8130
8131 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8132 if (wf->execMask(lane)) {
8133 if (std::isnan(src0[lane]) ||
8134 std::isnan(src1[lane])) {
8135 vdst[lane] = NAN;
8136 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8137 std::fpclassify(src0[lane]) == FP_ZERO) &&
8138 !std::signbit(src0[lane])) {
8139 if (std::isinf(src1[lane])) {
8140 vdst[lane] = NAN;
8141 } else if (!std::signbit(src1[lane])) {
8142 vdst[lane] = +0.0;
8143 } else {
8144 vdst[lane] = -0.0;
8145 }
8146 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8147 std::fpclassify(src0[lane]) == FP_ZERO) &&
8148 std::signbit(src0[lane])) {
8149 if (std::isinf(src1[lane])) {
8150 vdst[lane] = NAN;
8151 } else if (std::signbit(src1[lane])) {
8152 vdst[lane] = +0.0;
8153 } else {
8154 vdst[lane] = -0.0;
8155 }
8156 } else if (std::isinf(src0[lane]) &&
8157 !std::signbit(src0[lane])) {
8158 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8159 std::fpclassify(src1[lane]) == FP_ZERO) {
8160 vdst[lane] = NAN;
8161 } else if (!std::signbit(src1[lane])) {
8162 vdst[lane] = +INFINITY;
8163 } else {
8164 vdst[lane] = -INFINITY;
8165 }
8166 } else if (std::isinf(src0[lane]) &&
8167 std::signbit(src0[lane])) {
8168 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8169 std::fpclassify(src1[lane]) == FP_ZERO) {
8170 vdst[lane] = NAN;
8171 } else if (std::signbit(src1[lane])) {
8172 vdst[lane] = +INFINITY;
8173 } else {
8174 vdst[lane] = -INFINITY;
8175 }
8176 } else {
8177 vdst[lane] = src0[lane] * src1[lane];
8178 }
8179 }
8180 }
8181
8182 vdst.write();
8183 } // execute
8184 // --- Inst_VOP3__V_MIN_F64 class methods ---
8185
8187 : Inst_VOP3A(iFmt, "v_min_f64", false)
8188 {
8189 setFlag(ALU);
8190 setFlag(F64);
8191 } // Inst_VOP3__V_MIN_F64
8192
8194 {
8195 } // ~Inst_VOP3__V_MIN_F64
8196
8197 // --- description from .arch file ---
8198 // D.d = min(S0.d, S1.d).
8199 void
8201 {
8202 Wavefront *wf = gpuDynInst->wavefront();
8203 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8204 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8205 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8206
8207 src0.readSrc();
8208 src1.readSrc();
8209
8210 if (instData.ABS & 0x1) {
8211 src0.absModifier();
8212 }
8213
8214 if (instData.ABS & 0x2) {
8215 src1.absModifier();
8216 }
8217
8218 if (extData.NEG & 0x1) {
8219 src0.negModifier();
8220 }
8221
8222 if (extData.NEG & 0x2) {
8223 src1.negModifier();
8224 }
8225
8229 assert(!(instData.ABS & 0x4));
8230 assert(!(extData.NEG & 0x4));
8231
8232 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8233 if (wf->execMask(lane)) {
8234 vdst[lane] = std::fmin(src0[lane], src1[lane]);
8235 }
8236 }
8237
8238 vdst.write();
8239 } // execute
8240 // --- Inst_VOP3__V_MAX_F64 class methods ---
8241
8243 : Inst_VOP3A(iFmt, "v_max_f64", false)
8244 {
8245 setFlag(ALU);
8246 setFlag(F64);
8247 } // Inst_VOP3__V_MAX_F64
8248
8250 {
8251 } // ~Inst_VOP3__V_MAX_F64
8252
8253 // --- description from .arch file ---
8254 // D.d = max(S0.d, S1.d).
8255 void
8257 {
8258 Wavefront *wf = gpuDynInst->wavefront();
8259 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8260 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8261 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8262
8263 src0.readSrc();
8264 src1.readSrc();
8265
8266 if (instData.ABS & 0x1) {
8267 src0.absModifier();
8268 }
8269
8270 if (instData.ABS & 0x2) {
8271 src1.absModifier();
8272 }
8273
8274 if (extData.NEG & 0x1) {
8275 src0.negModifier();
8276 }
8277
8278 if (extData.NEG & 0x2) {
8279 src1.negModifier();
8280 }
8281
8285 assert(!(instData.ABS & 0x4));
8286 assert(!(extData.NEG & 0x4));
8287
8288 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8289 if (wf->execMask(lane)) {
8290 vdst[lane] = std::fmax(src0[lane], src1[lane]);
8291 }
8292 }
8293
8294 vdst.write();
8295 } // execute
8296 // --- Inst_VOP3__V_LDEXP_F64 class methods ---
8297
8299 : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
8300 {
8301 setFlag(ALU);
8302 setFlag(F64);
8303 } // Inst_VOP3__V_LDEXP_F64
8304
8306 {
8307 } // ~Inst_VOP3__V_LDEXP_F64
8308
8309 // --- description from .arch file ---
8310 // D.d = pow(S0.d, S1.i[31:0]).
8311 void
8313 {
8314 Wavefront *wf = gpuDynInst->wavefront();
8315 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8316 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8317 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8318
8319 src0.readSrc();
8320 src1.readSrc();
8321
8322 if (instData.ABS & 0x1) {
8323 src0.absModifier();
8324 }
8325
8326 if (extData.NEG & 0x1) {
8327 src0.negModifier();
8328 }
8329
8333 assert(!(instData.ABS & 0x2));
8334 assert(!(instData.ABS & 0x4));
8335 assert(!(extData.NEG & 0x2));
8336 assert(!(extData.NEG & 0x4));
8337
8338 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8339 if (wf->execMask(lane)) {
8340 if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
8341 vdst[lane] = src0[lane];
8342 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
8343 || std::fpclassify(src0[lane]) == FP_ZERO) {
8344 if (std::signbit(src0[lane])) {
8345 vdst[lane] = -0.0;
8346 } else {
8347 vdst[lane] = +0.0;
8348 }
8349 } else {
8350 vdst[lane] = std::ldexp(src0[lane], src1[lane]);
8351 }
8352 }
8353 }
8354
8355 vdst.write();
8356 } // execute
8357 // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
8358
8360 : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
8361 {
8362 setFlag(ALU);
8363 } // Inst_VOP3__V_MUL_LO_U32
8364
8366 {
8367 } // ~Inst_VOP3__V_MUL_LO_U32
8368
8369 // --- description from .arch file ---
8370 // D.u = S0.u * S1.u.
8371 void
8373 {
8374 Wavefront *wf = gpuDynInst->wavefront();
8375 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8376 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8377 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8378
8379 src0.readSrc();
8380 src1.readSrc();
8381
8385 assert(!(instData.ABS & 0x1));
8386 assert(!(instData.ABS & 0x2));
8387 assert(!(instData.ABS & 0x4));
8388 assert(!(extData.NEG & 0x1));
8389 assert(!(extData.NEG & 0x2));
8390 assert(!(extData.NEG & 0x4));
8391
8392 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8393 if (wf->execMask(lane)) {
8394 VecElemI64 s0 = (VecElemI64)src0[lane];
8395 VecElemI64 s1 = (VecElemI64)src1[lane];
8396 vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
8397 }
8398 }
8399
8400 vdst.write();
8401 } // execute
8402 // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
8403
8405 : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
8406 {
8407 setFlag(ALU);
8408 } // Inst_VOP3__V_MUL_HI_U32
8409
8411 {
8412 } // ~Inst_VOP3__V_MUL_HI_U32
8413
8414 // --- description from .arch file ---
8415 // D.u = (S0.u * S1.u) >> 32.
8416 void
8418 {
8419 Wavefront *wf = gpuDynInst->wavefront();
8420 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8421 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8422 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8423
8424 src0.readSrc();
8425 src1.readSrc();
8426
8430 assert(!(instData.ABS & 0x1));
8431 assert(!(instData.ABS & 0x2));
8432 assert(!(instData.ABS & 0x4));
8433 assert(!(extData.NEG & 0x1));
8434 assert(!(extData.NEG & 0x2));
8435 assert(!(extData.NEG & 0x4));
8436
8437 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8438 if (wf->execMask(lane)) {
8439 VecElemI64 s0 = (VecElemI64)src0[lane];
8440 VecElemI64 s1 = (VecElemI64)src1[lane];
8441 vdst[lane]
8442 = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
8443 }
8444 }
8445
8446 vdst.write();
8447 } // execute
8448 // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
8449
8451 : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
8452 {
8453 setFlag(ALU);
8454 } // Inst_VOP3__V_MUL_HI_I32
8455
8457 {
8458 } // ~Inst_VOP3__V_MUL_HI_I32
8459
8460 // --- description from .arch file ---
8461 // D.i = (S0.i * S1.i) >> 32.
8462 void
8464 {
8465 Wavefront *wf = gpuDynInst->wavefront();
8466 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
8467 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
8468 VecOperandI32 vdst(gpuDynInst, instData.VDST);
8469
8470 src0.readSrc();
8471 src1.readSrc();
8472
8476 assert(!(instData.ABS & 0x1));
8477 assert(!(instData.ABS & 0x2));
8478 assert(!(instData.ABS & 0x4));
8479 assert(!(extData.NEG & 0x1));
8480 assert(!(extData.NEG & 0x2));
8481 assert(!(extData.NEG & 0x4));
8482
8483 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8484 if (wf->execMask(lane)) {
8485 VecElemI64 s0 = (VecElemI64)src0[lane];
8486 VecElemI64 s1 = (VecElemI64)src1[lane];
8487 vdst[lane]
8488 = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
8489 }
8490 }
8491
8492 vdst.write();
8493 } // execute
8494 // --- Inst_VOP3__V_LDEXP_F32 class methods ---
8495
8497 : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
8498 {
8499 setFlag(ALU);
8500 setFlag(F32);
8501 } // Inst_VOP3__V_LDEXP_F32
8502
8504 {
8505 } // ~Inst_VOP3__V_LDEXP_F32
8506
8507 // --- description from .arch file ---
8508 // D.f = pow(S0.f, S1.i)
8509 void
8511 {
8512 Wavefront *wf = gpuDynInst->wavefront();
8513 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
8514 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
8515 VecOperandF32 vdst(gpuDynInst, instData.VDST);
8516
8517 src0.readSrc();
8518 src1.readSrc();
8519
8523 assert(!(instData.ABS & 0x2));
8524 assert(!(instData.ABS & 0x4));
8525 assert(!(extData.NEG & 0x2));
8526 assert(!(extData.NEG & 0x4));
8527
8528 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8529 if (wf->execMask(lane)) {
8530 vdst[lane] = std::ldexp(src0[lane], src1[lane]);
8531 }
8532 }
8533
8534 vdst.write();
8535 } // execute
8536 // --- Inst_VOP3__V_READLANE_B32 class methods ---
8537
8539 : Inst_VOP3A(iFmt, "v_readlane_b32", true)
8540 {
8541 setFlag(ALU);
8542 setFlag(IgnoreExec);
8543 } // Inst_VOP3__V_READLANE_B32
8544
8546 {
8547 } // ~Inst_VOP3__V_READLANE_B32
8548
8549 // --- description from .arch file ---
8550 // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
8551 // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
8552 // Input and output modifiers not supported; this is an untyped operation.
8553 void
8555 {
8556 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8557 ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
8558 ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
8559
8560 src0.readSrc();
8561 src1.read();
8562
8563 sdst = src0[src1.rawData() & 0x3f];
8564
8565 sdst.write();
8566 } // execute
8567 // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
8568
8570 : Inst_VOP3A(iFmt, "v_writelane_b32", false)
8571 {
8572 setFlag(ALU);
8573 setFlag(IgnoreExec);
8574 } // Inst_VOP3__V_WRITELANE_B32
8575
8577 {
8578 } // ~Inst_VOP3__V_WRITELANE_B32
8579
8580 // --- description from .arch file ---
8581 // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
8582 // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
8583 // exec mask.
8584 // Input and output modifiers not supported; this is an untyped operation.
8585 // SQ translates to V_MOV_B32.
8586 void
8588 {
8589 ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
8590 ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
8591 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8592
8593 src0.read();
8594 src1.read();
8595 vdst.read();
8596
8597 vdst[src1.rawData() & 0x3f] = src0.rawData();
8598
8599 vdst.write();
8600 } // execute
8601 // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
8602
8604 : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
8605 {
8606 setFlag(ALU);
8607 } // Inst_VOP3__V_BCNT_U32_B32
8608
8610 {
8611 } // ~Inst_VOP3__V_BCNT_U32_B32
8612
8613 // --- description from .arch file ---
8614 // D.u = CountOneBits(S0.u) + S1.u. Bit count.
8615 void
8617 {
8618 Wavefront *wf = gpuDynInst->wavefront();
8619 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8620 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8621 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8622
8623 src0.readSrc();
8624 src1.readSrc();
8625
8629 assert(!(instData.ABS & 0x1));
8630 assert(!(instData.ABS & 0x2));
8631 assert(!(instData.ABS & 0x4));
8632 assert(!(extData.NEG & 0x1));
8633 assert(!(extData.NEG & 0x2));
8634 assert(!(extData.NEG & 0x4));
8635
8636 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8637 if (wf->execMask(lane)) {
8638 vdst[lane] = popCount(src0[lane]) + src1[lane];
8639 }
8640 }
8641
8642 vdst.write();
8643 } // execute
8644 // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
8645
8647 InFmt_VOP3A *iFmt)
8648 : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
8649 {
8650 setFlag(ALU);
8651 } // Inst_VOP3__V_MBCNT_LO_U32_B32
8652
8654 {
8655 } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
8656
8657 // --- description from .arch file ---
8658 // ThreadMask = (1 << ThreadPosition) - 1;
8659 // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
8660 // Masked bit count, ThreadPosition is the position of this thread in the
8661 // --- wavefront (in 0..63).
8662 void
8664 {
8665 Wavefront *wf = gpuDynInst->wavefront();
8666 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8667 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8668 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8669 uint64_t threadMask = 0;
8670
8671 src0.readSrc();
8672 src1.readSrc();
8673
8677 assert(!(instData.ABS & 0x1));
8678 assert(!(instData.ABS & 0x2));
8679 assert(!(instData.ABS & 0x4));
8680 assert(!(extData.NEG & 0x1));
8681 assert(!(extData.NEG & 0x2));
8682 assert(!(extData.NEG & 0x4));
8683
8684 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8685 if (wf->execMask(lane)) {
8686 threadMask = ((1ULL << lane) - 1ULL);
8687 vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
8688 src1[lane];
8689 }
8690 }
8691
8692 vdst.write();
8693 } // execute
8694 // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
8695
8697 InFmt_VOP3A *iFmt)
8698 : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
8699 {
8700 setFlag(ALU);
8701 } // Inst_VOP3__V_MBCNT_HI_U32_B32
8702
8704 {
8705 } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
8706
8707 // --- description from .arch file ---
8708 // ThreadMask = (1 << ThreadPosition) - 1;
8709 // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
8710 // Masked bit count, ThreadPosition is the position of this thread in the
8711 // --- wavefront (in 0..63).
8712 void
8714 {
8715 Wavefront *wf = gpuDynInst->wavefront();
8716 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8717 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8718 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8719 uint64_t threadMask = 0;
8720
8721 src0.readSrc();
8722 src1.readSrc();
8723
8727 assert(!(instData.ABS & 0x1));
8728 assert(!(instData.ABS & 0x2));
8729 assert(!(instData.ABS & 0x4));
8730 assert(!(extData.NEG & 0x1));
8731 assert(!(extData.NEG & 0x2));
8732 assert(!(extData.NEG & 0x4));
8733
8734 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8735 if (wf->execMask(lane)) {
8736 threadMask = ((1ULL << lane) - 1ULL);
8737 vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
8738 src1[lane];
8739 }
8740 }
8741
8742 vdst.write();
8743 } // execute
8744 // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
8745
8747 : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
8748 {
8749 setFlag(ALU);
8750 } // Inst_VOP3__V_LSHLREV_B64
8751
8753 {
8754 } // ~Inst_VOP3__V_LSHLREV_B64
8755
8756 // --- description from .arch file ---
8757 // D.u64 = S1.u64 << S0.u[5:0].
8758 // SQ translates this to an internal SP opcode.
8759 void
8761 {
8762 Wavefront *wf = gpuDynInst->wavefront();
8763 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8764 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
8765 VecOperandU64 vdst(gpuDynInst, instData.VDST);
8766
8767 src0.readSrc();
8768 src1.readSrc();
8769
8773 assert(!(instData.ABS & 0x1));
8774 assert(!(instData.ABS & 0x2));
8775 assert(!(instData.ABS & 0x4));
8776 assert(!(extData.NEG & 0x1));
8777 assert(!(extData.NEG & 0x2));
8778 assert(!(extData.NEG & 0x4));
8779
8780 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8781 if (wf->execMask(lane)) {
8782 vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
8783 }
8784 }
8785
8786 vdst.write();
8787 } // execute
8788 // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
8789
8791 : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
8792 {
8793 setFlag(ALU);
8794 } // Inst_VOP3__V_LSHRREV_B64
8795
8797 {
8798 } // ~Inst_VOP3__V_LSHRREV_B64
8799
8800 // --- description from .arch file ---
8801 // D.u64 = S1.u64 >> S0.u[5:0].
8802 // The vacated bits are set to zero.
8803 // SQ translates this to an internal SP opcode.
8804 void
8806 {
8807 Wavefront *wf = gpuDynInst->wavefront();
8808 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8809 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
8810 VecOperandU64 vdst(gpuDynInst, instData.VDST);
8811
8812 src0.readSrc();
8813 src1.readSrc();
8814
8818 assert(!(instData.ABS & 0x1));
8819 assert(!(instData.ABS & 0x2));
8820 assert(!(instData.ABS & 0x4));
8821 assert(!(extData.NEG & 0x1));
8822 assert(!(extData.NEG & 0x2));
8823 assert(!(extData.NEG & 0x4));
8824
8825 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8826 if (wf->execMask(lane)) {
8827 vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
8828 }
8829 }
8830
8831 vdst.write();
8832 } // execute
8833 // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
8834
8836 : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
8837 {
8838 setFlag(ALU);
8839 } // Inst_VOP3__V_ASHRREV_I64
8840
8842 {
8843 } // ~Inst_VOP3__V_ASHRREV_I64
8844
8845 // --- description from .arch file ---
8846 // D.u64 = signext(S1.u64) >> S0.u[5:0].
8847 // The vacated bits are set to the sign bit of the input value.
8848 // SQ translates this to an internal SP opcode.
8849 void
8851 {
8852 Wavefront *wf = gpuDynInst->wavefront();
8853 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8854 ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
8855 VecOperandU64 vdst(gpuDynInst, instData.VDST);
8856
8857 src0.readSrc();
8858 src1.readSrc();
8859
8863 assert(!(instData.ABS & 0x1));
8864 assert(!(instData.ABS & 0x2));
8865 assert(!(instData.ABS & 0x4));
8866 assert(!(extData.NEG & 0x1));
8867 assert(!(extData.NEG & 0x2));
8868 assert(!(extData.NEG & 0x4));
8869
8870 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8871 if (wf->execMask(lane)) {
8872 vdst[lane]
8873 = src1[lane] >> bits(src0[lane], 5, 0);
8874 }
8875 }
8876
8877 vdst.write();
8878 } // execute
8879 // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
8880
8882 : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
8883 {
8884 setFlag(ALU);
8885 setFlag(F64);
8886 } // Inst_VOP3__V_TRIG_PREOP_F64
8887
8889 {
8890 } // ~Inst_VOP3__V_TRIG_PREOP_F64
8891
8892 // --- description from .arch file ---
8893 // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
8894 // returns an aligned, double precision segment of 2/PI needed to do range
8895 // reduction on S0.d (double-precision value). Multiple segments can be
8896 // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
8897 // inputs (exp > 1968) are scaled to avoid loss of precision through
8898 // denormalization.
8899 void
8901 {
8903 } // execute
8904 // --- Inst_VOP3__V_BFM_B32 class methods ---
8905
8907 : Inst_VOP3A(iFmt, "v_bfm_b32", false)
8908 {
8909 setFlag(ALU);
8910 } // Inst_VOP3__V_BFM_B32
8911
8913 {
8914 } // ~Inst_VOP3__V_BFM_B32
8915
8916 // --- description from .arch file ---
8917 // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
8918 // is the bitfield offset.
8919 void
8921 {
8922 Wavefront *wf = gpuDynInst->wavefront();
8923 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8924 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8925 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8926
8927 src0.readSrc();
8928 src1.readSrc();
8929
8933 assert(!(instData.ABS & 0x1));
8934 assert(!(instData.ABS & 0x2));
8935 assert(!(instData.ABS & 0x4));
8936 assert(!(extData.NEG & 0x1));
8937 assert(!(extData.NEG & 0x2));
8938 assert(!(extData.NEG & 0x4));
8939
8940 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8941 if (wf->execMask(lane)) {
8942 vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
8943 << bits(src1[lane], 4, 0);
8944 }
8945 }
8946
8947 vdst.write();
8948 } // execute
8949 // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
8950
8952 InFmt_VOP3A *iFmt)
8953 : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
8954 {
8955 setFlag(ALU);
8956 setFlag(F32);
8957 } // Inst_VOP3__V_CVT_PKNORM_I16_F32
8958
8960 {
8961 } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
8962
8963 // --- description from .arch file ---
8964 // D = {(snorm)S1.f, (snorm)S0.f}.
8965 void
8970 // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
8971
8973 InFmt_VOP3A *iFmt)
8974 : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
8975 {
8976 setFlag(ALU);
8977 setFlag(F32);
8978 } // Inst_VOP3__V_CVT_PKNORM_U16_F32
8979
8981 {
8982 } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
8983
8984 // --- description from .arch file ---
8985 // D = {(unorm)S1.f, (unorm)S0.f}.
8986 void
8991 // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
8992
8994 InFmt_VOP3A *iFmt)
8995 : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
8996 {
8997 setFlag(ALU);
8998 setFlag(F32);
8999 } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
9000
9002 {
9003 } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
9004
9005 // --- description from .arch file ---
9006 // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
9007 // --- regardless of current round mode setting in hardware.
9008 // This opcode is intended for use with 16-bit compressed exports.
9009 // See V_CVT_F16_F32 for a version that respects the current rounding mode.
9010 void
9015 // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
9016
9018 : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
9019 {
9020 setFlag(ALU);
9021 } // Inst_VOP3__V_CVT_PK_U16_U32
9022
9024 {
9025 } // ~Inst_VOP3__V_CVT_PK_U16_U32
9026
9027 // --- description from .arch file ---
9028 // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
9029 void
9031 {
9033 } // execute
9034 // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
9035
9037 : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
9038 {
9039 setFlag(ALU);
9040 } // Inst_VOP3__V_CVT_PK_I16_I32
9041
9043 {
9044 } // ~Inst_VOP3__V_CVT_PK_I16_I32
9045
9046 // --- description from .arch file ---
9047 // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
9048 void
9050 {
9052 } // execute
9053 // --- Inst_VOP3__V_CVT_PK_FP8_F32 class methods ---
9054
9056 : Inst_VOP3A(iFmt, "v_cvt_pk_fp8_f32", false)
9057 {
9058 setFlag(ALU);
9059 } // Inst_VOP3__V_CVT_PK_FP8_F32
9060
9062 {
9063 } // ~Inst_VOP3__V_CVT_PK_FP8_F32
9064
9065 void
9067 {
9068 Wavefront *wf = gpuDynInst->wavefront();
9069 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9070 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
9071 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9072
9073 src0.readSrc();
9074 src1.readSrc();
9075 vdst.read(); // Preserve bits
9076
9077 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9078 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9079 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9080 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9081
9082 unsigned opsel = instData.OPSEL;
9083 unsigned abs = instData.ABS;
9084 unsigned neg = extData.NEG;
9085
9086 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9087 if (wf->execMask(lane)) {
9088 AMDGPU::mxfloat8 tmp0(src0[lane]), tmp1(src1[lane]);
9089
9090 if ((abs & 1) && (tmp0 < 0)) tmp0 = -tmp0;
9091 if ((abs & 2) && (tmp1 < 0)) tmp1 = -tmp1;
9092 if (neg & 1) tmp0 = -tmp0;
9093 if (neg & 2) tmp1 = -tmp1;
9094
9095 uint16_t packed_data = (bits(tmp0.data, 31, 24) << 8)
9096 | bits(tmp1.data, 31, 24);
9097
9098 if (opsel & 8) {
9099 replaceBits(vdst[lane], 31, 16, packed_data);
9100 } else {
9101 replaceBits(vdst[lane], 15, 0, packed_data);
9102 }
9103 }
9104 }
9105
9106 vdst.write();
9107 } // execute
9108} // namespace VegaISA
9109} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
uint32_t data
Definition mxfp.hh:112
void setFlag(Flags flag)
const std::string _opcode
Nop class.
Definition nop.hh:49
T omodModifier(T val, unsigned omod)
void execute(GPUDynInstPtr) override
Definition vop3.cc:7382
void execute(GPUDynInstPtr) override
Definition vop3.cc:1451
void execute(GPUDynInstPtr) override
Definition vop3.cc:1305
void execute(GPUDynInstPtr) override
Definition vop3.cc:1609
Inst_VOP3__V_ADD_F16(InFmt_VOP3A *)
Definition vop3.cc:1594
Inst_VOP3__V_ADD_F32(InFmt_VOP3A *)
Definition vop3.cc:81
void execute(GPUDynInstPtr) override
Definition vop3.cc:95
void execute(GPUDynInstPtr) override
Definition vop3.cc:8000
Inst_VOP3__V_ADD_F64(InFmt_VOP3A *)
Definition vop3.cc:7986
void execute(GPUDynInstPtr) override
Definition vop3.cc:7336
Inst_VOP3__V_ADD_U16(InFmt_VOP3A *)
Definition vop3.cc:1703
void execute(GPUDynInstPtr) override
Definition vop3.cc:1717
void execute(GPUDynInstPtr) override
Definition vop3.cc:2293
Inst_VOP3__V_ADD_U32(InFmt_VOP3A *)
Definition vop3.cc:2280
void execute(GPUDynInstPtr) override
Definition vop3.cc:5808
void execute(GPUDynInstPtr) override
Definition vop3.cc:5856
void execute(GPUDynInstPtr) override
Definition vop3.cc:1065
Inst_VOP3__V_AND_B32(InFmt_VOP3A *)
Definition vop3.cc:1051
void execute(GPUDynInstPtr) override
Definition vop3.cc:7474
void execute(GPUDynInstPtr) override
Definition vop3.cc:1990
void execute(GPUDynInstPtr) override
Definition vop3.cc:977
void execute(GPUDynInstPtr) override
Definition vop3.cc:8850
void execute(GPUDynInstPtr) override
Definition vop3.cc:8616
Inst_VOP3__V_BFE_I32(InFmt_VOP3A *)
Definition vop3.cc:5512
void execute(GPUDynInstPtr) override
Definition vop3.cc:5526
void execute(GPUDynInstPtr) override
Definition vop3.cc:5479
Inst_VOP3__V_BFE_U32(InFmt_VOP3A *)
Definition vop3.cc:5465
Inst_VOP3__V_BFI_B32(InFmt_VOP3A *)
Definition vop3.cc:5566
void execute(GPUDynInstPtr) override
Definition vop3.cc:5579
Inst_VOP3__V_BFM_B32(InFmt_VOP3A *)
Definition vop3.cc:8906
void execute(GPUDynInstPtr) override
Definition vop3.cc:8920
void execute(GPUDynInstPtr) override
Definition vop3.cc:4301
void execute(GPUDynInstPtr) override
Definition vop3.cc:4976
void execute(GPUDynInstPtr) override
Definition vop3.cc:3660
void execute(GPUDynInstPtr) override
Definition vop3.cc:3457
void execute(GPUDynInstPtr) override
Definition vop3.cc:4681
Inst_VOP3__V_CLREXCP(InFmt_VOP3A *)
Definition vop3.cc:4669
Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *)
Definition vop3.cc:43
void execute(GPUDynInstPtr) override
Definition vop3.cc:58
void execute(GPUDynInstPtr) override
Definition vop3.cc:5079
Inst_VOP3__V_COS_F16(InFmt_VOP3A *)
Definition vop3.cc:5065
void execute(GPUDynInstPtr) override
Definition vop3.cc:4219
Inst_VOP3__V_COS_F32(InFmt_VOP3A *)
Definition vop3.cc:4203
void execute(GPUDynInstPtr) override
Definition vop3.cc:5396
void execute(GPUDynInstPtr) override
Definition vop3.cc:5459
void execute(GPUDynInstPtr) override
Definition vop3.cc:5417
void execute(GPUDynInstPtr) override
Definition vop3.cc:5438
void execute(GPUDynInstPtr) override
Definition vop3.cc:2858
void execute(GPUDynInstPtr) override
Definition vop3.cc:4723
void execute(GPUDynInstPtr) override
Definition vop3.cc:4702
void execute(GPUDynInstPtr) override
Definition vop3.cc:2914
void execute(GPUDynInstPtr) override
Definition vop3.cc:3064
void execute(GPUDynInstPtr) override
Definition vop3.cc:2635
void execute(GPUDynInstPtr) override
Definition vop3.cc:2677
void execute(GPUDynInstPtr) override
Definition vop3.cc:3160
void execute(GPUDynInstPtr) override
Definition vop3.cc:3200
void execute(GPUDynInstPtr) override
Definition vop3.cc:3240
void execute(GPUDynInstPtr) override
Definition vop3.cc:3280
void execute(GPUDynInstPtr) override
Definition vop3.cc:3112
void execute(GPUDynInstPtr) override
Definition vop3.cc:2595
void execute(GPUDynInstPtr) override
Definition vop3.cc:3376
void execute(GPUDynInstPtr) override
Definition vop3.cc:3003
void execute(GPUDynInstPtr) override
Definition vop3.cc:4765
void execute(GPUDynInstPtr) override
Definition vop3.cc:2775
void execute(GPUDynInstPtr) override
Definition vop3.cc:2543
void execute(GPUDynInstPtr) override
Definition vop3.cc:3043
void execute(GPUDynInstPtr) override
Definition vop3.cc:7822
void execute(GPUDynInstPtr) override
Definition vop3.cc:8966
void execute(GPUDynInstPtr) override
Definition vop3.cc:8987
void execute(GPUDynInstPtr) override
Definition vop3.cc:9011
void execute(GPUDynInstPtr) override
Definition vop3.cc:9066
void execute(GPUDynInstPtr) override
Definition vop3.cc:9049
void execute(GPUDynInstPtr) override
Definition vop3.cc:9030
void execute(GPUDynInstPtr) override
Definition vop3.cc:6566
void execute(GPUDynInstPtr) override
Definition vop3.cc:2962
void execute(GPUDynInstPtr) override
Definition vop3.cc:4744
void execute(GPUDynInstPtr) override
Definition vop3.cc:2719
void execute(GPUDynInstPtr) override
Definition vop3.cc:3322
void execute(GPUDynInstPtr) override
Definition vop3.cc:7749
void execute(GPUDynInstPtr) override
Definition vop3.cc:6623
void execute(GPUDynInstPtr) override
Definition vop3.cc:6701
void execute(GPUDynInstPtr) override
Definition vop3.cc:6939
void execute(GPUDynInstPtr) override
Definition vop3.cc:7002
void execute(GPUDynInstPtr) override
Definition vop3.cc:6796
void execute(GPUDynInstPtr) override
Definition vop3.cc:6853
Inst_VOP3__V_EXP_F16(InFmt_VOP3A *)
Definition vop3.cc:4863
void execute(GPUDynInstPtr) override
Definition vop3.cc:4880
void execute(GPUDynInstPtr) override
Definition vop3.cc:3781
Inst_VOP3__V_EXP_F32(InFmt_VOP3A *)
Definition vop3.cc:3767
void execute(GPUDynInstPtr) override
Definition vop3.cc:5099
void execute(GPUDynInstPtr) override
Definition vop3.cc:4413
void execute(GPUDynInstPtr) override
Definition vop3.cc:4333
void execute(GPUDynInstPtr) override
Definition vop3.cc:4373
void execute(GPUDynInstPtr) override
Definition vop3.cc:4955
void execute(GPUDynInstPtr) override
Definition vop3.cc:3741
void execute(GPUDynInstPtr) override
Definition vop3.cc:3538
void execute(GPUDynInstPtr) override
Definition vop3.cc:2424
Inst_VOP3__V_FMA_F16(InFmt_VOP3A *)
Definition vop3.cc:7685
void execute(GPUDynInstPtr) override
Definition vop3.cc:7701
void execute(GPUDynInstPtr) override
Definition vop3.cc:5627
Inst_VOP3__V_FMA_F32(InFmt_VOP3A *)
Definition vop3.cc:5612
Inst_VOP3__V_FMA_F64(InFmt_VOP3A *)
Definition vop3.cc:5673
void execute(GPUDynInstPtr) override
Definition vop3.cc:5688
void execute(GPUDynInstPtr) override
Definition vop3.cc:5039
void execute(GPUDynInstPtr) override
Definition vop3.cc:3578
void execute(GPUDynInstPtr) override
Definition vop3.cc:4541
void execute(GPUDynInstPtr) override
Definition vop3.cc:4934
void execute(GPUDynInstPtr) override
Definition vop3.cc:4587
void execute(GPUDynInstPtr) override
Definition vop3.cc:4454
void execute(GPUDynInstPtr) override
Definition vop3.cc:4907
void execute(GPUDynInstPtr) override
Definition vop3.cc:4638
void execute(GPUDynInstPtr) override
Definition vop3.cc:4500
void execute(GPUDynInstPtr) override
Definition vop3.cc:7894
void execute(GPUDynInstPtr) override
Definition vop3.cc:7923
void execute(GPUDynInstPtr) override
Definition vop3.cc:7953
void execute(GPUDynInstPtr) override
Definition vop3.cc:7849
void execute(GPUDynInstPtr) override
Definition vop3.cc:7980
void execute(GPUDynInstPtr) override
Definition vop3.cc:7874
void execute(GPUDynInstPtr) override
Definition vop3.cc:2274
void execute(GPUDynInstPtr) override
Definition vop3.cc:8510
void execute(GPUDynInstPtr) override
Definition vop3.cc:8312
void execute(GPUDynInstPtr) override
Definition vop3.cc:5753
Inst_VOP3__V_LERP_U8(InFmt_VOP3A *)
Definition vop3.cc:5734
void execute(GPUDynInstPtr) override
Definition vop3.cc:4857
Inst_VOP3__V_LOG_F16(InFmt_VOP3A *)
Definition vop3.cc:4840
void execute(GPUDynInstPtr) override
Definition vop3.cc:3821
Inst_VOP3__V_LOG_F32(InFmt_VOP3A *)
Definition vop3.cc:3807
void execute(GPUDynInstPtr) override
Definition vop3.cc:5147
void execute(GPUDynInstPtr) override
Definition vop3.cc:1894
void execute(GPUDynInstPtr) override
Definition vop3.cc:1021
void execute(GPUDynInstPtr) override
Definition vop3.cc:8760
void execute(GPUDynInstPtr) override
Definition vop3.cc:7290
void execute(GPUDynInstPtr) override
Definition vop3.cc:7768
void execute(GPUDynInstPtr) override
Definition vop3.cc:7427
void execute(GPUDynInstPtr) override
Definition vop3.cc:1939
void execute(GPUDynInstPtr) override
Definition vop3.cc:932
void execute(GPUDynInstPtr) override
Definition vop3.cc:8805
void execute(GPUDynInstPtr) override
Definition vop3.cc:1697
Inst_VOP3__V_MAC_F16(InFmt_VOP3A *)
Definition vop3.cc:1680
Inst_VOP3__V_MAC_F32(InFmt_VOP3A *)
Definition vop3.cc:1229
void execute(GPUDynInstPtr) override
Definition vop3.cc:1245
void execute(GPUDynInstPtr) override
Definition vop3.cc:7522
Inst_VOP3__V_MAD_F16(InFmt_VOP3A *)
Definition vop3.cc:7506
void execute(GPUDynInstPtr) override
Definition vop3.cc:5241
Inst_VOP3__V_MAD_F32(InFmt_VOP3A *)
Definition vop3.cc:5226
Inst_VOP3__V_MAD_I16(InFmt_VOP3A *)
Definition vop3.cc:7575
void execute(GPUDynInstPtr) override
Definition vop3.cc:7590
void execute(GPUDynInstPtr) override
Definition vop3.cc:5301
void execute(GPUDynInstPtr) override
Definition vop3.cc:7200
void execute(GPUDynInstPtr) override
Definition vop3.cc:5180
void execute(GPUDynInstPtr) override
Definition vop3.cc:7543
Inst_VOP3__V_MAD_U16(InFmt_VOP3A *)
Definition vop3.cc:7528
void execute(GPUDynInstPtr) override
Definition vop3.cc:5348
void execute(GPUDynInstPtr) override
Definition vop3.cc:7151
void execute(GPUDynInstPtr) override
Definition vop3.cc:6059
void execute(GPUDynInstPtr) override
Definition vop3.cc:6119
void execute(GPUDynInstPtr) override
Definition vop3.cc:6165
void execute(GPUDynInstPtr) override
Definition vop3.cc:2036
Inst_VOP3__V_MAX_F16(InFmt_VOP3A *)
Definition vop3.cc:2020
void execute(GPUDynInstPtr) override
Definition vop3.cc:703
Inst_VOP3__V_MAX_F32(InFmt_VOP3A *)
Definition vop3.cc:689
Inst_VOP3__V_MAX_F64(InFmt_VOP3A *)
Definition vop3.cc:8242
void execute(GPUDynInstPtr) override
Definition vop3.cc:8256
void execute(GPUDynInstPtr) override
Definition vop3.cc:2126
Inst_VOP3__V_MAX_I16(InFmt_VOP3A *)
Definition vop3.cc:2113
void execute(GPUDynInstPtr) override
Definition vop3.cc:801
Inst_VOP3__V_MAX_I32(InFmt_VOP3A *)
Definition vop3.cc:788
Inst_VOP3__V_MAX_U16(InFmt_VOP3A *)
Definition vop3.cc:2064
void execute(GPUDynInstPtr) override
Definition vop3.cc:2077
void execute(GPUDynInstPtr) override
Definition vop3.cc:887
Inst_VOP3__V_MAX_U32(InFmt_VOP3A *)
Definition vop3.cc:874
void execute(GPUDynInstPtr) override
Definition vop3.cc:8713
void execute(GPUDynInstPtr) override
Definition vop3.cc:8663
void execute(GPUDynInstPtr) override
Definition vop3.cc:6212
void execute(GPUDynInstPtr) override
Definition vop3.cc:6271
void execute(GPUDynInstPtr) override
Definition vop3.cc:6316
void execute(GPUDynInstPtr) override
Definition vop3.cc:5906
void execute(GPUDynInstPtr) override
Definition vop3.cc:5966
void execute(GPUDynInstPtr) override
Definition vop3.cc:6012
void execute(GPUDynInstPtr) override
Definition vop3.cc:2058
Inst_VOP3__V_MIN_F16(InFmt_VOP3A *)
Definition vop3.cc:2042
void execute(GPUDynInstPtr) override
Definition vop3.cc:647
Inst_VOP3__V_MIN_F32(InFmt_VOP3A *)
Definition vop3.cc:633
Inst_VOP3__V_MIN_F64(InFmt_VOP3A *)
Definition vop3.cc:8186
void execute(GPUDynInstPtr) override
Definition vop3.cc:8200
Inst_VOP3__V_MIN_I16(InFmt_VOP3A *)
Definition vop3.cc:2211
void execute(GPUDynInstPtr) override
Definition vop3.cc:2224
void execute(GPUDynInstPtr) override
Definition vop3.cc:758
Inst_VOP3__V_MIN_I32(InFmt_VOP3A *)
Definition vop3.cc:745
void execute(GPUDynInstPtr) override
Definition vop3.cc:2175
Inst_VOP3__V_MIN_U16(InFmt_VOP3A *)
Definition vop3.cc:2162
Inst_VOP3__V_MIN_U32(InFmt_VOP3A *)
Definition vop3.cc:831
void execute(GPUDynInstPtr) override
Definition vop3.cc:844
void execute(GPUDynInstPtr) override
Definition vop3.cc:2509
Inst_VOP3__V_MOV_B32(InFmt_VOP3A *)
Definition vop3.cc:2495
void execute(GPUDynInstPtr) override
Definition vop3.cc:2837
void execute(GPUDynInstPtr) override
Definition vop3.cc:7109
void execute(GPUDynInstPtr) override
Definition vop3.cc:7129
void execute(GPUDynInstPtr) override
Definition vop3.cc:7068
Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *)
Definition vop3.cc:7055
Inst_VOP3__V_MUL_F16(InFmt_VOP3A *)
Definition vop3.cc:1659
void execute(GPUDynInstPtr) override
Definition vop3.cc:1674
void execute(GPUDynInstPtr) override
Definition vop3.cc:366
Inst_VOP3__V_MUL_F32(InFmt_VOP3A *)
Definition vop3.cc:352
Inst_VOP3__V_MUL_F64(InFmt_VOP3A *)
Definition vop3.cc:8085
void execute(GPUDynInstPtr) override
Definition vop3.cc:8099
void execute(GPUDynInstPtr) override
Definition vop3.cc:510
void execute(GPUDynInstPtr) override
Definition vop3.cc:8463
void execute(GPUDynInstPtr) override
Definition vop3.cc:601
void execute(GPUDynInstPtr) override
Definition vop3.cc:8417
void execute(GPUDynInstPtr) override
Definition vop3.cc:466
void execute(GPUDynInstPtr) override
Definition vop3.cc:265
void execute(GPUDynInstPtr) override
Definition vop3.cc:1850
void execute(GPUDynInstPtr) override
Definition vop3.cc:8372
void execute(GPUDynInstPtr) override
Definition vop3.cc:558
void execute(GPUDynInstPtr) override
Definition vop3.cc:2490
Inst_VOP3__V_NOP(InFmt_VOP3A *)
Definition vop3.cc:2476
void execute(GPUDynInstPtr) override
Definition vop3.cc:4261
Inst_VOP3__V_NOT_B32(InFmt_VOP3A *)
Definition vop3.cc:4247
Inst_VOP3__V_OR3_B32(InFmt_VOP3A *)
Definition vop3.cc:1139
void execute(GPUDynInstPtr) override
Definition vop3.cc:1153
Inst_VOP3__V_OR_B32(InFmt_VOP3A *)
Definition vop3.cc:1095
void execute(GPUDynInstPtr) override
Definition vop3.cc:1109
uint8_t permute(uint64_t in_dword2x, uint32_t sel)
void execute(GPUDynInstPtr) override
Definition vop3.cc:7648
void execute(GPUDynInstPtr) override
Definition vop3.cc:7088
Inst_VOP3__V_RCP_F16(InFmt_VOP3A *)
Definition vop3.cc:4771
void execute(GPUDynInstPtr) override
Definition vop3.cc:4788
Inst_VOP3__V_RCP_F32(InFmt_VOP3A *)
Definition vop3.cc:3855
void execute(GPUDynInstPtr) override
Definition vop3.cc:3869
void execute(GPUDynInstPtr) override
Definition vop3.cc:3991
Inst_VOP3__V_RCP_F64(InFmt_VOP3A *)
Definition vop3.cc:3977
void execute(GPUDynInstPtr) override
Definition vop3.cc:3911
void execute(GPUDynInstPtr) override
Definition vop3.cc:8554
void execute(GPUDynInstPtr) override
Definition vop3.cc:5019
void execute(GPUDynInstPtr) override
Definition vop3.cc:3700
void execute(GPUDynInstPtr) override
Definition vop3.cc:3497
void execute(GPUDynInstPtr) override
Definition vop3.cc:4834
Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *)
Definition vop3.cc:4817
Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *)
Definition vop3.cc:3937
void execute(GPUDynInstPtr) override
Definition vop3.cc:3951
void execute(GPUDynInstPtr) override
Definition vop3.cc:4043
Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *)
Definition vop3.cc:4029
void execute(GPUDynInstPtr) override
Definition vop3.cc:6417
Inst_VOP3__V_SAD_U16(InFmt_VOP3A *)
Definition vop3.cc:6453
void execute(GPUDynInstPtr) override
Definition vop3.cc:6468
Inst_VOP3__V_SAD_U32(InFmt_VOP3A *)
Definition vop3.cc:6503
void execute(GPUDynInstPtr) override
Definition vop3.cc:6517
void execute(GPUDynInstPtr) override
Definition vop3.cc:6364
Inst_VOP3__V_SAD_U8(InFmt_VOP3A *)
Definition vop3.cc:6348
void execute(GPUDynInstPtr) override
Definition vop3.cc:5059
Inst_VOP3__V_SIN_F16(InFmt_VOP3A *)
Definition vop3.cc:5045
Inst_VOP3__V_SIN_F32(InFmt_VOP3A *)
Definition vop3.cc:4159
void execute(GPUDynInstPtr) override
Definition vop3.cc:4175
void execute(GPUDynInstPtr) override
Definition vop3.cc:4811
void execute(GPUDynInstPtr) override
Definition vop3.cc:4093
void execute(GPUDynInstPtr) override
Definition vop3.cc:4133
void execute(GPUDynInstPtr) override
Definition vop3.cc:1560
void execute(GPUDynInstPtr) override
Definition vop3.cc:1506
void execute(GPUDynInstPtr) override
Definition vop3.cc:1402
void execute(GPUDynInstPtr) override
Definition vop3.cc:1653
void execute(GPUDynInstPtr) override
Definition vop3.cc:209
void execute(GPUDynInstPtr) override
Definition vop3.cc:1806
void execute(GPUDynInstPtr) override
Definition vop3.cc:2379
void execute(GPUDynInstPtr) override
Definition vop3.cc:1353
Inst_VOP3__V_SUB_F16(InFmt_VOP3A *)
Definition vop3.cc:1615
void execute(GPUDynInstPtr) override
Definition vop3.cc:1631
void execute(GPUDynInstPtr) override
Definition vop3.cc:152
Inst_VOP3__V_SUB_F32(InFmt_VOP3A *)
Definition vop3.cc:137
Inst_VOP3__V_SUB_U16(InFmt_VOP3A *)
Definition vop3.cc:1747
void execute(GPUDynInstPtr) override
Definition vop3.cc:1761
Inst_VOP3__V_SUB_U32(InFmt_VOP3A *)
Definition vop3.cc:2323
void execute(GPUDynInstPtr) override
Definition vop3.cc:2336
void execute(GPUDynInstPtr) override
Definition vop3.cc:8900
void execute(GPUDynInstPtr) override
Definition vop3.cc:4997
void execute(GPUDynInstPtr) override
Definition vop3.cc:3619
void execute(GPUDynInstPtr) override
Definition vop3.cc:3416
void execute(GPUDynInstPtr) override
Definition vop3.cc:8587
void execute(GPUDynInstPtr) override
Definition vop3.cc:7245
Inst_VOP3__V_XAD_U32(InFmt_VOP3A *)
Definition vop3.cc:7232
Inst_VOP3__V_XOR_B32(InFmt_VOP3A *)
Definition vop3.cc:1185
void execute(GPUDynInstPtr) override
Definition vop3.cc:1199
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:409
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:392
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
Definition operand.hh:491
void read() override
read from the vrf.
Definition operand.hh:147
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:131
void write() override
write to the vrf.
Definition operand.hh:199
VectorMask & execMask()
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition bitfield.hh:415
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
constexpr void replaceBits(T &val, unsigned first, unsigned last, B bit_val)
A convenience function to replace bits first to last of val with bit_val in place.
Definition bitfield.hh:216
std::enable_if_t< std::is_integral_v< T >, T > reverseBits(T val, size_t size=sizeof(T))
Takes a value and returns the bit reversed version.
Definition bitfield.hh:255
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 7 > i
Definition misc_types.hh:67
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition inst_util.hh:174
ScalarRegI32 findFirstOne(T val)
Definition inst_util.hh:142
T median(T val_0, T val_1, T val_2)
Definition inst_util.hh:247
ScalarRegI32 findFirstOneMsb(T val)
Definition inst_util.hh:153
T roundNearestEven(T val)
Definition inst_util.hh:259
uint32_t VecElemU32
uint64_t VecElemU64
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition inst_util.hh:272
Bitfield< 31, 16 > selector
Definition misc.hh:1038
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:78
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:83

Generated on Tue Jun 18 2024 16:23:51 for gem5 by doxygen 1.11.0