gem5 [DEVELOP-FOR-25.1]
Loading...
Searching...
No Matches
vop3.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
35
36namespace gem5
37{
38
39namespace VegaISA
40{
41 // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
42
44 : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
45 {
46 setFlag(ALU);
47 setFlag(ReadsVCC);
48 } // Inst_VOP3__V_CNDMASK_B32
49
51 {
52 } // ~Inst_VOP3__V_CNDMASK_B32
53
54 // --- description from .arch file ---
55 // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
56 // as a scalar GPR in S2.
57 void
59 {
60 Wavefront *wf = gpuDynInst->wavefront();
61 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
62 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
63 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
64 VecOperandU32 vdst(gpuDynInst, instData.VDST);
65
66 src0.readSrc();
67 src1.readSrc();
68 vcc.read();
69
70 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
71 if (wf->execMask(lane)) {
72 vdst[lane] = bits(vcc.rawData(), lane)
73 ? src1[lane] : src0[lane];
74 }
75 }
76
77 vdst.write();
78 } // execute
79 // --- Inst_VOP3__V_ADD_F32 class methods ---
80
82 : Inst_VOP3A(iFmt, "v_add_f32", false)
83 {
84 setFlag(ALU);
85 setFlag(F32);
86 } // Inst_VOP3__V_ADD_F32
87
89 {
90 } // ~Inst_VOP3__V_ADD_F32
91
92 // --- description from .arch file ---
93 // D.f = S0.f + S1.f.
94 void
96 {
97 Wavefront *wf = gpuDynInst->wavefront();
98 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
99 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
100 VecOperandF32 vdst(gpuDynInst, instData.VDST);
101
102 src0.readSrc();
103 src1.readSrc();
104
105 if (instData.ABS & 0x1) {
106 src0.absModifier();
107 }
108
109 if (instData.ABS & 0x2) {
110 src1.absModifier();
111 }
112
113 if (extData.NEG & 0x1) {
114 src0.negModifier();
115 }
116
117 if (extData.NEG & 0x2) {
118 src1.negModifier();
119 }
120
124 assert(!(instData.ABS & 0x4));
125 assert(!(extData.NEG & 0x4));
126
127 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
128 if (wf->execMask(lane)) {
129 vdst[lane] = src0[lane] + src1[lane];
130 }
131 }
132
133 vdst.write();
134 } // execute
135 // --- Inst_VOP3__V_SUB_F32 class methods ---
136
138 : Inst_VOP3A(iFmt, "v_sub_f32", false)
139 {
140 setFlag(ALU);
141 setFlag(F32);
142 } // Inst_VOP3__V_SUB_F32
143
145 {
146 } // ~Inst_VOP3__V_SUB_F32
147
148 // --- description from .arch file ---
149 // D.f = S0.f - S1.f.
150 // SQ translates to V_ADD_F32.
151 void
153 {
154 Wavefront *wf = gpuDynInst->wavefront();
155 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
156 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
157 VecOperandF32 vdst(gpuDynInst, instData.VDST);
158
159 src0.readSrc();
160 src1.readSrc();
161
162 if (instData.ABS & 0x1) {
163 src0.absModifier();
164 }
165
166 if (instData.ABS & 0x2) {
167 src1.absModifier();
168 }
169
170 if (extData.NEG & 0x1) {
171 src0.negModifier();
172 }
173
174 if (extData.NEG & 0x2) {
175 src1.negModifier();
176 }
177
181 assert(!(instData.ABS & 0x4));
182 assert(!(extData.NEG & 0x4));
183
184 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
185 if (wf->execMask(lane)) {
186 vdst[lane] = src0[lane] - src1[lane];
187 }
188 }
189
190 vdst.write();
191 } // execute
192 // --- Inst_VOP3__V_SUBREV_F32 class methods ---
193
195 : Inst_VOP3A(iFmt, "v_subrev_f32", false)
196 {
197 setFlag(ALU);
198 setFlag(F32);
199 } // Inst_VOP3__V_SUBREV_F32
200
202 {
203 } // ~Inst_VOP3__V_SUBREV_F32
204
205 // --- description from .arch file ---
206 // D.f = S1.f - S0.f.
207 // SQ translates to V_ADD_F32.
208 void
210 {
211 Wavefront *wf = gpuDynInst->wavefront();
212 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
213 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
214 VecOperandF32 vdst(gpuDynInst, instData.VDST);
215
216 src0.readSrc();
217 src1.readSrc();
218
219 if (instData.ABS & 0x1) {
220 src0.absModifier();
221 }
222
223 if (instData.ABS & 0x2) {
224 src1.absModifier();
225 }
226
227 if (extData.NEG & 0x1) {
228 src0.negModifier();
229 }
230
231 if (extData.NEG & 0x2) {
232 src1.negModifier();
233 }
234
238 assert(!(instData.ABS & 0x4));
239 assert(!(extData.NEG & 0x4));
240
241 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
242 if (wf->execMask(lane)) {
243 vdst[lane] = src1[lane] - src0[lane];
244 }
245 }
246
247 vdst.write();
248 } // execute
249 // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
250
252 : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
253 {
254 setFlag(ALU);
255 setFlag(F32);
256 } // Inst_VOP3__V_MUL_LEGACY_F32
257
259 {
260 } // ~Inst_VOP3__V_MUL_LEGACY_F32
261
262 // --- description from .arch file ---
263 // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
264 void
266 {
267 Wavefront *wf = gpuDynInst->wavefront();
268 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
269 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
270 VecOperandF32 vdst(gpuDynInst, instData.VDST);
271
272 src0.readSrc();
273 src1.readSrc();
274
275 if (instData.ABS & 0x1) {
276 src0.absModifier();
277 }
278
279 if (instData.ABS & 0x2) {
280 src1.absModifier();
281 }
282
283 if (extData.NEG & 0x1) {
284 src0.negModifier();
285 }
286
287 if (extData.NEG & 0x2) {
288 src1.negModifier();
289 }
290
294 assert(!(instData.ABS & 0x4));
295 assert(!(extData.NEG & 0x4));
296
297 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
298 if (wf->execMask(lane)) {
299 if (std::isnan(src0[lane]) ||
300 std::isnan(src1[lane])) {
301 vdst[lane] = NAN;
302 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
303 std::fpclassify(src0[lane]) == FP_ZERO) &&
304 !std::signbit(src0[lane])) {
305 if (std::isinf(src1[lane])) {
306 vdst[lane] = NAN;
307 } else if (!std::signbit(src1[lane])) {
308 vdst[lane] = +0.0;
309 } else {
310 vdst[lane] = -0.0;
311 }
312 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
313 std::fpclassify(src0[lane]) == FP_ZERO) &&
314 std::signbit(src0[lane])) {
315 if (std::isinf(src1[lane])) {
316 vdst[lane] = NAN;
317 } else if (std::signbit(src1[lane])) {
318 vdst[lane] = +0.0;
319 } else {
320 vdst[lane] = -0.0;
321 }
322 } else if (std::isinf(src0[lane]) &&
323 !std::signbit(src0[lane])) {
324 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
325 std::fpclassify(src1[lane]) == FP_ZERO) {
326 vdst[lane] = NAN;
327 } else if (!std::signbit(src1[lane])) {
328 vdst[lane] = +INFINITY;
329 } else {
330 vdst[lane] = -INFINITY;
331 }
332 } else if (std::isinf(src0[lane]) &&
333 std::signbit(src0[lane])) {
334 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
335 std::fpclassify(src1[lane]) == FP_ZERO) {
336 vdst[lane] = NAN;
337 } else if (std::signbit(src1[lane])) {
338 vdst[lane] = +INFINITY;
339 } else {
340 vdst[lane] = -INFINITY;
341 }
342 } else {
343 vdst[lane] = src0[lane] * src1[lane];
344 }
345 }
346 }
347
348 vdst.write();
349 } // execute
350 // --- Inst_VOP3__V_MUL_F32 class methods ---
351
353 : Inst_VOP3A(iFmt, "v_mul_f32", false)
354 {
355 setFlag(ALU);
356 setFlag(F32);
357 } // Inst_VOP3__V_MUL_F32
358
360 {
361 } // ~Inst_VOP3__V_MUL_F32
362
363 // --- description from .arch file ---
364 // D.f = S0.f * S1.f.
365 void
367 {
368 Wavefront *wf = gpuDynInst->wavefront();
369 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
370 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
371 VecOperandF32 vdst(gpuDynInst, instData.VDST);
372
373 src0.readSrc();
374 src1.readSrc();
375
376 if (instData.ABS & 0x1) {
377 src0.absModifier();
378 }
379
380 if (instData.ABS & 0x2) {
381 src1.absModifier();
382 }
383
384 if (extData.NEG & 0x1) {
385 src0.negModifier();
386 }
387
388 if (extData.NEG & 0x2) {
389 src1.negModifier();
390 }
391
395 assert(!(instData.ABS & 0x4));
396 assert(!(extData.NEG & 0x4));
397
398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
399 if (wf->execMask(lane)) {
400 if (std::isnan(src0[lane]) ||
401 std::isnan(src1[lane])) {
402 vdst[lane] = NAN;
403 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
404 std::fpclassify(src0[lane]) == FP_ZERO) &&
405 !std::signbit(src0[lane])) {
406 if (std::isinf(src1[lane])) {
407 vdst[lane] = NAN;
408 } else if (!std::signbit(src1[lane])) {
409 vdst[lane] = +0.0;
410 } else {
411 vdst[lane] = -0.0;
412 }
413 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
414 std::fpclassify(src0[lane]) == FP_ZERO) &&
415 std::signbit(src0[lane])) {
416 if (std::isinf(src1[lane])) {
417 vdst[lane] = NAN;
418 } else if (std::signbit(src1[lane])) {
419 vdst[lane] = +0.0;
420 } else {
421 vdst[lane] = -0.0;
422 }
423 } else if (std::isinf(src0[lane]) &&
424 !std::signbit(src0[lane])) {
425 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
426 std::fpclassify(src1[lane]) == FP_ZERO) {
427 vdst[lane] = NAN;
428 } else if (!std::signbit(src1[lane])) {
429 vdst[lane] = +INFINITY;
430 } else {
431 vdst[lane] = -INFINITY;
432 }
433 } else if (std::isinf(src0[lane]) &&
434 std::signbit(src0[lane])) {
435 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
436 std::fpclassify(src1[lane]) == FP_ZERO) {
437 vdst[lane] = NAN;
438 } else if (std::signbit(src1[lane])) {
439 vdst[lane] = +INFINITY;
440 } else {
441 vdst[lane] = -INFINITY;
442 }
443 } else {
444 vdst[lane] = src0[lane] * src1[lane];
445 }
446 }
447 }
448
449 vdst.write();
450 } // execute
451 // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
452
454 : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
455 {
456 setFlag(ALU);
457 } // Inst_VOP3__V_MUL_I32_I24
458
460 {
461 } // ~Inst_VOP3__V_MUL_I32_I24
462
463 // --- description from .arch file ---
464 // D.i = S0.i[23:0] * S1.i[23:0].
465 void
467 {
468 Wavefront *wf = gpuDynInst->wavefront();
469 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
470 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
471 VecOperandI32 vdst(gpuDynInst, instData.VDST);
472
473 src0.readSrc();
474 src1.read();
475
479 assert(!(instData.ABS & 0x1));
480 assert(!(instData.ABS & 0x2));
481 assert(!(instData.ABS & 0x4));
482 assert(!(extData.NEG & 0x1));
483 assert(!(extData.NEG & 0x2));
484 assert(!(extData.NEG & 0x4));
485
486 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
487 if (wf->execMask(lane)) {
488 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
489 * sext<24>(bits(src1[lane], 23, 0));
490 }
491 }
492
493 vdst.write();
494 } // execute
495 // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
496
498 : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
499 {
500 setFlag(ALU);
501 } // Inst_VOP3__V_MUL_HI_I32_I24
502
504 {
505 } // ~Inst_VOP3__V_MUL_HI_I32_I24
506
507 // --- description from .arch file ---
508 // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
509 void
511 {
512 Wavefront *wf = gpuDynInst->wavefront();
513 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
514 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
515 VecOperandI32 vdst(gpuDynInst, instData.VDST);
516
517 src0.readSrc();
518 src1.readSrc();
519
523 assert(!(instData.ABS & 0x1));
524 assert(!(instData.ABS & 0x2));
525 assert(!(instData.ABS & 0x4));
526 assert(!(extData.NEG & 0x1));
527 assert(!(extData.NEG & 0x2));
528 assert(!(extData.NEG & 0x4));
529
530 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
531 if (wf->execMask(lane)) {
532 VecElemI64 tmp_src0
533 = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
534 VecElemI64 tmp_src1
535 = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
536
537 vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
538 }
539 }
540
541 vdst.write();
542 } // execute
543 // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
544
546 : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
547 {
548 setFlag(ALU);
549 } // Inst_VOP3__V_MUL_U32_U24
550
552 {
553 } // ~Inst_VOP3__V_MUL_U32_U24
554
555 // --- description from .arch file ---
556 // D.u = S0.u[23:0] * S1.u[23:0].
557 void
559 {
560 Wavefront *wf = gpuDynInst->wavefront();
561 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
562 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
563 VecOperandU32 vdst(gpuDynInst, instData.VDST);
564
565 src0.readSrc();
566 src1.readSrc();
567
571 assert(!(instData.ABS & 0x1));
572 assert(!(instData.ABS & 0x2));
573 assert(!(instData.ABS & 0x4));
574 assert(!(extData.NEG & 0x1));
575 assert(!(extData.NEG & 0x2));
576 assert(!(extData.NEG & 0x4));
577
578 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
579 if (wf->execMask(lane)) {
580 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
581 }
582 }
583
584 vdst.write();
585 } // execute
586 // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
587
589 : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
590 {
591 setFlag(ALU);
592 } // Inst_VOP3__V_MUL_HI_U32_U24
593
595 {
596 } // ~Inst_VOP3__V_MUL_HI_U32_U24
597
598 // --- description from .arch file ---
599 // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
600 void
602 {
603 Wavefront *wf = gpuDynInst->wavefront();
604 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
605 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
606 VecOperandU32 vdst(gpuDynInst, instData.VDST);
607
608 src0.readSrc();
609 src1.readSrc();
610
614 assert(!(instData.ABS & 0x1));
615 assert(!(instData.ABS & 0x2));
616 assert(!(instData.ABS & 0x4));
617 assert(!(extData.NEG & 0x1));
618 assert(!(extData.NEG & 0x2));
619 assert(!(extData.NEG & 0x4));
620
621 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
622 if (wf->execMask(lane)) {
623 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
624 VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
625 vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
626 }
627 }
628
629 vdst.write();
630 } // execute
631 // --- Inst_VOP3__V_MIN_F32 class methods ---
632
634 : Inst_VOP3A(iFmt, "v_min_f32", false)
635 {
636 setFlag(ALU);
637 setFlag(F32);
638 } // Inst_VOP3__V_MIN_F32
639
641 {
642 } // ~Inst_VOP3__V_MIN_F32
643
644 // --- description from .arch file ---
645 // D.f = (S0.f < S1.f ? S0.f : S1.f).
646 void
648 {
649 Wavefront *wf = gpuDynInst->wavefront();
650 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
651 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
652 VecOperandF32 vdst(gpuDynInst, instData.VDST);
653
654 src0.readSrc();
655 src1.readSrc();
656
657 if (instData.ABS & 0x1) {
658 src0.absModifier();
659 }
660
661 if (instData.ABS & 0x2) {
662 src1.absModifier();
663 }
664
665 if (extData.NEG & 0x1) {
666 src0.negModifier();
667 }
668
669 if (extData.NEG & 0x2) {
670 src1.negModifier();
671 }
672
676 assert(!(instData.ABS & 0x4));
677 assert(!(extData.NEG & 0x4));
678
679 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
680 if (wf->execMask(lane)) {
681 vdst[lane] = std::fmin(src0[lane], src1[lane]);
682 }
683 }
684
685 vdst.write();
686 } // execute
687 // --- Inst_VOP3__V_MAX_F32 class methods ---
688
690 : Inst_VOP3A(iFmt, "v_max_f32", false)
691 {
692 setFlag(ALU);
693 setFlag(F32);
694 } // Inst_VOP3__V_MAX_F32
695
697 {
698 } // ~Inst_VOP3__V_MAX_F32
699
700 // --- description from .arch file ---
701 // D.f = (S0.f >= S1.f ? S0.f : S1.f).
702 void
704 {
705 Wavefront *wf = gpuDynInst->wavefront();
706 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
707 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
708 VecOperandF32 vdst(gpuDynInst, instData.VDST);
709
710 src0.readSrc();
711 src1.readSrc();
712
713 if (instData.ABS & 0x1) {
714 src0.absModifier();
715 }
716
717 if (instData.ABS & 0x2) {
718 src1.absModifier();
719 }
720
721 if (extData.NEG & 0x1) {
722 src0.negModifier();
723 }
724
725 if (extData.NEG & 0x2) {
726 src1.negModifier();
727 }
728
732 assert(!(instData.ABS & 0x4));
733 assert(!(extData.NEG & 0x4));
734
735 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
736 if (wf->execMask(lane)) {
737 vdst[lane] = std::fmax(src0[lane], src1[lane]);
738 }
739 }
740
741 vdst.write();
742 } // execute
743 // --- Inst_VOP3__V_MIN_I32 class methods ---
744
746 : Inst_VOP3A(iFmt, "v_min_i32", false)
747 {
748 setFlag(ALU);
749 } // Inst_VOP3__V_MIN_I32
750
752 {
753 } // ~Inst_VOP3__V_MIN_I32
754
755 // --- description from .arch file ---
756 // D.i = min(S0.i, S1.i).
757 void
759 {
760 Wavefront *wf = gpuDynInst->wavefront();
761 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
762 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
763 VecOperandI32 vdst(gpuDynInst, instData.VDST);
764
765 src0.readSrc();
766 src1.readSrc();
767
771 assert(!(instData.ABS & 0x1));
772 assert(!(instData.ABS & 0x2));
773 assert(!(instData.ABS & 0x4));
774 assert(!(extData.NEG & 0x1));
775 assert(!(extData.NEG & 0x2));
776 assert(!(extData.NEG & 0x4));
777
778 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
779 if (wf->execMask(lane)) {
780 vdst[lane] = std::min(src0[lane], src1[lane]);
781 }
782 }
783
784 vdst.write();
785 } // execute
786 // --- Inst_VOP3__V_MAX_I32 class methods ---
787
789 : Inst_VOP3A(iFmt, "v_max_i32", false)
790 {
791 setFlag(ALU);
792 } // Inst_VOP3__V_MAX_I32
793
795 {
796 } // ~Inst_VOP3__V_MAX_I32
797
798 // --- description from .arch file ---
799 // D.i = max(S0.i, S1.i).
800 void
802 {
803 Wavefront *wf = gpuDynInst->wavefront();
804 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
805 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
806 VecOperandI32 vdst(gpuDynInst, instData.VDST);
807
808 src0.readSrc();
809 src1.readSrc();
810
814 assert(!(instData.ABS & 0x1));
815 assert(!(instData.ABS & 0x2));
816 assert(!(instData.ABS & 0x4));
817 assert(!(extData.NEG & 0x1));
818 assert(!(extData.NEG & 0x2));
819 assert(!(extData.NEG & 0x4));
820
821 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
822 if (wf->execMask(lane)) {
823 vdst[lane] = std::max(src0[lane], src1[lane]);
824 }
825 }
826
827 vdst.write();
828 } // execute
829 // --- Inst_VOP3__V_MIN_U32 class methods ---
830
832 : Inst_VOP3A(iFmt, "v_min_u32", false)
833 {
834 setFlag(ALU);
835 } // Inst_VOP3__V_MIN_U32
836
838 {
839 } // ~Inst_VOP3__V_MIN_U32
840
841 // --- description from .arch file ---
842 // D.u = min(S0.u, S1.u).
843 void
845 {
846 Wavefront *wf = gpuDynInst->wavefront();
847 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
848 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
849 VecOperandU32 vdst(gpuDynInst, instData.VDST);
850
851 src0.readSrc();
852 src1.readSrc();
853
857 assert(!(instData.ABS & 0x1));
858 assert(!(instData.ABS & 0x2));
859 assert(!(instData.ABS & 0x4));
860 assert(!(extData.NEG & 0x1));
861 assert(!(extData.NEG & 0x2));
862 assert(!(extData.NEG & 0x4));
863
864 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
865 if (wf->execMask(lane)) {
866 vdst[lane] = std::min(src0[lane], src1[lane]);
867 }
868 }
869
870 vdst.write();
871 } // execute
872 // --- Inst_VOP3__V_MAX_U32 class methods ---
873
875 : Inst_VOP3A(iFmt, "v_max_u32", false)
876 {
877 setFlag(ALU);
878 } // Inst_VOP3__V_MAX_U32
879
881 {
882 } // ~Inst_VOP3__V_MAX_U32
883
884 // --- description from .arch file ---
885 // D.u = max(S0.u, S1.u).
886 void
888 {
889 Wavefront *wf = gpuDynInst->wavefront();
890 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
891 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
892 VecOperandU32 vdst(gpuDynInst, instData.VDST);
893
894 src0.readSrc();
895 src1.readSrc();
896
900 assert(!(instData.ABS & 0x1));
901 assert(!(instData.ABS & 0x2));
902 assert(!(instData.ABS & 0x4));
903 assert(!(extData.NEG & 0x1));
904 assert(!(extData.NEG & 0x2));
905 assert(!(extData.NEG & 0x4));
906
907 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
908 if (wf->execMask(lane)) {
909 vdst[lane] = std::max(src0[lane], src1[lane]);
910 }
911 }
912
913 vdst.write();
914 } // execute
915 // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
916
918 : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
919 {
920 setFlag(ALU);
921 } // Inst_VOP3__V_LSHRREV_B32
922
924 {
925 } // ~Inst_VOP3__V_LSHRREV_B32
926
927 // --- description from .arch file ---
928 // D.u = S1.u >> S0.u[4:0].
929 // The vacated bits are set to zero.
930 // SQ translates this to an internal SP opcode.
931 void
933 {
934 Wavefront *wf = gpuDynInst->wavefront();
935 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
936 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
937 VecOperandU32 vdst(gpuDynInst, instData.VDST);
938
939 src0.readSrc();
940 src1.readSrc();
941
945 assert(!(instData.ABS & 0x1));
946 assert(!(instData.ABS & 0x2));
947 assert(!(instData.ABS & 0x4));
948 assert(!(extData.NEG & 0x1));
949 assert(!(extData.NEG & 0x2));
950 assert(!(extData.NEG & 0x4));
951
952 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
953 if (wf->execMask(lane)) {
954 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
955 }
956 }
957
958 vdst.write();
959 } // execute
960 // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
961
963 : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
964 {
965 setFlag(ALU);
966 } // Inst_VOP3__V_ASHRREV_I32
967
969 {
970 } // ~Inst_VOP3__V_ASHRREV_I32
971
972 // --- description from .arch file ---
973 // D.i = signext(S1.i) >> S0.i[4:0].
974 // The vacated bits are set to the sign bit of the input value.
975 // SQ translates this to an internal SP opcode.
976 void
978 {
979 Wavefront *wf = gpuDynInst->wavefront();
980 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
981 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
982 VecOperandI32 vdst(gpuDynInst, instData.VDST);
983
984 src0.readSrc();
985 src1.readSrc();
986
990 assert(!(instData.ABS & 0x1));
991 assert(!(instData.ABS & 0x2));
992 assert(!(instData.ABS & 0x4));
993 assert(!(extData.NEG & 0x1));
994 assert(!(extData.NEG & 0x2));
995 assert(!(extData.NEG & 0x4));
996
997 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
998 if (wf->execMask(lane)) {
999 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
1000 }
1001 }
1002
1003 vdst.write();
1004 } // execute
1005 // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
1006
1008 : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
1009 {
1010 setFlag(ALU);
1011 } // Inst_VOP3__V_LSHLREV_B32
1012
1014 {
1015 } // ~Inst_VOP3__V_LSHLREV_B32
1016
1017 // --- description from .arch file ---
1018 // D.u = S1.u << S0.u[4:0].
1019 // SQ translates this to an internal SP opcode.
1020 void
1022 {
1023 Wavefront *wf = gpuDynInst->wavefront();
1024 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1025 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1026 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1027
1028 src0.readSrc();
1029 src1.readSrc();
1030
1034 assert(!(instData.ABS & 0x1));
1035 assert(!(instData.ABS & 0x2));
1036 assert(!(instData.ABS & 0x4));
1037 assert(!(extData.NEG & 0x1));
1038 assert(!(extData.NEG & 0x2));
1039 assert(!(extData.NEG & 0x4));
1040
1041 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1042 if (wf->execMask(lane)) {
1043 vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
1044 }
1045 }
1046
1047 vdst.write();
1048 } // execute
1049 // --- Inst_VOP3__V_AND_B32 class methods ---
1050
1052 : Inst_VOP3A(iFmt, "v_and_b32", false)
1053 {
1054 setFlag(ALU);
1055 } // Inst_VOP3__V_AND_B32
1056
1058 {
1059 } // ~Inst_VOP3__V_AND_B32
1060
1061 // --- description from .arch file ---
1062 // D.u = S0.u & S1.u.
1063 // Input and output modifiers not supported.
1064 void
1066 {
1067 Wavefront *wf = gpuDynInst->wavefront();
1068 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1069 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1070 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1071
1072 src0.readSrc();
1073 src1.readSrc();
1074
1078 assert(!(instData.ABS & 0x1));
1079 assert(!(instData.ABS & 0x2));
1080 assert(!(instData.ABS & 0x4));
1081 assert(!(extData.NEG & 0x1));
1082 assert(!(extData.NEG & 0x2));
1083 assert(!(extData.NEG & 0x4));
1084
1085 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1086 if (wf->execMask(lane)) {
1087 vdst[lane] = src0[lane] & src1[lane];
1088 }
1089 }
1090
1091 vdst.write();
1092 } // execute
1093 // --- Inst_VOP3__V_OR_B32 class methods ---
1094
1096 : Inst_VOP3A(iFmt, "v_or_b32", false)
1097 {
1098 setFlag(ALU);
1099 } // Inst_VOP3__V_OR_B32
1100
1102 {
1103 } // ~Inst_VOP3__V_OR_B32
1104
1105 // --- description from .arch file ---
1106 // D.u = S0.u | S1.u.
1107 // Input and output modifiers not supported.
1108 void
1110 {
1111 Wavefront *wf = gpuDynInst->wavefront();
1112 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1113 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1114 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1115
1116 src0.readSrc();
1117 src1.readSrc();
1118
1122 assert(!(instData.ABS & 0x1));
1123 assert(!(instData.ABS & 0x2));
1124 assert(!(instData.ABS & 0x4));
1125 assert(!(extData.NEG & 0x1));
1126 assert(!(extData.NEG & 0x2));
1127 assert(!(extData.NEG & 0x4));
1128
1129 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1130 if (wf->execMask(lane)) {
1131 vdst[lane] = src0[lane] | src1[lane];
1132 }
1133 }
1134
1135 vdst.write();
1136 } // execute
1137 // --- Inst_VOP3__V_OR3_B32 class methods ---
1138
1140 : Inst_VOP3A(iFmt, "v_or3_b32", false)
1141 {
1142 setFlag(ALU);
1143 } // Inst_VOP3__V_OR3_B32
1144
1146 {
1147 } // ~Inst_VOP3__V_OR3_B32
1148
1149 // --- description from .arch file ---
1150 // D.u = S0.u | S1.u | S2.u.
1151 // Input and output modifiers not supported.
1152 void
1154 {
1155 Wavefront *wf = gpuDynInst->wavefront();
1156 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1157 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1158 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
1159 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1160
1161 src0.readSrc();
1162 src1.readSrc();
1163 src2.readSrc();
1164
1168 assert(!(instData.ABS & 0x1));
1169 assert(!(instData.ABS & 0x2));
1170 assert(!(instData.ABS & 0x4));
1171 assert(!(extData.NEG & 0x1));
1172 assert(!(extData.NEG & 0x2));
1173 assert(!(extData.NEG & 0x4));
1174
1175 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1176 if (wf->execMask(lane)) {
1177 vdst[lane] = src0[lane] | src1[lane] | src2[lane];
1178 }
1179 }
1180
1181 vdst.write();
1182 } // execute
1183 // --- Inst_VOP3__V_XOR_B32 class methods ---
1184
1186 : Inst_VOP3A(iFmt, "v_xor_b32", false)
1187 {
1188 setFlag(ALU);
1189 } // Inst_VOP3__V_XOR_B32
1190
1192 {
1193 } // ~Inst_VOP3__V_XOR_B32
1194
1195 // --- description from .arch file ---
1196 // D.u = S0.u ^ S1.u.
1197 // Input and output modifiers not supported.
1198 void
1200 {
1201 Wavefront *wf = gpuDynInst->wavefront();
1202 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1203 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1204 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1205
1206 src0.readSrc();
1207 src1.readSrc();
1208
1212 assert(!(instData.ABS & 0x1));
1213 assert(!(instData.ABS & 0x2));
1214 assert(!(instData.ABS & 0x4));
1215 assert(!(extData.NEG & 0x1));
1216 assert(!(extData.NEG & 0x2));
1217 assert(!(extData.NEG & 0x4));
1218
1219 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1220 if (wf->execMask(lane)) {
1221 vdst[lane] = src0[lane] ^ src1[lane];
1222 }
1223 }
1224
1225 vdst.write();
1226 } // execute
1227 // --- Inst_VOP3__V_DOT2C_F32_BF16 class methods ---
1228
1230 : Inst_VOP3A(iFmt, "v_dot2c_f32_bf16", false)
1231 {
1232 setFlag(ALU);
1233 } // Inst_VOP3__V_DOT2C_F32_BF16
1234
1236 {
1237 } // ~Inst_VOP3__V_DOT2C_F32_BF16
1238
1239 void
1241 {
1242 Wavefront *wf = gpuDynInst->wavefront();
1243 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1244 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1245 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1246
1247 src0.readSrc();
1248 src1.readSrc();
1249 vdst.read();
1250
1251 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1252 if (wf->execMask(lane)) {
1253 AMDGPU::mxbfloat16 a1, a2, b1, b2;
1254 a1.data = uint16_t(bits(src0[lane], 15, 0));
1255 a2.data = uint16_t(bits(src0[lane], 31, 16));
1256 b1.data = uint16_t(bits(src1[lane], 15, 0));
1257 b2.data = uint16_t(bits(src1[lane], 31, 16));
1258
1259 // ABS treated as NEG_HI
1260 if (instData.ABS & 0x1) a2 = -a2;
1261 if (instData.ABS & 0x2) b2 = -b2;
1262 if (extData.NEG & 0x1) a1 = -a1;
1263 if (extData.NEG & 0x2) b1 = -b1;
1264
1265 vdst[lane] += float(a1) * float(b1);
1266 vdst[lane] += float(a2) * float(b2);
1267 }
1268 }
1269
1270 vdst.write();
1271 } // execute
1272 // --- Inst_VOP3__V_MAC_F32 class methods ---
1273
1275 : Inst_VOP3A(iFmt, "v_mac_f32", false)
1276 {
1277 setFlag(ALU);
1278 setFlag(F32);
1279 setFlag(MAC);
1280 } // Inst_VOP3__V_MAC_F32
1281
1283 {
1284 } // ~Inst_VOP3__V_MAC_F32
1285
1286 // --- description from .arch file ---
1287 // D.f = S0.f * S1.f + D.f.
1288 // SQ translates to V_MAD_F32.
1289 void
1291 {
1292 Wavefront *wf = gpuDynInst->wavefront();
1293 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
1294 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
1295 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1296
1297 src0.readSrc();
1298 src1.readSrc();
1299 vdst.read();
1300
1301 if (instData.ABS & 0x1) {
1302 src0.absModifier();
1303 }
1304
1305 if (instData.ABS & 0x2) {
1306 src1.absModifier();
1307 }
1308
1309 if (extData.NEG & 0x1) {
1310 src0.negModifier();
1311 }
1312
1313 if (extData.NEG & 0x2) {
1314 src1.negModifier();
1315 }
1316
1320 assert(!(instData.ABS & 0x4));
1321 assert(!(extData.NEG & 0x4));
1322
1323 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1324 if (wf->execMask(lane)) {
1325 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
1326 }
1327 }
1328
1329 vdst.write();
1330 } // execute
1331 // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
1332
1334 : Inst_VOP3B(iFmt, "v_add_co_u32")
1335 {
1336 setFlag(ALU);
1337 setFlag(WritesVCC);
1338 } // Inst_VOP3__V_ADD_CO_U32
1339
1341 {
1342 } // ~Inst_VOP3__V_ADD_CO_U32
1343
1344 // --- description from .arch file ---
1345 // D.u = S0.u + S1.u;
1346 // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
1347 // --- overflow or carry-out for V_ADDC_U32.
1348 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1349 void
1351 {
1352 Wavefront *wf = gpuDynInst->wavefront();
1353 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1354 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1355 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1356 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1357
1358 src0.readSrc();
1359 src1.readSrc();
1360
1364 assert(!(extData.NEG & 0x1));
1365 assert(!(extData.NEG & 0x2));
1366 assert(!(extData.NEG & 0x4));
1367
1368 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1369 if (wf->execMask(lane)) {
1370 vdst[lane] = src0[lane] + src1[lane];
1371 vcc.setBit(lane, ((VecElemU64)src0[lane]
1372 + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
1373 }
1374 }
1375
1376 vdst.write();
1377 vcc.write();
1378 } // execute
1379 // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
1380
1382 : Inst_VOP3B(iFmt, "v_sub_co_u32")
1383 {
1384 setFlag(ALU);
1385 setFlag(WritesVCC);
1386 } // Inst_VOP3__V_SUB_CO_U32
1387
1389 {
1390 } // ~Inst_VOP3__V_SUB_CO_U32
1391
1392 // --- description from .arch file ---
1393 // D.u = S0.u - S1.u;
1394 // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
1395 // carry-out for V_SUBB_U32.
1396 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1397 void
1399 {
1400 Wavefront *wf = gpuDynInst->wavefront();
1401 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1402 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1403 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1404 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1405
1406 src0.readSrc();
1407 src1.readSrc();
1408
1412 assert(!(extData.NEG & 0x1));
1413 assert(!(extData.NEG & 0x2));
1414 assert(!(extData.NEG & 0x4));
1415
1416 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1417 if (wf->execMask(lane)) {
1418 vdst[lane] = src0[lane] - src1[lane];
1419 vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
1420 }
1421 }
1422
1423 vdst.write();
1424 vcc.write();
1425 } // execute
1426 // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
1427
1429 InFmt_VOP3B *iFmt)
1430 : Inst_VOP3B(iFmt, "v_subrev_co_u32")
1431 {
1432 setFlag(ALU);
1433 setFlag(WritesVCC);
1434 } // Inst_VOP3__V_SUBREV_CO_U32
1435
1437 {
1438 } // ~Inst_VOP3__V_SUBREV_CO_U32
1439
1440 // --- description from .arch file ---
1441 // D.u = S1.u - S0.u;
1442 // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
1443 // carry-out for V_SUBB_U32.
1444 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1445 // SQ translates this to V_SUB_U32 with reversed operands.
1446 void
1448 {
1449 Wavefront *wf = gpuDynInst->wavefront();
1450 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1451 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1452 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1453 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1454
1455 src0.readSrc();
1456 src1.readSrc();
1457
1461 assert(!(extData.NEG & 0x1));
1462 assert(!(extData.NEG & 0x2));
1463 assert(!(extData.NEG & 0x4));
1464
1465 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1466 if (wf->execMask(lane)) {
1467 vdst[lane] = src1[lane] - src0[lane];
1468 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
1469 }
1470 }
1471
1472 vdst.write();
1473 vcc.write();
1474 } // execute
1475 // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
1476
1478 : Inst_VOP3B(iFmt, "v_addc_co_u32")
1479 {
1480 setFlag(ALU);
1481 setFlag(WritesVCC);
1482 setFlag(ReadsVCC);
1483 } // Inst_VOP3__V_ADDC_CO_U32
1484
1486 {
1487 } // ~Inst_VOP3__V_ADDC_CO_U32
1488
1489 // --- description from .arch file ---
1490 // D.u = S0.u + S1.u + VCC[threadId];
1491 // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
1492 // is an UNSIGNED overflow.
1493 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1494 // source comes from the SGPR-pair at S2.u.
1495 void
1497 {
1498 Wavefront *wf = gpuDynInst->wavefront();
1499 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1500 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1501 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1502 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1503 ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1504
1505 src0.readSrc();
1506 src1.readSrc();
1507 vcc.read();
1508
1512 assert(!(extData.NEG & 0x1));
1513 assert(!(extData.NEG & 0x2));
1514 assert(!(extData.NEG & 0x4));
1515
1516 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1517 if (wf->execMask(lane)) {
1518 vdst[lane] = src0[lane] + src1[lane]
1519 + bits(vcc.rawData(), lane);
1520 sdst.setBit(lane, ((VecElemU64)src0[lane]
1521 + (VecElemU64)src1[lane]
1522 + (VecElemU64)bits(vcc.rawData(), lane))
1523 >= 0x100000000 ? 1 : 0);
1524 }
1525 }
1526
1527 vdst.write();
1528 sdst.write();
1529 } // execute
1530 // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
1531
1533 : Inst_VOP3B(iFmt, "v_subb_co_u32")
1534 {
1535 setFlag(ALU);
1536 setFlag(WritesVCC);
1537 setFlag(ReadsVCC);
1538 } // Inst_VOP3__V_SUBB_CO_U32
1539
1541 {
1542 } // ~Inst_VOP3__V_SUBB_CO_U32
1543
1544 // --- description from .arch file ---
1545 // D.u = S0.u - S1.u - VCC[threadId];
1546 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1547 // --- overflow.
1548 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1549 // --- source comes from the SGPR-pair at S2.u.
1550 void
1552 {
1553 Wavefront *wf = gpuDynInst->wavefront();
1554 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1555 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1556 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1557 ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1558 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1559
1560 src0.readSrc();
1561 src1.readSrc();
1562 vcc.read();
1563
1567 assert(!(extData.NEG & 0x1));
1568 assert(!(extData.NEG & 0x2));
1569 assert(!(extData.NEG & 0x4));
1570
1571 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1572 if (wf->execMask(lane)) {
1573 vdst[lane] = src0[lane] - src1[lane]
1574 - bits(vcc.rawData(), lane);
1575 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1576 > src0[lane] ? 1 : 0);
1577 }
1578 }
1579
1580 vdst.write();
1581 sdst.write();
1582 } // execute
1583 // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
1584
1586 InFmt_VOP3B *iFmt)
1587 : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
1588 {
1589 setFlag(ALU);
1590 setFlag(WritesVCC);
1591 setFlag(ReadsVCC);
1592 } // Inst_VOP3__V_SUBBREV_CO_U32
1593
1595 {
1596 } // ~Inst_VOP3__V_SUBBREV_CO_U32
1597
1598 // --- description from .arch file ---
1599 // D.u = S1.u - S0.u - VCC[threadId];
1600 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1601 // overflow.
1602 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1603 // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
1604 void
1606 {
1607 Wavefront *wf = gpuDynInst->wavefront();
1608 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1609 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1610 ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1611 ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1612 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1613
1614 src0.readSrc();
1615 src1.readSrc();
1616 vcc.read();
1617
1621 assert(!(extData.NEG & 0x1));
1622 assert(!(extData.NEG & 0x2));
1623 assert(!(extData.NEG & 0x4));
1624
1625 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1626 if (wf->execMask(lane)) {
1627 vdst[lane] = src1[lane] - src0[lane]
1628 - bits(vcc.rawData(), lane);
1629 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1630 > src0[lane] ? 1 : 0);
1631 }
1632 }
1633
1634 vdst.write();
1635 sdst.write();
1636 } // execute
1637 // --- Inst_VOP3__V_ADD_F16 class methods ---
1638
1640 : Inst_VOP3A(iFmt, "v_add_f16", false)
1641 {
1642 setFlag(ALU);
1643 setFlag(F16);
1644 } // Inst_VOP3__V_ADD_F16
1645
1647 {
1648 } // ~Inst_VOP3__V_ADD_F16
1649
1650 // --- description from .arch file ---
1651 // D.f16 = S0.f16 + S1.f16.
1652 // Supports denormals, round mode, exception flags, saturation.
1653 void
1655 {
1657 } // execute
1658 // --- Inst_VOP3__V_SUB_F16 class methods ---
1659
1661 : Inst_VOP3A(iFmt, "v_sub_f16", false)
1662 {
1663 setFlag(ALU);
1664 setFlag(F16);
1665 } // Inst_VOP3__V_SUB_F16
1666
1668 {
1669 } // ~Inst_VOP3__V_SUB_F16
1670
1671 // --- description from .arch file ---
1672 // D.f16 = S0.f16 - S1.f16.
1673 // Supports denormals, round mode, exception flags, saturation.
1674 // SQ translates to V_ADD_F16.
1675 void
1677 {
1679 } // execute
1680 // --- Inst_VOP3__V_SUBREV_F16 class methods ---
1681
1683 : Inst_VOP3A(iFmt, "v_subrev_f16", false)
1684 {
1685 setFlag(ALU);
1686 setFlag(F16);
1687 } // Inst_VOP3__V_SUBREV_F16
1688
1690 {
1691 } // ~Inst_VOP3__V_SUBREV_F16
1692
1693 // --- description from .arch file ---
1694 // D.f16 = S1.f16 - S0.f16.
1695 // Supports denormals, round mode, exception flags, saturation.
1696 // SQ translates to V_ADD_F16.
1697 void
1699 {
1701 } // execute
1702 // --- Inst_VOP3__V_MUL_F16 class methods ---
1703
1705 : Inst_VOP3A(iFmt, "v_mul_f16", false)
1706 {
1707 setFlag(ALU);
1708 setFlag(F16);
1709 } // Inst_VOP3__V_MUL_F16
1710
1712 {
1713 } // ~Inst_VOP3__V_MUL_F16
1714
1715 // --- description from .arch file ---
1716 // D.f16 = S0.f16 * S1.f16.
1717 // Supports denormals, round mode, exception flags, saturation.
1718 void
1720 {
1722 } // execute
1723 // --- Inst_VOP3__V_MAC_F16 class methods ---
1724
1726 : Inst_VOP3A(iFmt, "v_mac_f16", false)
1727 {
1728 setFlag(ALU);
1729 setFlag(F16);
1730 setFlag(MAC);
1731 } // Inst_VOP3__V_MAC_F16
1732
1734 {
1735 } // ~Inst_VOP3__V_MAC_F16
1736
1737 // --- description from .arch file ---
1738 // D.f16 = S0.f16 * S1.f16 + D.f16.
1739 // Supports round mode, exception flags, saturation.
1740 // SQ translates this to V_MAD_F16.
1741 void
1743 {
1745 } // execute
1746 // --- Inst_VOP3__V_ADD_U16 class methods ---
1747
1749 : Inst_VOP3A(iFmt, "v_add_u16", false)
1750 {
1751 setFlag(ALU);
1752 } // Inst_VOP3__V_ADD_U16
1753
1755 {
1756 } // ~Inst_VOP3__V_ADD_U16
1757
1758 // --- description from .arch file ---
1759 // D.u16 = S0.u16 + S1.u16.
1760 // Supports saturation (unsigned 16-bit integer domain).
1761 void
1763 {
1764 Wavefront *wf = gpuDynInst->wavefront();
1765 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1766 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1767 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1768
1769 src0.readSrc();
1770 src1.readSrc();
1771
1775 assert(!(instData.ABS & 0x1));
1776 assert(!(instData.ABS & 0x2));
1777 assert(!(instData.ABS & 0x4));
1778 assert(!(extData.NEG & 0x1));
1779 assert(!(extData.NEG & 0x2));
1780 assert(!(extData.NEG & 0x4));
1781
1782 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1783 if (wf->execMask(lane)) {
1784 vdst[lane] = src0[lane] + src1[lane];
1785 }
1786 }
1787
1788 vdst.write();
1789 } // execute
1790 // --- Inst_VOP3__V_SUB_U16 class methods ---
1791
1793 : Inst_VOP3A(iFmt, "v_sub_u16", false)
1794 {
1795 setFlag(ALU);
1796 } // Inst_VOP3__V_SUB_U16
1797
1799 {
1800 } // ~Inst_VOP3__V_SUB_U16
1801
1802 // --- description from .arch file ---
1803 // D.u16 = S0.u16 - S1.u16.
1804 // Supports saturation (unsigned 16-bit integer domain).
1805 void
1807 {
1808 Wavefront *wf = gpuDynInst->wavefront();
1809 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1810 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1811 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1812
1813 src0.readSrc();
1814 src1.readSrc();
1815
1819 assert(!(instData.ABS & 0x1));
1820 assert(!(instData.ABS & 0x2));
1821 assert(!(instData.ABS & 0x4));
1822 assert(!(extData.NEG & 0x1));
1823 assert(!(extData.NEG & 0x2));
1824 assert(!(extData.NEG & 0x4));
1825
1826 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1827 if (wf->execMask(lane)) {
1828 vdst[lane] = src0[lane] - src1[lane];
1829 }
1830 }
1831
1832 vdst.write();
1833 } // execute
1834 // --- Inst_VOP3__V_SUBREV_U16 class methods ---
1835
1837 : Inst_VOP3A(iFmt, "v_subrev_u16", false)
1838 {
1839 setFlag(ALU);
1840 } // Inst_VOP3__V_SUBREV_U16
1841
1843 {
1844 } // ~Inst_VOP3__V_SUBREV_U16
1845
1846 // --- description from .arch file ---
1847 // D.u16 = S1.u16 - S0.u16.
1848 // Supports saturation (unsigned 16-bit integer domain).
1849 // SQ translates this to V_SUB_U16 with reversed operands.
1850 void
1852 {
1853 Wavefront *wf = gpuDynInst->wavefront();
1854 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1855 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1856 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1857
1858 src0.readSrc();
1859 src1.readSrc();
1860
1864 assert(!(instData.ABS & 0x1));
1865 assert(!(instData.ABS & 0x2));
1866 assert(!(instData.ABS & 0x4));
1867 assert(!(extData.NEG & 0x1));
1868 assert(!(extData.NEG & 0x2));
1869 assert(!(extData.NEG & 0x4));
1870
1871 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1872 if (wf->execMask(lane)) {
1873 vdst[lane] = src1[lane] - src0[lane];
1874 }
1875 }
1876
1877 vdst.write();
1878 } // execute
1879 // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
1880
1882 : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
1883 {
1884 setFlag(ALU);
1885 } // Inst_VOP3__V_MUL_LO_U16
1886
1888 {
1889 } // ~Inst_VOP3__V_MUL_LO_U16
1890
1891 // --- description from .arch file ---
1892 // D.u16 = S0.u16 * S1.u16.
1893 // Supports saturation (unsigned 16-bit integer domain).
1894 void
1896 {
1897 Wavefront *wf = gpuDynInst->wavefront();
1898 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1899 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1900 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1901
1902 src0.readSrc();
1903 src1.readSrc();
1904
1908 assert(!(instData.ABS & 0x1));
1909 assert(!(instData.ABS & 0x2));
1910 assert(!(instData.ABS & 0x4));
1911 assert(!(extData.NEG & 0x1));
1912 assert(!(extData.NEG & 0x2));
1913 assert(!(extData.NEG & 0x4));
1914
1915 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1916 if (wf->execMask(lane)) {
1917 vdst[lane] = src0[lane] * src1[lane];
1918 }
1919 }
1920
1921 vdst.write();
1922 } // execute
1923 // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
1924
1926 : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
1927 {
1928 setFlag(ALU);
1929 } // Inst_VOP3__V_LSHLREV_B16
1930
1932 {
1933 } // ~Inst_VOP3__V_LSHLREV_B16
1934
1935 // --- description from .arch file ---
1936 // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
1937 // SQ translates this to an internal SP opcode.
1938 void
1940 {
1941 Wavefront *wf = gpuDynInst->wavefront();
1942 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1943 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1944 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1945
1946 src0.readSrc();
1947 src1.readSrc();
1948
1952 assert(!(instData.ABS & 0x1));
1953 assert(!(instData.ABS & 0x2));
1954 assert(!(instData.ABS & 0x4));
1955 assert(!(extData.NEG & 0x1));
1956 assert(!(extData.NEG & 0x2));
1957 assert(!(extData.NEG & 0x4));
1958
1959 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1960 if (wf->execMask(lane)) {
1961 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
1962 }
1963 }
1964
1965 vdst.write();
1966 } // execute
1967 // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
1968
1970 : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
1971 {
1972 setFlag(ALU);
1973 } // Inst_VOP3__V_LSHRREV_B16
1974
1976 {
1977 } // ~Inst_VOP3__V_LSHRREV_B16
1978
1979 // --- description from .arch file ---
1980 // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
1981 // The vacated bits are set to zero.
1982 // SQ translates this to an internal SP opcode.
1983 void
1985 {
1986 Wavefront *wf = gpuDynInst->wavefront();
1987 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1988 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1989 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1990
1991 src0.readSrc();
1992 src1.readSrc();
1993
1994 if (instData.ABS & 0x1) {
1995 src0.absModifier();
1996 }
1997
1998 if (instData.ABS & 0x2) {
1999 src1.absModifier();
2000 }
2001
2002 if (extData.NEG & 0x1) {
2003 src0.negModifier();
2004 }
2005
2006 if (extData.NEG & 0x2) {
2007 src1.negModifier();
2008 }
2009
2010 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2011 if (wf->execMask(lane)) {
2012 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
2013 }
2014 }
2015
2016 vdst.write();
2017 } // execute
2018 // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
2019
2021 : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
2022 {
2023 setFlag(ALU);
2024 } // Inst_VOP3__V_ASHRREV_I16
2025
2027 {
2028 } // ~Inst_VOP3__V_ASHRREV_I16
2029
2030 // --- description from .arch file ---
2031 // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
2032 // The vacated bits are set to the sign bit of the input value.
2033 // SQ translates this to an internal SP opcode.
2034 void
2036 {
2037 Wavefront *wf = gpuDynInst->wavefront();
2038 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
2039 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
2040 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2041
2042 src0.readSrc();
2043 src1.readSrc();
2044
2048 assert(!(instData.ABS & 0x1));
2049 assert(!(instData.ABS & 0x2));
2050 assert(!(instData.ABS & 0x4));
2051 assert(!(extData.NEG & 0x1));
2052 assert(!(extData.NEG & 0x2));
2053 assert(!(extData.NEG & 0x4));
2054
2055 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2056 if (wf->execMask(lane)) {
2057 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
2058 }
2059 }
2060
2061 vdst.write();
2062 } // execute
2063 // --- Inst_VOP3__V_MAX_F16 class methods ---
2064
2066 : Inst_VOP3A(iFmt, "v_max_f16", false)
2067 {
2068 setFlag(ALU);
2069 setFlag(F16);
2070 } // Inst_VOP3__V_MAX_F16
2071
2073 {
2074 } // ~Inst_VOP3__V_MAX_F16
2075
2076 // --- description from .arch file ---
2077 // D.f16 = max(S0.f16, S1.f16).
2078 // IEEE compliant. Supports denormals, round mode, exception flags,
2079 // saturation.
2080 void
2082 {
2084 } // execute
2085 // --- Inst_VOP3__V_MIN_F16 class methods ---
2086
2088 : Inst_VOP3A(iFmt, "v_min_f16", false)
2089 {
2090 setFlag(ALU);
2091 setFlag(F16);
2092 } // Inst_VOP3__V_MIN_F16
2093
2095 {
2096 } // ~Inst_VOP3__V_MIN_F16
2097
2098 // --- description from .arch file ---
2099 // D.f16 = min(S0.f16, S1.f16).
2100 // IEEE compliant. Supports denormals, round mode, exception flags,
2101 // saturation.
2102 void
2104 {
2106 } // execute
2107 // --- Inst_VOP3__V_MAX_U16 class methods ---
2108
2110 : Inst_VOP3A(iFmt, "v_max_u16", false)
2111 {
2112 setFlag(ALU);
2113 } // Inst_VOP3__V_MAX_U16
2114
2116 {
2117 } // ~Inst_VOP3__V_MAX_U16
2118
2119 // --- description from .arch file ---
2120 // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
2121 void
2123 {
2124 Wavefront *wf = gpuDynInst->wavefront();
2125 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
2126 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
2127 VecOperandU16 vdst(gpuDynInst, instData.VDST);
2128
2129 src0.readSrc();
2130 src1.readSrc();
2131
2132 if (instData.ABS & 0x1) {
2133 src0.absModifier();
2134 }
2135
2136 if (instData.ABS & 0x2) {
2137 src1.absModifier();
2138 }
2139
2140 if (extData.NEG & 0x1) {
2141 src0.negModifier();
2142 }
2143
2144 if (extData.NEG & 0x2) {
2145 src1.negModifier();
2146 }
2147
2148 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2149 if (wf->execMask(lane)) {
2150 vdst[lane] = std::max(src0[lane], src1[lane]);
2151 }
2152 }
2153
2154 vdst.write();
2155 } // execute
2156 // --- Inst_VOP3__V_MAX_I16 class methods ---
2157
2159 : Inst_VOP3A(iFmt, "v_max_i16", false)
2160 {
2161 setFlag(ALU);
2162 } // Inst_VOP3__V_MAX_I16
2163
2165 {
2166 } // ~Inst_VOP3__V_MAX_I16
2167
2168 // --- description from .arch file ---
2169 // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
2170 void
2172 {
2173 Wavefront *wf = gpuDynInst->wavefront();
2174 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
2175 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
2176 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2177
2178 src0.readSrc();
2179 src1.readSrc();
2180
2181 if (instData.ABS & 0x1) {
2182 src0.absModifier();
2183 }
2184
2185 if (instData.ABS & 0x2) {
2186 src1.absModifier();
2187 }
2188
2189 if (extData.NEG & 0x1) {
2190 src0.negModifier();
2191 }
2192
2193 if (extData.NEG & 0x2) {
2194 src1.negModifier();
2195 }
2196
2197 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2198 if (wf->execMask(lane)) {
2199 vdst[lane] = std::max(src0[lane], src1[lane]);
2200 }
2201 }
2202
2203 vdst.write();
2204 } // execute
2205 // --- Inst_VOP3__V_MIN_U16 class methods ---
2206
2208 : Inst_VOP3A(iFmt, "v_min_u16", false)
2209 {
2210 setFlag(ALU);
2211 } // Inst_VOP3__V_MIN_U16
2212
2214 {
2215 } // ~Inst_VOP3__V_MIN_U16
2216
2217 // --- description from .arch file ---
2218 // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
2219 void
2221 {
2222 Wavefront *wf = gpuDynInst->wavefront();
2223 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
2224 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
2225 VecOperandU16 vdst(gpuDynInst, instData.VDST);
2226
2227 src0.readSrc();
2228 src1.readSrc();
2229
2230 if (instData.ABS & 0x1) {
2231 src0.absModifier();
2232 }
2233
2234 if (instData.ABS & 0x2) {
2235 src1.absModifier();
2236 }
2237
2238 if (extData.NEG & 0x1) {
2239 src0.negModifier();
2240 }
2241
2242 if (extData.NEG & 0x2) {
2243 src1.negModifier();
2244 }
2245
2246 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2247 if (wf->execMask(lane)) {
2248 vdst[lane] = std::min(src0[lane], src1[lane]);
2249 }
2250 }
2251
2252 vdst.write();
2253 } // execute
2254 // --- Inst_VOP3__V_MIN_I16 class methods ---
2255
2257 : Inst_VOP3A(iFmt, "v_min_i16", false)
2258 {
2259 setFlag(ALU);
2260 } // Inst_VOP3__V_MIN_I16
2261
2263 {
2264 } // ~Inst_VOP3__V_MIN_I16
2265
2266 // --- description from .arch file ---
2267 // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
2268 void
2270 {
2271 Wavefront *wf = gpuDynInst->wavefront();
2272 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
2273 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
2274 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2275
2276 src0.readSrc();
2277 src1.readSrc();
2278
2279 if (instData.ABS & 0x1) {
2280 src0.absModifier();
2281 }
2282
2283 if (instData.ABS & 0x2) {
2284 src1.absModifier();
2285 }
2286
2287 if (extData.NEG & 0x1) {
2288 src0.negModifier();
2289 }
2290
2291 if (extData.NEG & 0x2) {
2292 src1.negModifier();
2293 }
2294
2295 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2296 if (wf->execMask(lane)) {
2297 vdst[lane] = std::min(src0[lane], src1[lane]);
2298 }
2299 }
2300
2301 vdst.write();
2302 } // execute
2303 // --- Inst_VOP3__V_LDEXP_F16 class methods ---
2304
2306 : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
2307 {
2308 setFlag(ALU);
2309 setFlag(F16);
2310 } // Inst_VOP3__V_LDEXP_F16
2311
2313 {
2314 } // ~Inst_VOP3__V_LDEXP_F16
2315
2316 // --- description from .arch file ---
2317 // D.f16 = S0.f16 * (2 ** S1.i16).
2318 void
2320 {
2322 } // execute
2323 // --- Inst_VOP3__V_ADD_U32 class methods ---
2324
2326 : Inst_VOP3A(iFmt, "v_add_u32", false)
2327 {
2328 setFlag(ALU);
2329 } // Inst_VOP3__V_ADD_U32
2330
2332 {
2333 } // ~Inst_VOP3__V_ADD_U32
2334
2335 // --- description from .arch file ---
2336 // D.u32 = S0.u32 + S1.u32.
2337 void
2339 {
2340 Wavefront *wf = gpuDynInst->wavefront();
2341 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2342 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2343 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2344
2345 src0.readSrc();
2346 src1.readSrc();
2347
2351 assert(!(instData.ABS & 0x1));
2352 assert(!(instData.ABS & 0x2));
2353 assert(!(instData.ABS & 0x4));
2354 assert(!(extData.NEG & 0x1));
2355 assert(!(extData.NEG & 0x2));
2356 assert(!(extData.NEG & 0x4));
2357
2358 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2359 if (wf->execMask(lane)) {
2360 vdst[lane] = src0[lane] + src1[lane];
2361 }
2362 }
2363
2364 vdst.write();
2365 } // execute
2366 // --- Inst_VOP3__V_SUB_U32 class methods ---
2367
2369 : Inst_VOP3A(iFmt, "v_sub_u32", false)
2370 {
2371 setFlag(ALU);
2372 } // Inst_VOP3__V_SUB_U32
2373
2375 {
2376 } // ~Inst_VOP3__V_SUB_U32
2377
2378 // --- description from .arch file ---
2379 // D.u32 = S0.u32 - S1.u32.
2380 void
2382 {
2383 Wavefront *wf = gpuDynInst->wavefront();
2384 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2385 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2386 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2387
2388 src0.readSrc();
2389 src1.readSrc();
2390
2394 assert(!(instData.ABS & 0x1));
2395 assert(!(instData.ABS & 0x2));
2396 assert(!(instData.ABS & 0x4));
2397 assert(!(extData.NEG & 0x1));
2398 assert(!(extData.NEG & 0x2));
2399 assert(!(extData.NEG & 0x4));
2400
2401 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2402 if (wf->execMask(lane)) {
2403 vdst[lane] = src0[lane] - src1[lane];
2404 }
2405 }
2406
2407 vdst.write();
2408 } // execute
2409 // --- Inst_VOP3__V_SUBREV_U32 class methods ---
2410
2412 : Inst_VOP3A(iFmt, "v_subrev_u32", false)
2413 {
2414 setFlag(ALU);
2415 } // Inst_VOP3__V_SUBREV_U32
2416
2418 {
2419 } // ~Inst_VOP3__V_SUBREV_U32
2420
2421 // --- description from .arch file ---
2422 // D.u32 = S1.u32 - S0.u32.
2423 void
2425 {
2426 Wavefront *wf = gpuDynInst->wavefront();
2427 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2428 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2429 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2430
2431 src0.readSrc();
2432 src1.readSrc();
2433
2437 assert(!(instData.ABS & 0x1));
2438 assert(!(instData.ABS & 0x2));
2439 assert(!(instData.ABS & 0x4));
2440 assert(!(extData.NEG & 0x1));
2441 assert(!(extData.NEG & 0x2));
2442 assert(!(extData.NEG & 0x4));
2443
2444 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2445 if (wf->execMask(lane)) {
2446 vdst[lane] = src1[lane] - src0[lane];
2447 }
2448 }
2449
2450 vdst.write();
2451 } // execute
2452 // --- Inst_VOP3__V_FMAC_F32 class methods ---
2453
2455 : Inst_VOP3A(iFmt, "v_fmac_f32", false)
2456 {
2457 setFlag(ALU);
2458 setFlag(F32);
2459 setFlag(FMA);
2460 } // Inst_VOP3__V_FMAC_F32
2461
2463 {
2464 } // ~Inst_VOP3__V_FMAC_F32
2465
2466 // --- description from .arch file ---
2467 // D.f = S0.f * S1.f + D.f.
2468 void
2470 {
2471 Wavefront *wf = gpuDynInst->wavefront();
2472 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
2473 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
2474 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2475
2476 src0.readSrc();
2477 src1.readSrc();
2478 vdst.read();
2479
2480 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
2481 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2482 panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
2483
2484 if (instData.ABS & 0x1) {
2485 src0.absModifier();
2486 }
2487
2488 if (instData.ABS & 0x2) {
2489 src1.absModifier();
2490 }
2491
2492 if (instData.ABS & 0x4) {
2493 vdst.absModifier();
2494 }
2495
2496 if (extData.NEG & 0x1) {
2497 src0.negModifier();
2498 }
2499
2500 if (extData.NEG & 0x2) {
2501 src1.negModifier();
2502 }
2503
2504 if (extData.NEG & 0x4) {
2505 vdst.negModifier();
2506 }
2507
2508 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2509 if (wf->execMask(lane)) {
2510 float out = std::fma(src0[lane], src1[lane], vdst[lane]);
2511 out = omodModifier(out, extData.OMOD);
2512 if (instData.CLAMP) {
2513 out = std::clamp(vdst[lane], 0.0f, 1.0f);
2514 }
2515 vdst[lane] = out;
2516 }
2517 }
2518
2519 vdst.write();
2520 } // execute
2521 // --- Inst_VOP3__V_NOP class methods ---
2522
2524 : Inst_VOP3A(iFmt, "v_nop", false)
2525 {
2526 setFlag(Nop);
2527 setFlag(ALU);
2528 } // Inst_VOP3__V_NOP
2529
2531 {
2532 } // ~Inst_VOP3__V_NOP
2533
2534 // --- description from .arch file ---
2535 // Do nothing.
2536 void
2538 {
2539 } // execute
2540 // --- Inst_VOP3__V_MOV_B32 class methods ---
2541
2543 : Inst_VOP3A(iFmt, "v_mov_b32", false)
2544 {
2545 setFlag(ALU);
2546 } // Inst_VOP3__V_MOV_B32
2547
2549 {
2550 } // ~Inst_VOP3__V_MOV_B32
2551
2552 // --- description from .arch file ---
2553 // D.u = S0.u.
2554 // Input and output modifiers not supported; this is an untyped operation.
2555 void
2557 {
2558 Wavefront *wf = gpuDynInst->wavefront();
2559 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
2560 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2561
2562 src.readSrc();
2563
2564 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2565 if (wf->execMask(lane)) {
2566 vdst[lane] = src[lane];
2567 }
2568 }
2569
2570 vdst.write();
2571 } // execute
2572 // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
2573
2575 : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
2576 {
2577 setFlag(ALU);
2578 setFlag(F64);
2579 } // Inst_VOP3__V_CVT_I32_F64
2580
2582 {
2583 } // ~Inst_VOP3__V_CVT_I32_F64
2584
2585 // --- description from .arch file ---
2586 // D.i = (int)S0.d.
2587 // Out-of-range floating point values (including infinity) saturate. NaN is
2588 // --- converted to 0.
2589 void
2591 {
2592 Wavefront *wf = gpuDynInst->wavefront();
2593 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
2594 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2595
2596 src.readSrc();
2597
2598 if (instData.ABS & 0x1) {
2599 src.absModifier();
2600 }
2601
2602 if (extData.NEG & 0x1) {
2603 src.negModifier();
2604 }
2605
2606 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2607 if (wf->execMask(lane)) {
2608 int exp;
2609 std::frexp(src[lane],&exp);
2610 if (std::isnan(src[lane])) {
2611 vdst[lane] = 0;
2612 } else if (std::isinf(src[lane]) || exp > 30) {
2613 if (std::signbit(src[lane])) {
2614 vdst[lane] = INT_MIN;
2615 } else {
2616 vdst[lane] = INT_MAX;
2617 }
2618 } else {
2619 vdst[lane] = (VecElemI32)src[lane];
2620 }
2621 }
2622 }
2623
2624 vdst.write();
2625 } // execute
2626 // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
2627
2629 : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
2630 {
2631 setFlag(ALU);
2632 setFlag(F64);
2633 } // Inst_VOP3__V_CVT_F64_I32
2634
2636 {
2637 } // ~Inst_VOP3__V_CVT_F64_I32
2638
2639 // --- description from .arch file ---
2640 // D.d = (double)S0.i.
2641 void
2643 {
2644 Wavefront *wf = gpuDynInst->wavefront();
2645 ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
2646 VecOperandF64 vdst(gpuDynInst, instData.VDST);
2647
2648 src.readSrc();
2649
2650 if (instData.ABS & 0x1) {
2651 src.absModifier();
2652 }
2653
2654 if (extData.NEG & 0x1) {
2655 src.negModifier();
2656 }
2657
2658 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2659 if (wf->execMask(lane)) {
2660 vdst[lane] = (VecElemF64)src[lane];
2661 }
2662 }
2663
2664 vdst.write();
2665 } // execute
2666 // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
2667
2669 : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
2670 {
2671 setFlag(ALU);
2672 setFlag(F32);
2673 } // Inst_VOP3__V_CVT_F32_I32
2674
2676 {
2677 } // ~Inst_VOP3__V_CVT_F32_I32
2678
2679 // --- description from .arch file ---
2680 // D.f = (float)S0.i.
2681 void
2683 {
2684 Wavefront *wf = gpuDynInst->wavefront();
2685 VecOperandI32 src(gpuDynInst, extData.SRC0);
2686 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2687
2688 src.readSrc();
2689
2693 assert(!(instData.ABS & 0x1));
2694 assert(!(instData.ABS & 0x2));
2695 assert(!(instData.ABS & 0x4));
2696 assert(!(extData.NEG & 0x1));
2697 assert(!(extData.NEG & 0x2));
2698 assert(!(extData.NEG & 0x4));
2699
2700 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2701 if (wf->execMask(lane)) {
2702 vdst[lane] = (VecElemF32)src[lane];
2703 }
2704 }
2705
2706 vdst.write();
2707 } // execute
2708 // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
2709
2711 : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
2712 {
2713 setFlag(ALU);
2714 setFlag(F32);
2715 } // Inst_VOP3__V_CVT_F32_U32
2716
2718 {
2719 } // ~Inst_VOP3__V_CVT_F32_U32
2720
2721 // --- description from .arch file ---
2722 // D.f = (float)S0.u.
2723 void
2725 {
2726 Wavefront *wf = gpuDynInst->wavefront();
2727 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
2728 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2729
2730 src.readSrc();
2731
2732 if (instData.ABS & 0x1) {
2733 src.absModifier();
2734 }
2735
2736 if (extData.NEG & 0x1) {
2737 src.negModifier();
2738 }
2739
2740 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2741 if (wf->execMask(lane)) {
2742 vdst[lane] = (VecElemF32)src[lane];
2743 }
2744 }
2745
2746 vdst.write();
2747 } // execute
2748 // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
2749
2751 : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
2752 {
2753 setFlag(ALU);
2754 setFlag(F32);
2755 } // Inst_VOP3__V_CVT_U32_F32
2756
2758 {
2759 } // ~Inst_VOP3__V_CVT_U32_F32
2760
2761 // --- description from .arch file ---
2762 // D.u = (unsigned)S0.f.
2763 // Out-of-range floating point values (including infinity) saturate. NaN is
2764 // --- converted to 0.
2765 void
2767 {
2768 Wavefront *wf = gpuDynInst->wavefront();
2769 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2770 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2771
2772 src.readSrc();
2773
2774 if (instData.ABS & 0x1) {
2775 src.absModifier();
2776 }
2777
2778 if (extData.NEG & 0x1) {
2779 src.negModifier();
2780 }
2781
2782 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2783 if (wf->execMask(lane)) {
2784 int exp;
2785 std::frexp(src[lane],&exp);
2786 if (std::isnan(src[lane])) {
2787 vdst[lane] = 0;
2788 } else if (std::isinf(src[lane])) {
2789 if (std::signbit(src[lane])) {
2790 vdst[lane] = 0;
2791 } else {
2792 vdst[lane] = UINT_MAX;
2793 }
2794 } else if (exp > 31) {
2795 vdst[lane] = UINT_MAX;
2796 } else {
2797 vdst[lane] = (VecElemU32)src[lane];
2798 }
2799 }
2800 }
2801
2802 vdst.write();
2803 } // execute
2804 // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
2805
2807 : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
2808 {
2809 setFlag(ALU);
2810 setFlag(F32);
2811 } // Inst_VOP3__V_CVT_I32_F32
2812
2814 {
2815 } // ~Inst_VOP3__V_CVT_I32_F32
2816
2817 // --- description from .arch file ---
2818 // D.i = (int)S0.f.
2819 // Out-of-range floating point values (including infinity) saturate. NaN is
2820 // --- converted to 0.
2821 void
2823 {
2824 Wavefront *wf = gpuDynInst->wavefront();
2825 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2826 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2827
2828 src.readSrc();
2829
2830 if (instData.ABS & 0x1) {
2831 src.absModifier();
2832 }
2833
2834 if (extData.NEG & 0x1) {
2835 src.negModifier();
2836 }
2837
2841 assert(!(instData.ABS & 0x2));
2842 assert(!(instData.ABS & 0x4));
2843 assert(!(extData.NEG & 0x2));
2844 assert(!(extData.NEG & 0x4));
2845
2846 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2847 if (wf->execMask(lane)) {
2848 int exp;
2849 std::frexp(src[lane],&exp);
2850 if (std::isnan(src[lane])) {
2851 vdst[lane] = 0;
2852 } else if (std::isinf(src[lane]) || exp > 30) {
2853 if (std::signbit(src[lane])) {
2854 vdst[lane] = INT_MIN;
2855 } else {
2856 vdst[lane] = INT_MAX;
2857 }
2858 } else {
2859 vdst[lane] = (VecElemI32)src[lane];
2860 }
2861 }
2862 }
2863
2864 vdst.write();
2865 } // execute
2866 // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
2867
2869 : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
2870 {
2871 setFlag(ALU);
2872 } // Inst_VOP3__V_MOV_FED_B32
2873
2875 {
2876 } // ~Inst_VOP3__V_MOV_FED_B32
2877
2878 // --- description from .arch file ---
2879 // D.u = S0.u;
2880 // Introduce EDC double error upon write to dest vgpr without causing an
2881 // --- exception.
2882 // Input and output modifiers not supported; this is an untyped operation.
2883 void
2885 {
2887 } // execute
2888 // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
2889
2891 : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
2892 {
2893 setFlag(ALU);
2894 setFlag(F32);
2895 } // Inst_VOP3__V_CVT_F16_F32
2896
2898 {
2899 } // ~Inst_VOP3__V_CVT_F16_F32
2900
2901 // --- description from .arch file ---
2902 // D.f16 = flt32_to_flt16(S0.f).
2903 // Supports input modifiers and creates FP16 denormals when appropriate.
2904 void
2906 {
2907 Wavefront *wf = gpuDynInst->wavefront();
2908 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
2909 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2910
2911 src0.readSrc();
2912 vdst.read();
2913
2914 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2915 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2916
2917 unsigned abs = instData.ABS;
2918 unsigned neg = extData.NEG;
2919 int opsel = instData.OPSEL;
2920
2921 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2922 if (wf->execMask(lane)) {
2923 float tmp = src0[lane];
2924
2925 if ((abs & 1) && (tmp < 0)) tmp = -tmp;
2926 if (neg & 1) tmp = -tmp;
2927
2928 tmp = omodModifier(tmp, extData.OMOD);
2929 if (instData.CLAMP) {
2930 tmp = std::clamp(tmp, 0.0f, 1.0f);
2931 }
2932
2933 AMDGPU::mxfloat16 out(tmp);
2934
2935 // If opsel[3] use upper 16-bits of dest, otherwise lower.
2936 if (opsel & 8) {
2937 replaceBits(vdst[lane], 31, 16, (out.data >> 16));
2938 } else {
2939 replaceBits(vdst[lane], 15, 0, (out.data >> 16));
2940 }
2941 }
2942 }
2943
2944 vdst.write();
2945 } // execute
2946 // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
2947
2949 : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
2950 {
2951 setFlag(ALU);
2952 setFlag(F32);
2953 } // Inst_VOP3__V_CVT_F32_F16
2954
2956 {
2957 } // ~Inst_VOP3__V_CVT_F32_F16
2958
2959 // --- description from .arch file ---
2960 // D.f = flt16_to_flt32(S0.f16).
2961 // FP16 denormal inputs are always accepted.
2962 void
2964 {
2965 Wavefront *wf = gpuDynInst->wavefront();
2966 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2967 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2968
2969 src0.readSrc();
2970
2971 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2972 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2973 panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
2974
2975 unsigned abs = instData.ABS;
2976 unsigned neg = extData.NEG;
2977
2978 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2979 if (wf->execMask(lane)) {
2980 AMDGPU::mxfloat16 tmp(src0[lane]);
2981
2982 if ((abs & 1) && (tmp < 0)) tmp = -tmp;
2983 if (neg & 1) tmp = -tmp;
2984
2985 float out = omodModifier(float(tmp), extData.OMOD);
2986 if (instData.CLAMP) {
2987 out = std::clamp(out, 0.0f, 1.0f);
2988 }
2989
2990 vdst[lane] = out;
2991 }
2992 }
2993
2994 vdst.write();
2995 } // execute
2996 // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
2997
2999 InFmt_VOP3A *iFmt)
3000 : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
3001 {
3002 setFlag(ALU);
3003 setFlag(F32);
3004 } // Inst_VOP3__V_CVT_RPI_I32_F32
3005
3007 {
3008 } // ~Inst_VOP3__V_CVT_RPI_I32_F32
3009
3010 // --- description from .arch file ---
3011 // D.i = (int)floor(S0.f + 0.5).
3012 void
3014 {
3015 Wavefront *wf = gpuDynInst->wavefront();
3016 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3017 VecOperandI32 vdst(gpuDynInst, instData.VDST);
3018
3019 src.readSrc();
3020
3021 if (instData.ABS & 0x1) {
3022 src.absModifier();
3023 }
3024
3025 if (extData.NEG & 0x1) {
3026 src.negModifier();
3027 }
3028
3029 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3030 if (wf->execMask(lane)) {
3031 vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
3032 }
3033 }
3034
3035 vdst.write();
3036 } // execute
3037 // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
3038
3040 InFmt_VOP3A *iFmt)
3041 : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
3042 {
3043 setFlag(ALU);
3044 setFlag(F32);
3045 } // Inst_VOP3__V_CVT_FLR_I32_F32
3046
3048 {
3049 } // ~Inst_VOP3__V_CVT_FLR_I32_F32
3050
3051 // --- description from .arch file ---
3052 // D.i = (int)floor(S0.f).
3053 void
3055 {
3056 Wavefront *wf = gpuDynInst->wavefront();
3057 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3058 VecOperandI32 vdst(gpuDynInst, instData.VDST);
3059
3060 src.readSrc();
3061
3062 if (instData.ABS & 0x1) {
3063 src.absModifier();
3064 }
3065
3066 if (extData.NEG & 0x1) {
3067 src.negModifier();
3068 }
3069
3070 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3071 if (wf->execMask(lane)) {
3072 vdst[lane] = (VecElemI32)std::floor(src[lane]);
3073 }
3074 }
3075
3076 vdst.write();
3077 } // execute
3078 // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
3079
3081 : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
3082 {
3083 setFlag(ALU);
3084 setFlag(F32);
3085 } // Inst_VOP3__V_CVT_OFF_F32_I4
3086
3088 {
3089 } // ~Inst_VOP3__V_CVT_OFF_F32_I4
3090
3091 // --- description from .arch file ---
3092 // 4-bit signed int to 32-bit float. Used for interpolation in shader.
3093 void
3095 {
3096 // Could not parse sq_uc.arch desc field
3098 } // execute
3099 // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
3100
3102 : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
3103 {
3104 setFlag(ALU);
3105 setFlag(F64);
3106 } // Inst_VOP3__V_CVT_F32_F64
3107
3109 {
3110 } // ~Inst_VOP3__V_CVT_F32_F64
3111
3112 // --- description from .arch file ---
3113 // D.f = (float)S0.d.
3114 void
3116 {
3117 Wavefront *wf = gpuDynInst->wavefront();
3118 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3119 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3120
3121 src.readSrc();
3122
3123 if (instData.ABS & 0x1) {
3124 src.absModifier();
3125 }
3126
3127 if (extData.NEG & 0x1) {
3128 src.negModifier();
3129 }
3130
3134 assert(!(instData.ABS & 0x2));
3135 assert(!(instData.ABS & 0x4));
3136 assert(!(extData.NEG & 0x2));
3137 assert(!(extData.NEG & 0x4));
3138
3139 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3140 if (wf->execMask(lane)) {
3141 vdst[lane] = (VecElemF32)src[lane];
3142 }
3143 }
3144
3145 vdst.write();
3146 } // execute
3147 // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
3148
3150 : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
3151 {
3152 setFlag(ALU);
3153 setFlag(F64);
3154 } // Inst_VOP3__V_CVT_F64_F32
3155
3157 {
3158 } // ~Inst_VOP3__V_CVT_F64_F32
3159
3160 // --- description from .arch file ---
3161 // D.d = (double)S0.f.
3162 void
3164 {
3165 Wavefront *wf = gpuDynInst->wavefront();
3166 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3167 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3168
3169 src.readSrc();
3170
3171 if (instData.ABS & 0x1) {
3172 src.absModifier();
3173 }
3174
3175 if (extData.NEG & 0x1) {
3176 src.negModifier();
3177 }
3178
3182 assert(!(instData.ABS & 0x2));
3183 assert(!(instData.ABS & 0x4));
3184 assert(!(extData.NEG & 0x2));
3185 assert(!(extData.NEG & 0x4));
3186
3187 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3188 if (wf->execMask(lane)) {
3189 vdst[lane] = (VecElemF64)src[lane];
3190 }
3191 }
3192
3193 vdst.write();
3194 } // execute
3195 // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
3196
3198 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
3199 {
3200 setFlag(ALU);
3201 setFlag(F32);
3202 } // Inst_VOP3__V_CVT_F32_UBYTE0
3203
3205 {
3206 } // ~Inst_VOP3__V_CVT_F32_UBYTE0
3207
3208 // --- description from .arch file ---
3209 // D.f = (float)(S0.u[7:0]).
3210 void
3212 {
3213 Wavefront *wf = gpuDynInst->wavefront();
3214 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3215 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3216
3217 src.readSrc();
3218
3219 if (instData.ABS & 0x1) {
3220 src.absModifier();
3221 }
3222
3223 if (extData.NEG & 0x1) {
3224 src.negModifier();
3225 }
3226
3227 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3228 if (wf->execMask(lane)) {
3229 vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
3230 }
3231 }
3232
3233 vdst.write();
3234 } // execute
3235 // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
3236
3238 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
3239 {
3240 setFlag(ALU);
3241 setFlag(F32);
3242 } // Inst_VOP3__V_CVT_F32_UBYTE1
3243
3245 {
3246 } // ~Inst_VOP3__V_CVT_F32_UBYTE1
3247
3248 // --- description from .arch file ---
3249 // D.f = (float)(S0.u[15:8]).
3250 void
3252 {
3253 Wavefront *wf = gpuDynInst->wavefront();
3254 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3255 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3256
3257 src.readSrc();
3258
3259 if (instData.ABS & 0x1) {
3260 src.absModifier();
3261 }
3262
3263 if (extData.NEG & 0x1) {
3264 src.negModifier();
3265 }
3266
3267 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3268 if (wf->execMask(lane)) {
3269 vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
3270 }
3271 }
3272
3273 vdst.write();
3274 } // execute
3275 // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
3276
3278 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
3279 {
3280 setFlag(ALU);
3281 setFlag(F32);
3282 } // Inst_VOP3__V_CVT_F32_UBYTE2
3283
3285 {
3286 } // ~Inst_VOP3__V_CVT_F32_UBYTE2
3287
3288 // --- description from .arch file ---
3289 // D.f = (float)(S0.u[23:16]).
3290 void
3292 {
3293 Wavefront *wf = gpuDynInst->wavefront();
3294 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3295 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3296
3297 src.readSrc();
3298
3299 if (instData.ABS & 0x1) {
3300 src.absModifier();
3301 }
3302
3303 if (extData.NEG & 0x1) {
3304 src.negModifier();
3305 }
3306
3307 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3308 if (wf->execMask(lane)) {
3309 vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
3310 }
3311 }
3312
3313 vdst.write();
3314 } // execute
3315 // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
3316
3318 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
3319 {
3320 setFlag(ALU);
3321 setFlag(F32);
3322 } // Inst_VOP3__V_CVT_F32_UBYTE3
3323
3325 {
3326 } // ~Inst_VOP3__V_CVT_F32_UBYTE3
3327
3328 // --- description from .arch file ---
3329 // D.f = (float)(S0.u[31:24]).
3330 void
3332 {
3333 Wavefront *wf = gpuDynInst->wavefront();
3334 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3335 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3336
3337 src.readSrc();
3338
3339 if (instData.ABS & 0x1) {
3340 src.absModifier();
3341 }
3342
3343 if (extData.NEG & 0x1) {
3344 src.negModifier();
3345 }
3346
3347 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3348 if (wf->execMask(lane)) {
3349 vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
3350 }
3351 }
3352
3353 vdst.write();
3354 } // execute
3355 // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
3356
3358 : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
3359 {
3360 setFlag(ALU);
3361 setFlag(F64);
3362 } // Inst_VOP3__V_CVT_U32_F64
3363
3365 {
3366 } // ~Inst_VOP3__V_CVT_U32_F64
3367
3368 // --- description from .arch file ---
3369 // D.u = (unsigned)S0.d.
3370 // Out-of-range floating point values (including infinity) saturate. NaN is
3371 // --- converted to 0.
3372 void
3374 {
3375 Wavefront *wf = gpuDynInst->wavefront();
3376 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3377 VecOperandU32 vdst(gpuDynInst, instData.VDST);
3378
3379 src.readSrc();
3380
3381 if (instData.ABS & 0x1) {
3382 src.absModifier();
3383 }
3384
3385 if (extData.NEG & 0x1) {
3386 src.negModifier();
3387 }
3388
3389 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3390 if (wf->execMask(lane)) {
3391 int exp;
3392 std::frexp(src[lane],&exp);
3393 if (std::isnan(src[lane])) {
3394 vdst[lane] = 0;
3395 } else if (std::isinf(src[lane])) {
3396 if (std::signbit(src[lane])) {
3397 vdst[lane] = 0;
3398 } else {
3399 vdst[lane] = UINT_MAX;
3400 }
3401 } else if (exp > 31) {
3402 vdst[lane] = UINT_MAX;
3403 } else {
3404 vdst[lane] = (VecElemU32)src[lane];
3405 }
3406 }
3407 }
3408
3409 vdst.write();
3410 } // execute
3411 // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
3412
3414 : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
3415 {
3416 setFlag(ALU);
3417 setFlag(F64);
3418 } // Inst_VOP3__V_CVT_F64_U32
3419
3421 {
3422 } // ~Inst_VOP3__V_CVT_F64_U32
3423
3424 // --- description from .arch file ---
3425 // D.d = (double)S0.u.
3426 void
3428 {
3429 Wavefront *wf = gpuDynInst->wavefront();
3430 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3431 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3432
3433 src.readSrc();
3434
3435 if (instData.ABS & 0x1) {
3436 src.absModifier();
3437 }
3438
3439 if (extData.NEG & 0x1) {
3440 src.negModifier();
3441 }
3442
3443 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3444 if (wf->execMask(lane)) {
3445 vdst[lane] = (VecElemF64)src[lane];
3446 }
3447 }
3448
3449 vdst.write();
3450 } // execute
3451 // --- Inst_VOP3__V_TRUNC_F64 class methods ---
3452
3454 : Inst_VOP3A(iFmt, "v_trunc_f64", false)
3455 {
3456 setFlag(ALU);
3457 setFlag(F64);
3458 } // Inst_VOP3__V_TRUNC_F64
3459
3461 {
3462 } // ~Inst_VOP3__V_TRUNC_F64
3463
3464 // --- description from .arch file ---
3465 // D.d = trunc(S0.d), return integer part of S0.d.
3466 void
3468 {
3469 Wavefront *wf = gpuDynInst->wavefront();
3470 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3471 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3472
3473 src.readSrc();
3474
3475 if (instData.ABS & 0x1) {
3476 src.absModifier();
3477 }
3478
3479 if (extData.NEG & 0x1) {
3480 src.negModifier();
3481 }
3482
3483 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3484 if (wf->execMask(lane)) {
3485 vdst[lane] = std::trunc(src[lane]);
3486 }
3487 }
3488
3489 vdst.write();
3490 } // execute
3491 // --- Inst_VOP3__V_CEIL_F64 class methods ---
3492
3494 : Inst_VOP3A(iFmt, "v_ceil_f64", false)
3495 {
3496 setFlag(ALU);
3497 setFlag(F64);
3498 } // Inst_VOP3__V_CEIL_F64
3499
3501 {
3502 } // ~Inst_VOP3__V_CEIL_F64
3503
3504 // --- description from .arch file ---
3505 // D.d = trunc(S0.d);
3506 // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
3507 void
3509 {
3510 Wavefront *wf = gpuDynInst->wavefront();
3511 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3512 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3513
3514 src.readSrc();
3515
3516 if (instData.ABS & 0x1) {
3517 src.absModifier();
3518 }
3519
3520 if (extData.NEG & 0x1) {
3521 src.negModifier();
3522 }
3523
3524 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3525 if (wf->execMask(lane)) {
3526 vdst[lane] = std::ceil(src[lane]);
3527 }
3528 }
3529
3530 vdst.write();
3531 } // execute
3532 // --- Inst_VOP3__V_RNDNE_F64 class methods ---
3533
3535 : Inst_VOP3A(iFmt, "v_rndne_f64", false)
3536 {
3537 setFlag(ALU);
3538 setFlag(F64);
3539 } // Inst_VOP3__V_RNDNE_F64
3540
3542 {
3543 } // ~Inst_VOP3__V_RNDNE_F64
3544
3545 // --- description from .arch file ---
3546 // D.d = round_nearest_even(S0.d).
3547 void
3549 {
3550 Wavefront *wf = gpuDynInst->wavefront();
3551 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3552 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3553
3554 src.readSrc();
3555
3556 if (instData.ABS & 0x1) {
3557 src.absModifier();
3558 }
3559
3560 if (extData.NEG & 0x1) {
3561 src.negModifier();
3562 }
3563
3564 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3565 if (wf->execMask(lane)) {
3566 vdst[lane] = roundNearestEven(src[lane]);
3567 }
3568 }
3569
3570 vdst.write();
3571 } // execute
3572 // --- Inst_VOP3__V_FLOOR_F64 class methods ---
3573
3575 : Inst_VOP3A(iFmt, "v_floor_f64", false)
3576 {
3577 setFlag(ALU);
3578 setFlag(F64);
3579 } // Inst_VOP3__V_FLOOR_F64
3580
3582 {
3583 } // ~Inst_VOP3__V_FLOOR_F64
3584
3585 // --- description from .arch file ---
3586 // D.d = trunc(S0.d);
3587 // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
3588 void
3590 {
3591 Wavefront *wf = gpuDynInst->wavefront();
3592 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3593 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3594
3595 src.readSrc();
3596
3597 if (instData.ABS & 0x1) {
3598 src.absModifier();
3599 }
3600
3601 if (extData.NEG & 0x1) {
3602 src.negModifier();
3603 }
3604
3605 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3606 if (wf->execMask(lane)) {
3607 vdst[lane] = std::floor(src[lane]);
3608 }
3609 }
3610
3611 vdst.write();
3612 } // execute
3613 // --- Inst_VOP3__V_FRACT_F32 class methods ---
3614
3616 : Inst_VOP3A(iFmt, "v_fract_f32", false)
3617 {
3618 setFlag(ALU);
3619 setFlag(F32);
3620 } // Inst_VOP3__V_FRACT_F32
3621
3623 {
3624 } // ~Inst_VOP3__V_FRACT_F32
3625
3626 // --- description from .arch file ---
3627 // D.f = S0.f - floor(S0.f).
3628 void
3630 {
3631 Wavefront *wf = gpuDynInst->wavefront();
3632 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3633 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3634
3635 src.readSrc();
3636
3637 if (instData.ABS & 0x1) {
3638 src.absModifier();
3639 }
3640
3641 if (extData.NEG & 0x1) {
3642 src.negModifier();
3643 }
3644
3645 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3646 if (wf->execMask(lane)) {
3647 VecElemF32 int_part(0.0);
3648 vdst[lane] = std::modf(src[lane], &int_part);
3649 }
3650 }
3651
3652 vdst.write();
3653 } // execute
3654 // --- Inst_VOP3__V_TRUNC_F32 class methods ---
3655
3657 : Inst_VOP3A(iFmt, "v_trunc_f32", false)
3658 {
3659 setFlag(ALU);
3660 setFlag(F32);
3661 } // Inst_VOP3__V_TRUNC_F32
3662
3664 {
3665 } // ~Inst_VOP3__V_TRUNC_F32
3666
3667 // --- description from .arch file ---
3668 // D.f = trunc(S0.f), return integer part of S0.f.
3669 void
3671 {
3672 Wavefront *wf = gpuDynInst->wavefront();
3673 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3674 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3675
3676 src.readSrc();
3677
3678 if (instData.ABS & 0x1) {
3679 src.absModifier();
3680 }
3681
3682 if (extData.NEG & 0x1) {
3683 src.negModifier();
3684 }
3685
3686 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3687 if (wf->execMask(lane)) {
3688 vdst[lane] = std::trunc(src[lane]);
3689 }
3690 }
3691
3692 vdst.write();
3693 } // execute
3694 // --- Inst_VOP3__V_CEIL_F32 class methods ---
3695
3697 : Inst_VOP3A(iFmt, "v_ceil_f32", false)
3698 {
3699 setFlag(ALU);
3700 setFlag(F32);
3701 } // Inst_VOP3__V_CEIL_F32
3702
3704 {
3705 } // ~Inst_VOP3__V_CEIL_F32
3706
3707 // --- description from .arch file ---
3708 // D.f = trunc(S0.f);
3709 // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
3710 void
3712 {
3713 Wavefront *wf = gpuDynInst->wavefront();
3714 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3715 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3716
3717 src.readSrc();
3718
3719 if (instData.ABS & 0x1) {
3720 src.absModifier();
3721 }
3722
3723 if (extData.NEG & 0x1) {
3724 src.negModifier();
3725 }
3726
3727 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3728 if (wf->execMask(lane)) {
3729 vdst[lane] = std::ceil(src[lane]);
3730 }
3731 }
3732
3733 vdst.write();
3734 } // execute
3735 // --- Inst_VOP3__V_RNDNE_F32 class methods ---
3736
3738 : Inst_VOP3A(iFmt, "v_rndne_f32", false)
3739 {
3740 setFlag(ALU);
3741 setFlag(F32);
3742 } // Inst_VOP3__V_RNDNE_F32
3743
3745 {
3746 } // ~Inst_VOP3__V_RNDNE_F32
3747
3748 // --- description from .arch file ---
3749 // D.f = round_nearest_even(S0.f).
3750 void
3752 {
3753 Wavefront *wf = gpuDynInst->wavefront();
3754 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3755 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3756
3757 src.readSrc();
3758
3759 if (instData.ABS & 0x1) {
3760 src.absModifier();
3761 }
3762
3763 if (extData.NEG & 0x1) {
3764 src.negModifier();
3765 }
3766
3767 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3768 if (wf->execMask(lane)) {
3769 vdst[lane] = roundNearestEven(src[lane]);
3770 }
3771 }
3772
3773 vdst.write();
3774 } // execute
3775 // --- Inst_VOP3__V_FLOOR_F32 class methods ---
3776
3778 : Inst_VOP3A(iFmt, "v_floor_f32", false)
3779 {
3780 setFlag(ALU);
3781 setFlag(F32);
3782 } // Inst_VOP3__V_FLOOR_F32
3783
3785 {
3786 } // ~Inst_VOP3__V_FLOOR_F32
3787
3788 // --- description from .arch file ---
3789 // D.f = trunc(S0.f);
3790 // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
3791 void
3793 {
3794 Wavefront *wf = gpuDynInst->wavefront();
3795 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3796 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3797
3798 src.readSrc();
3799
3800 if (instData.ABS & 0x1) {
3801 src.absModifier();
3802 }
3803
3804 if (extData.NEG & 0x1) {
3805 src.negModifier();
3806 }
3807
3808 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3809 if (wf->execMask(lane)) {
3810 vdst[lane] = std::floor(src[lane]);
3811 }
3812 }
3813
3814 vdst.write();
3815 } // execute
3816 // --- Inst_VOP3__V_EXP_F32 class methods ---
3817
3819 : Inst_VOP3A(iFmt, "v_exp_f32", false)
3820 {
3821 setFlag(ALU);
3822 setFlag(F32);
3823 } // Inst_VOP3__V_EXP_F32
3824
3826 {
3827 } // ~Inst_VOP3__V_EXP_F32
3828
3829 // --- description from .arch file ---
3830 // D.f = pow(2.0, S0.f).
3831 void
3833 {
3834 Wavefront *wf = gpuDynInst->wavefront();
3835 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3836 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3837
3838 src.readSrc();
3839
3840 if (instData.ABS & 0x1) {
3841 src.absModifier();
3842 }
3843
3844 if (extData.NEG & 0x1) {
3845 src.negModifier();
3846 }
3847
3848 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3849 if (wf->execMask(lane)) {
3850 vdst[lane] = std::pow(2.0, src[lane]);
3851 }
3852 }
3853
3854 vdst.write();
3855 } // execute
3856 // --- Inst_VOP3__V_LOG_F32 class methods ---
3857
3859 : Inst_VOP3A(iFmt, "v_log_f32", false)
3860 {
3861 setFlag(ALU);
3862 setFlag(F32);
3863 } // Inst_VOP3__V_LOG_F32
3864
3866 {
3867 } // ~Inst_VOP3__V_LOG_F32
3868
3869 // --- description from .arch file ---
3870 // D.f = log2(S0.f). Base 2 logarithm.
3871 void
3873 {
3874 Wavefront *wf = gpuDynInst->wavefront();
3875 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3876 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3877
3878 src.readSrc();
3879
3880 if (instData.ABS & 0x1) {
3881 src.absModifier();
3882 }
3883
3884 if (extData.NEG & 0x1) {
3885 src.negModifier();
3886 }
3887
3891 assert(!(instData.ABS & 0x2));
3892 assert(!(instData.ABS & 0x4));
3893 assert(!(extData.NEG & 0x2));
3894 assert(!(extData.NEG & 0x4));
3895
3896 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3897 if (wf->execMask(lane)) {
3898 vdst[lane] = std::log2(src[lane]);
3899 }
3900 }
3901
3902 vdst.write();
3903 } // execute
3904 // --- Inst_VOP3__V_RCP_F32 class methods ---
3905
3907 : Inst_VOP3A(iFmt, "v_rcp_f32", false)
3908 {
3909 setFlag(ALU);
3910 setFlag(F32);
3911 } // Inst_VOP3__V_RCP_F32
3912
3914 {
3915 } // ~Inst_VOP3__V_RCP_F32
3916
3917 // --- description from .arch file ---
3918 // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
3919 void
3921 {
3922 Wavefront *wf = gpuDynInst->wavefront();
3923 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3924 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3925
3926 src.readSrc();
3927
3928 if (instData.ABS & 0x1) {
3929 src.absModifier();
3930 }
3931
3932 if (extData.NEG & 0x1) {
3933 src.negModifier();
3934 }
3935
3936 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3937 if (wf->execMask(lane)) {
3938 vdst[lane] = 1.0 / src[lane];
3939 }
3940 }
3941
3942 vdst.write();
3943 } // execute
3944 // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
3945
3947 : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
3948 {
3949 setFlag(ALU);
3950 setFlag(F32);
3951 } // Inst_VOP3__V_RCP_IFLAG_F32
3952
3954 {
3955 } // ~Inst_VOP3__V_RCP_IFLAG_F32
3956
3957 // --- description from .arch file ---
3958 // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
3959 // --- integer DIV_BY_ZERO exception but cannot raise floating-point
3960 // --- exceptions.
3961 void
3963 {
3964 Wavefront *wf = gpuDynInst->wavefront();
3965 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3966 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3967
3968 src.readSrc();
3969
3970 if (instData.ABS & 0x1) {
3971 src.absModifier();
3972 }
3973
3974 if (extData.NEG & 0x1) {
3975 src.negModifier();
3976 }
3977
3978 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3979 if (wf->execMask(lane)) {
3980 vdst[lane] = 1.0 / src[lane];
3981 }
3982 }
3983
3984 vdst.write();
3985 } // execute
3986 // --- Inst_VOP3__V_RSQ_F32 class methods ---
3987
3989 : Inst_VOP3A(iFmt, "v_rsq_f32", false)
3990 {
3991 setFlag(ALU);
3992 setFlag(F32);
3993 } // Inst_VOP3__V_RSQ_F32
3994
3996 {
3997 } // ~Inst_VOP3__V_RSQ_F32
3998
3999 // --- description from .arch file ---
4000 // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
4001 void
4003 {
4004 Wavefront *wf = gpuDynInst->wavefront();
4005 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4006 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4007
4008 src.readSrc();
4009
4010 if (instData.ABS & 0x1) {
4011 src.absModifier();
4012 }
4013
4014 if (extData.NEG & 0x1) {
4015 src.negModifier();
4016 }
4017
4018 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4019 if (wf->execMask(lane)) {
4020 vdst[lane] = 1.0 / std::sqrt(src[lane]);
4021 }
4022 }
4023
4024 vdst.write();
4025 } // execute
4026 // --- Inst_VOP3__V_RCP_F64 class methods ---
4027
4029 : Inst_VOP3A(iFmt, "v_rcp_f64", false)
4030 {
4031 setFlag(ALU);
4032 setFlag(F64);
4033 } // Inst_VOP3__V_RCP_F64
4034
4036 {
4037 } // ~Inst_VOP3__V_RCP_F64
4038
4039 // --- description from .arch file ---
4040 // D.d = 1.0 / S0.d.
4041 void
4043 {
4044 Wavefront *wf = gpuDynInst->wavefront();
4045 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4046 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4047
4048 src.readSrc();
4049
4050 if (instData.ABS & 0x1) {
4051 src.absModifier();
4052 }
4053
4054 if (extData.NEG & 0x1) {
4055 src.negModifier();
4056 }
4057
4058 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4059 if (wf->execMask(lane)) {
4060 if (std::fpclassify(src[lane]) == FP_ZERO) {
4061 vdst[lane] = +INFINITY;
4062 } else if (std::isnan(src[lane])) {
4063 vdst[lane] = NAN;
4064 } else if (std::isinf(src[lane])) {
4065 if (std::signbit(src[lane])) {
4066 vdst[lane] = -0.0;
4067 } else {
4068 vdst[lane] = 0.0;
4069 }
4070 } else {
4071 vdst[lane] = 1.0 / src[lane];
4072 }
4073 }
4074 }
4075
4076 vdst.write();
4077 } // execute
4078 // --- Inst_VOP3__V_RSQ_F64 class methods ---
4079
4081 : Inst_VOP3A(iFmt, "v_rsq_f64", false)
4082 {
4083 setFlag(ALU);
4084 setFlag(F64);
4085 } // Inst_VOP3__V_RSQ_F64
4086
4088 {
4089 } // ~Inst_VOP3__V_RSQ_F64
4090
4091 // --- description from .arch file ---
4092 // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
4093 void
4095 {
4096 Wavefront *wf = gpuDynInst->wavefront();
4097 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4098 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4099
4100 src.readSrc();
4101
4102 if (instData.ABS & 0x1) {
4103 src.absModifier();
4104 }
4105
4106 if (extData.NEG & 0x1) {
4107 src.negModifier();
4108 }
4109
4110 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4111 if (wf->execMask(lane)) {
4112 if (std::fpclassify(src[lane]) == FP_ZERO) {
4113 vdst[lane] = +INFINITY;
4114 } else if (std::isnan(src[lane])) {
4115 vdst[lane] = NAN;
4116 } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
4117 vdst[lane] = 0.0;
4118 } else if (std::signbit(src[lane])) {
4119 vdst[lane] = NAN;
4120 } else {
4121 vdst[lane] = 1.0 / std::sqrt(src[lane]);
4122 }
4123 }
4124 }
4125
4126 vdst.write();
4127 } // execute
4128 // --- Inst_VOP3__V_SQRT_F32 class methods ---
4129
4131 : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
4132 {
4133 setFlag(ALU);
4134 setFlag(F32);
4135 } // Inst_VOP3__V_SQRT_F32
4136
4138 {
4139 } // ~Inst_VOP3__V_SQRT_F32
4140
4141 // --- description from .arch file ---
4142 // D.f = sqrt(S0.f).
4143 void
4145 {
4146 Wavefront *wf = gpuDynInst->wavefront();
4147 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4148 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4149
4150 src.readSrc();
4151
4152 if (instData.ABS & 0x1) {
4153 src.absModifier();
4154 }
4155
4156 if (extData.NEG & 0x1) {
4157 src.negModifier();
4158 }
4159
4160 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4161 if (wf->execMask(lane)) {
4162 vdst[lane] = std::sqrt(src[lane]);
4163 }
4164 }
4165
4166 vdst.write();
4167 } // execute
4168 // --- Inst_VOP3__V_SQRT_F64 class methods ---
4169
4171 : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
4172 {
4173 setFlag(ALU);
4174 setFlag(F64);
4175 } // Inst_VOP3__V_SQRT_F64
4176
4178 {
4179 } // ~Inst_VOP3__V_SQRT_F64
4180
4181 // --- description from .arch file ---
4182 // D.d = sqrt(S0.d).
4183 void
4185 {
4186 Wavefront *wf = gpuDynInst->wavefront();
4187 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4188 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4189
4190 src.readSrc();
4191
4192 if (instData.ABS & 0x1) {
4193 src.absModifier();
4194 }
4195
4196 if (extData.NEG & 0x1) {
4197 src.negModifier();
4198 }
4199
4200 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4201 if (wf->execMask(lane)) {
4202 vdst[lane] = std::sqrt(src[lane]);
4203 }
4204 }
4205
4206 vdst.write();
4207 } // execute
4208 // --- Inst_VOP3__V_SIN_F32 class methods ---
4209
4211 : Inst_VOP3A(iFmt, "v_sin_f32", false)
4212 {
4213 setFlag(ALU);
4214 setFlag(F32);
4215 } // Inst_VOP3__V_SIN_F32
4216
4218 {
4219 } // ~Inst_VOP3__V_SIN_F32
4220
4221 // --- description from .arch file ---
4222 // D.f = sin(S0.f * 2 * PI).
4223 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
4224 // float 0.0.
4225 void
4227 {
4228 Wavefront *wf = gpuDynInst->wavefront();
4229 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4230 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
4231 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4232
4233 src.readSrc();
4234 pi.read();
4235
4236 if (instData.ABS & 0x1) {
4237 src.absModifier();
4238 }
4239
4240 if (extData.NEG & 0x1) {
4241 src.negModifier();
4242 }
4243
4244 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4245 if (wf->execMask(lane)) {
4246 vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
4247 }
4248 }
4249
4250 vdst.write();
4251 } // execute
4252 // --- Inst_VOP3__V_COS_F32 class methods ---
4253
4255 : Inst_VOP3A(iFmt, "v_cos_f32", false)
4256 {
4257 setFlag(ALU);
4258 setFlag(F32);
4259 } // Inst_VOP3__V_COS_F32
4260
4262 {
4263 } // ~Inst_VOP3__V_COS_F32
4264
4265 // --- description from .arch file ---
4266 // D.f = cos(S0.f * 2 * PI).
4267 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
4268 // float 1.0.
4269 void
4271 {
4272 Wavefront *wf = gpuDynInst->wavefront();
4273 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4274 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
4275 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4276
4277 src.readSrc();
4278 pi.read();
4279
4280 if (instData.ABS & 0x1) {
4281 src.absModifier();
4282 }
4283
4284 if (extData.NEG & 0x1) {
4285 src.negModifier();
4286 }
4287
4288 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4289 if (wf->execMask(lane)) {
4290 vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
4291 }
4292 }
4293
4294 vdst.write();
4295 } // execute
4296 // --- Inst_VOP3__V_NOT_B32 class methods ---
4297
4299 : Inst_VOP3A(iFmt, "v_not_b32", false)
4300 {
4301 setFlag(ALU);
4302 } // Inst_VOP3__V_NOT_B32
4303
4305 {
4306 } // ~Inst_VOP3__V_NOT_B32
4307
4308 // --- description from .arch file ---
4309 // D.u = ~S0.u.
4310 // Input and output modifiers not supported.
4311 void
4313 {
4314 Wavefront *wf = gpuDynInst->wavefront();
4315 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4316 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4317
4318 src.readSrc();
4319
4320 if (instData.ABS & 0x1) {
4321 src.absModifier();
4322 }
4323
4324 if (extData.NEG & 0x1) {
4325 src.negModifier();
4326 }
4327
4328 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4329 if (wf->execMask(lane)) {
4330 vdst[lane] = ~src[lane];
4331 }
4332 }
4333
4334 vdst.write();
4335 } // execute
4336 // --- Inst_VOP3__V_BFREV_B32 class methods ---
4337
4339 : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
4340 {
4341 setFlag(ALU);
4342 } // Inst_VOP3__V_BFREV_B32
4343
4345 {
4346 } // ~Inst_VOP3__V_BFREV_B32
4347
4348 // --- description from .arch file ---
4349 // D.u[31:0] = S0.u[0:31], bitfield reverse.
4350 // Input and output modifiers not supported.
4351 void
4353 {
4354 Wavefront *wf = gpuDynInst->wavefront();
4355 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4356 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4357
4358 src.readSrc();
4359
4360 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4361 if (wf->execMask(lane)) {
4362 vdst[lane] = reverseBits(src[lane]);
4363 }
4364 }
4365
4366 vdst.write();
4367 } // execute
4368 // --- Inst_VOP3__V_FFBH_U32 class methods ---
4369
4371 : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
4372 {
4373 setFlag(ALU);
4374 } // Inst_VOP3__V_FFBH_U32
4375
4377 {
4378 } // ~Inst_VOP3__V_FFBH_U32
4379
4380 // --- description from .arch file ---
4381 // D.u = position of first 1 in S0.u from MSB;
4382 // D.u = 0xffffffff if S0.u == 0.
4383 void
4385 {
4386 Wavefront *wf = gpuDynInst->wavefront();
4387 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4388 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4389
4390 src.readSrc();
4391
4392 if (instData.ABS & 0x1) {
4393 src.absModifier();
4394 }
4395
4396 if (extData.NEG & 0x1) {
4397 src.negModifier();
4398 }
4399
4400 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4401 if (wf->execMask(lane)) {
4402 vdst[lane] = findFirstOneMsb(src[lane]);
4403 }
4404 }
4405
4406 vdst.write();
4407 } // execute
4408 // --- Inst_VOP3__V_FFBL_B32 class methods ---
4409
4411 : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
4412 {
4413 setFlag(ALU);
4414 } // Inst_VOP3__V_FFBL_B32
4415
4417 {
4418 } // ~Inst_VOP3__V_FFBL_B32
4419
4420 // --- description from .arch file ---
4421 // D.u = position of first 1 in S0.u from LSB;
4422 // D.u = 0xffffffff if S0.u == 0.
4423 void
4425 {
4426 Wavefront *wf = gpuDynInst->wavefront();
4427 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4428 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4429
4430 src.readSrc();
4431
4432 if (instData.ABS & 0x1) {
4433 src.absModifier();
4434 }
4435
4436 if (extData.NEG & 0x1) {
4437 src.negModifier();
4438 }
4439
4440 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4441 if (wf->execMask(lane)) {
4442 vdst[lane] = findFirstOne(src[lane]);
4443 }
4444 }
4445
4446 vdst.write();
4447 } // execute
4448 // --- Inst_VOP3__V_FFBH_I32 class methods ---
4449
4451 : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
4452 {
4453 setFlag(ALU);
4454 } // Inst_VOP3__V_FFBH_I32
4455
4457 {
4458 } // ~Inst_VOP3__V_FFBH_I32
4459
4460 // --- description from .arch file ---
4461 // D.u = position of first bit different from sign bit in S0.i from MSB;
4462 // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
4463 void
4465 {
4466 Wavefront *wf = gpuDynInst->wavefront();
4467 ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
4468 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4469
4470 src.readSrc();
4471
4472 if (instData.ABS & 0x1) {
4473 src.absModifier();
4474 }
4475
4476 if (extData.NEG & 0x1) {
4477 src.negModifier();
4478 }
4479
4480 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4481 if (wf->execMask(lane)) {
4482 vdst[lane] = firstOppositeSignBit(src[lane]);
4483 }
4484 }
4485
4486 vdst.write();
4487 } // execute
4488 // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
4489
4491 InFmt_VOP3A *iFmt)
4492 : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
4493 {
4494 setFlag(ALU);
4495 setFlag(F64);
4496 } // Inst_VOP3__V_FREXP_EXP_I32_F64
4497
4499 {
4500 } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
4501
4502 // --- description from .arch file ---
4503 // See V_FREXP_EXP_I32_F32.
4504 void
4506 {
4507 Wavefront *wf = gpuDynInst->wavefront();
4508 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4509 VecOperandI32 vdst(gpuDynInst, instData.VDST);
4510
4511 src.readSrc();
4512
4513 if (instData.ABS & 0x1) {
4514 src.absModifier();
4515 }
4516
4517 if (extData.NEG & 0x1) {
4518 src.negModifier();
4519 }
4520
4521 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4522 if (wf->execMask(lane)) {
4523 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
4524 vdst[lane] = 0;
4525 } else {
4526 VecElemI32 exp(0);
4527 std::frexp(src[lane], &exp);
4528 vdst[lane] = exp;
4529 }
4530 }
4531 }
4532
4533 vdst.write();
4534 } // execute
4535 // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
4536
4538 : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
4539 {
4540 setFlag(ALU);
4541 setFlag(F64);
4542 } // Inst_VOP3__V_FREXP_MANT_F64
4543
4545 {
4546 } // ~Inst_VOP3__V_FREXP_MANT_F64
4547
4548 // --- description from .arch file ---
4549 // See V_FREXP_MANT_F32.
4550 void
4552 {
4553 Wavefront *wf = gpuDynInst->wavefront();
4554 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4555 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4556
4557 src.readSrc();
4558
4559 if (instData.ABS & 0x1) {
4560 src.absModifier();
4561 }
4562
4563 if (extData.NEG & 0x1) {
4564 src.negModifier();
4565 }
4566
4567 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4568 if (wf->execMask(lane)) {
4569 VecElemI32 exp(0);
4570 vdst[lane] = std::frexp(src[lane], &exp);
4571 }
4572 }
4573
4574 vdst.write();
4575 } // execute
4576 // --- Inst_VOP3__V_FRACT_F64 class methods ---
4577
4579 : Inst_VOP3A(iFmt, "v_fract_f64", false)
4580 {
4581 setFlag(ALU);
4582 setFlag(F64);
4583 } // Inst_VOP3__V_FRACT_F64
4584
4586 {
4587 } // ~Inst_VOP3__V_FRACT_F64
4588
4589 // --- description from .arch file ---
4590 // See V_FRACT_F32.
4591 void
4593 {
4594 Wavefront *wf = gpuDynInst->wavefront();
4595 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4596 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4597
4598 src.readSrc();
4599
4600 if (instData.ABS & 0x1) {
4601 src.absModifier();
4602 }
4603
4604 if (extData.NEG & 0x1) {
4605 src.negModifier();
4606 }
4607
4608 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4609 if (wf->execMask(lane)) {
4610 VecElemF32 int_part(0.0);
4611 vdst[lane] = std::modf(src[lane], &int_part);
4612 }
4613 }
4614
4615 vdst.write();
4616 } // execute
4617 // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
4618
4620 InFmt_VOP3A *iFmt)
4621 : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
4622 {
4623 setFlag(ALU);
4624 setFlag(F32);
4625 } // Inst_VOP3__V_FREXP_EXP_I32_F32
4626
4628 {
4629 } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
4630
4631 // --- description from .arch file ---
4632 // if (S0.f == INF || S0.f == NAN) then D.i = 0;
4633 // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
4634 // Returns exponent of single precision float input, such that S0.f =
4635 // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
4636 // the significand.
4637 void
4639 {
4640 Wavefront *wf = gpuDynInst->wavefront();
4641 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4642 VecOperandI32 vdst(gpuDynInst, instData.VDST);
4643
4644 src.readSrc();
4645
4646 if (instData.ABS & 0x1) {
4647 src.absModifier();
4648 }
4649
4650 if (extData.NEG & 0x1) {
4651 src.negModifier();
4652 }
4653
4654 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4655 if (wf->execMask(lane)) {
4656 if (std::isinf(src[lane])|| std::isnan(src[lane])) {
4657 vdst[lane] = 0;
4658 } else {
4659 VecElemI32 exp(0);
4660 std::frexp(src[lane], &exp);
4661 vdst[lane] = exp;
4662 }
4663 }
4664 }
4665
4666 vdst.write();
4667 } // execute
4668 // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
4669
4671 : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
4672 {
4673 setFlag(ALU);
4674 setFlag(F32);
4675 } // Inst_VOP3__V_FREXP_MANT_F32
4676
4678 {
4679 } // ~Inst_VOP3__V_FREXP_MANT_F32
4680
4681 // --- description from .arch file ---
4682 // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
4683 // else D.f = Mantissa(S0.f).
4684 // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
4685 // --- significand of single precision float input, such that S0.f =
4686 // --- significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
4687 // --- returns integer exponent.
4688 void
4690 {
4691 Wavefront *wf = gpuDynInst->wavefront();
4692 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4693 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4694
4695 src.readSrc();
4696
4697 if (instData.ABS & 0x1) {
4698 src.absModifier();
4699 }
4700
4701 if (extData.NEG & 0x1) {
4702 src.negModifier();
4703 }
4704
4705 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4706 if (wf->execMask(lane)) {
4707 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
4708 vdst[lane] = src[lane];
4709 } else {
4710 VecElemI32 exp(0);
4711 vdst[lane] = std::frexp(src[lane], &exp);
4712 }
4713 }
4714 }
4715
4716 vdst.write();
4717 } // execute
4718 // --- Inst_VOP3__V_CLREXCP class methods ---
4719
4721 : Inst_VOP3A(iFmt, "v_clrexcp", false)
4722 {
4723 } // Inst_VOP3__V_CLREXCP
4724
4726 {
4727 } // ~Inst_VOP3__V_CLREXCP
4728
4729 // --- description from .arch file ---
4730 // Clear wave's exception state in SIMD (SP).
4731 void
4733 {
4735 } // execute
4736 // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
4737
4739 : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
4740 {
4741 setFlag(ALU);
4742 setFlag(F16);
4743 } // Inst_VOP3__V_CVT_F16_U16
4744
4746 {
4747 } // ~Inst_VOP3__V_CVT_F16_U16
4748
4749 // --- description from .arch file ---
4750 // D.f16 = uint16_to_flt16(S.u16).
4751 // Supports denormals, rounding, exception flags and saturation.
4752 void
4754 {
4756 } // execute
4757 // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
4758
4760 : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
4761 {
4762 setFlag(ALU);
4763 setFlag(F16);
4764 } // Inst_VOP3__V_CVT_F16_I16
4765
4767 {
4768 } // ~Inst_VOP3__V_CVT_F16_I16
4769
4770 // --- description from .arch file ---
4771 // D.f16 = int16_to_flt16(S.i16).
4772 // Supports denormals, rounding, exception flags and saturation.
4773 void
4775 {
4777 } // execute
4778 // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
4779
4781 : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
4782 {
4783 setFlag(ALU);
4784 setFlag(F16);
4785 } // Inst_VOP3__V_CVT_U16_F16
4786
4788 {
4789 } // ~Inst_VOP3__V_CVT_U16_F16
4790
4791 // --- description from .arch file ---
4792 // D.u16 = flt16_to_uint16(S.f16).
4793 // Supports rounding, exception flags and saturation.
4794 void
4796 {
4798 } // execute
4799 // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
4800
4802 : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
4803 {
4804 setFlag(ALU);
4805 setFlag(F16);
4806 } // Inst_VOP3__V_CVT_I16_F16
4807
4809 {
4810 } // ~Inst_VOP3__V_CVT_I16_F16
4811
4812 // --- description from .arch file ---
4813 // D.i16 = flt16_to_int16(S.f16).
4814 // Supports rounding, exception flags and saturation.
4815 void
4817 {
4819 } // execute
4820 // --- Inst_VOP3__V_RCP_F16 class methods ---
4821
4823 : Inst_VOP3A(iFmt, "v_rcp_f16", false)
4824 {
4825 setFlag(ALU);
4826 setFlag(F16);
4827 } // Inst_VOP3__V_RCP_F16
4828
4830 {
4831 } // ~Inst_VOP3__V_RCP_F16
4832
4833 // --- description from .arch file ---
4834 // if (S0.f16 == 1.0f)
4835 // D.f16 = 1.0f;
4836 // else
4837 // D.f16 = ApproximateRecip(S0.f16).
4838 void
4840 {
4842 } // execute
4843 // --- Inst_VOP3__V_SQRT_F16 class methods ---
4844
4846 : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
4847 {
4848 setFlag(ALU);
4849 setFlag(F16);
4850 } // Inst_VOP3__V_SQRT_F16
4851
4853 {
4854 } // ~Inst_VOP3__V_SQRT_F16
4855
4856 // --- description from .arch file ---
4857 // if (S0.f16 == 1.0f)
4858 // D.f16 = 1.0f;
4859 // else
4860 // D.f16 = ApproximateSqrt(S0.f16).
4861 void
4863 {
4865 } // execute
4866 // --- Inst_VOP3__V_RSQ_F16 class methods ---
4867
4869 : Inst_VOP3A(iFmt, "v_rsq_f16", false)
4870 {
4871 setFlag(ALU);
4872 setFlag(F16);
4873 } // Inst_VOP3__V_RSQ_F16
4874
4876 {
4877 } // ~Inst_VOP3__V_RSQ_F16
4878
4879 // --- description from .arch file ---
4880 // if (S0.f16 == 1.0f)
4881 // D.f16 = 1.0f;
4882 // else
4883 // D.f16 = ApproximateRecipSqrt(S0.f16).
4884 void
4886 {
4888 } // execute
4889 // --- Inst_VOP3__V_LOG_F16 class methods ---
4890
4892 : Inst_VOP3A(iFmt, "v_log_f16", false)
4893 {
4894 setFlag(ALU);
4895 setFlag(F16);
4896 } // Inst_VOP3__V_LOG_F16
4897
4899 {
4900 } // ~Inst_VOP3__V_LOG_F16
4901
4902 // --- description from .arch file ---
4903 // if (S0.f16 == 1.0f)
4904 // D.f16 = 0.0f;
4905 // else
4906 // D.f16 = ApproximateLog2(S0.f16).
4907 void
4909 {
4911 } // execute
4912 // --- Inst_VOP3__V_EXP_F16 class methods ---
4913
4915 : Inst_VOP3A(iFmt, "v_exp_f16", false)
4916 {
4917 setFlag(ALU);
4918 setFlag(F16);
4919 } // Inst_VOP3__V_EXP_F16
4920
4922 {
4923 } // ~Inst_VOP3__V_EXP_F16
4924
4925 // --- description from .arch file ---
4926 // if (S0.f16 == 0.0f)
4927 // D.f16 = 1.0f;
4928 // else
4929 // D.f16 = Approximate2ToX(S0.f16).
4930 void
4932 {
4934 } // execute
4935 // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
4936
4938 : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
4939 {
4940 setFlag(ALU);
4941 setFlag(F16);
4942 } // Inst_VOP3__V_FREXP_MANT_F16
4943
4945 {
4946 } // ~Inst_VOP3__V_FREXP_MANT_F16
4947
4948 // --- description from .arch file ---
4949 // if (S0.f16 == +-INF || S0.f16 == NAN)
4950 // D.f16 = S0.f16;
4951 // else
4952 // D.f16 = mantissa(S0.f16).
4953 // Result range is (-1.0,-0.5][0.5,1.0).
4954 // C math library frexp function.
4955 // Returns binary significand of half precision float input, such that the
4956 // original single float = significand * (2 ** exponent).
4957 void
4959 {
4961 } // execute
4962 // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
4963
4965 InFmt_VOP3A *iFmt)
4966 : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
4967 {
4968 setFlag(ALU);
4969 setFlag(F16);
4970 } // Inst_VOP3__V_FREXP_EXP_I16_F16
4971
4973 {
4974 } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
4975
4976 // --- description from .arch file ---
4977 // if (S0.f16 == +-INF || S0.f16 == NAN)
4978 // D.i16 = 0;
4979 // else
4980 // D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
4981 // C math library frexp function.
4982 // Returns exponent of half precision float input, such that the
4983 // original single float = significand * (2 ** exponent).
4984 void
4989 // --- Inst_VOP3__V_FLOOR_F16 class methods ---
4990
4992 : Inst_VOP3A(iFmt, "v_floor_f16", false)
4993 {
4994 setFlag(ALU);
4995 setFlag(F16);
4996 } // Inst_VOP3__V_FLOOR_F16
4997
4999 {
5000 } // ~Inst_VOP3__V_FLOOR_F16
5001
5002 // --- description from .arch file ---
5003 // D.f16 = trunc(S0.f16);
5004 // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
5005 void
5007 {
5009 } // execute
5010 // --- Inst_VOP3__V_CEIL_F16 class methods ---
5011
5013 : Inst_VOP3A(iFmt, "v_ceil_f16", false)
5014 {
5015 setFlag(ALU);
5016 setFlag(F16);
5017 } // Inst_VOP3__V_CEIL_F16
5018
5020 {
5021 } // ~Inst_VOP3__V_CEIL_F16
5022
5023 // --- description from .arch file ---
5024 // D.f16 = trunc(S0.f16);
5025 // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
5026 void
5028 {
5030 } // execute
5031 // --- Inst_VOP3__V_TRUNC_F16 class methods ---
5032
5034 : Inst_VOP3A(iFmt, "v_trunc_f16", false)
5035 {
5036 setFlag(ALU);
5037 setFlag(F16);
5038 } // Inst_VOP3__V_TRUNC_F16
5039
5041 {
5042 } // ~Inst_VOP3__V_TRUNC_F16
5043
5044 // --- description from .arch file ---
5045 // D.f16 = trunc(S0.f16).
5046 // Round-to-zero semantics.
5047 void
5049 {
5051 } // execute
5052 // --- Inst_VOP3__V_RNDNE_F16 class methods ---
5053
5055 : Inst_VOP3A(iFmt, "v_rndne_f16", false)
5056 {
5057 setFlag(ALU);
5058 setFlag(F16);
5059 } // Inst_VOP3__V_RNDNE_F16
5060
5062 {
5063 } // ~Inst_VOP3__V_RNDNE_F16
5064
5065 // --- description from .arch file ---
5066 // D.f16 = FLOOR(S0.f16 + 0.5f);
5067 // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
5068 // Round-to-nearest-even semantics.
5069 void
5071 {
5073 } // execute
5074 // --- Inst_VOP3__V_FRACT_F16 class methods ---
5075
5077 : Inst_VOP3A(iFmt, "v_fract_f16", false)
5078 {
5079 setFlag(ALU);
5080 setFlag(F16);
5081 } // Inst_VOP3__V_FRACT_F16
5082
5084 {
5085 } // ~Inst_VOP3__V_FRACT_F16
5086
5087 // --- description from .arch file ---
5088 // D.f16 = S0.f16 + -floor(S0.f16).
5089 void
5091 {
5093 } // execute
5094 // --- Inst_VOP3__V_SIN_F16 class methods ---
5095
5097 : Inst_VOP3A(iFmt, "v_sin_f16", false)
5098 {
5099 setFlag(ALU);
5100 setFlag(F16);
5101 } // Inst_VOP3__V_SIN_F16
5102
5104 {
5105 } // ~Inst_VOP3__V_SIN_F16
5106
5107 // --- description from .arch file ---
5108 // D.f16 = sin(S0.f16 * 2 * PI).
5109 void
5111 {
5113 } // execute
5114 // --- Inst_VOP3__V_COS_F16 class methods ---
5115
5117 : Inst_VOP3A(iFmt, "v_cos_f16", false)
5118 {
5119 setFlag(ALU);
5120 setFlag(F16);
5121 } // Inst_VOP3__V_COS_F16
5122
5124 {
5125 } // ~Inst_VOP3__V_COS_F16
5126
5127 // --- description from .arch file ---
5128 // D.f16 = cos(S0.f16 * 2 * PI).
5129 void
5131 {
5133 } // execute
5134 // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
5135
5137 : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
5138 {
5139 setFlag(ALU);
5140 setFlag(F32);
5141 } // Inst_VOP3__V_EXP_LEGACY_F32
5142
5144 {
5145 } // ~Inst_VOP3__V_EXP_LEGACY_F32
5146
5147 // --- description from .arch file ---
5148 // D.f = pow(2.0, S0.f) with legacy semantics.
5149 void
5151 {
5152 Wavefront *wf = gpuDynInst->wavefront();
5153 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
5154 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5155
5156 src.readSrc();
5157
5158 if (instData.ABS & 0x1) {
5159 src.absModifier();
5160 }
5161
5162 if (extData.NEG & 0x1) {
5163 src.negModifier();
5164 }
5165
5169 assert(!(instData.ABS & 0x2));
5170 assert(!(instData.ABS & 0x4));
5171 assert(!(extData.NEG & 0x2));
5172 assert(!(extData.NEG & 0x4));
5173
5174 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5175 if (wf->execMask(lane)) {
5176 vdst[lane] = std::pow(2.0, src[lane]);
5177 }
5178 }
5179
5180 vdst.write();
5181 } // execute
5182 // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
5183
5185 : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
5186 {
5187 setFlag(ALU);
5188 setFlag(F32);
5189 } // Inst_VOP3__V_LOG_LEGACY_F32
5190
5192 {
5193 } // ~Inst_VOP3__V_LOG_LEGACY_F32
5194
5195 // --- description from .arch file ---
5196 // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
5197 void
5199 {
5200 Wavefront *wf = gpuDynInst->wavefront();
5201 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
5202 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5203
5204 src.readSrc();
5205
5206 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5207 if (wf->execMask(lane)) {
5208 vdst[lane] = std::log2(src[lane]);
5209 }
5210 }
5211
5212 vdst.write();
5213 } // execute
5214 // --- Inst_VOP3__V_PRNG_B32 class methods ---
5215
5217 : Inst_VOP3A(iFmt, "v_prng_b32", false)
5218 {
5219 setFlag(ALU);
5220 } // Inst_VOP3__V_PRNG_B32
5221
5223 {} // ~Inst_VOP3__V_PRNG_B32
5224
5225 // Generate a pseudorandom number using an LFSR (linear feedback shift
5226 // register) seeded with the vector input, then store the result into a
5227 // vector register.
5228 //
5229 // in = S0.u32;
5230 // D0.u32 = ((in << 1U) ^ (in[31] ? 197U : 0U))
5231 //
5232 // Notes: This function produces a sequence of pseudorandom numbers with
5233 // period 2**32 - 1 unless the input is zero, in which case the period is
5234 // 1.
5235 void
5237 {
5238 Wavefront *wf = gpuDynInst->wavefront();
5239 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
5240 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5241
5242 src.readSrc();
5243
5244 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
5245 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
5246 panic_if(instData.ABS, "ABS not supported for %s", _opcode);
5247 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
5248 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
5249 panic_if(extData.NEG, "NEG not supported for %s", _opcode);
5250 panic_if(instData.OPSEL, "OPSEL not supported for %s", _opcode);
5251
5252 auto randFunc = [](VecElemU32 in) {
5253 return ((in << 1) ^ (((in >> 31) & 1) ? 0xc5 : 0x00));
5254 };
5255
5256 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5257 if (wf->execMask(lane)) {
5258 vdst[lane] = randFunc(src[lane]);
5259 }
5260 }
5261
5262 vdst.write();
5263 } // execute
5264 // --- Inst_VOP3__V_CVT_F32_BF16 class methods ---
5265
5267 : Inst_VOP3A(iFmt, "v_cvt_f32_bf16", false)
5268 {
5269 setFlag(ALU);
5270 setFlag(F32);
5271 } // Inst_VOP3__V_CVT_F32_BF16
5272
5274 {
5275 } // ~Inst_VOP3__V_CVT_F32_BF16
5276
5277 void
5279 {
5280 Wavefront *wf = gpuDynInst->wavefront();
5281 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
5282 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5283
5284 src.readSrc();
5285
5286 bool clamp = instData.CLAMP;
5287 unsigned abs = instData.ABS;
5288 unsigned opsel = instData.OPSEL;
5289 unsigned omod = extData.OMOD;
5290 unsigned neg = extData.NEG;
5291
5292 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5293 if (wf->execMask(lane)) {
5294 uint16_t s0 = (opsel & 1) ? bits(src[lane], 31, 16)
5295 : bits(src[lane], 15, 0);
5296
5298 tmp.data = s0;
5299
5300 float f32 = float(tmp);
5301
5302 if (abs & 1) f32 = std::fabs(f32);
5303 if (neg & 1) f32 = -f32;
5304 if (omod) f32 = omodModifier(f32, omod);
5305 if (clamp) f32 = std::clamp(f32, 0.0f, 1.0f);
5306
5307 vdst[lane] = f32;
5308 }
5309 }
5310
5311 vdst.write();
5312 } // execute
5313 // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
5314
5316 : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
5317 {
5318 setFlag(ALU);
5319 setFlag(F32);
5320 setFlag(MAD);
5321 } // Inst_VOP3__V_MAD_LEGACY_F32
5322
5324 {
5325 } // ~Inst_VOP3__V_MAD_LEGACY_F32
5326
5327 // --- description from .arch file ---
5328 // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
5329 void
5331 {
5332 Wavefront *wf = gpuDynInst->wavefront();
5333 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5334 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5335 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5336 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5337
5338 src0.readSrc();
5339 src1.readSrc();
5340 src2.readSrc();
5341
5342 if (instData.ABS & 0x1) {
5343 src0.absModifier();
5344 }
5345
5346 if (instData.ABS & 0x2) {
5347 src1.absModifier();
5348 }
5349
5350 if (instData.ABS & 0x4) {
5351 src2.absModifier();
5352 }
5353
5354 if (extData.NEG & 0x1) {
5355 src0.negModifier();
5356 }
5357
5358 if (extData.NEG & 0x2) {
5359 src1.negModifier();
5360 }
5361
5362 if (extData.NEG & 0x4) {
5363 src2.negModifier();
5364 }
5365
5366 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5367 if (wf->execMask(lane)) {
5368 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5369 }
5370 }
5371
5372 vdst.write();
5373 } // execute
5374 // --- Inst_VOP3__V_MAD_F32 class methods ---
5375
5377 : Inst_VOP3A(iFmt, "v_mad_f32", false)
5378 {
5379 setFlag(ALU);
5380 setFlag(F32);
5381 setFlag(MAD);
5382 } // Inst_VOP3__V_MAD_F32
5383
5385 {
5386 } // ~Inst_VOP3__V_MAD_F32
5387
5388 // --- description from .arch file ---
5389 // D.f = S0.f * S1.f + S2.f.
5390 void
5392 {
5393 Wavefront *wf = gpuDynInst->wavefront();
5394 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5395 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5396 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5397 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5398
5399 src0.readSrc();
5400 src1.readSrc();
5401 src2.readSrc();
5402
5403 if (instData.ABS & 0x1) {
5404 src0.absModifier();
5405 }
5406
5407 if (instData.ABS & 0x2) {
5408 src1.absModifier();
5409 }
5410
5411 if (instData.ABS & 0x4) {
5412 src2.absModifier();
5413 }
5414
5415 if (extData.NEG & 0x1) {
5416 src0.negModifier();
5417 }
5418
5419 if (extData.NEG & 0x2) {
5420 src1.negModifier();
5421 }
5422
5423 if (extData.NEG & 0x4) {
5424 src2.negModifier();
5425 }
5426
5427 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5428 if (wf->execMask(lane)) {
5429 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5430 }
5431 }
5432
5433 vdst.write();
5434 } // execute
5435 // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
5436
5438 : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
5439 {
5440 setFlag(ALU);
5441 setFlag(MAD);
5442 } // Inst_VOP3__V_MAD_I32_I24
5443
5445 {
5446 } // ~Inst_VOP3__V_MAD_I32_I24
5447
5448 // --- description from .arch file ---
5449 // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
5450 void
5452 {
5453 Wavefront *wf = gpuDynInst->wavefront();
5454 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5455 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
5456 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
5457 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5458
5459 src0.readSrc();
5460 src1.readSrc();
5461 src2.readSrc();
5462
5466 assert(!(instData.ABS & 0x1));
5467 assert(!(instData.ABS & 0x2));
5468 assert(!(instData.ABS & 0x4));
5469 assert(!(extData.NEG & 0x1));
5470 assert(!(extData.NEG & 0x2));
5471 assert(!(extData.NEG & 0x4));
5472
5473 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5474 if (wf->execMask(lane)) {
5475 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
5476 * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
5477 }
5478 }
5479
5480 vdst.write();
5481 } // execute
5482 // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
5483
5485 : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
5486 {
5487 setFlag(ALU);
5488 setFlag(MAD);
5489 } // Inst_VOP3__V_MAD_U32_U24
5490
5492 {
5493 } // ~Inst_VOP3__V_MAD_U32_U24
5494
5495 // --- description from .arch file ---
5496 // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
5497 void
5499 {
5500 Wavefront *wf = gpuDynInst->wavefront();
5501 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5502 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5503 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5504 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5505
5506 src0.readSrc();
5507 src1.readSrc();
5508 src2.readSrc();
5509
5513 assert(!(instData.ABS & 0x1));
5514 assert(!(instData.ABS & 0x2));
5515 assert(!(instData.ABS & 0x4));
5516 assert(!(extData.NEG & 0x1));
5517 assert(!(extData.NEG & 0x2));
5518 assert(!(extData.NEG & 0x4));
5519
5520 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5521 if (wf->execMask(lane)) {
5522 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
5523 + src2[lane];
5524 }
5525 }
5526
5527 vdst.write();
5528 } // execute
5529 // --- Inst_VOP3__V_CUBEID_F32 class methods ---
5530
5532 : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
5533 {
5534 setFlag(ALU);
5535 setFlag(F32);
5536 } // Inst_VOP3__V_CUBEID_F32
5537
5539 {
5540 } // ~Inst_VOP3__V_CUBEID_F32
5541
5542 // --- description from .arch file ---
5543 // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
5544 // --- (S0.f, S1.f, S2.f).
5545 void
5547 {
5549 } // execute
5550 // --- Inst_VOP3__V_CUBESC_F32 class methods ---
5551
5553 : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
5554 {
5555 setFlag(ALU);
5556 setFlag(F32);
5557 } // Inst_VOP3__V_CUBESC_F32
5558
5560 {
5561 } // ~Inst_VOP3__V_CUBESC_F32
5562
5563 // --- description from .arch file ---
5564 // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
5565 // S2.f).
5566 void
5568 {
5570 } // execute
5571 // --- Inst_VOP3__V_CUBETC_F32 class methods ---
5572
5574 : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
5575 {
5576 setFlag(ALU);
5577 setFlag(F32);
5578 } // Inst_VOP3__V_CUBETC_F32
5579
5581 {
5582 } // ~Inst_VOP3__V_CUBETC_F32
5583
5584 // --- description from .arch file ---
5585 // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
5586 // S2.f).
5587 void
5589 {
5591 } // execute
5592 // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
5593
5595 : Inst_VOP3A(iFmt, "v_cubema_f32", false)
5596 {
5597 setFlag(ALU);
5598 setFlag(F32);
5599 } // Inst_VOP3__V_CUBEMA_F32
5600
5602 {
5603 } // ~Inst_VOP3__V_CUBEMA_F32
5604
5605 // --- description from .arch file ---
5606 // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
5607 // --- S2.f).
5608 void
5610 {
5612 } // execute
5613 // --- Inst_VOP3__V_BFE_U32 class methods ---
5614
5616 : Inst_VOP3A(iFmt, "v_bfe_u32", false)
5617 {
5618 setFlag(ALU);
5619 } // Inst_VOP3__V_BFE_U32
5620
5622 {
5623 } // ~Inst_VOP3__V_BFE_U32
5624
5625 // --- description from .arch file ---
5626 // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
5627 // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
5628 void
5630 {
5631 Wavefront *wf = gpuDynInst->wavefront();
5632 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5633 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5634 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5635 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5636
5637 src0.readSrc();
5638 src1.readSrc();
5639 src2.readSrc();
5640
5644 assert(!(instData.ABS & 0x1));
5645 assert(!(instData.ABS & 0x2));
5646 assert(!(instData.ABS & 0x4));
5647 assert(!(extData.NEG & 0x1));
5648 assert(!(extData.NEG & 0x2));
5649 assert(!(extData.NEG & 0x4));
5650
5651 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5652 if (wf->execMask(lane)) {
5653 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
5654 & ((1 << bits(src2[lane], 4, 0)) - 1);
5655 }
5656 }
5657
5658 vdst.write();
5659 } // execute
5660 // --- Inst_VOP3__V_BFE_I32 class methods ---
5661
5663 : Inst_VOP3A(iFmt, "v_bfe_i32", false)
5664 {
5665 setFlag(ALU);
5666 } // Inst_VOP3__V_BFE_I32
5667
5669 {
5670 } // ~Inst_VOP3__V_BFE_I32
5671
5672 // --- description from .arch file ---
5673 // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
5674 // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
5675 void
5677 {
5678 Wavefront *wf = gpuDynInst->wavefront();
5679 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5680 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5681 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5682 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5683
5684 src0.readSrc();
5685 src1.readSrc();
5686 src2.readSrc();
5687
5691 assert(!(instData.ABS & 0x1));
5692 assert(!(instData.ABS & 0x2));
5693 assert(!(instData.ABS & 0x4));
5694 assert(!(extData.NEG & 0x1));
5695 assert(!(extData.NEG & 0x2));
5696 assert(!(extData.NEG & 0x4));
5697
5698 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5699 if (wf->execMask(lane)) {
5700 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
5701 & ((1 << bits(src2[lane], 4, 0)) - 1);
5702
5703 // Above extracted a signed int of size src2 bits which needs
5704 // to be signed-extended. Check if the MSB of our src2-bit
5705 // integer is 1, and sign extend it is.
5706 if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
5707 vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
5708 }
5709 }
5710 }
5711
5712 vdst.write();
5713 } // execute
5714 // --- Inst_VOP3__V_BFI_B32 class methods ---
5715
5717 : Inst_VOP3A(iFmt, "v_bfi_b32", false)
5718 {
5719 setFlag(ALU);
5720 } // Inst_VOP3__V_BFI_B32
5721
5723 {
5724 } // ~Inst_VOP3__V_BFI_B32
5725
5726 // --- description from .arch file ---
5727 // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
5728 void
5730 {
5731 Wavefront *wf = gpuDynInst->wavefront();
5732 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5733 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5734 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5735 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5736
5737 src0.readSrc();
5738 src1.readSrc();
5739 src2.readSrc();
5740
5744 assert(!(instData.ABS & 0x1));
5745 assert(!(instData.ABS & 0x2));
5746 assert(!(instData.ABS & 0x4));
5747 assert(!(extData.NEG & 0x1));
5748 assert(!(extData.NEG & 0x2));
5749 assert(!(extData.NEG & 0x4));
5750
5751 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5752 if (wf->execMask(lane)) {
5753 vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
5754 & src2[lane]);
5755 }
5756 }
5757
5758 vdst.write();
5759 } // execute
5760 // --- Inst_VOP3__V_FMA_F32 class methods ---
5761
5763 : Inst_VOP3A(iFmt, "v_fma_f32", false)
5764 {
5765 setFlag(ALU);
5766 setFlag(F32);
5767 setFlag(FMA);
5768 } // Inst_VOP3__V_FMA_F32
5769
5771 {
5772 } // ~Inst_VOP3__V_FMA_F32
5773
5774 // --- description from .arch file ---
5775 // D.f = S0.f * S1.f + S2.f.
5776 void
5778 {
5779 Wavefront *wf = gpuDynInst->wavefront();
5780 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5781 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5782 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5783 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5784
5785 src0.readSrc();
5786 src1.readSrc();
5787 src2.readSrc();
5788
5789 if (instData.ABS & 0x1) {
5790 src0.absModifier();
5791 }
5792
5793 if (instData.ABS & 0x2) {
5794 src1.absModifier();
5795 }
5796
5797 if (instData.ABS & 0x4) {
5798 src2.absModifier();
5799 }
5800
5801 if (extData.NEG & 0x1) {
5802 src0.negModifier();
5803 }
5804
5805 if (extData.NEG & 0x2) {
5806 src1.negModifier();
5807 }
5808
5809 if (extData.NEG & 0x4) {
5810 src2.negModifier();
5811 }
5812
5813 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5814 if (wf->execMask(lane)) {
5815 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5816 }
5817 }
5818
5819 vdst.write();
5820 } // execute
5821 // --- Inst_VOP3__V_FMA_F64 class methods ---
5822
5824 : Inst_VOP3A(iFmt, "v_fma_f64", false)
5825 {
5826 setFlag(ALU);
5827 setFlag(F64);
5828 setFlag(FMA);
5829 } // Inst_VOP3__V_FMA_F64
5830
5832 {
5833 } // ~Inst_VOP3__V_FMA_F64
5834
5835 // --- description from .arch file ---
5836 // D.d = S0.d * S1.d + S2.d.
5837 void
5839 {
5840 Wavefront *wf = gpuDynInst->wavefront();
5841 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
5842 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
5843 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
5844 VecOperandF64 vdst(gpuDynInst, instData.VDST);
5845
5846 src0.readSrc();
5847 src1.readSrc();
5848 src2.readSrc();
5849
5850 if (instData.ABS & 0x1) {
5851 src0.absModifier();
5852 }
5853
5854 if (instData.ABS & 0x2) {
5855 src1.absModifier();
5856 }
5857
5858 if (instData.ABS & 0x4) {
5859 src2.absModifier();
5860 }
5861
5862 if (extData.NEG & 0x1) {
5863 src0.negModifier();
5864 }
5865
5866 if (extData.NEG & 0x2) {
5867 src1.negModifier();
5868 }
5869
5870 if (extData.NEG & 0x4) {
5871 src2.negModifier();
5872 }
5873
5874 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5875 if (wf->execMask(lane)) {
5876 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5877 }
5878 }
5879
5880 vdst.write();
5881 } // execute
5882 // --- Inst_VOP3__V_LERP_U8 class methods ---
5883
5885 : Inst_VOP3A(iFmt, "v_lerp_u8", false)
5886 {
5887 setFlag(ALU);
5888 } // Inst_VOP3__V_LERP_U8
5889
5891 {
5892 } // ~Inst_VOP3__V_LERP_U8
5893
5894 // --- description from .arch file ---
5895 // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
5896 // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
5897 // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
5898 // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
5899 // Unsigned 8-bit pixel average on packed unsigned bytes (linear
5900 // --- interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
5901 // --- otherwise 0.5 truncates.
5902 void
5904 {
5905 Wavefront *wf = gpuDynInst->wavefront();
5906 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5907 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5908 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5909 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5910
5911 src0.readSrc();
5912 src1.readSrc();
5913 src2.readSrc();
5914
5918 assert(!(instData.ABS & 0x1));
5919 assert(!(instData.ABS & 0x2));
5920 assert(!(instData.ABS & 0x4));
5921 assert(!(extData.NEG & 0x1));
5922 assert(!(extData.NEG & 0x2));
5923 assert(!(extData.NEG & 0x4));
5924
5925 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5926 if (wf->execMask(lane)) {
5927 vdst[lane] = ((bits(src0[lane], 31, 24)
5928 + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
5929 << 24;
5930 vdst[lane] += ((bits(src0[lane], 23, 16)
5931 + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
5932 << 16;
5933 vdst[lane] += ((bits(src0[lane], 15, 8)
5934 + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
5935 << 8;
5936 vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
5937 + bits(src2[lane], 0)) >> 1);
5938 }
5939 }
5940
5941 vdst.write();
5942 } // execute
5943 // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
5944
5946 : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
5947 {
5948 setFlag(ALU);
5949 } // Inst_VOP3__V_ALIGNBIT_B32
5950
5952 {
5953 } // ~Inst_VOP3__V_ALIGNBIT_B32
5954
5955 // --- description from .arch file ---
5956 // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
5957 void
5959 {
5960 Wavefront *wf = gpuDynInst->wavefront();
5961 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5962 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5963 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5964 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5965
5966 src0.readSrc();
5967 src1.readSrc();
5968 src2.readSrc();
5969
5973 assert(!(instData.ABS & 0x1));
5974 assert(!(instData.ABS & 0x2));
5975 assert(!(instData.ABS & 0x4));
5976 assert(!(extData.NEG & 0x1));
5977 assert(!(extData.NEG & 0x2));
5978 assert(!(extData.NEG & 0x4));
5979
5980 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5981 if (wf->execMask(lane)) {
5982 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
5983 | (VecElemU64)src1[lane]);
5984 vdst[lane] = (VecElemU32)((src_0_1
5985 >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
5986 }
5987 }
5988
5989 vdst.write();
5990 } // execute
5991 // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
5992
5994 : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
5995 {
5996 setFlag(ALU);
5997 } // Inst_VOP3__V_ALIGNBYTE_B32
5998
6000 {
6001 } // ~Inst_VOP3__V_ALIGNBYTE_B32
6002
6003 // --- description from .arch file ---
6004 // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
6005 void
6007 {
6008 Wavefront *wf = gpuDynInst->wavefront();
6009 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6010 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6011 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6012 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6013
6014 src0.readSrc();
6015 src1.readSrc();
6016 src2.readSrc();
6017
6021 assert(!(instData.ABS & 0x1));
6022 assert(!(instData.ABS & 0x2));
6023 assert(!(instData.ABS & 0x4));
6024 assert(!(extData.NEG & 0x1));
6025 assert(!(extData.NEG & 0x2));
6026 assert(!(extData.NEG & 0x4));
6027
6028 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6029 if (wf->execMask(lane)) {
6030 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
6031 | (VecElemU64)src1[lane]);
6032 vdst[lane] = (VecElemU32)((src_0_1
6033 >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
6034 & 0xffffffff);
6035 }
6036 }
6037
6038 vdst.write();
6039 } // execute
6040 // --- Inst_VOP3__V_MIN3_F32 class methods ---
6041
6043 : Inst_VOP3A(iFmt, "v_min3_f32", false)
6044 {
6045 setFlag(ALU);
6046 setFlag(F32);
6047 } // Inst_VOP3__V_MIN3_F32
6048
6050 {
6051 } // ~Inst_VOP3__V_MIN3_F32
6052
6053 // --- description from .arch file ---
6054 // D.f = min(S0.f, S1.f, S2.f).
6055 void
6057 {
6058 Wavefront *wf = gpuDynInst->wavefront();
6059 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6060 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6061 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6062 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6063
6064 src0.readSrc();
6065 src1.readSrc();
6066 src2.readSrc();
6067
6068 if (instData.ABS & 0x1) {
6069 src0.absModifier();
6070 }
6071
6072 if (instData.ABS & 0x2) {
6073 src1.absModifier();
6074 }
6075
6076 if (instData.ABS & 0x4) {
6077 src2.absModifier();
6078 }
6079
6080 if (extData.NEG & 0x1) {
6081 src0.negModifier();
6082 }
6083
6084 if (extData.NEG & 0x2) {
6085 src1.negModifier();
6086 }
6087
6088 if (extData.NEG & 0x4) {
6089 src2.negModifier();
6090 }
6091
6092 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6093 if (wf->execMask(lane)) {
6094 VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
6095 vdst[lane] = std::fmin(min_0_1, src2[lane]);
6096 }
6097 }
6098
6099 vdst.write();
6100 } // execute
6101 // --- Inst_VOP3__V_MIN3_I32 class methods ---
6102
6104 : Inst_VOP3A(iFmt, "v_min3_i32", false)
6105 {
6106 setFlag(ALU);
6107 } // Inst_VOP3__V_MIN3_I32
6108
6110 {
6111 } // ~Inst_VOP3__V_MIN3_I32
6112
6113 // --- description from .arch file ---
6114 // D.i = min(S0.i, S1.i, S2.i).
6115 void
6117 {
6118 Wavefront *wf = gpuDynInst->wavefront();
6119 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6120 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6121 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
6122 VecOperandI32 vdst(gpuDynInst, instData.VDST);
6123
6124 src0.readSrc();
6125 src1.readSrc();
6126 src2.readSrc();
6127
6131 assert(!(instData.ABS & 0x1));
6132 assert(!(instData.ABS & 0x2));
6133 assert(!(instData.ABS & 0x4));
6134 assert(!(extData.NEG & 0x1));
6135 assert(!(extData.NEG & 0x2));
6136 assert(!(extData.NEG & 0x4));
6137
6138 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6139 if (wf->execMask(lane)) {
6140 VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
6141 vdst[lane] = std::min(min_0_1, src2[lane]);
6142 }
6143 }
6144
6145 vdst.write();
6146 } // execute
6147 // --- Inst_VOP3__V_MIN3_U32 class methods ---
6148
6150 : Inst_VOP3A(iFmt, "v_min3_u32", false)
6151 {
6152 setFlag(ALU);
6153 } // Inst_VOP3__V_MIN3_U32
6154
6156 {
6157 } // ~Inst_VOP3__V_MIN3_U32
6158
6159 // --- description from .arch file ---
6160 // D.u = min(S0.u, S1.u, S2.u).
6161 void
6163 {
6164 Wavefront *wf = gpuDynInst->wavefront();
6165 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6166 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6167 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6168 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6169
6170 src0.readSrc();
6171 src1.readSrc();
6172 src2.readSrc();
6173
6177 assert(!(instData.ABS & 0x1));
6178 assert(!(instData.ABS & 0x2));
6179 assert(!(instData.ABS & 0x4));
6180 assert(!(extData.NEG & 0x1));
6181 assert(!(extData.NEG & 0x2));
6182 assert(!(extData.NEG & 0x4));
6183
6184 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6185 if (wf->execMask(lane)) {
6186 VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
6187 vdst[lane] = std::min(min_0_1, src2[lane]);
6188 }
6189 }
6190
6191 vdst.write();
6192 } // execute
6193 // --- Inst_VOP3__V_MAX3_F32 class methods ---
6194
6196 : Inst_VOP3A(iFmt, "v_max3_f32", false)
6197 {
6198 setFlag(ALU);
6199 setFlag(F32);
6200 } // Inst_VOP3__V_MAX3_F32
6201
6203 {
6204 } // ~Inst_VOP3__V_MAX3_F32
6205
6206 // --- description from .arch file ---
6207 // D.f = max(S0.f, S1.f, S2.f).
6208 void
6210 {
6211 Wavefront *wf = gpuDynInst->wavefront();
6212 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6213 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6214 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6215 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6216
6217 src0.readSrc();
6218 src1.readSrc();
6219 src2.readSrc();
6220
6221 if (instData.ABS & 0x1) {
6222 src0.absModifier();
6223 }
6224
6225 if (instData.ABS & 0x2) {
6226 src1.absModifier();
6227 }
6228
6229 if (instData.ABS & 0x4) {
6230 src2.absModifier();
6231 }
6232
6233 if (extData.NEG & 0x1) {
6234 src0.negModifier();
6235 }
6236
6237 if (extData.NEG & 0x2) {
6238 src1.negModifier();
6239 }
6240
6241 if (extData.NEG & 0x4) {
6242 src2.negModifier();
6243 }
6244
6245 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6246 if (wf->execMask(lane)) {
6247 VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
6248 vdst[lane] = std::fmax(max_0_1, src2[lane]);
6249 }
6250 }
6251
6252 vdst.write();
6253 } // execute
6254 // --- Inst_VOP3__V_MAX3_I32 class methods ---
6255
6257 : Inst_VOP3A(iFmt, "v_max3_i32", false)
6258 {
6259 setFlag(ALU);
6260 } // Inst_VOP3__V_MAX3_I32
6261
6263 {
6264 } // ~Inst_VOP3__V_MAX3_I32
6265
6266 // --- description from .arch file ---
6267 // D.i = max(S0.i, S1.i, S2.i).
6268 void
6270 {
6271 Wavefront *wf = gpuDynInst->wavefront();
6272 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6273 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6274 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
6275 VecOperandI32 vdst(gpuDynInst, instData.VDST);
6276
6277 src0.readSrc();
6278 src1.readSrc();
6279 src2.readSrc();
6280
6284 assert(!(instData.ABS & 0x1));
6285 assert(!(instData.ABS & 0x2));
6286 assert(!(instData.ABS & 0x4));
6287 assert(!(extData.NEG & 0x1));
6288 assert(!(extData.NEG & 0x2));
6289 assert(!(extData.NEG & 0x4));
6290
6291 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6292 if (wf->execMask(lane)) {
6293 VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
6294 vdst[lane] = std::max(max_0_1, src2[lane]);
6295 }
6296 }
6297
6298 vdst.write();
6299 } // execute
6300 // --- Inst_VOP3__V_MAX3_U32 class methods ---
6301
6303 : Inst_VOP3A(iFmt, "v_max3_u32", false)
6304 {
6305 setFlag(ALU);
6306 } // Inst_VOP3__V_MAX3_U32
6307
6309 {
6310 } // ~Inst_VOP3__V_MAX3_U32
6311
6312 // --- description from .arch file ---
6313 // D.u = max(S0.u, S1.u, S2.u).
6314 void
6316 {
6317 Wavefront *wf = gpuDynInst->wavefront();
6318 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6319 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6320 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6321 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6322
6323 src0.readSrc();
6324 src1.readSrc();
6325 src2.readSrc();
6326
6330 assert(!(instData.ABS & 0x1));
6331 assert(!(instData.ABS & 0x2));
6332 assert(!(instData.ABS & 0x4));
6333 assert(!(extData.NEG & 0x1));
6334 assert(!(extData.NEG & 0x2));
6335 assert(!(extData.NEG & 0x4));
6336
6337 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6338 if (wf->execMask(lane)) {
6339 VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
6340 vdst[lane] = std::max(max_0_1, src2[lane]);
6341 }
6342 }
6343
6344 vdst.write();
6345 } // execute
6346 // --- Inst_VOP3__V_MED3_F32 class methods ---
6347
6349 : Inst_VOP3A(iFmt, "v_med3_f32", false)
6350 {
6351 setFlag(ALU);
6352 setFlag(F32);
6353 } // Inst_VOP3__V_MED3_F32
6354
6356 {
6357 } // ~Inst_VOP3__V_MED3_F32
6358
6359 // --- description from .arch file ---
6360 // D.f = median(S0.f, S1.f, S2.f).
6361 void
6363 {
6364 Wavefront *wf = gpuDynInst->wavefront();
6365 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6366 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6367 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6368 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6369
6370 src0.readSrc();
6371 src1.readSrc();
6372 src2.readSrc();
6373
6374 if (instData.ABS & 0x1) {
6375 src0.absModifier();
6376 }
6377
6378 if (instData.ABS & 0x2) {
6379 src1.absModifier();
6380 }
6381
6382 if (instData.ABS & 0x4) {
6383 src2.absModifier();
6384 }
6385
6386 if (extData.NEG & 0x1) {
6387 src0.negModifier();
6388 }
6389
6390 if (extData.NEG & 0x2) {
6391 src1.negModifier();
6392 }
6393
6394 if (extData.NEG & 0x4) {
6395 src2.negModifier();
6396 }
6397
6398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6399 if (wf->execMask(lane)) {
6400 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6401 }
6402 }
6403
6404 vdst.write();
6405 } // execute
6406 // --- Inst_VOP3__V_MED3_I32 class methods ---
6407
6409 : Inst_VOP3A(iFmt, "v_med3_i32", false)
6410 {
6411 setFlag(ALU);
6412 } // Inst_VOP3__V_MED3_I32
6413
6415 {
6416 } // ~Inst_VOP3__V_MED3_I32
6417
6418 // --- description from .arch file ---
6419 // D.i = median(S0.i, S1.i, S2.i).
6420 void
6422 {
6423 Wavefront *wf = gpuDynInst->wavefront();
6424 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6425 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6426 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
6427 VecOperandI32 vdst(gpuDynInst, instData.VDST);
6428
6429 src0.readSrc();
6430 src1.readSrc();
6431 src2.readSrc();
6432
6436 assert(!(instData.ABS & 0x1));
6437 assert(!(instData.ABS & 0x2));
6438 assert(!(instData.ABS & 0x4));
6439 assert(!(extData.NEG & 0x1));
6440 assert(!(extData.NEG & 0x2));
6441 assert(!(extData.NEG & 0x4));
6442
6443 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6444 if (wf->execMask(lane)) {
6445 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6446 }
6447 }
6448
6449 vdst.write();
6450 } // execute
6451 // --- Inst_VOP3__V_MED3_U32 class methods ---
6452
6454 : Inst_VOP3A(iFmt, "v_med3_u32", false)
6455 {
6456 setFlag(ALU);
6457 } // Inst_VOP3__V_MED3_U32
6458
6460 {
6461 } // ~Inst_VOP3__V_MED3_U32
6462
6463 // --- description from .arch file ---
6464 // D.u = median(S0.u, S1.u, S2.u).
6465 void
6467 {
6468 Wavefront *wf = gpuDynInst->wavefront();
6469 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6470 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6471 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6472 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6473
6474 src0.readSrc();
6475 src1.readSrc();
6476 src2.readSrc();
6477
6481 assert(!(instData.ABS & 0x1));
6482 assert(!(instData.ABS & 0x2));
6483 assert(!(instData.ABS & 0x4));
6484 assert(!(extData.NEG & 0x1));
6485 assert(!(extData.NEG & 0x2));
6486 assert(!(extData.NEG & 0x4));
6487
6488 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6489 if (wf->execMask(lane)) {
6490 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6491 }
6492 }
6493
6494 vdst.write();
6495 } // execute
6496 // --- Inst_VOP3__V_SAD_U8 class methods ---
6497
6499 : Inst_VOP3A(iFmt, "v_sad_u8", false)
6500 {
6501 setFlag(ALU);
6502 } // Inst_VOP3__V_SAD_U8
6503
6505 {
6506 } // ~Inst_VOP3__V_SAD_U8
6507
6508 // --- description from .arch file ---
6509 // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
6510 // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
6511 // Sum of absolute differences with accumulation, overflow into upper bits
6512 // is allowed.
6513 void
6515 {
6516 Wavefront *wf = gpuDynInst->wavefront();
6517 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6518 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6519 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6520 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6521
6522 src0.readSrc();
6523 src1.readSrc();
6524 src2.readSrc();
6525
6529 assert(!(instData.ABS & 0x1));
6530 assert(!(instData.ABS & 0x2));
6531 assert(!(instData.ABS & 0x4));
6532 assert(!(extData.NEG & 0x1));
6533 assert(!(extData.NEG & 0x2));
6534 assert(!(extData.NEG & 0x4));
6535
6536 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6537 if (wf->execMask(lane)) {
6538 vdst[lane] = std::abs(bits(src0[lane], 31, 24)
6539 - bits(src1[lane], 31, 24))
6540 + std::abs(bits(src0[lane], 23, 16)
6541 - bits(src1[lane], 23, 16))
6542 + std::abs(bits(src0[lane], 15, 8)
6543 - bits(src1[lane], 15, 8))
6544 + std::abs(bits(src0[lane], 7, 0)
6545 - bits(src1[lane], 7, 0)) + src2[lane];
6546 }
6547 }
6548
6549 vdst.write();
6550 } // execute
6551 // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
6552
6554 : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
6555 {
6556 setFlag(ALU);
6557 } // Inst_VOP3__V_SAD_HI_U8
6558
6560 {
6561 } // ~Inst_VOP3__V_SAD_HI_U8
6562
6563 // --- description from .arch file ---
6564 // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
6565 // Sum of absolute differences with accumulation, overflow is lost.
6566 void
6568 {
6569 Wavefront *wf = gpuDynInst->wavefront();
6570 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6571 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6572 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6573 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6574
6575 src0.readSrc();
6576 src1.readSrc();
6577 src2.readSrc();
6578
6582 assert(!(instData.ABS & 0x1));
6583 assert(!(instData.ABS & 0x2));
6584 assert(!(instData.ABS & 0x4));
6585 assert(!(extData.NEG & 0x1));
6586 assert(!(extData.NEG & 0x2));
6587 assert(!(extData.NEG & 0x4));
6588
6589 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6590 if (wf->execMask(lane)) {
6591 vdst[lane] = (((bits(src0[lane], 31, 24)
6592 - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
6593 - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
6594 - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
6595 - bits(src1[lane], 7, 0))) << 16) + src2[lane];
6596 }
6597 }
6598
6599 vdst.write();
6600 } // execute
6601 // --- Inst_VOP3__V_SAD_U16 class methods ---
6602
6604 : Inst_VOP3A(iFmt, "v_sad_u16", false)
6605 {
6606 setFlag(ALU);
6607 } // Inst_VOP3__V_SAD_U16
6608
6610 {
6611 } // ~Inst_VOP3__V_SAD_U16
6612
6613 // --- description from .arch file ---
6614 // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
6615 // + S2.u.
6616 // Word SAD with accumulation.
6617 void
6619 {
6620 Wavefront *wf = gpuDynInst->wavefront();
6621 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6622 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6623 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6624 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6625
6626 src0.readSrc();
6627 src1.readSrc();
6628 src2.readSrc();
6629
6633 assert(!(instData.ABS & 0x1));
6634 assert(!(instData.ABS & 0x2));
6635 assert(!(instData.ABS & 0x4));
6636 assert(!(extData.NEG & 0x1));
6637 assert(!(extData.NEG & 0x2));
6638 assert(!(extData.NEG & 0x4));
6639
6640 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6641 if (wf->execMask(lane)) {
6642 vdst[lane] = std::abs(bits(src0[lane], 31, 16)
6643 - bits(src1[lane], 31, 16))
6644 + std::abs(bits(src0[lane], 15, 0)
6645 - bits(src1[lane], 15, 0)) + src2[lane];
6646 }
6647 }
6648
6649 vdst.write();
6650 } // execute
6651 // --- Inst_VOP3__V_SAD_U32 class methods ---
6652
6654 : Inst_VOP3A(iFmt, "v_sad_u32", false)
6655 {
6656 setFlag(ALU);
6657 } // Inst_VOP3__V_SAD_U32
6658
6660 {
6661 } // ~Inst_VOP3__V_SAD_U32
6662
6663 // --- description from .arch file ---
6664 // D.u = abs(S0.i - S1.i) + S2.u.
6665 // Dword SAD with accumulation.
6666 void
6668 {
6669 Wavefront *wf = gpuDynInst->wavefront();
6670 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6671 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6672 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6673 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6674
6675 src0.readSrc();
6676 src1.readSrc();
6677 src2.readSrc();
6678
6682 assert(!(instData.ABS & 0x1));
6683 assert(!(instData.ABS & 0x2));
6684 assert(!(instData.ABS & 0x4));
6685 assert(!(extData.NEG & 0x1));
6686 assert(!(extData.NEG & 0x2));
6687 assert(!(extData.NEG & 0x4));
6688
6689 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6690 if (wf->execMask(lane)) {
6691 vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
6692 } // if
6693 } // for
6694
6695 vdst.write();
6696 } // execute
6697 // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
6698
6700 : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
6701 {
6702 setFlag(ALU);
6703 setFlag(F32);
6704 } // Inst_VOP3__V_CVT_PK_U8_F32
6705
6707 {
6708 } // ~Inst_VOP3__V_CVT_PK_U8_F32
6709
6710 // --- description from .arch file ---
6711 // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
6712 // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
6713 // Convert floating point value S0 to 8-bit unsigned integer and pack the
6714 // result into byte S1 of dword S2.
6715 void
6717 {
6718 Wavefront *wf = gpuDynInst->wavefront();
6719 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6720 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6721 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6722 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6723
6724 src0.readSrc();
6725 src1.readSrc();
6726 src2.readSrc();
6727
6728 if (instData.ABS & 0x1) {
6729 src0.absModifier();
6730 }
6731
6732
6733 if (extData.NEG & 0x1) {
6734 src0.negModifier();
6735 }
6736
6740 assert(!(instData.ABS & 0x2));
6741 assert(!(instData.ABS & 0x4));
6742 assert(!(extData.NEG & 0x2));
6743 assert(!(extData.NEG & 0x4));
6744
6745 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6746 if (wf->execMask(lane)) {
6747 vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
6748 << (8 * bits(src1[lane], 1, 0)))
6749 | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
6750 }
6751 }
6752
6753 vdst.write();
6754 } // execute
6755 // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
6756
6758 : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
6759 {
6760 setFlag(ALU);
6761 setFlag(F32);
6762 } // Inst_VOP3__V_DIV_FIXUP_F32
6763
6765 {
6766 } // ~Inst_VOP3__V_DIV_FIXUP_F32
6767
6768 // --- description from .arch file ---
6769 // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
6770 // s2.f = Numerator. This opcode generates exceptions resulting from the
6771 // division operation.
6772 void
6774 {
6775 Wavefront *wf = gpuDynInst->wavefront();
6776 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6777 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6778 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6779 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6780
6781 src0.readSrc();
6782 src1.readSrc();
6783 src2.readSrc();
6784
6785 if (instData.ABS & 0x1) {
6786 src0.absModifier();
6787 }
6788
6789 if (instData.ABS & 0x2) {
6790 src1.absModifier();
6791 }
6792
6793 if (instData.ABS & 0x4) {
6794 src2.absModifier();
6795 }
6796
6797 if (extData.NEG & 0x1) {
6798 src0.negModifier();
6799 }
6800
6801 if (extData.NEG & 0x2) {
6802 src1.negModifier();
6803 }
6804
6805 if (extData.NEG & 0x4) {
6806 src2.negModifier();
6807 }
6808
6809 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6810 if (wf->execMask(lane)) {
6811 if (std::fpclassify(src1[lane]) == FP_ZERO) {
6812 if (std::signbit(src1[lane])) {
6813 vdst[lane] = -INFINITY;
6814 } else {
6815 vdst[lane] = +INFINITY;
6816 }
6817 } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
6818 vdst[lane] = NAN;
6819 } else if (std::isinf(src1[lane])) {
6820 if (std::signbit(src1[lane])) {
6821 vdst[lane] = -INFINITY;
6822 } else {
6823 vdst[lane] = +INFINITY;
6824 }
6825 } else {
6826 vdst[lane] = src2[lane] / src1[lane];
6827 }
6828 }
6829 }
6830
6831 vdst.write();
6832 } // execute
6833 // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
6834
6836 : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
6837 {
6838 setFlag(ALU);
6839 setFlag(F64);
6840 } // Inst_VOP3__V_DIV_FIXUP_F64
6841
6843 {
6844 } // ~Inst_VOP3__V_DIV_FIXUP_F64
6845
6846 // --- description from .arch file ---
6847 // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
6848 // s2.d = Numerator. This opcode generates exceptions resulting from the
6849 // division operation.
6850 void
6852 {
6853 Wavefront *wf = gpuDynInst->wavefront();
6854 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
6855 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
6856 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
6857 VecOperandF64 vdst(gpuDynInst, instData.VDST);
6858
6859 src0.readSrc();
6860 src1.readSrc();
6861 src2.readSrc();
6862
6863 if (instData.ABS & 0x1) {
6864 src0.absModifier();
6865 }
6866
6867 if (instData.ABS & 0x2) {
6868 src1.absModifier();
6869 }
6870
6871 if (instData.ABS & 0x4) {
6872 src2.absModifier();
6873 }
6874
6875 if (extData.NEG & 0x1) {
6876 src0.negModifier();
6877 }
6878
6879 if (extData.NEG & 0x2) {
6880 src1.negModifier();
6881 }
6882
6883 if (extData.NEG & 0x4) {
6884 src2.negModifier();
6885 }
6886
6887 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6888 if (wf->execMask(lane)) {
6889 int sign_out = std::signbit(src1[lane])
6890 ^ std::signbit(src2[lane]);
6891 int exp1(0);
6892 int exp2(0);
6893 std::frexp(src1[lane], &exp1);
6894 std::frexp(src2[lane], &exp2);
6895
6896 if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
6897 vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
6898 } else if (std::fpclassify(src1[lane]) == FP_ZERO
6899 && std::fpclassify(src2[lane]) == FP_ZERO) {
6900 vdst[lane]
6901 = std::numeric_limits<VecElemF64>::signaling_NaN();
6902 } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
6903 vdst[lane]
6904 = std::numeric_limits<VecElemF64>::signaling_NaN();
6905 } else if (std::fpclassify(src1[lane]) == FP_ZERO
6906 || std::isinf(src2[lane])) {
6907 vdst[lane] = sign_out ? -INFINITY : +INFINITY;
6908 } else if (std::isinf(src1[lane])
6909 || std::fpclassify(src2[lane]) == FP_ZERO) {
6910 vdst[lane] = sign_out ? -0.0 : +0.0;
6911 } else if (exp2 - exp1 < -1075) {
6912 vdst[lane] = src0[lane];
6913 } else if (exp1 == 2047) {
6914 vdst[lane] = src0[lane];
6915 } else {
6916 vdst[lane] = sign_out ? -std::fabs(src0[lane])
6917 : std::fabs(src0[lane]);
6918 }
6919 }
6920 }
6921
6922 vdst.write();
6923 } // execute
6924 // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
6925
6927 InFmt_VOP3B *iFmt)
6928 : Inst_VOP3B(iFmt, "v_div_scale_f32")
6929 {
6930 setFlag(ALU);
6931 setFlag(WritesVCC);
6932 setFlag(F32);
6933 } // Inst_VOP3__V_DIV_SCALE_F32
6934
6936 {
6937 } // ~Inst_VOP3__V_DIV_SCALE_F32
6938
6939 // --- description from .arch file ---
6940 // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
6941 // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
6942 // numerator and denominator, this opcode will appropriately scale inputs
6943 // for division to avoid subnormal terms during Newton-Raphson correction
6944 // algorithm. This opcode producses a VCC flag for post-scale of quotient.
6945 void
6947 {
6948 Wavefront *wf = gpuDynInst->wavefront();
6949 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6950 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6951 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6952 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
6953 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6954
6955 src0.readSrc();
6956 src1.readSrc();
6957 src2.readSrc();
6958
6959 if (extData.NEG & 0x1) {
6960 src0.negModifier();
6961 }
6962
6963 if (extData.NEG & 0x2) {
6964 src1.negModifier();
6965 }
6966
6967 if (extData.NEG & 0x4) {
6968 src2.negModifier();
6969 }
6970
6971 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6972 if (wf->execMask(lane)) {
6973 vdst[lane] = src0[lane];
6974 vcc.setBit(lane, 0);
6975 }
6976 }
6977
6978 vcc.write();
6979 vdst.write();
6980 } // execute
6981 // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
6982
6984 InFmt_VOP3B *iFmt)
6985 : Inst_VOP3B(iFmt, "v_div_scale_f64")
6986 {
6987 setFlag(ALU);
6988 setFlag(WritesVCC);
6989 setFlag(F64);
6990 } // Inst_VOP3__V_DIV_SCALE_F64
6991
6993 {
6994 } // ~Inst_VOP3__V_DIV_SCALE_F64
6995
6996 // --- description from .arch file ---
6997 // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
6998 // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
6999 // numerator and denominator, this opcode will appropriately scale inputs
7000 // for division to avoid subnormal terms during Newton-Raphson correction
7001 // algorithm. This opcode producses a VCC flag for post-scale of quotient.
7002 void
7004 {
7005 Wavefront *wf = gpuDynInst->wavefront();
7006 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
7007 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
7008 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
7009 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
7010 VecOperandF64 vdst(gpuDynInst, instData.VDST);
7011
7012 src0.readSrc();
7013 src1.readSrc();
7014 src2.readSrc();
7015
7016 if (extData.NEG & 0x1) {
7017 src0.negModifier();
7018 }
7019
7020 if (extData.NEG & 0x2) {
7021 src1.negModifier();
7022 }
7023
7024 if (extData.NEG & 0x4) {
7025 src2.negModifier();
7026 }
7027
7028 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7029 if (wf->execMask(lane)) {
7030 int exp1(0);
7031 int exp2(0);
7032 std::frexp(src1[lane], &exp1);
7033 std::frexp(src2[lane], &exp2);
7034 vcc.setBit(lane, 0);
7035
7036 if (std::fpclassify(src1[lane]) == FP_ZERO
7037 || std::fpclassify(src2[lane]) == FP_ZERO) {
7038 vdst[lane] = NAN;
7039 } else if (exp2 - exp1 >= 768) {
7040 vcc.setBit(lane, 1);
7041 if (src0[lane] == src1[lane]) {
7042 vdst[lane] = std::ldexp(src0[lane], 128);
7043 }
7044 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
7045 vdst[lane] = std::ldexp(src0[lane], 128);
7046 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
7047 && std::fpclassify(src2[lane] / src1[lane])
7048 == FP_SUBNORMAL) {
7049 vcc.setBit(lane, 1);
7050 if (src0[lane] == src1[lane]) {
7051 vdst[lane] = std::ldexp(src0[lane], 128);
7052 }
7053 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
7054 vdst[lane] = std::ldexp(src0[lane], -128);
7055 } else if (std::fpclassify(src2[lane] / src1[lane])
7056 == FP_SUBNORMAL) {
7057 vcc.setBit(lane, 1);
7058 if (src0[lane] == src2[lane]) {
7059 vdst[lane] = std::ldexp(src0[lane], 128);
7060 }
7061 } else if (exp2 <= 53) {
7062 vdst[lane] = std::ldexp(src0[lane], 128);
7063 }
7064 }
7065 }
7066
7067 vcc.write();
7068 vdst.write();
7069 } // execute
7070 // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
7071
7073 : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
7074 {
7075 setFlag(ALU);
7076 setFlag(ReadsVCC);
7077 setFlag(F32);
7078 setFlag(FMA);
7079 } // Inst_VOP3__V_DIV_FMAS_F32
7080
7082 {
7083 } // ~Inst_VOP3__V_DIV_FMAS_F32
7084
7085 // --- description from .arch file ---
7086 // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
7087 // s1.f = Denominator, s2.f = Numerator)
7088 void
7090 {
7091 Wavefront *wf = gpuDynInst->wavefront();
7092 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
7093 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
7094 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
7095 VecOperandF64 vdst(gpuDynInst, instData.VDST);
7096
7097 src0.readSrc();
7098 src1.readSrc();
7099 src2.readSrc();
7100
7101 if (instData.ABS & 0x1) {
7102 src0.absModifier();
7103 }
7104
7105 if (instData.ABS & 0x2) {
7106 src1.absModifier();
7107 }
7108
7109 if (instData.ABS & 0x4) {
7110 src2.absModifier();
7111 }
7112
7113 if (extData.NEG & 0x1) {
7114 src0.negModifier();
7115 }
7116
7117 if (extData.NEG & 0x2) {
7118 src1.negModifier();
7119 }
7120
7121 if (extData.NEG & 0x4) {
7122 src2.negModifier();
7123 }
7124
7125 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7126 if (wf->execMask(lane)) {
7127 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
7128 }
7129 }
7130
7131 //vdst.write();
7132 } // execute
7133 // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
7134
7136 : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
7137 {
7138 setFlag(ALU);
7139 setFlag(ReadsVCC);
7140 setFlag(F64);
7141 setFlag(FMA);
7142 } // Inst_VOP3__V_DIV_FMAS_F64
7143
7145 {
7146 } // ~Inst_VOP3__V_DIV_FMAS_F64
7147
7148 // --- description from .arch file ---
7149 // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
7150 // s1.d = Denominator, s2.d = Numerator)
7151 void
7153 {
7154 Wavefront *wf = gpuDynInst->wavefront();
7155 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
7156 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
7157 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
7158 VecOperandF64 vdst(gpuDynInst, instData.VDST);
7159 ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
7160
7161 src0.readSrc();
7162 src1.readSrc();
7163 src2.readSrc();
7164 vcc.read();
7165
7166 if (instData.ABS & 0x1) {
7167 src0.absModifier();
7168 }
7169
7170 if (instData.ABS & 0x2) {
7171 src1.absModifier();
7172 }
7173
7174 if (instData.ABS & 0x4) {
7175 src2.absModifier();
7176 }
7177
7178 if (extData.NEG & 0x1) {
7179 src0.negModifier();
7180 }
7181
7182 if (extData.NEG & 0x2) {
7183 src1.negModifier();
7184 }
7185
7186 if (extData.NEG & 0x4) {
7187 src2.negModifier();
7188 }
7189
7190 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7191 if (wf->execMask(lane)) {
7192 if (bits(vcc.rawData(), lane)) {
7193 vdst[lane] = std::pow(2, 64)
7194 * std::fma(src0[lane], src1[lane], src2[lane]);
7195 } else {
7196 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
7197 }
7198 }
7199 }
7200
7201 vdst.write();
7202 } // execute
7203 // --- Inst_VOP3__V_MSAD_U8 class methods ---
7204
7206 : Inst_VOP3A(iFmt, "v_msad_u8", false)
7207 {
7208 setFlag(ALU);
7209 } // Inst_VOP3__V_MSAD_U8
7210
7212 {
7213 } // ~Inst_VOP3__V_MSAD_U8
7214
7215 // --- description from .arch file ---
7216 // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
7217 void
7219 {
7221 } // execute
7222 // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
7223
7225 : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
7226 {
7227 setFlag(ALU);
7228 } // Inst_VOP3__V_QSAD_PK_U16_U8
7229
7231 {
7232 } // ~Inst_VOP3__V_QSAD_PK_U16_U8
7233
7234 // --- description from .arch file ---
7235 // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
7236 // S1.u[31:0], S2.u[63:0])
7237 void
7239 {
7241 } // execute
7242 // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
7243
7245 InFmt_VOP3A *iFmt)
7246 : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
7247 {
7248 setFlag(ALU);
7249 } // Inst_VOP3__V_MQSAD_PK_U16_U8
7250
7252 {
7253 } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
7254
7255 // --- description from .arch file ---
7256 // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
7257 // --- S1.u[31:0], S2.u[63:0])
7258 void
7260 {
7262 } // execute
7263 // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
7264
7266 : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
7267 {
7268 setFlag(ALU);
7269 } // Inst_VOP3__V_MQSAD_U32_U8
7270
7272 {
7273 } // ~Inst_VOP3__V_MQSAD_U32_U8
7274
7275 // --- description from .arch file ---
7276 // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
7277 // --- S1.u[31:0], S2.u[127:0])
7278 void
7280 {
7282 } // execute
7283 // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
7284
7286 InFmt_VOP3B *iFmt)
7287 : Inst_VOP3B(iFmt, "v_mad_u64_u32")
7288 {
7289 setFlag(ALU);
7290 setFlag(WritesVCC);
7291 setFlag(MAD);
7292 } // Inst_VOP3__V_MAD_U64_U32
7293
7295 {
7296 } // ~Inst_VOP3__V_MAD_U64_U32
7297
7298 // --- description from .arch file ---
7299 // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
7300 void
7302 {
7303 Wavefront *wf = gpuDynInst->wavefront();
7304 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7305 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7306 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
7307 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
7308 VecOperandU64 vdst(gpuDynInst, instData.VDST);
7309
7310 src0.readSrc();
7311 src1.readSrc();
7312 src2.readSrc();
7313 vdst.read();
7314
7318 assert(!(extData.NEG & 0x1));
7319 assert(!(extData.NEG & 0x2));
7320 assert(!(extData.NEG & 0x4));
7321
7322 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7323 if (wf->execMask(lane)) {
7324 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
7325 src2[lane]));
7326 }
7327 }
7328
7329 vcc.write();
7330 vdst.write();
7331 } // execute
7332 // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
7333
7335 InFmt_VOP3B *iFmt)
7336 : Inst_VOP3B(iFmt, "v_mad_i64_i32")
7337 {
7338 setFlag(ALU);
7339 setFlag(WritesVCC);
7340 setFlag(MAD);
7341 } // Inst_VOP3__V_MAD_I64_I32
7342
7344 {
7345 } // ~Inst_VOP3__V_MAD_I64_I32
7346
7347 // --- description from .arch file ---
7348 // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
7349 void
7351 {
7352 Wavefront *wf = gpuDynInst->wavefront();
7353 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
7354 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
7355 ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
7356 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
7357 VecOperandI64 vdst(gpuDynInst, instData.VDST);
7358
7359 src0.readSrc();
7360 src1.readSrc();
7361 src2.readSrc();
7362
7366 assert(!(extData.NEG & 0x1));
7367 assert(!(extData.NEG & 0x2));
7368 assert(!(extData.NEG & 0x4));
7369
7370 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7371 if (wf->execMask(lane)) {
7372 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
7373 src2[lane]));
7374 }
7375 }
7376
7377 vcc.write();
7378 vdst.write();
7379 } // execute
7380 // --- Inst_VOP3__V_XAD_U32 class methods ---
7381
7383 : Inst_VOP3A(iFmt, "v_xad_u32", false)
7384 {
7385 setFlag(ALU);
7386 } // Inst_VOP3__V_XAD_U32
7387
7389 {
7390 } // ~Inst_VOP3__V_XAD_U32
7391
7392 // --- description from .arch file ---
7393 // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
7394 void
7396 {
7397 Wavefront *wf = gpuDynInst->wavefront();
7398 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7399 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7400 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7401 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7402
7403 src0.readSrc();
7404 src1.readSrc();
7405 src2.readSrc();
7406
7410 assert(!(instData.ABS & 0x1));
7411 assert(!(instData.ABS & 0x2));
7412 assert(!(instData.ABS & 0x4));
7413 assert(!(extData.NEG & 0x1));
7414 assert(!(extData.NEG & 0x2));
7415 assert(!(extData.NEG & 0x4));
7416
7417 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7418 if (wf->execMask(lane)) {
7419 vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
7420 }
7421 }
7422
7423 vdst.write();
7424 } // execute
7425 // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
7426
7428 : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
7429 {
7430 setFlag(ALU);
7431 } // Inst_VOP3__V_LSHL_ADD_U32
7432
7434 {
7435 } // ~Inst_VOP3__V_LSHL_ADD_U32
7436
7437 // --- description from .arch file ---
7438 // D.u = (S0.u << S1.u[4:0]) + S2.u.
7439 void
7441 {
7442 Wavefront *wf = gpuDynInst->wavefront();
7443 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7444 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7445 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7446 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7447
7448 src0.readSrc();
7449 src1.readSrc();
7450 src2.readSrc();
7451
7455 assert(!(instData.ABS & 0x1));
7456 assert(!(instData.ABS & 0x2));
7457 assert(!(instData.ABS & 0x4));
7458 assert(!(extData.NEG & 0x1));
7459 assert(!(extData.NEG & 0x2));
7460 assert(!(extData.NEG & 0x4));
7461
7462 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7463 if (wf->execMask(lane)) {
7464 vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
7465 + src2[lane];
7466 }
7467 }
7468
7469 vdst.write();
7470 } // execute
7471 // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
7472
7474 : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
7475 {
7476 setFlag(ALU);
7477 } // Inst_VOP3__V_ADD_LSHL_U32
7478
7480 {
7481 } // ~Inst_VOP3__V_ADD_LSHL_U32
7482
7483 // --- description from .arch file ---
7484 // D.u = (S0.u + S1.u) << S2.u[4:0].
7485 void
7487 {
7488 Wavefront *wf = gpuDynInst->wavefront();
7489 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7490 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7491 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7492 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7493
7494 src0.readSrc();
7495 src1.readSrc();
7496 src2.readSrc();
7497
7501 assert(!(instData.ABS & 0x1));
7502 assert(!(instData.ABS & 0x2));
7503 assert(!(instData.ABS & 0x4));
7504 assert(!(extData.NEG & 0x1));
7505 assert(!(extData.NEG & 0x2));
7506 assert(!(extData.NEG & 0x4));
7507
7508 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7509 if (wf->execMask(lane)) {
7510 vdst[lane] =
7511 (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
7512 }
7513 }
7514
7515 vdst.write();
7516 } // execute
7517 // --- Inst_VOP3__V_ADD3_U32 class methods ---
7518
7520 : Inst_VOP3A(iFmt, "v_add3_u32", false)
7521 {
7522 setFlag(ALU);
7523 } // Inst_VOP3__V_ADD3_U32
7524
7526 {
7527 } // ~Inst_VOP3__V_ADD3_U32
7528
7529 // --- description from .arch file ---
7530 // D.u = S0.u + S1.u + S2.u.
7531 void
7533 {
7534 Wavefront *wf = gpuDynInst->wavefront();
7535 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7536 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7537 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7538 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7539
7540 src0.readSrc();
7541 src1.readSrc();
7542 src2.readSrc();
7543
7547 assert(!(instData.ABS & 0x1));
7548 assert(!(instData.ABS & 0x2));
7549 assert(!(instData.ABS & 0x4));
7550 assert(!(extData.NEG & 0x1));
7551 assert(!(extData.NEG & 0x2));
7552 assert(!(extData.NEG & 0x4));
7553
7554 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7555 if (wf->execMask(lane)) {
7556 vdst[lane] = src0[lane] + src1[lane] + src2[lane];
7557 }
7558 }
7559
7560 vdst.write();
7561 } // execute
7562 // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
7563
7565 : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
7566 {
7567 setFlag(ALU);
7568 } // Inst_VOP3__V_LSHL_OR_B32
7569
7571 {
7572 } // ~Inst_VOP3__V_LSHL_OR_B32
7573
7574 // --- description from .arch file ---
7575 // D.u = (S0.u << S1.u[4:0]) | S2.u.
7576 void
7578 {
7579 Wavefront *wf = gpuDynInst->wavefront();
7580 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7581 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7582 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7583 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7584
7585 src0.readSrc();
7586 src1.readSrc();
7587 src2.readSrc();
7588
7592 assert(!(instData.ABS & 0x1));
7593 assert(!(instData.ABS & 0x2));
7594 assert(!(instData.ABS & 0x4));
7595 assert(!(extData.NEG & 0x1));
7596 assert(!(extData.NEG & 0x2));
7597 assert(!(extData.NEG & 0x4));
7598
7599 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7600 if (wf->execMask(lane)) {
7601 vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
7602 | src2[lane];
7603 }
7604 }
7605
7606 vdst.write();
7607 } // execute
7608 // --- Inst_VOP3__V_AND_OR_B32 class methods ---
7609
7611 : Inst_VOP3A(iFmt, "v_and_or_b32", false)
7612 {
7613 setFlag(ALU);
7614 } // Inst_VOP3__V_AND_OR_B32
7615
7617 {
7618 } // ~Inst_VOP3__V_AND_OR_B32
7619
7620 // --- description from .arch file ---
7621 // D.u = (S0.u & S1.u) | S2.u.
7622 // Input and output modifiers not supported.
7623 void
7625 {
7626 Wavefront *wf = gpuDynInst->wavefront();
7627 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7628 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7629 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7630 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7631
7632 src0.readSrc();
7633 src1.readSrc();
7634 src2.readSrc();
7635
7639 assert(!(instData.ABS & 0x1));
7640 assert(!(instData.ABS & 0x2));
7641 assert(!(instData.ABS & 0x4));
7642 assert(!(extData.NEG & 0x1));
7643 assert(!(extData.NEG & 0x2));
7644 assert(!(extData.NEG & 0x4));
7645
7646 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7647 if (wf->execMask(lane)) {
7648 vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
7649 }
7650 }
7651
7652 vdst.write();
7653 } // execute
7654 // --- Inst_VOP3__V_MAD_F16 class methods ---
7655
7657 : Inst_VOP3A(iFmt, "v_mad_f16", false)
7658 {
7659 setFlag(ALU);
7660 setFlag(F16);
7661 setFlag(MAD);
7662 } // Inst_VOP3__V_MAD_F16
7663
7665 {
7666 } // ~Inst_VOP3__V_MAD_F16
7667
7668 // --- description from .arch file ---
7669 // D.f16 = S0.f16 * S1.f16 + S2.f16.
7670 // Supports round mode, exception flags, saturation.
7671 void
7673 {
7675 } // execute
7676 // --- Inst_VOP3__V_MAD_U16 class methods ---
7677
7679 : Inst_VOP3A(iFmt, "v_mad_u16", false)
7680 {
7681 setFlag(ALU);
7682 setFlag(MAD);
7683 } // Inst_VOP3__V_MAD_U16
7684
7686 {
7687 } // ~Inst_VOP3__V_MAD_U16
7688
7689 // --- description from .arch file ---
7690 // D.u16 = S0.u16 * S1.u16 + S2.u16.
7691 // Supports saturation (unsigned 16-bit integer domain).
7692 void
7694 {
7695 Wavefront *wf = gpuDynInst->wavefront();
7696 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
7697 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
7698 ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
7699 VecOperandU16 vdst(gpuDynInst, instData.VDST);
7700
7701 src0.readSrc();
7702 src1.readSrc();
7703 src2.readSrc();
7704
7708 assert(!(instData.ABS & 0x1));
7709 assert(!(instData.ABS & 0x2));
7710 assert(!(instData.ABS & 0x4));
7711 assert(!(extData.NEG & 0x1));
7712 assert(!(extData.NEG & 0x2));
7713 assert(!(extData.NEG & 0x4));
7714
7715 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7716 if (wf->execMask(lane)) {
7717 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
7718 }
7719 }
7720
7721 vdst.write();
7722 } // execute
7723 // --- Inst_VOP3__V_MAD_I16 class methods ---
7724
7726 : Inst_VOP3A(iFmt, "v_mad_i16", false)
7727 {
7728 setFlag(ALU);
7729 setFlag(MAD);
7730 } // Inst_VOP3__V_MAD_I16
7731
7733 {
7734 } // ~Inst_VOP3__V_MAD_I16
7735
7736 // --- description from .arch file ---
7737 // D.i16 = S0.i16 * S1.i16 + S2.i16.
7738 // Supports saturation (signed 16-bit integer domain).
7739 void
7741 {
7742 Wavefront *wf = gpuDynInst->wavefront();
7743 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
7744 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
7745 ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
7746 VecOperandI16 vdst(gpuDynInst, instData.VDST);
7747
7748 src0.readSrc();
7749 src1.readSrc();
7750 src2.readSrc();
7751
7755 assert(!(instData.ABS & 0x1));
7756 assert(!(instData.ABS & 0x2));
7757 assert(!(instData.ABS & 0x4));
7758 assert(!(extData.NEG & 0x1));
7759 assert(!(extData.NEG & 0x2));
7760 assert(!(extData.NEG & 0x4));
7761
7762 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7763 if (wf->execMask(lane)) {
7764 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
7765 }
7766 }
7767
7768 vdst.write();
7769 } // execute
7770 // --- Inst_VOP3__V_PERM_B32 class methods ---
7771
7773 : Inst_VOP3A(iFmt, "v_perm_b32", false)
7774 {
7775 setFlag(ALU);
7776 } // Inst_VOP3__V_PERM_B32
7777
7779 {
7780 } // ~Inst_VOP3__V_PERM_B32
7781
7782 // --- description from .arch file ---
7783 // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
7784 // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
7785 // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
7786 // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
7787 // byte permute(byte in[8], byte sel) {
7788 // if (sel>=13) then return 0xff;
7789 // elsif(sel==12) then return 0x00;
7790 // elsif(sel==11) then return in[7][7] * 0xff;
7791 // elsif(sel==10) then return in[5][7] * 0xff;
7792 // elsif(sel==9) then return in[3][7] * 0xff;
7793 // elsif(sel==8) then return in[1][7] * 0xff;
7794 // else return in[sel];
7795 // }
7796 // Byte permute.
7797 void
7799 {
7800 Wavefront *wf = gpuDynInst->wavefront();
7801 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7802 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7803 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7804 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7805
7806 src0.readSrc();
7807 src1.readSrc();
7808 src2.readSrc();
7809
7810 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7811 if (wf->execMask(lane)) {
7812 VecElemU64 selector = (VecElemU64)src0[lane];
7813 selector = (selector << 32) | (VecElemU64)src1[lane];
7814 vdst[lane] = 0;
7815
7816 DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
7817 "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
7818 src1[lane], src2[lane], vdst[lane]);
7819 DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
7820
7821 for (int i = 0; i < 4 ; ++i) {
7822 VecElemU32 permuted_val = permute(selector, 0xFF
7823 & ((VecElemU32)src2[lane] >> (8 * i)));
7824 vdst[lane] |= (permuted_val << (8 * i));
7825 }
7826
7827 DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
7828 }
7829 }
7830
7831 vdst.write();
7832 } // execute
7833 // --- Inst_VOP3__V_FMA_F16 class methods ---
7834
7836 : Inst_VOP3A(iFmt, "v_fma_f16", false)
7837 {
7838 setFlag(ALU);
7839 setFlag(F16);
7840 setFlag(FMA);
7841 } // Inst_VOP3__V_FMA_F16
7842
7844 {
7845 } // ~Inst_VOP3__V_FMA_F16
7846
7847 // --- description from .arch file ---
7848 // D.f16 = S0.f16 * S1.f16 + S2.f16.
7849 // Fused half precision multiply add.
7850 void
7852 {
7854 } // execute
7855 // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
7856
7858 : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
7859 {
7860 setFlag(ALU);
7861 setFlag(F16);
7862 } // Inst_VOP3__V_DIV_FIXUP_F16
7863
7865 {
7866 } // ~Inst_VOP3__V_DIV_FIXUP_F16
7867
7868 // --- description from .arch file ---
7869 // sign_out = sign(S1.f16)^sign(S2.f16);
7870 // if (S2.f16 == NAN)
7871 // D.f16 = Quiet(S2.f16);
7872 // else if (S1.f16 == NAN)
7873 // D.f16 = Quiet(S1.f16);
7874 // else if (S1.f16 == S2.f16 == 0)
7875 // # 0/0
7876 // D.f16 = pele_nan(0xfe00);
7877 // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
7878 // # inf/inf
7879 // D.f16 = pele_nan(0xfe00);
7880 // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
7881 // # x/0, or inf/y
7882 // D.f16 = sign_out ? -INF : INF;
7883 // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
7884 // # x/inf, 0/y
7885 // D.f16 = sign_out ? -0 : 0;
7886 // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
7887 // D.f16 = sign_out ? -underflow : underflow;
7888 // else if (exp(S1.f16) == 255)
7889 // D.f16 = sign_out ? -overflow : overflow;
7890 // else
7891 // D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
7892 // Half precision division fixup.
7893 // S0 = Quotient, S1 = Denominator, S3 = Numerator.
7894 // Given a numerator, denominator, and quotient from a divide, this opcode
7895 // will detect and apply special case numerics, touching up the quotient if
7896 // necessary. This opcode also generates invalid, denorm and divide by
7897 // zero exceptions caused by the division.
7898 void
7900 {
7902 } // execute
7903 // --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
7904
7906 : Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
7907 {
7908 setFlag(ALU);
7909 } // Inst_VOP3__V_LSHL_ADD_U64
7910
7912 {
7913 } // ~Inst_VOP3__V_LSHL_ADD_U64
7914
7915 // --- description from .arch file ---
7916 // D.u = (S0.u << S1.u[4:0]) + S2.u.
7917 void
7919 {
7920 Wavefront *wf = gpuDynInst->wavefront();
7921 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
7922 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7923 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
7924 VecOperandU64 vdst(gpuDynInst, instData.VDST);
7925
7926 src0.readSrc();
7927 src1.readSrc();
7928 src2.readSrc();
7929
7933 assert(!(instData.ABS & 0x1));
7934 assert(!(instData.ABS & 0x2));
7935 assert(!(instData.ABS & 0x4));
7936 assert(!(extData.NEG & 0x1));
7937 assert(!(extData.NEG & 0x2));
7938 assert(!(extData.NEG & 0x4));
7939
7940 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7941 if (wf->execMask(lane)) {
7942 int shift_amount = bits(src1[lane], 2, 0);
7943 shift_amount = shift_amount > 4 ? 0 : shift_amount;
7944 vdst[lane] = (src0[lane] << shift_amount)
7945 + src2[lane];
7946 }
7947 }
7948
7949 vdst.write();
7950 } // execute
7951 // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
7952
7954 InFmt_VOP3A *iFmt)
7955 : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
7956 {
7957 setFlag(ALU);
7958 setFlag(F32);
7959 } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
7960
7962 {
7963 } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
7964
7965 // --- description from .arch file ---
7966 // byte = S1.u[1:0]; bit = byte * 8;
7967 // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
7968 // Pack converted value of S0.f into byte S1 of the destination.
7969 // SQ translates to V_CVT_PK_U8_F32.
7970 // Note: this opcode uses src_c to pass destination in as a source.
7971 void
7976 // --- Inst_VOP3__V_BITOP3_B16 class methods ---
7977
7979 InFmt_VOP3A *iFmt)
7980 : Inst_VOP3A(iFmt, "v_bitop3_b16", false)
7981 {
7982 setFlag(ALU);
7983 } // Inst_VOP3__V_BITOP3_B16
7984
7986 {
7987 } // ~Inst_VOP3__V_BITOP3_B16
7988
7989 void
7991 {
7992 Wavefront *wf = gpuDynInst->wavefront();
7993 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7994 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7995 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7996 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7997
7998 src0.readSrc();
7999 src1.readSrc();
8000 src2.readSrc();
8001 vdst.read();
8002
8003 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
8004 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
8005 panic_if(instData.CLAMP, "Clamp not supported for %s", _opcode);
8006
8007 int opsel = instData.OPSEL;
8008
8009 uint8_t ttbl = 0;
8010 replaceBits(ttbl, 2, 0, extData.NEG & 0x7);
8011 replaceBits(ttbl, 5, 3, instData.ABS & 0x7);
8012 replaceBits(ttbl, 7, 6, extData.OMOD & 0x3);
8013
8014 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8015 if (wf->execMask(lane)) {
8016 uint16_t s0 = (opsel & 1) ? bits(src0[lane], 31, 16)
8017 : bits(src0[lane], 15, 0);
8018 uint16_t s1 = (opsel & 2) ? bits(src1[lane], 31, 16)
8019 : bits(src1[lane], 15, 0);
8020 uint16_t s2 = (opsel & 4) ? bits(src2[lane], 31, 16)
8021 : bits(src2[lane], 15, 0);
8022
8023 uint16_t tmp = 0;
8024 tmp |= (ttbl & 0x01) ? (~s0 & ~s1 & ~s2) : 0;
8025 tmp |= (ttbl & 0x02) ? (~s0 & ~s1 & s2) : 0;
8026 tmp |= (ttbl & 0x04) ? (~s0 & s1 & ~s2) : 0;
8027 tmp |= (ttbl & 0x08) ? (~s0 & s1 & s2) : 0;
8028 tmp |= (ttbl & 0x10) ? ( s0 & ~s1 & ~s2) : 0;
8029 tmp |= (ttbl & 0x20) ? ( s0 & ~s1 & s2) : 0;
8030 tmp |= (ttbl & 0x40) ? ( s0 & s1 & ~s2) : 0;
8031 tmp |= (ttbl & 0x80) ? ( s0 & s1 & s2) : 0;
8032
8033 if (opsel & 8) {
8034 replaceBits(vdst[lane], 31, 16, tmp);
8035 } else {
8036 replaceBits(vdst[lane], 15, 0, tmp);
8037 }
8038 }
8039 }
8040
8041 vdst.write();
8042 } // execute
8043 // --- Inst_VOP3__V_BITOP3_B32 class methods ---
8044
8046 InFmt_VOP3A *iFmt)
8047 : Inst_VOP3A(iFmt, "v_bitop3_b32", false)
8048 {
8049 setFlag(ALU);
8050 } // Inst_VOP3__V_BITOP3_B32
8051
8053 {
8054 } // ~Inst_VOP3__V_BITOP3_B32
8055
8056 void
8058 {
8059 Wavefront *wf = gpuDynInst->wavefront();
8060 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8061 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8062 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
8063 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8064
8065 src0.readSrc();
8066 src1.readSrc();
8067 src2.readSrc();
8068
8069 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
8070 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
8071 panic_if(instData.CLAMP, "Clamp not supported for %s", _opcode);
8072 panic_if(instData.OPSEL, "OP_SEL not supported for %s", _opcode);
8073
8074 uint8_t ttbl = 0;
8075 replaceBits(ttbl, 2, 0, extData.NEG & 0x7);
8076 replaceBits(ttbl, 5, 3, instData.ABS & 0x7);
8077 replaceBits(ttbl, 7, 6, extData.OMOD & 0x3);
8078
8079 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8080 if (wf->execMask(lane)) {
8081 uint32_t s0 = src0[lane];
8082 uint32_t s1 = src1[lane];
8083 uint32_t s2 = src2[lane];
8084
8085 uint16_t tmp = 0;
8086 tmp |= (ttbl & 0x01) ? (~s0 & ~s1 & ~s2) : 0;
8087 tmp |= (ttbl & 0x02) ? (~s0 & ~s1 & s2) : 0;
8088 tmp |= (ttbl & 0x04) ? (~s0 & s1 & ~s2) : 0;
8089 tmp |= (ttbl & 0x08) ? (~s0 & s1 & s2) : 0;
8090 tmp |= (ttbl & 0x10) ? ( s0 & ~s1 & ~s2) : 0;
8091 tmp |= (ttbl & 0x20) ? ( s0 & ~s1 & s2) : 0;
8092 tmp |= (ttbl & 0x40) ? ( s0 & s1 & ~s2) : 0;
8093 tmp |= (ttbl & 0x80) ? ( s0 & s1 & s2) : 0;
8094
8095 vdst[lane] = tmp;
8096 }
8097 }
8098
8099 vdst.write();
8100 } // execute
8101 // --- Inst_VOP3__V_ASHR_PK_I8_I32 class methods ---
8102
8104 : Inst_VOP3A(iFmt, "v_ashr_pk_i8_i32", false)
8105 {
8106 setFlag(ALU);
8107 } // Inst_VOP3__V_ASHR_PK_I8_I32
8108
8110 {
8111 } // ~Inst_VOP3__V_ASHR_PK_I8_I32
8112
8113 void
8115 {
8116 Wavefront *wf = gpuDynInst->wavefront();
8117 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
8118 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
8119 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
8120 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8121
8122 src0.readSrc();
8123 src1.readSrc();
8124 src2.readSrc();
8125 vdst.read();
8126
8127 auto sat8 = [](int32_t n) -> uint8_t {
8128 if (n <= -128) return 0x80;
8129 else if (n >= 127) return 0x7f;
8130 else return n & 0xff;
8131 };
8132
8133 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
8134 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
8135
8136 int opsel = instData.OPSEL;
8137 panic_if(opsel & 0x7, "Source OPSEL not supported for %s", _opcode);
8138
8139 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8140 if (wf->execMask(lane)) {
8141 uint8_t lower = sat8(src0[lane] >> bits(src2[lane], 4, 0));
8142 uint8_t upper = sat8(src0[lane] >> bits(src2[lane], 4, 0));
8143
8144 // Don't clobber unwritten bits according to pgm guide.
8145 uint16_t result = uint16_t(upper) << 8 | uint16_t(lower);
8146 if (opsel & 0x8) {
8147 replaceBits(vdst[lane], 31, 16, result);
8148 } else {
8149 replaceBits(vdst[lane], 15, 0, result);
8150 }
8151 }
8152 }
8153
8154 vdst.write();
8155
8156 } // execute
8157 // --- Inst_VOP3__V_ASHR_PK_U8_I32 class methods ---
8158
8160 : Inst_VOP3A(iFmt, "v_ashr_pk_u8_i32", false)
8161 {
8162 setFlag(ALU);
8163 } // Inst_VOP3__V_ASHR_PK_U8_I32
8164
8166 {
8167 } // ~Inst_VOP3__V_ASHR_PK_U8_I32
8168
8169 void
8171 {
8172 Wavefront *wf = gpuDynInst->wavefront();
8173 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
8174 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
8175 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
8176 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8177
8178 src0.readSrc();
8179 src1.readSrc();
8180 src2.readSrc();
8181 vdst.read();
8182
8183 auto sat8 = [](int32_t n) -> uint8_t {
8184 if (n <= 0) return 0;
8185 else if (n >= 255) return 0xff;
8186 else return n & 0xff;
8187 };
8188
8189 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
8190 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
8191
8192 int opsel = instData.OPSEL;
8193 panic_if(opsel & 0x7, "Source OPSEL not supported for %s", _opcode);
8194
8195 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8196 if (wf->execMask(lane)) {
8197 uint8_t lower = sat8(src0[lane] >> bits(src2[lane], 4, 0));
8198 uint8_t upper = sat8(src0[lane] >> bits(src2[lane], 4, 0));
8199
8200 // Don't clobber unwritten bits according to pgm guide.
8201 uint16_t result = uint16_t(upper) << 8 | uint16_t(lower);
8202 if (opsel & 0x8) {
8203 replaceBits(vdst[lane], 31, 16, result);
8204 } else {
8205 replaceBits(vdst[lane], 15, 0, result);
8206 }
8207 }
8208 }
8209
8210 vdst.write();
8211
8212 } // execute
8213 // --- Inst_VOP3__V_CVT_PK_F16_F32 class methods ---
8214
8217 : Inst_VOP3A(iFmt, "v_cvt_pk_f16_f32", false)
8218 {
8219 setFlag(ALU);
8220 } // Inst_VOP3__V_CVT_PK_F16_F32
8221
8223 {
8224 } // ~Inst_VOP3__V_CVT_PK_F16_F32
8225
8226 void
8228 {
8229 Wavefront *wf = gpuDynInst->wavefront();
8230 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
8231 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
8232 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8233
8234 src0.readSrc();
8235 src1.readSrc();
8236 vdst.read();
8237
8238 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
8239 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
8240
8241 bool clamp = instData.CLAMP;
8242 unsigned abs = instData.ABS;
8243 unsigned opsel = instData.OPSEL;
8244 unsigned omod = extData.OMOD;
8245 unsigned neg = extData.NEG;
8246
8247 // Unclear how opsel would work here.
8248 panic_if(opsel, "OPSEL not implemented for %s", _opcode);
8249
8250 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8251 if (wf->execMask(lane)) {
8252 AMDGPU::mxfloat16 tmp0(src0[lane]), tmp1(src1[lane]);
8253
8254 if (abs & 1) tmp0.fabs();
8255 if (abs & 2) tmp1.fabs();
8256 if (neg & 1) tmp0.neg();
8257 if (neg & 2) tmp1.neg();
8258 tmp0.omodModifier(omod);
8259 tmp1.omodModifier(omod);
8260 tmp0.clamp(clamp);
8261 tmp1.clamp(clamp);
8262
8263 uint32_t lower_word = tmp0.data;
8264 uint32_t upper_word = tmp1.data;
8265
8266 vdst[lane] = (upper_word << 16) | lower_word;
8267 }
8268 }
8269
8270 vdst.write();
8271 } // execute
8272 // --- Inst_VOP3__V_CVT_PK_BF16_F32 class methods ---
8273
8276 : Inst_VOP3A(iFmt, "v_cvt_pk_bf16_f32", false)
8277 {
8278 setFlag(ALU);
8279 } // Inst_VOP3__V_CVT_PK_BF16_F32
8280
8282 {
8283 } // ~Inst_VOP3__V_CVT_PK_BF16_F32
8284
8285 void
8287 {
8288 Wavefront *wf = gpuDynInst->wavefront();
8289 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
8290 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
8291 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8292
8293 src0.readSrc();
8294 src1.readSrc();
8295 vdst.read();
8296
8297 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
8298 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
8299
8300 bool clamp = instData.CLAMP;
8301 unsigned abs = instData.ABS;
8302 unsigned opsel = instData.OPSEL;
8303 unsigned omod = extData.OMOD;
8304 unsigned neg = extData.NEG;
8305
8306 // Unclear how opsel would work here.
8307 panic_if(opsel, "OPSEL not implemented for %s", _opcode);
8308
8309 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8310 if (wf->execMask(lane)) {
8311 AMDGPU::mxbfloat16 tmp0(src0[lane]), tmp1(src1[lane]);
8312
8313 if (abs & 1) tmp0.fabs();
8314 if (abs & 2) tmp1.fabs();
8315 if (neg & 1) tmp0.neg();
8316 if (neg & 2) tmp1.neg();
8317 tmp0.omodModifier(omod);
8318 tmp1.omodModifier(omod);
8319 tmp0.clamp(clamp);
8320 tmp1.clamp(clamp);
8321
8322 uint32_t lower_word = tmp0.data;
8323 uint32_t upper_word = tmp1.data;
8324
8325 vdst[lane] = (upper_word << 16) | lower_word;
8326 }
8327 }
8328
8329 vdst.write();
8330 } // execute
8331 // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
8332
8334 : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
8335 {
8336 setFlag(ALU);
8337 setFlag(F32);
8338 } // Inst_VOP3__V_INTERP_P1_F32
8339
8341 {
8342 } // ~Inst_VOP3__V_INTERP_P1_F32
8343
8344 // --- description from .arch file ---
8345 // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
8346 // V_MAD_F32 for SP).
8347 // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
8348 // D == S then data corruption will occur.
8349 // NOTE: In textual representations the I/J VGPR is the first source and
8350 // the attribute is the second source; however in the VOP3 encoding the
8351 // attribute is stored in the src0 field and the VGPR is stored in the
8352 // src1 field.
8353 void
8355 {
8357 } // execute
8358 // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
8359
8361 : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
8362 {
8363 setFlag(ALU);
8364 setFlag(F32);
8365 } // Inst_VOP3__V_INTERP_P2_F32
8366
8368 {
8369 } // ~Inst_VOP3__V_INTERP_P2_F32
8370
8371 // --- description from .arch file ---
8372 // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
8373 // V_MAD_F32 for SP).
8374 // NOTE: In textual representations the I/J VGPR is the first source and
8375 // the attribute is the second source; however in the VOP3 encoding the
8376 // attribute is stored in the src0 field and the VGPR is stored in the
8377 // src1 field.
8378 void
8380 {
8382 } // execute
8383 // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
8384
8386 : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
8387 {
8388 setFlag(ALU);
8389 setFlag(F32);
8390 } // Inst_VOP3__V_INTERP_MOV_F32
8391
8393 {
8394 } // ~Inst_VOP3__V_INTERP_MOV_F32
8395
8396 // --- description from .arch file ---
8397 // D.f = {P10,P20,P0}[S.u]; parameter load.
8398 void
8400 {
8402 } // execute
8403 // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
8404
8406 InFmt_VOP3A *iFmt)
8407 : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
8408 {
8409 setFlag(ALU);
8410 setFlag(F16);
8411 } // Inst_VOP3__V_INTERP_P1LL_F16
8412
8414 {
8415 } // ~Inst_VOP3__V_INTERP_P1LL_F16
8416
8417 // --- description from .arch file ---
8418 // D.f32 = P10.f16 * S0.f32 + P0.f16.
8419 // 'LL' stands for 'two LDS arguments'.
8420 // attr_word selects the high or low half 16 bits of each LDS dword
8421 // accessed.
8422 // This opcode is available for 32-bank LDS only.
8423 // NOTE: In textual representations the I/J VGPR is the first source and
8424 // the attribute is the second source; however in the VOP3 encoding the
8425 // attribute is stored in the src0 field and the VGPR is stored in the
8426 // src1 field.
8427 void
8429 {
8431 } // execute
8432 // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
8433
8435 InFmt_VOP3A *iFmt)
8436 : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
8437 {
8438 setFlag(ALU);
8439 setFlag(F16);
8440 } // Inst_VOP3__V_INTERP_P1LV_F16
8441
8443 {
8444 } // ~Inst_VOP3__V_INTERP_P1LV_F16
8445
8446 // --- description from .arch file ---
8447 // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
8448 // 'LV' stands for 'One LDS and one VGPR argument'.
8449 // S2 holds two parameters, attr_word selects the high or low word of the
8450 // VGPR for this calculation, as well as the high or low half of the LDS
8451 // data.
8452 // Meant for use with 16-bank LDS.
8453 // NOTE: In textual representations the I/J VGPR is the first source and
8454 // the attribute is the second source; however in the VOP3 encoding the
8455 // attribute is stored in the src0 field and the VGPR is stored in the
8456 // src1 field.
8457 void
8459 {
8461 } // execute
8462 // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
8463
8465 : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
8466 {
8467 setFlag(ALU);
8468 setFlag(F16);
8469 } // Inst_VOP3__V_INTERP_P2_F16
8470
8472 {
8473 } // ~Inst_VOP3__V_INTERP_P2_F16
8474
8475 // --- description from .arch file ---
8476 // D.f16 = P20.f16 * S0.f32 + S2.f32.
8477 // Final computation. attr_word selects LDS high or low 16bits. Used for
8478 // both 16- and 32-bank LDS.
8479 // Result is always written to the 16 LSBs of the destination VGPR.
8480 // NOTE: In textual representations the I/J VGPR is the first source and
8481 // the attribute is the second source; however in the VOP3 encoding the
8482 // attribute is stored in the src0 field and the VGPR is stored in the
8483 // src1 field.
8484 void
8486 {
8488 } // execute
8489 // --- Inst_VOP3__V_ADD_F64 class methods ---
8490
8492 : Inst_VOP3A(iFmt, "v_add_f64", false)
8493 {
8494 setFlag(ALU);
8495 setFlag(F64);
8496 } // Inst_VOP3__V_ADD_F64
8497
8499 {
8500 } // ~Inst_VOP3__V_ADD_F64
8501
8502 // --- description from .arch file ---
8503 // D.d = S0.d + S1.d.
8504 void
8506 {
8507 Wavefront *wf = gpuDynInst->wavefront();
8508 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8509 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8510 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8511
8512 src0.readSrc();
8513 src1.readSrc();
8514
8515 if (instData.ABS & 0x1) {
8516 src0.absModifier();
8517 }
8518
8519 if (instData.ABS & 0x2) {
8520 src1.absModifier();
8521 }
8522
8523 if (extData.NEG & 0x1) {
8524 src0.negModifier();
8525 }
8526
8527 if (extData.NEG & 0x2) {
8528 src1.negModifier();
8529 }
8530
8534 assert(!(instData.ABS & 0x4));
8535 assert(!(extData.NEG & 0x4));
8536
8537 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8538 if (wf->execMask(lane)) {
8539 if (std::isnan(src0[lane]) ||
8540 std::isnan(src1[lane]) ) {
8541 vdst[lane] = NAN;
8542 } else if (std::isinf(src0[lane]) &&
8543 std::isinf(src1[lane])) {
8544 if (std::signbit(src0[lane]) !=
8545 std::signbit(src1[lane])) {
8546 vdst[lane] = NAN;
8547 } else {
8548 vdst[lane] = src0[lane];
8549 }
8550 } else if (std::isinf(src0[lane])) {
8551 vdst[lane] = src0[lane];
8552 } else if (std::isinf(src1[lane])) {
8553 vdst[lane] = src1[lane];
8554 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8555 std::fpclassify(src0[lane]) == FP_ZERO) {
8556 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8557 std::fpclassify(src1[lane]) == FP_ZERO) {
8558 if (std::signbit(src0[lane]) &&
8559 std::signbit(src1[lane])) {
8560 vdst[lane] = -0.0;
8561 } else {
8562 vdst[lane] = 0.0;
8563 }
8564 } else {
8565 vdst[lane] = src1[lane];
8566 }
8567 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8568 std::fpclassify(src1[lane]) == FP_ZERO) {
8569 if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8570 std::fpclassify(src0[lane]) == FP_ZERO) {
8571 if (std::signbit(src0[lane]) &&
8572 std::signbit(src1[lane])) {
8573 vdst[lane] = -0.0;
8574 } else {
8575 vdst[lane] = 0.0;
8576 }
8577 } else {
8578 vdst[lane] = src0[lane];
8579 }
8580 } else {
8581 vdst[lane] = src0[lane] + src1[lane];
8582 }
8583 }
8584 }
8585
8586 vdst.write();
8587 } // execute
8588 // --- Inst_VOP3__V_MUL_F64 class methods ---
8589
8591 : Inst_VOP3A(iFmt, "v_mul_f64", false)
8592 {
8593 setFlag(ALU);
8594 setFlag(F64);
8595 } // Inst_VOP3__V_MUL_F64
8596
8598 {
8599 } // ~Inst_VOP3__V_MUL_F64
8600
8601 // --- description from .arch file ---
8602 // D.d = S0.d * S1.d.
8603 void
8605 {
8606 Wavefront *wf = gpuDynInst->wavefront();
8607 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8608 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8609 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8610
8611 src0.readSrc();
8612 src1.readSrc();
8613
8614 if (instData.ABS & 0x1) {
8615 src0.absModifier();
8616 }
8617
8618 if (instData.ABS & 0x2) {
8619 src1.absModifier();
8620 }
8621
8622 if (extData.NEG & 0x1) {
8623 src0.negModifier();
8624 }
8625
8626 if (extData.NEG & 0x2) {
8627 src1.negModifier();
8628 }
8629
8633 assert(!(instData.ABS & 0x4));
8634 assert(!(extData.NEG & 0x4));
8635
8636 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8637 if (wf->execMask(lane)) {
8638 if (std::isnan(src0[lane]) ||
8639 std::isnan(src1[lane])) {
8640 vdst[lane] = NAN;
8641 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8642 std::fpclassify(src0[lane]) == FP_ZERO) &&
8643 !std::signbit(src0[lane])) {
8644 if (std::isinf(src1[lane])) {
8645 vdst[lane] = NAN;
8646 } else if (!std::signbit(src1[lane])) {
8647 vdst[lane] = +0.0;
8648 } else {
8649 vdst[lane] = -0.0;
8650 }
8651 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8652 std::fpclassify(src0[lane]) == FP_ZERO) &&
8653 std::signbit(src0[lane])) {
8654 if (std::isinf(src1[lane])) {
8655 vdst[lane] = NAN;
8656 } else if (std::signbit(src1[lane])) {
8657 vdst[lane] = +0.0;
8658 } else {
8659 vdst[lane] = -0.0;
8660 }
8661 } else if (std::isinf(src0[lane]) &&
8662 !std::signbit(src0[lane])) {
8663 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8664 std::fpclassify(src1[lane]) == FP_ZERO) {
8665 vdst[lane] = NAN;
8666 } else if (!std::signbit(src1[lane])) {
8667 vdst[lane] = +INFINITY;
8668 } else {
8669 vdst[lane] = -INFINITY;
8670 }
8671 } else if (std::isinf(src0[lane]) &&
8672 std::signbit(src0[lane])) {
8673 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8674 std::fpclassify(src1[lane]) == FP_ZERO) {
8675 vdst[lane] = NAN;
8676 } else if (std::signbit(src1[lane])) {
8677 vdst[lane] = +INFINITY;
8678 } else {
8679 vdst[lane] = -INFINITY;
8680 }
8681 } else {
8682 vdst[lane] = src0[lane] * src1[lane];
8683 }
8684 }
8685 }
8686
8687 vdst.write();
8688 } // execute
8689 // --- Inst_VOP3__V_MIN_F64 class methods ---
8690
8692 : Inst_VOP3A(iFmt, "v_min_f64", false)
8693 {
8694 setFlag(ALU);
8695 setFlag(F64);
8696 } // Inst_VOP3__V_MIN_F64
8697
8699 {
8700 } // ~Inst_VOP3__V_MIN_F64
8701
8702 // --- description from .arch file ---
8703 // D.d = min(S0.d, S1.d).
8704 void
8706 {
8707 Wavefront *wf = gpuDynInst->wavefront();
8708 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8709 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8710 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8711
8712 src0.readSrc();
8713 src1.readSrc();
8714
8715 if (instData.ABS & 0x1) {
8716 src0.absModifier();
8717 }
8718
8719 if (instData.ABS & 0x2) {
8720 src1.absModifier();
8721 }
8722
8723 if (extData.NEG & 0x1) {
8724 src0.negModifier();
8725 }
8726
8727 if (extData.NEG & 0x2) {
8728 src1.negModifier();
8729 }
8730
8734 assert(!(instData.ABS & 0x4));
8735 assert(!(extData.NEG & 0x4));
8736
8737 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8738 if (wf->execMask(lane)) {
8739 vdst[lane] = std::fmin(src0[lane], src1[lane]);
8740 }
8741 }
8742
8743 vdst.write();
8744 } // execute
8745 // --- Inst_VOP3__V_MAX_F64 class methods ---
8746
8748 : Inst_VOP3A(iFmt, "v_max_f64", false)
8749 {
8750 setFlag(ALU);
8751 setFlag(F64);
8752 } // Inst_VOP3__V_MAX_F64
8753
8755 {
8756 } // ~Inst_VOP3__V_MAX_F64
8757
8758 // --- description from .arch file ---
8759 // D.d = max(S0.d, S1.d).
8760 void
8762 {
8763 Wavefront *wf = gpuDynInst->wavefront();
8764 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8765 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8766 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8767
8768 src0.readSrc();
8769 src1.readSrc();
8770
8771 if (instData.ABS & 0x1) {
8772 src0.absModifier();
8773 }
8774
8775 if (instData.ABS & 0x2) {
8776 src1.absModifier();
8777 }
8778
8779 if (extData.NEG & 0x1) {
8780 src0.negModifier();
8781 }
8782
8783 if (extData.NEG & 0x2) {
8784 src1.negModifier();
8785 }
8786
8790 assert(!(instData.ABS & 0x4));
8791 assert(!(extData.NEG & 0x4));
8792
8793 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8794 if (wf->execMask(lane)) {
8795 vdst[lane] = std::fmax(src0[lane], src1[lane]);
8796 }
8797 }
8798
8799 vdst.write();
8800 } // execute
8801 // --- Inst_VOP3__V_LDEXP_F64 class methods ---
8802
8804 : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
8805 {
8806 setFlag(ALU);
8807 setFlag(F64);
8808 } // Inst_VOP3__V_LDEXP_F64
8809
8811 {
8812 } // ~Inst_VOP3__V_LDEXP_F64
8813
8814 // --- description from .arch file ---
8815 // D.d = pow(S0.d, S1.i[31:0]).
8816 void
8818 {
8819 Wavefront *wf = gpuDynInst->wavefront();
8820 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8821 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8822 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8823
8824 src0.readSrc();
8825 src1.readSrc();
8826
8827 if (instData.ABS & 0x1) {
8828 src0.absModifier();
8829 }
8830
8831 if (extData.NEG & 0x1) {
8832 src0.negModifier();
8833 }
8834
8838 assert(!(instData.ABS & 0x2));
8839 assert(!(instData.ABS & 0x4));
8840 assert(!(extData.NEG & 0x2));
8841 assert(!(extData.NEG & 0x4));
8842
8843 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8844 if (wf->execMask(lane)) {
8845 if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
8846 vdst[lane] = src0[lane];
8847 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
8848 || std::fpclassify(src0[lane]) == FP_ZERO) {
8849 if (std::signbit(src0[lane])) {
8850 vdst[lane] = -0.0;
8851 } else {
8852 vdst[lane] = +0.0;
8853 }
8854 } else {
8855 vdst[lane] = std::ldexp(src0[lane], src1[lane]);
8856 }
8857 }
8858 }
8859
8860 vdst.write();
8861 } // execute
8862 // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
8863
8865 : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
8866 {
8867 setFlag(ALU);
8868 } // Inst_VOP3__V_MUL_LO_U32
8869
8871 {
8872 } // ~Inst_VOP3__V_MUL_LO_U32
8873
8874 // --- description from .arch file ---
8875 // D.u = S0.u * S1.u.
8876 void
8878 {
8879 Wavefront *wf = gpuDynInst->wavefront();
8880 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8881 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8882 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8883
8884 src0.readSrc();
8885 src1.readSrc();
8886
8890 assert(!(instData.ABS & 0x1));
8891 assert(!(instData.ABS & 0x2));
8892 assert(!(instData.ABS & 0x4));
8893 assert(!(extData.NEG & 0x1));
8894 assert(!(extData.NEG & 0x2));
8895 assert(!(extData.NEG & 0x4));
8896
8897 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8898 if (wf->execMask(lane)) {
8899 VecElemI64 s0 = (VecElemI64)src0[lane];
8900 VecElemI64 s1 = (VecElemI64)src1[lane];
8901 vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
8902 }
8903 }
8904
8905 vdst.write();
8906 } // execute
8907 // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
8908
8910 : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
8911 {
8912 setFlag(ALU);
8913 } // Inst_VOP3__V_MUL_HI_U32
8914
8916 {
8917 } // ~Inst_VOP3__V_MUL_HI_U32
8918
8919 // --- description from .arch file ---
8920 // D.u = (S0.u * S1.u) >> 32.
8921 void
8923 {
8924 Wavefront *wf = gpuDynInst->wavefront();
8925 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8926 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8927 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8928
8929 src0.readSrc();
8930 src1.readSrc();
8931
8935 assert(!(instData.ABS & 0x1));
8936 assert(!(instData.ABS & 0x2));
8937 assert(!(instData.ABS & 0x4));
8938 assert(!(extData.NEG & 0x1));
8939 assert(!(extData.NEG & 0x2));
8940 assert(!(extData.NEG & 0x4));
8941
8942 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8943 if (wf->execMask(lane)) {
8944 VecElemI64 s0 = (VecElemI64)src0[lane];
8945 VecElemI64 s1 = (VecElemI64)src1[lane];
8946 vdst[lane]
8947 = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
8948 }
8949 }
8950
8951 vdst.write();
8952 } // execute
8953 // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
8954
8956 : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
8957 {
8958 setFlag(ALU);
8959 } // Inst_VOP3__V_MUL_HI_I32
8960
8962 {
8963 } // ~Inst_VOP3__V_MUL_HI_I32
8964
8965 // --- description from .arch file ---
8966 // D.i = (S0.i * S1.i) >> 32.
8967 void
8969 {
8970 Wavefront *wf = gpuDynInst->wavefront();
8971 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
8972 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
8973 VecOperandI32 vdst(gpuDynInst, instData.VDST);
8974
8975 src0.readSrc();
8976 src1.readSrc();
8977
8981 assert(!(instData.ABS & 0x1));
8982 assert(!(instData.ABS & 0x2));
8983 assert(!(instData.ABS & 0x4));
8984 assert(!(extData.NEG & 0x1));
8985 assert(!(extData.NEG & 0x2));
8986 assert(!(extData.NEG & 0x4));
8987
8988 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8989 if (wf->execMask(lane)) {
8990 VecElemI64 s0 = (VecElemI64)src0[lane];
8991 VecElemI64 s1 = (VecElemI64)src1[lane];
8992 vdst[lane]
8993 = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
8994 }
8995 }
8996
8997 vdst.write();
8998 } // execute
8999 // --- Inst_VOP3__V_LDEXP_F32 class methods ---
9000
9002 : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
9003 {
9004 setFlag(ALU);
9005 setFlag(F32);
9006 } // Inst_VOP3__V_LDEXP_F32
9007
9009 {
9010 } // ~Inst_VOP3__V_LDEXP_F32
9011
9012 // --- description from .arch file ---
9013 // D.f = pow(S0.f, S1.i)
9014 void
9016 {
9017 Wavefront *wf = gpuDynInst->wavefront();
9018 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9019 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
9020 VecOperandF32 vdst(gpuDynInst, instData.VDST);
9021
9022 src0.readSrc();
9023 src1.readSrc();
9024
9028 assert(!(instData.ABS & 0x2));
9029 assert(!(instData.ABS & 0x4));
9030 assert(!(extData.NEG & 0x2));
9031 assert(!(extData.NEG & 0x4));
9032
9033 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9034 if (wf->execMask(lane)) {
9035 vdst[lane] = std::ldexp(src0[lane], src1[lane]);
9036 }
9037 }
9038
9039 vdst.write();
9040 } // execute
9041 // --- Inst_VOP3__V_READLANE_B32 class methods ---
9042
9044 : Inst_VOP3A(iFmt, "v_readlane_b32", true)
9045 {
9046 setFlag(ALU);
9047 setFlag(IgnoreExec);
9048 } // Inst_VOP3__V_READLANE_B32
9049
9051 {
9052 } // ~Inst_VOP3__V_READLANE_B32
9053
9054 // --- description from .arch file ---
9055 // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
9056 // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
9057 // Input and output modifiers not supported; this is an untyped operation.
9058 void
9060 {
9061 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9062 ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
9063 ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
9064
9065 src0.readSrc();
9066 src1.read();
9067
9068 sdst = src0[src1.rawData() & 0x3f];
9069
9070 sdst.write();
9071 } // execute
9072 // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
9073
9075 : Inst_VOP3A(iFmt, "v_writelane_b32", false)
9076 {
9077 setFlag(ALU);
9078 setFlag(IgnoreExec);
9079 } // Inst_VOP3__V_WRITELANE_B32
9080
9082 {
9083 } // ~Inst_VOP3__V_WRITELANE_B32
9084
9085 // --- description from .arch file ---
9086 // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
9087 // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
9088 // exec mask.
9089 // Input and output modifiers not supported; this is an untyped operation.
9090 // SQ translates to V_MOV_B32.
9091 void
9093 {
9094 ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
9095 ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
9096 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9097
9098 src0.read();
9099 src1.read();
9100 vdst.read();
9101
9102 vdst[src1.rawData() & 0x3f] = src0.rawData();
9103
9104 vdst.write();
9105 } // execute
9106 // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
9107
9109 : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
9110 {
9111 setFlag(ALU);
9112 } // Inst_VOP3__V_BCNT_U32_B32
9113
9115 {
9116 } // ~Inst_VOP3__V_BCNT_U32_B32
9117
9118 // --- description from .arch file ---
9119 // D.u = CountOneBits(S0.u) + S1.u. Bit count.
9120 void
9122 {
9123 Wavefront *wf = gpuDynInst->wavefront();
9124 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9125 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9126 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9127
9128 src0.readSrc();
9129 src1.readSrc();
9130
9134 assert(!(instData.ABS & 0x1));
9135 assert(!(instData.ABS & 0x2));
9136 assert(!(instData.ABS & 0x4));
9137 assert(!(extData.NEG & 0x1));
9138 assert(!(extData.NEG & 0x2));
9139 assert(!(extData.NEG & 0x4));
9140
9141 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9142 if (wf->execMask(lane)) {
9143 vdst[lane] = popCount(src0[lane]) + src1[lane];
9144 }
9145 }
9146
9147 vdst.write();
9148 } // execute
9149 // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
9150
9152 InFmt_VOP3A *iFmt)
9153 : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
9154 {
9155 setFlag(ALU);
9156 } // Inst_VOP3__V_MBCNT_LO_U32_B32
9157
9159 {
9160 } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
9161
9162 // --- description from .arch file ---
9163 // ThreadMask = (1 << ThreadPosition) - 1;
9164 // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
9165 // Masked bit count, ThreadPosition is the position of this thread in the
9166 // --- wavefront (in 0..63).
9167 void
9169 {
9170 Wavefront *wf = gpuDynInst->wavefront();
9171 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9172 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9173 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9174 uint64_t threadMask = 0;
9175
9176 src0.readSrc();
9177 src1.readSrc();
9178
9182 assert(!(instData.ABS & 0x1));
9183 assert(!(instData.ABS & 0x2));
9184 assert(!(instData.ABS & 0x4));
9185 assert(!(extData.NEG & 0x1));
9186 assert(!(extData.NEG & 0x2));
9187 assert(!(extData.NEG & 0x4));
9188
9189 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9190 if (wf->execMask(lane)) {
9191 threadMask = ((1ULL << lane) - 1ULL);
9192 vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
9193 src1[lane];
9194 }
9195 }
9196
9197 vdst.write();
9198 } // execute
9199 // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
9200
9202 InFmt_VOP3A *iFmt)
9203 : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
9204 {
9205 setFlag(ALU);
9206 } // Inst_VOP3__V_MBCNT_HI_U32_B32
9207
9209 {
9210 } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
9211
9212 // --- description from .arch file ---
9213 // ThreadMask = (1 << ThreadPosition) - 1;
9214 // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
9215 // Masked bit count, ThreadPosition is the position of this thread in the
9216 // --- wavefront (in 0..63).
9217 void
9219 {
9220 Wavefront *wf = gpuDynInst->wavefront();
9221 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9222 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9223 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9224 uint64_t threadMask = 0;
9225
9226 src0.readSrc();
9227 src1.readSrc();
9228
9232 assert(!(instData.ABS & 0x1));
9233 assert(!(instData.ABS & 0x2));
9234 assert(!(instData.ABS & 0x4));
9235 assert(!(extData.NEG & 0x1));
9236 assert(!(extData.NEG & 0x2));
9237 assert(!(extData.NEG & 0x4));
9238
9239 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9240 if (wf->execMask(lane)) {
9241 threadMask = ((1ULL << lane) - 1ULL);
9242 vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
9243 src1[lane];
9244 }
9245 }
9246
9247 vdst.write();
9248 } // execute
9249 // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
9250
9252 : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
9253 {
9254 setFlag(ALU);
9255 } // Inst_VOP3__V_LSHLREV_B64
9256
9258 {
9259 } // ~Inst_VOP3__V_LSHLREV_B64
9260
9261 // --- description from .arch file ---
9262 // D.u64 = S1.u64 << S0.u[5:0].
9263 // SQ translates this to an internal SP opcode.
9264 void
9266 {
9267 Wavefront *wf = gpuDynInst->wavefront();
9268 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9269 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
9270 VecOperandU64 vdst(gpuDynInst, instData.VDST);
9271
9272 src0.readSrc();
9273 src1.readSrc();
9274
9278 assert(!(instData.ABS & 0x1));
9279 assert(!(instData.ABS & 0x2));
9280 assert(!(instData.ABS & 0x4));
9281 assert(!(extData.NEG & 0x1));
9282 assert(!(extData.NEG & 0x2));
9283 assert(!(extData.NEG & 0x4));
9284
9285 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9286 if (wf->execMask(lane)) {
9287 vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
9288 }
9289 }
9290
9291 vdst.write();
9292 } // execute
9293 // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
9294
9296 : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
9297 {
9298 setFlag(ALU);
9299 } // Inst_VOP3__V_LSHRREV_B64
9300
9302 {
9303 } // ~Inst_VOP3__V_LSHRREV_B64
9304
9305 // --- description from .arch file ---
9306 // D.u64 = S1.u64 >> S0.u[5:0].
9307 // The vacated bits are set to zero.
9308 // SQ translates this to an internal SP opcode.
9309 void
9311 {
9312 Wavefront *wf = gpuDynInst->wavefront();
9313 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9314 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
9315 VecOperandU64 vdst(gpuDynInst, instData.VDST);
9316
9317 src0.readSrc();
9318 src1.readSrc();
9319
9323 assert(!(instData.ABS & 0x1));
9324 assert(!(instData.ABS & 0x2));
9325 assert(!(instData.ABS & 0x4));
9326 assert(!(extData.NEG & 0x1));
9327 assert(!(extData.NEG & 0x2));
9328 assert(!(extData.NEG & 0x4));
9329
9330 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9331 if (wf->execMask(lane)) {
9332 vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
9333 }
9334 }
9335
9336 vdst.write();
9337 } // execute
9338 // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
9339
9341 : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
9342 {
9343 setFlag(ALU);
9344 } // Inst_VOP3__V_ASHRREV_I64
9345
9347 {
9348 } // ~Inst_VOP3__V_ASHRREV_I64
9349
9350 // --- description from .arch file ---
9351 // D.u64 = signext(S1.u64) >> S0.u[5:0].
9352 // The vacated bits are set to the sign bit of the input value.
9353 // SQ translates this to an internal SP opcode.
9354 void
9356 {
9357 Wavefront *wf = gpuDynInst->wavefront();
9358 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9359 ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
9360 VecOperandU64 vdst(gpuDynInst, instData.VDST);
9361
9362 src0.readSrc();
9363 src1.readSrc();
9364
9368 assert(!(instData.ABS & 0x1));
9369 assert(!(instData.ABS & 0x2));
9370 assert(!(instData.ABS & 0x4));
9371 assert(!(extData.NEG & 0x1));
9372 assert(!(extData.NEG & 0x2));
9373 assert(!(extData.NEG & 0x4));
9374
9375 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9376 if (wf->execMask(lane)) {
9377 vdst[lane]
9378 = src1[lane] >> bits(src0[lane], 5, 0);
9379 }
9380 }
9381
9382 vdst.write();
9383 } // execute
9384 // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
9385
9387 : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
9388 {
9389 setFlag(ALU);
9390 setFlag(F64);
9391 } // Inst_VOP3__V_TRIG_PREOP_F64
9392
9394 {
9395 } // ~Inst_VOP3__V_TRIG_PREOP_F64
9396
9397 // --- description from .arch file ---
9398 // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
9399 // returns an aligned, double precision segment of 2/PI needed to do range
9400 // reduction on S0.d (double-precision value). Multiple segments can be
9401 // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
9402 // inputs (exp > 1968) are scaled to avoid loss of precision through
9403 // denormalization.
9404 void
9406 {
9408 } // execute
9409 // --- Inst_VOP3__V_BFM_B32 class methods ---
9410
9412 : Inst_VOP3A(iFmt, "v_bfm_b32", false)
9413 {
9414 setFlag(ALU);
9415 } // Inst_VOP3__V_BFM_B32
9416
9418 {
9419 } // ~Inst_VOP3__V_BFM_B32
9420
9421 // --- description from .arch file ---
9422 // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
9423 // is the bitfield offset.
9424 void
9426 {
9427 Wavefront *wf = gpuDynInst->wavefront();
9428 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
9429 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9430 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9431
9432 src0.readSrc();
9433 src1.readSrc();
9434
9438 assert(!(instData.ABS & 0x1));
9439 assert(!(instData.ABS & 0x2));
9440 assert(!(instData.ABS & 0x4));
9441 assert(!(extData.NEG & 0x1));
9442 assert(!(extData.NEG & 0x2));
9443 assert(!(extData.NEG & 0x4));
9444
9445 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9446 if (wf->execMask(lane)) {
9447 vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
9448 << bits(src1[lane], 4, 0);
9449 }
9450 }
9451
9452 vdst.write();
9453 } // execute
9454 // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
9455
9457 InFmt_VOP3A *iFmt)
9458 : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
9459 {
9460 setFlag(ALU);
9461 setFlag(F32);
9462 } // Inst_VOP3__V_CVT_PKNORM_I16_F32
9463
9465 {
9466 } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
9467
9468 // --- description from .arch file ---
9469 // D = {(snorm)S1.f, (snorm)S0.f}.
9470 void
9475 // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
9476
9478 InFmt_VOP3A *iFmt)
9479 : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
9480 {
9481 setFlag(ALU);
9482 setFlag(F32);
9483 } // Inst_VOP3__V_CVT_PKNORM_U16_F32
9484
9486 {
9487 } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
9488
9489 // --- description from .arch file ---
9490 // D = {(unorm)S1.f, (unorm)S0.f}.
9491 void
9496 // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
9497
9499 InFmt_VOP3A *iFmt)
9500 : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
9501 {
9502 setFlag(ALU);
9503 setFlag(F32);
9504 } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
9505
9507 {
9508 } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
9509
9510 // --- description from .arch file ---
9511 // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
9512 // --- regardless of current round mode setting in hardware.
9513 // This opcode is intended for use with 16-bit compressed exports.
9514 // See V_CVT_F16_F32 for a version that respects the current rounding mode.
9515 void
9520 // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
9521
9523 : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
9524 {
9525 setFlag(ALU);
9526 } // Inst_VOP3__V_CVT_PK_U16_U32
9527
9529 {
9530 } // ~Inst_VOP3__V_CVT_PK_U16_U32
9531
9532 // --- description from .arch file ---
9533 // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
9534 void
9536 {
9538 } // execute
9539 // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
9540
9542 : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
9543 {
9544 setFlag(ALU);
9545 } // Inst_VOP3__V_CVT_PK_I16_I32
9546
9548 {
9549 } // ~Inst_VOP3__V_CVT_PK_I16_I32
9550
9551 // --- description from .arch file ---
9552 // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
9553 void
9555 {
9557 } // execute
9558 // --- Inst_VOP3__V_CVT_PK_FP8_F32 class methods ---
9559
9561 : Inst_VOP3A(iFmt, "v_cvt_pk_fp8_f32", false)
9562 {
9563 setFlag(ALU);
9564 } // Inst_VOP3__V_CVT_PK_FP8_F32
9565
9567 {
9568 } // ~Inst_VOP3__V_CVT_PK_FP8_F32
9569
9570 void
9572 {
9573 Wavefront *wf = gpuDynInst->wavefront();
9574 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9575 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
9576 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9577
9578 src0.readSrc();
9579 src1.readSrc();
9580 vdst.read(); // Preserve bits
9581
9582 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9583 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9584 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9585 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9586
9587 unsigned opsel = instData.OPSEL;
9588 unsigned abs = instData.ABS;
9589 unsigned neg = extData.NEG;
9590
9591 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9592 if (wf->execMask(lane)) {
9593 AMDGPU::mxfloat8 tmp0(src0[lane]), tmp1(src1[lane]);
9594
9595 if (abs & 1) tmp0.fabs();
9596 if (abs & 2) tmp1.fabs();
9597 if (neg & 1) tmp0.neg();
9598 if (neg & 2) tmp1.neg();
9599
9600 uint16_t packed_data = (bits(tmp1.data, 31, 24) << 8)
9601 | bits(tmp0.data, 31, 24);
9602
9603 if (opsel & 8) {
9604 replaceBits(vdst[lane], 31, 16, packed_data);
9605 } else {
9606 replaceBits(vdst[lane], 15, 0, packed_data);
9607 }
9608 }
9609 }
9610
9611 vdst.write();
9612 } // execute
9613 // --- Inst_VOP3__V_CVT_PK_BF8_F32 class methods ---
9614
9616 : Inst_VOP3A(iFmt, "v_cvt_pk_bf8_f32", false)
9617 {
9618 setFlag(ALU);
9619 } // Inst_VOP3__V_CVT_PK_BF8_F32
9620
9622 {
9623 } // ~Inst_VOP3__V_CVT_PK_BF8_F32
9624
9625 void
9627 {
9628 Wavefront *wf = gpuDynInst->wavefront();
9629 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9630 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
9631 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9632
9633 src0.readSrc();
9634 src1.readSrc();
9635 vdst.read(); // Preserve bits
9636
9637 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9638 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9639 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9640 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9641
9642 unsigned opsel = instData.OPSEL;
9643 unsigned abs = instData.ABS;
9644 unsigned neg = extData.NEG;
9645
9646 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9647 if (wf->execMask(lane)) {
9648 AMDGPU::mxbfloat8 tmp0(src0[lane]), tmp1(src1[lane]);
9649
9650 if (abs & 1) tmp0.fabs();
9651 if (abs & 2) tmp1.fabs();
9652 if (neg & 1) tmp0.neg();
9653 if (neg & 2) tmp1.neg();
9654
9655 uint16_t packed_data = (bits(tmp1.data, 31, 24) << 8)
9656 | bits(tmp0.data, 31, 24);
9657
9658 if (opsel & 8) {
9659 replaceBits(vdst[lane], 31, 16, packed_data);
9660 } else {
9661 replaceBits(vdst[lane], 15, 0, packed_data);
9662 }
9663 }
9664 }
9665
9666 vdst.write();
9667 } // execute
9668 // --- Inst_VOP3__V_CVT_SR_FP8_F32 class methods ---
9669
9672 : Inst_VOP3A(iFmt, "v_cvt_sr_fp8_f32", false)
9673 {
9674 setFlag(ALU);
9675 } // Inst_VOP3__V_CVT_SR_FP8_F32
9676
9678 {
9679 } // ~Inst_VOP3__V_CVT_SR_FP8_F32
9680
9681 void
9683 {
9684 Wavefront *wf = gpuDynInst->wavefront();
9685 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9686 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9687 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9688
9689 src0.readSrc();
9690 src1.readSrc();
9691 vdst.read();
9692
9693 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9694 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9695 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9696 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9697
9698 unsigned abs = instData.ABS;
9699 unsigned neg = extData.NEG;
9700 int opsel = instData.OPSEL;
9701 panic_if(opsel & 0x3, "Source OPSEL not supported for %s", _opcode);
9702 opsel = bits(opsel, 3, 2);
9703
9704 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9705 if (wf->execMask(lane)) {
9706 AMDGPU::mxfloat32 in(src0[lane]);
9707 AMDGPU::mxfloat8 cvt;
9708
9709 if (abs & 1) in = std::fabs(src0[lane]);
9710 if (neg & 1) in = -in;
9711
9712 using sInfo = decltype(in.getFmt());
9713 using dInfo = decltype(cvt.getFmt());
9714 dInfo cvt_info = AMDGPU::convertMXFP<dInfo, sInfo>(
9715 in.getFmt(), AMDGPU::roundStochastic, src1[lane]
9716 );
9717 cvt.setFmt(cvt_info);
9718
9719 if (opsel == 0) {
9720 replaceBits(vdst[lane], 7, 0, cvt);
9721 } else if (opsel == 1) {
9722 replaceBits(vdst[lane], 15, 8, cvt);
9723 } else if (opsel == 2) {
9724 replaceBits(vdst[lane], 23, 16, cvt);
9725 } else {
9726 replaceBits(vdst[lane], 31, 24, cvt);
9727 }
9728 }
9729 }
9730
9731 vdst.write();
9732 } // execute
9733 // --- Inst_VOP3__V_CVT_SR_BF8_F32 class methods ---
9734
9737 : Inst_VOP3A(iFmt, "v_cvt_sr_fp8_f32", false)
9738 {
9739 setFlag(ALU);
9740 } // Inst_VOP3__V_CVT_SR_BF8_F32
9741
9743 {
9744 } // ~Inst_VOP3__V_CVT_SR_BF8_F32
9745
9746 void
9748 {
9749 Wavefront *wf = gpuDynInst->wavefront();
9750 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9751 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9752 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9753
9754 src0.readSrc();
9755 src1.readSrc();
9756 vdst.read();
9757
9758 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9759 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9760 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9761 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9762
9763 unsigned abs = instData.ABS;
9764 unsigned neg = extData.NEG;
9765 int opsel = instData.OPSEL;
9766 panic_if(opsel & 0x3, "Source OPSEL not supported for %s", _opcode);
9767 opsel = bits(opsel, 3, 2);
9768
9769 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9770 if (wf->execMask(lane)) {
9771 AMDGPU::mxfloat32 in(src0[lane]);
9773
9774 if (abs & 1) in = std::fabs(src0[lane]);
9775 if (neg & 1) in = -in;
9776
9777 using sInfo = decltype(in.getFmt());
9778 using dInfo = decltype(cvt.getFmt());
9779 dInfo cvt_info = AMDGPU::convertMXFP<dInfo, sInfo>(
9780 in.getFmt(), AMDGPU::roundStochastic, src1[lane]
9781 );
9782 cvt.setFmt(cvt_info);
9783
9784 if (opsel == 0) {
9785 replaceBits(vdst[lane], 7, 0, cvt);
9786 } else if (opsel == 1) {
9787 replaceBits(vdst[lane], 15, 8, cvt);
9788 } else if (opsel == 2) {
9789 replaceBits(vdst[lane], 23, 16, cvt);
9790 } else {
9791 replaceBits(vdst[lane], 31, 24, cvt);
9792 }
9793 }
9794 }
9795
9796 vdst.write();
9797 } // execute
9798 // --- Inst_VOP3__V_CVT_SR_F16_F32 class methods ---
9799
9802 : Inst_VOP3A(iFmt, "v_cvt_sr_f16_f32", false)
9803 {
9804 setFlag(ALU);
9805 } // Inst_VOP3__V_CVT_SR_F16_F32
9806
9808 {
9809 } // ~Inst_VOP3__V_CVT_SR_F16_F32
9810
9811 void
9813 {
9814 Wavefront *wf = gpuDynInst->wavefront();
9815 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9816 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9817 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9818
9819 src0.readSrc();
9820 src1.readSrc();
9821 vdst.read();
9822
9823 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9824 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9825 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9826 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9827
9828 unsigned abs = instData.ABS;
9829 unsigned neg = extData.NEG;
9830 int opsel = instData.OPSEL;
9831 panic_if(opsel & 0x7, "Source OPSEL not supported for %s", _opcode);
9832
9833 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9834 if (wf->execMask(lane)) {
9835 AMDGPU::mxfloat32 in = src0[lane];
9837
9838 if (abs & 1) in = std::fabs(src0[lane]);
9839 if (neg & 1) in = -in;
9840
9841 using sInfo = decltype(in.getFmt());
9842 using dInfo = decltype(cvt.getFmt());
9843 dInfo cvt_info = AMDGPU::convertMXFP<dInfo, sInfo>(
9844 in.getFmt(), AMDGPU::roundStochastic, src1[lane]
9845 );
9846 cvt.setFmt(cvt_info);
9847
9848 if (opsel & 8) {
9849 replaceBits(vdst[lane], 31, 16, cvt.data >> 16);
9850 } else {
9851 replaceBits(vdst[lane], 15, 0, cvt.data >> 16);
9852 }
9853 }
9854 }
9855
9856 vdst.write();
9857 } // execute
9858 // --- Inst_VOP3__V_CVT_SR_BF16_F32 class methods ---
9859
9862 : Inst_VOP3A(iFmt, "v_cvt_sr_bf16_f32", false)
9863 {
9864 setFlag(ALU);
9865 } // Inst_VOP3__V_CVT_SR_BF16_F32
9866
9868 {
9869 } // ~Inst_VOP3__V_CVT_SR_BF16_F32
9870
9871 // --- description from .arch file ---
9872 // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
9873 void
9875 {
9876 Wavefront *wf = gpuDynInst->wavefront();
9877 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9878 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
9879 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9880
9881 src0.readSrc();
9882 src1.readSrc();
9883 vdst.read(); // Preserve bits
9884
9885 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9886 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9887 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9888 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9889
9890 unsigned abs = instData.ABS;
9891 unsigned neg = extData.NEG;
9892 int opsel = instData.OPSEL;
9893 panic_if(opsel & 0x7, "Source OPSEL not supported for %s", _opcode);
9894
9895 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9896 if (wf->execMask(lane)) {
9897 AMDGPU::mxfloat32 in = src0[lane];
9899
9900 if (abs & 1) in = std::fabs(src0[lane]);
9901 if (neg & 1) in = -in;
9902
9903 using sInfo = decltype(in.getFmt());
9904 using dInfo = decltype(cvt.getFmt());
9905 dInfo cvt_info = AMDGPU::convertMXFP<dInfo, sInfo>(
9906 in.getFmt(), AMDGPU::roundStochastic, src1[lane]
9907 );
9908 cvt.setFmt(cvt_info);
9909
9910 if (opsel & 8) {
9911 replaceBits(vdst[lane], 31, 16, cvt.data >> 16);
9912 } else {
9913 replaceBits(vdst[lane], 15, 0, cvt.data >> 16);
9914 }
9915 }
9916 }
9917
9918 vdst.write();
9919 } // execute
9920 // --- Inst_VOP3__V_PERMLANE16_SWAP_B32 class methods ---
9921
9923 InFmt_VOP3A *iFmt)
9924 : Inst_VOP3A(iFmt, "v_permlane16_swap_b32", false)
9925 {
9926 setFlag(ALU);
9927 } // Inst_VOP3__V_PERMLANE16_SWAP_B32
9928
9930 {} // ~Inst_VOP3__V_PERMLANE16_SWAP_B32
9931
9932 // Swap data between two vector registers. Odd rows of the first operand
9933 // are swapped with even rows of the second operand (one row is 16 lanes).
9934 //
9935 // Notes: ABS, NEG and OMOD modifiers should all be zeroed for this
9936 // instruction. This instruction is useful for BFP data conversions.
9937 void
9939 {
9940 VecOperandU32 src0(gpuDynInst, extData.SRC0);
9941 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9942
9943 src0.read();
9944 vdst.read();
9945
9946 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9947 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9948 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9949 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9950 panic_if(instData.ABS, "ABS not supported for %s", _opcode);
9951 panic_if(extData.NEG, "NEG not supported for %s", _opcode);
9952
9953 // Ignores EXEC MASK
9954 for (int pass = 0; pass < 2; ++pass) {
9955 for (int lane = 0; lane < 16; ++lane) {
9956 int dlane = pass * 32 + lane + 16;
9957 int slane = pass * 32 + lane;
9958
9959 VecElemU32 tmp = src0[slane];
9960 src0[slane] = vdst[dlane];
9961 vdst[dlane] = tmp;
9962 }
9963 }
9964
9965 src0.write();
9966 vdst.write();
9967 } // execute
9968 // --- Inst_VOP3__V_PERMLANE32_SWAP_B32 class methods ---
9969
9971 InFmt_VOP3A *iFmt)
9972 : Inst_VOP3A(iFmt, "v_permlane32_swap_b32", false)
9973 {
9974 setFlag(ALU);
9975 } // Inst_VOP3__V_PERMLANE32_SWAP_B32
9976
9978 {} // ~Inst_VOP3__V_PERMLANE32_SWAP_B32
9979
9980 void
9982 {
9983 VecOperandU32 src0(gpuDynInst, extData.SRC0);
9984 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9985
9986 src0.read();
9987 vdst.read();
9988
9989 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9990 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9991 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9992 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9993 panic_if(instData.ABS, "ABS not supported for %s", _opcode);
9994 panic_if(extData.NEG, "NEG not supported for %s", _opcode);
9995
9996 // Ignores EXEC MASK
9997 for (int lane = 0; lane < 32; ++lane) {
9998 VecElemU32 tmp = src0[lane];
9999 src0[lane] = vdst[lane + 32];
10000 vdst[lane + 32] = tmp;
10001 }
10002
10003 src0.write();
10004 vdst.write();
10005 } // execute
10006} // namespace VegaISA
10007} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
void clamp(bool do_clamp)
Definition mxfp.hh:228
FMT getFmt() const
Definition mxfp.hh:115
void setFmt(FMT in)
Definition mxfp.hh:123
void omodModifier(unsigned omod)
Definition mxfp.hh:209
uint32_t data
Definition mxfp.hh:112
void setFlag(Flags flag)
const std::string _opcode
Nop class.
Definition nop.hh:49
Inst_VOP3A(InFmt_VOP3A *, const std::string &opcode, bool sgpr_dst)
T omodModifier(T val, unsigned omod)
Inst_VOP3B(InFmt_VOP3B *, const std::string &opcode)
void execute(GPUDynInstPtr) override
Definition vop3.cc:7532
void execute(GPUDynInstPtr) override
Definition vop3.cc:1496
void execute(GPUDynInstPtr) override
Definition vop3.cc:1350
void execute(GPUDynInstPtr) override
Definition vop3.cc:1654
Inst_VOP3__V_ADD_F16(InFmt_VOP3A *)
Definition vop3.cc:1639
Inst_VOP3__V_ADD_F32(InFmt_VOP3A *)
Definition vop3.cc:81
void execute(GPUDynInstPtr) override
Definition vop3.cc:95
void execute(GPUDynInstPtr) override
Definition vop3.cc:8505
Inst_VOP3__V_ADD_F64(InFmt_VOP3A *)
Definition vop3.cc:8491
void execute(GPUDynInstPtr) override
Definition vop3.cc:7486
Inst_VOP3__V_ADD_U16(InFmt_VOP3A *)
Definition vop3.cc:1748
void execute(GPUDynInstPtr) override
Definition vop3.cc:1762
void execute(GPUDynInstPtr) override
Definition vop3.cc:2338
Inst_VOP3__V_ADD_U32(InFmt_VOP3A *)
Definition vop3.cc:2325
void execute(GPUDynInstPtr) override
Definition vop3.cc:5958
void execute(GPUDynInstPtr) override
Definition vop3.cc:6006
void execute(GPUDynInstPtr) override
Definition vop3.cc:1065
Inst_VOP3__V_AND_B32(InFmt_VOP3A *)
Definition vop3.cc:1051
void execute(GPUDynInstPtr) override
Definition vop3.cc:7624
void execute(GPUDynInstPtr) override
Definition vop3.cc:2035
void execute(GPUDynInstPtr) override
Definition vop3.cc:977
void execute(GPUDynInstPtr) override
Definition vop3.cc:9355
void execute(GPUDynInstPtr) override
Definition vop3.cc:8114
void execute(GPUDynInstPtr) override
Definition vop3.cc:8170
void execute(GPUDynInstPtr) override
Definition vop3.cc:9121
Inst_VOP3__V_BFE_I32(InFmt_VOP3A *)
Definition vop3.cc:5662
void execute(GPUDynInstPtr) override
Definition vop3.cc:5676
void execute(GPUDynInstPtr) override
Definition vop3.cc:5629
Inst_VOP3__V_BFE_U32(InFmt_VOP3A *)
Definition vop3.cc:5615
Inst_VOP3__V_BFI_B32(InFmt_VOP3A *)
Definition vop3.cc:5716
void execute(GPUDynInstPtr) override
Definition vop3.cc:5729
Inst_VOP3__V_BFM_B32(InFmt_VOP3A *)
Definition vop3.cc:9411
void execute(GPUDynInstPtr) override
Definition vop3.cc:9425
void execute(GPUDynInstPtr) override
Definition vop3.cc:4352
void execute(GPUDynInstPtr) override
Definition vop3.cc:7990
void execute(GPUDynInstPtr) override
Definition vop3.cc:8057
void execute(GPUDynInstPtr) override
Definition vop3.cc:5027
void execute(GPUDynInstPtr) override
Definition vop3.cc:3711
void execute(GPUDynInstPtr) override
Definition vop3.cc:3508
void execute(GPUDynInstPtr) override
Definition vop3.cc:4732
Inst_VOP3__V_CLREXCP(InFmt_VOP3A *)
Definition vop3.cc:4720
Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *)
Definition vop3.cc:43
void execute(GPUDynInstPtr) override
Definition vop3.cc:58
void execute(GPUDynInstPtr) override
Definition vop3.cc:5130
Inst_VOP3__V_COS_F16(InFmt_VOP3A *)
Definition vop3.cc:5116
void execute(GPUDynInstPtr) override
Definition vop3.cc:4270
Inst_VOP3__V_COS_F32(InFmt_VOP3A *)
Definition vop3.cc:4254
void execute(GPUDynInstPtr) override
Definition vop3.cc:5546
void execute(GPUDynInstPtr) override
Definition vop3.cc:5609
void execute(GPUDynInstPtr) override
Definition vop3.cc:5567
void execute(GPUDynInstPtr) override
Definition vop3.cc:5588
void execute(GPUDynInstPtr) override
Definition vop3.cc:2905
void execute(GPUDynInstPtr) override
Definition vop3.cc:4774
void execute(GPUDynInstPtr) override
Definition vop3.cc:4753
void execute(GPUDynInstPtr) override
Definition vop3.cc:5278
void execute(GPUDynInstPtr) override
Definition vop3.cc:2963
void execute(GPUDynInstPtr) override
Definition vop3.cc:3115
void execute(GPUDynInstPtr) override
Definition vop3.cc:2682
void execute(GPUDynInstPtr) override
Definition vop3.cc:2724
void execute(GPUDynInstPtr) override
Definition vop3.cc:3211
void execute(GPUDynInstPtr) override
Definition vop3.cc:3251
void execute(GPUDynInstPtr) override
Definition vop3.cc:3291
void execute(GPUDynInstPtr) override
Definition vop3.cc:3331
void execute(GPUDynInstPtr) override
Definition vop3.cc:3163
void execute(GPUDynInstPtr) override
Definition vop3.cc:2642
void execute(GPUDynInstPtr) override
Definition vop3.cc:3427
void execute(GPUDynInstPtr) override
Definition vop3.cc:3054
void execute(GPUDynInstPtr) override
Definition vop3.cc:4816
void execute(GPUDynInstPtr) override
Definition vop3.cc:2822
void execute(GPUDynInstPtr) override
Definition vop3.cc:2590
void execute(GPUDynInstPtr) override
Definition vop3.cc:3094
void execute(GPUDynInstPtr) override
Definition vop3.cc:7972
void execute(GPUDynInstPtr) override
Definition vop3.cc:9471
void execute(GPUDynInstPtr) override
Definition vop3.cc:9492
void execute(GPUDynInstPtr) override
Definition vop3.cc:9516
void execute(GPUDynInstPtr) override
Definition vop3.cc:8286
void execute(GPUDynInstPtr) override
Definition vop3.cc:9626
void execute(GPUDynInstPtr) override
Definition vop3.cc:8227
void execute(GPUDynInstPtr) override
Definition vop3.cc:9571
void execute(GPUDynInstPtr) override
Definition vop3.cc:9554
void execute(GPUDynInstPtr) override
Definition vop3.cc:9535
void execute(GPUDynInstPtr) override
Definition vop3.cc:6716
void execute(GPUDynInstPtr) override
Definition vop3.cc:3013
void execute(GPUDynInstPtr) override
Definition vop3.cc:9874
void execute(GPUDynInstPtr) override
Definition vop3.cc:9747
void execute(GPUDynInstPtr) override
Definition vop3.cc:9812
void execute(GPUDynInstPtr) override
Definition vop3.cc:9682
void execute(GPUDynInstPtr) override
Definition vop3.cc:4795
void execute(GPUDynInstPtr) override
Definition vop3.cc:2766
void execute(GPUDynInstPtr) override
Definition vop3.cc:3373
void execute(GPUDynInstPtr) override
Definition vop3.cc:7899
void execute(GPUDynInstPtr) override
Definition vop3.cc:6773
void execute(GPUDynInstPtr) override
Definition vop3.cc:6851
void execute(GPUDynInstPtr) override
Definition vop3.cc:7089
void execute(GPUDynInstPtr) override
Definition vop3.cc:7152
void execute(GPUDynInstPtr) override
Definition vop3.cc:6946
void execute(GPUDynInstPtr) override
Definition vop3.cc:7003
void execute(GPUDynInstPtr) override
Definition vop3.cc:1240
Inst_VOP3__V_EXP_F16(InFmt_VOP3A *)
Definition vop3.cc:4914
void execute(GPUDynInstPtr) override
Definition vop3.cc:4931
void execute(GPUDynInstPtr) override
Definition vop3.cc:3832
Inst_VOP3__V_EXP_F32(InFmt_VOP3A *)
Definition vop3.cc:3818
void execute(GPUDynInstPtr) override
Definition vop3.cc:5150
void execute(GPUDynInstPtr) override
Definition vop3.cc:4464
void execute(GPUDynInstPtr) override
Definition vop3.cc:4384
void execute(GPUDynInstPtr) override
Definition vop3.cc:4424
void execute(GPUDynInstPtr) override
Definition vop3.cc:5006
void execute(GPUDynInstPtr) override
Definition vop3.cc:3792
void execute(GPUDynInstPtr) override
Definition vop3.cc:3589
void execute(GPUDynInstPtr) override
Definition vop3.cc:2469
Inst_VOP3__V_FMA_F16(InFmt_VOP3A *)
Definition vop3.cc:7835
void execute(GPUDynInstPtr) override
Definition vop3.cc:7851
void execute(GPUDynInstPtr) override
Definition vop3.cc:5777
Inst_VOP3__V_FMA_F32(InFmt_VOP3A *)
Definition vop3.cc:5762
Inst_VOP3__V_FMA_F64(InFmt_VOP3A *)
Definition vop3.cc:5823
void execute(GPUDynInstPtr) override
Definition vop3.cc:5838
void execute(GPUDynInstPtr) override
Definition vop3.cc:5090
void execute(GPUDynInstPtr) override
Definition vop3.cc:3629
void execute(GPUDynInstPtr) override
Definition vop3.cc:4592
void execute(GPUDynInstPtr) override
Definition vop3.cc:4985
void execute(GPUDynInstPtr) override
Definition vop3.cc:4638
void execute(GPUDynInstPtr) override
Definition vop3.cc:4505
void execute(GPUDynInstPtr) override
Definition vop3.cc:4958
void execute(GPUDynInstPtr) override
Definition vop3.cc:4689
void execute(GPUDynInstPtr) override
Definition vop3.cc:4551
void execute(GPUDynInstPtr) override
Definition vop3.cc:8399
void execute(GPUDynInstPtr) override
Definition vop3.cc:8428
void execute(GPUDynInstPtr) override
Definition vop3.cc:8458
void execute(GPUDynInstPtr) override
Definition vop3.cc:8354
void execute(GPUDynInstPtr) override
Definition vop3.cc:8485
void execute(GPUDynInstPtr) override
Definition vop3.cc:8379
void execute(GPUDynInstPtr) override
Definition vop3.cc:2319
void execute(GPUDynInstPtr) override
Definition vop3.cc:9015
void execute(GPUDynInstPtr) override
Definition vop3.cc:8817
void execute(GPUDynInstPtr) override
Definition vop3.cc:5903
Inst_VOP3__V_LERP_U8(InFmt_VOP3A *)
Definition vop3.cc:5884
void execute(GPUDynInstPtr) override
Definition vop3.cc:4908
Inst_VOP3__V_LOG_F16(InFmt_VOP3A *)
Definition vop3.cc:4891
void execute(GPUDynInstPtr) override
Definition vop3.cc:3872
Inst_VOP3__V_LOG_F32(InFmt_VOP3A *)
Definition vop3.cc:3858
void execute(GPUDynInstPtr) override
Definition vop3.cc:5198
void execute(GPUDynInstPtr) override
Definition vop3.cc:1939
void execute(GPUDynInstPtr) override
Definition vop3.cc:1021
void execute(GPUDynInstPtr) override
Definition vop3.cc:9265
void execute(GPUDynInstPtr) override
Definition vop3.cc:7440
void execute(GPUDynInstPtr) override
Definition vop3.cc:7918
void execute(GPUDynInstPtr) override
Definition vop3.cc:7577
void execute(GPUDynInstPtr) override
Definition vop3.cc:1984
void execute(GPUDynInstPtr) override
Definition vop3.cc:932
void execute(GPUDynInstPtr) override
Definition vop3.cc:9310
void execute(GPUDynInstPtr) override
Definition vop3.cc:1742
Inst_VOP3__V_MAC_F16(InFmt_VOP3A *)
Definition vop3.cc:1725
Inst_VOP3__V_MAC_F32(InFmt_VOP3A *)
Definition vop3.cc:1274
void execute(GPUDynInstPtr) override
Definition vop3.cc:1290
void execute(GPUDynInstPtr) override
Definition vop3.cc:7672
Inst_VOP3__V_MAD_F16(InFmt_VOP3A *)
Definition vop3.cc:7656
void execute(GPUDynInstPtr) override
Definition vop3.cc:5391
Inst_VOP3__V_MAD_F32(InFmt_VOP3A *)
Definition vop3.cc:5376
Inst_VOP3__V_MAD_I16(InFmt_VOP3A *)
Definition vop3.cc:7725
void execute(GPUDynInstPtr) override
Definition vop3.cc:7740
void execute(GPUDynInstPtr) override
Definition vop3.cc:5451
void execute(GPUDynInstPtr) override
Definition vop3.cc:7350
void execute(GPUDynInstPtr) override
Definition vop3.cc:5330
void execute(GPUDynInstPtr) override
Definition vop3.cc:7693
Inst_VOP3__V_MAD_U16(InFmt_VOP3A *)
Definition vop3.cc:7678
void execute(GPUDynInstPtr) override
Definition vop3.cc:5498
void execute(GPUDynInstPtr) override
Definition vop3.cc:7301
void execute(GPUDynInstPtr) override
Definition vop3.cc:6209
void execute(GPUDynInstPtr) override
Definition vop3.cc:6269
void execute(GPUDynInstPtr) override
Definition vop3.cc:6315
void execute(GPUDynInstPtr) override
Definition vop3.cc:2081
Inst_VOP3__V_MAX_F16(InFmt_VOP3A *)
Definition vop3.cc:2065
void execute(GPUDynInstPtr) override
Definition vop3.cc:703
Inst_VOP3__V_MAX_F32(InFmt_VOP3A *)
Definition vop3.cc:689
Inst_VOP3__V_MAX_F64(InFmt_VOP3A *)
Definition vop3.cc:8747
void execute(GPUDynInstPtr) override
Definition vop3.cc:8761
void execute(GPUDynInstPtr) override
Definition vop3.cc:2171
Inst_VOP3__V_MAX_I16(InFmt_VOP3A *)
Definition vop3.cc:2158
void execute(GPUDynInstPtr) override
Definition vop3.cc:801
Inst_VOP3__V_MAX_I32(InFmt_VOP3A *)
Definition vop3.cc:788
Inst_VOP3__V_MAX_U16(InFmt_VOP3A *)
Definition vop3.cc:2109
void execute(GPUDynInstPtr) override
Definition vop3.cc:2122
void execute(GPUDynInstPtr) override
Definition vop3.cc:887
Inst_VOP3__V_MAX_U32(InFmt_VOP3A *)
Definition vop3.cc:874
void execute(GPUDynInstPtr) override
Definition vop3.cc:9218
void execute(GPUDynInstPtr) override
Definition vop3.cc:9168
void execute(GPUDynInstPtr) override
Definition vop3.cc:6362
void execute(GPUDynInstPtr) override
Definition vop3.cc:6421
void execute(GPUDynInstPtr) override
Definition vop3.cc:6466
void execute(GPUDynInstPtr) override
Definition vop3.cc:6056
void execute(GPUDynInstPtr) override
Definition vop3.cc:6116
void execute(GPUDynInstPtr) override
Definition vop3.cc:6162
void execute(GPUDynInstPtr) override
Definition vop3.cc:2103
Inst_VOP3__V_MIN_F16(InFmt_VOP3A *)
Definition vop3.cc:2087
void execute(GPUDynInstPtr) override
Definition vop3.cc:647
Inst_VOP3__V_MIN_F32(InFmt_VOP3A *)
Definition vop3.cc:633
Inst_VOP3__V_MIN_F64(InFmt_VOP3A *)
Definition vop3.cc:8691
void execute(GPUDynInstPtr) override
Definition vop3.cc:8705
Inst_VOP3__V_MIN_I16(InFmt_VOP3A *)
Definition vop3.cc:2256
void execute(GPUDynInstPtr) override
Definition vop3.cc:2269
void execute(GPUDynInstPtr) override
Definition vop3.cc:758
Inst_VOP3__V_MIN_I32(InFmt_VOP3A *)
Definition vop3.cc:745
void execute(GPUDynInstPtr) override
Definition vop3.cc:2220
Inst_VOP3__V_MIN_U16(InFmt_VOP3A *)
Definition vop3.cc:2207
Inst_VOP3__V_MIN_U32(InFmt_VOP3A *)
Definition vop3.cc:831
void execute(GPUDynInstPtr) override
Definition vop3.cc:844
void execute(GPUDynInstPtr) override
Definition vop3.cc:2556
Inst_VOP3__V_MOV_B32(InFmt_VOP3A *)
Definition vop3.cc:2542
void execute(GPUDynInstPtr) override
Definition vop3.cc:2884
void execute(GPUDynInstPtr) override
Definition vop3.cc:7259
void execute(GPUDynInstPtr) override
Definition vop3.cc:7279
void execute(GPUDynInstPtr) override
Definition vop3.cc:7218
Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *)
Definition vop3.cc:7205
Inst_VOP3__V_MUL_F16(InFmt_VOP3A *)
Definition vop3.cc:1704
void execute(GPUDynInstPtr) override
Definition vop3.cc:1719
void execute(GPUDynInstPtr) override
Definition vop3.cc:366
Inst_VOP3__V_MUL_F32(InFmt_VOP3A *)
Definition vop3.cc:352
Inst_VOP3__V_MUL_F64(InFmt_VOP3A *)
Definition vop3.cc:8590
void execute(GPUDynInstPtr) override
Definition vop3.cc:8604
void execute(GPUDynInstPtr) override
Definition vop3.cc:510
void execute(GPUDynInstPtr) override
Definition vop3.cc:8968
void execute(GPUDynInstPtr) override
Definition vop3.cc:601
void execute(GPUDynInstPtr) override
Definition vop3.cc:8922
void execute(GPUDynInstPtr) override
Definition vop3.cc:466
void execute(GPUDynInstPtr) override
Definition vop3.cc:265
void execute(GPUDynInstPtr) override
Definition vop3.cc:1895
void execute(GPUDynInstPtr) override
Definition vop3.cc:8877
void execute(GPUDynInstPtr) override
Definition vop3.cc:558
void execute(GPUDynInstPtr) override
Definition vop3.cc:2537
Inst_VOP3__V_NOP(InFmt_VOP3A *)
Definition vop3.cc:2523
void execute(GPUDynInstPtr) override
Definition vop3.cc:4312
Inst_VOP3__V_NOT_B32(InFmt_VOP3A *)
Definition vop3.cc:4298
Inst_VOP3__V_OR3_B32(InFmt_VOP3A *)
Definition vop3.cc:1139
void execute(GPUDynInstPtr) override
Definition vop3.cc:1153
Inst_VOP3__V_OR_B32(InFmt_VOP3A *)
Definition vop3.cc:1095
void execute(GPUDynInstPtr) override
Definition vop3.cc:1109
void execute(GPUDynInstPtr) override
Definition vop3.cc:9938
void execute(GPUDynInstPtr) override
Definition vop3.cc:9981
uint8_t permute(uint64_t in_dword2x, uint32_t sel)
void execute(GPUDynInstPtr) override
Definition vop3.cc:7798
void execute(GPUDynInstPtr) override
Definition vop3.cc:5236
void execute(GPUDynInstPtr) override
Definition vop3.cc:7238
Inst_VOP3__V_RCP_F16(InFmt_VOP3A *)
Definition vop3.cc:4822
void execute(GPUDynInstPtr) override
Definition vop3.cc:4839
Inst_VOP3__V_RCP_F32(InFmt_VOP3A *)
Definition vop3.cc:3906
void execute(GPUDynInstPtr) override
Definition vop3.cc:3920
void execute(GPUDynInstPtr) override
Definition vop3.cc:4042
Inst_VOP3__V_RCP_F64(InFmt_VOP3A *)
Definition vop3.cc:4028
void execute(GPUDynInstPtr) override
Definition vop3.cc:3962
void execute(GPUDynInstPtr) override
Definition vop3.cc:9059
void execute(GPUDynInstPtr) override
Definition vop3.cc:5070
void execute(GPUDynInstPtr) override
Definition vop3.cc:3751
void execute(GPUDynInstPtr) override
Definition vop3.cc:3548
void execute(GPUDynInstPtr) override
Definition vop3.cc:4885
Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *)
Definition vop3.cc:4868
Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *)
Definition vop3.cc:3988
void execute(GPUDynInstPtr) override
Definition vop3.cc:4002
void execute(GPUDynInstPtr) override
Definition vop3.cc:4094
Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *)
Definition vop3.cc:4080
void execute(GPUDynInstPtr) override
Definition vop3.cc:6567
Inst_VOP3__V_SAD_U16(InFmt_VOP3A *)
Definition vop3.cc:6603
void execute(GPUDynInstPtr) override
Definition vop3.cc:6618
Inst_VOP3__V_SAD_U32(InFmt_VOP3A *)
Definition vop3.cc:6653
void execute(GPUDynInstPtr) override
Definition vop3.cc:6667
void execute(GPUDynInstPtr) override
Definition vop3.cc:6514
Inst_VOP3__V_SAD_U8(InFmt_VOP3A *)
Definition vop3.cc:6498
void execute(GPUDynInstPtr) override
Definition vop3.cc:5110
Inst_VOP3__V_SIN_F16(InFmt_VOP3A *)
Definition vop3.cc:5096
Inst_VOP3__V_SIN_F32(InFmt_VOP3A *)
Definition vop3.cc:4210
void execute(GPUDynInstPtr) override
Definition vop3.cc:4226
void execute(GPUDynInstPtr) override
Definition vop3.cc:4862
void execute(GPUDynInstPtr) override
Definition vop3.cc:4144
void execute(GPUDynInstPtr) override
Definition vop3.cc:4184
void execute(GPUDynInstPtr) override
Definition vop3.cc:1605
void execute(GPUDynInstPtr) override
Definition vop3.cc:1551
void execute(GPUDynInstPtr) override
Definition vop3.cc:1447
void execute(GPUDynInstPtr) override
Definition vop3.cc:1698
void execute(GPUDynInstPtr) override
Definition vop3.cc:209
void execute(GPUDynInstPtr) override
Definition vop3.cc:1851
void execute(GPUDynInstPtr) override
Definition vop3.cc:2424
void execute(GPUDynInstPtr) override
Definition vop3.cc:1398
Inst_VOP3__V_SUB_F16(InFmt_VOP3A *)
Definition vop3.cc:1660
void execute(GPUDynInstPtr) override
Definition vop3.cc:1676
void execute(GPUDynInstPtr) override
Definition vop3.cc:152
Inst_VOP3__V_SUB_F32(InFmt_VOP3A *)
Definition vop3.cc:137
Inst_VOP3__V_SUB_U16(InFmt_VOP3A *)
Definition vop3.cc:1792
void execute(GPUDynInstPtr) override
Definition vop3.cc:1806
Inst_VOP3__V_SUB_U32(InFmt_VOP3A *)
Definition vop3.cc:2368
void execute(GPUDynInstPtr) override
Definition vop3.cc:2381
void execute(GPUDynInstPtr) override
Definition vop3.cc:9405
void execute(GPUDynInstPtr) override
Definition vop3.cc:5048
void execute(GPUDynInstPtr) override
Definition vop3.cc:3670
void execute(GPUDynInstPtr) override
Definition vop3.cc:3467
void execute(GPUDynInstPtr) override
Definition vop3.cc:9092
void execute(GPUDynInstPtr) override
Definition vop3.cc:7395
Inst_VOP3__V_XAD_U32(InFmt_VOP3A *)
Definition vop3.cc:7382
Inst_VOP3__V_XOR_B32(InFmt_VOP3A *)
Definition vop3.cc:1185
void execute(GPUDynInstPtr) override
Definition vop3.cc:1199
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:419
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:402
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
Definition operand.hh:507
void read() override
read from the vrf.
Definition operand.hh:148
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:132
void write() override
write to the vrf.
Definition operand.hh:203
VectorMask & execMask()
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition bitfield.hh:415
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
constexpr void replaceBits(T &val, unsigned first, unsigned last, B bit_val)
A convenience function to replace bits first to last of val with bit_val in place.
Definition bitfield.hh:216
std::enable_if_t< std::is_integral_v< T >, T > reverseBits(T val, size_t size=sizeof(T))
Takes a value and returns the bit reversed version.
Definition bitfield.hh:255
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
mxfp< fp8_e4m3_info > mxfloat8
Definition mxfp_types.hh:47
mxfp< binary32 > mxfloat32
Definition mxfp_types.hh:52
mxfp< fp16_e8m7_info > mxbfloat16
Definition mxfp_types.hh:49
mxfp< fp16_e5m10_info > mxfloat16
Definition mxfp_types.hh:50
dFMT convertMXFP(sFMT in, mxfpRoundingMode mode=roundTiesToEven, uint32_t seed=0)
mxfp< fp8_e5m2_info > mxbfloat8
Definition mxfp_types.hh:46
Bitfield< 22 > a1
Bitfield< 31 > n
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 7, 4 > b1
Definition qarma.hh:65
Bitfield< 11, 8 > b2
Definition qarma.hh:64
static constexpr float32_t f32(uint32_t v)
Definition float.hh:98
classes that represnt vector/scalar operands in VEGA ISA.
Definition faults.cc:39
ScalarOperand< ScalarRegU64, false > ScalarOperandU64
Definition operand.hh:804
VecOperand< VecElemF32, true > ConstVecOperandF32
Definition operand.hh:846
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition inst_util.hh:174
VecOperand< VecElemU32, false > VecOperandU32
Definition operand.hh:829
ScalarOperand< ScalarRegU32, true > ConstScalarOperandU32
Definition operand.hh:815
VecOperand< VecElemF64, true > ConstVecOperandF64
Definition operand.hh:849
VecOperand< VecElemI16, false, 1 > VecOperandI16
Definition operand.hh:828
VecOperand< VecElemI32, true > ConstVecOperandI32
Definition operand.hh:845
VecOperand< VecElemU32, true > ConstVecOperandU32
Definition operand.hh:844
ScalarRegI32 findFirstOne(T val)
Definition inst_util.hh:142
T median(T val_0, T val_1, T val_2)
Definition inst_util.hh:247
ScalarRegI32 findFirstOneMsb(T val)
Definition inst_util.hh:153
T roundNearestEven(T val)
Definition inst_util.hh:259
VecOperand< VecElemI64, true > ConstVecOperandI64
Definition operand.hh:848
uint32_t VecElemU32
VecOperand< VecElemU16, false, 1 > VecOperandU16
Definition operand.hh:827
ScalarOperand< ScalarRegU64, true > ConstScalarOperandU64
Definition operand.hh:818
VecOperand< VecElemU16, true, 1 > ConstVecOperandU16
Definition operand.hh:842
ScalarOperand< ScalarRegF32, true > ConstScalarOperandF32
Definition operand.hh:817
ScalarOperand< ScalarRegU32, false > ScalarOperandU32
Definition operand.hh:801
VecOperand< VecElemI64, false > VecOperandI64
Definition operand.hh:834
const int NumVecElemPerVecReg(64)
uint64_t VecElemU64
VecOperand< VecElemU64, false > VecOperandU64
Definition operand.hh:832
VecOperand< VecElemI32, false > VecOperandI32
Definition operand.hh:830
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition inst_util.hh:272
VecOperand< VecElemI16, true, 1 > ConstVecOperandI16
Definition operand.hh:843
VecOperand< VecElemU64, true > ConstVecOperandU64
Definition operand.hh:847
VecOperand< VecElemF64, false > VecOperandF64
Definition operand.hh:833
VecOperand< VecElemF32, false > VecOperandF32
Definition operand.hh:831
Bitfield< 31, 16 > selector
Definition misc.hh:1038
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:78
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:83

Generated on Mon Oct 27 2025 04:12:47 for gem5 by doxygen 1.14.0