gem5 v24.1.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
vop3.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
35
36namespace gem5
37{
38
39namespace VegaISA
40{
41 // --- Inst_VOP3__V_CNDMASK_B32 class methods ---
42
44 : Inst_VOP3A(iFmt, "v_cndmask_b32", false)
45 {
46 setFlag(ALU);
47 setFlag(ReadsVCC);
48 } // Inst_VOP3__V_CNDMASK_B32
49
51 {
52 } // ~Inst_VOP3__V_CNDMASK_B32
53
54 // --- description from .arch file ---
55 // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
56 // as a scalar GPR in S2.
57 void
59 {
60 Wavefront *wf = gpuDynInst->wavefront();
61 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
62 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
63 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
64 VecOperandU32 vdst(gpuDynInst, instData.VDST);
65
66 src0.readSrc();
67 src1.readSrc();
68 vcc.read();
69
70 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
71 if (wf->execMask(lane)) {
72 vdst[lane] = bits(vcc.rawData(), lane)
73 ? src1[lane] : src0[lane];
74 }
75 }
76
77 vdst.write();
78 } // execute
79 // --- Inst_VOP3__V_ADD_F32 class methods ---
80
82 : Inst_VOP3A(iFmt, "v_add_f32", false)
83 {
84 setFlag(ALU);
85 setFlag(F32);
86 } // Inst_VOP3__V_ADD_F32
87
89 {
90 } // ~Inst_VOP3__V_ADD_F32
91
92 // --- description from .arch file ---
93 // D.f = S0.f + S1.f.
94 void
96 {
97 Wavefront *wf = gpuDynInst->wavefront();
98 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
99 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
100 VecOperandF32 vdst(gpuDynInst, instData.VDST);
101
102 src0.readSrc();
103 src1.readSrc();
104
105 if (instData.ABS & 0x1) {
106 src0.absModifier();
107 }
108
109 if (instData.ABS & 0x2) {
110 src1.absModifier();
111 }
112
113 if (extData.NEG & 0x1) {
114 src0.negModifier();
115 }
116
117 if (extData.NEG & 0x2) {
118 src1.negModifier();
119 }
120
124 assert(!(instData.ABS & 0x4));
125 assert(!(extData.NEG & 0x4));
126
127 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
128 if (wf->execMask(lane)) {
129 vdst[lane] = src0[lane] + src1[lane];
130 }
131 }
132
133 vdst.write();
134 } // execute
135 // --- Inst_VOP3__V_SUB_F32 class methods ---
136
138 : Inst_VOP3A(iFmt, "v_sub_f32", false)
139 {
140 setFlag(ALU);
141 setFlag(F32);
142 } // Inst_VOP3__V_SUB_F32
143
145 {
146 } // ~Inst_VOP3__V_SUB_F32
147
148 // --- description from .arch file ---
149 // D.f = S0.f - S1.f.
150 // SQ translates to V_ADD_F32.
151 void
153 {
154 Wavefront *wf = gpuDynInst->wavefront();
155 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
156 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
157 VecOperandF32 vdst(gpuDynInst, instData.VDST);
158
159 src0.readSrc();
160 src1.readSrc();
161
162 if (instData.ABS & 0x1) {
163 src0.absModifier();
164 }
165
166 if (instData.ABS & 0x2) {
167 src1.absModifier();
168 }
169
170 if (extData.NEG & 0x1) {
171 src0.negModifier();
172 }
173
174 if (extData.NEG & 0x2) {
175 src1.negModifier();
176 }
177
181 assert(!(instData.ABS & 0x4));
182 assert(!(extData.NEG & 0x4));
183
184 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
185 if (wf->execMask(lane)) {
186 vdst[lane] = src0[lane] - src1[lane];
187 }
188 }
189
190 vdst.write();
191 } // execute
192 // --- Inst_VOP3__V_SUBREV_F32 class methods ---
193
195 : Inst_VOP3A(iFmt, "v_subrev_f32", false)
196 {
197 setFlag(ALU);
198 setFlag(F32);
199 } // Inst_VOP3__V_SUBREV_F32
200
202 {
203 } // ~Inst_VOP3__V_SUBREV_F32
204
205 // --- description from .arch file ---
206 // D.f = S1.f - S0.f.
207 // SQ translates to V_ADD_F32.
208 void
210 {
211 Wavefront *wf = gpuDynInst->wavefront();
212 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
213 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
214 VecOperandF32 vdst(gpuDynInst, instData.VDST);
215
216 src0.readSrc();
217 src1.readSrc();
218
219 if (instData.ABS & 0x1) {
220 src0.absModifier();
221 }
222
223 if (instData.ABS & 0x2) {
224 src1.absModifier();
225 }
226
227 if (extData.NEG & 0x1) {
228 src0.negModifier();
229 }
230
231 if (extData.NEG & 0x2) {
232 src1.negModifier();
233 }
234
238 assert(!(instData.ABS & 0x4));
239 assert(!(extData.NEG & 0x4));
240
241 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
242 if (wf->execMask(lane)) {
243 vdst[lane] = src1[lane] - src0[lane];
244 }
245 }
246
247 vdst.write();
248 } // execute
249 // --- Inst_VOP3__V_MUL_LEGACY_F32 class methods ---
250
252 : Inst_VOP3A(iFmt, "v_mul_legacy_f32", false)
253 {
254 setFlag(ALU);
255 setFlag(F32);
256 } // Inst_VOP3__V_MUL_LEGACY_F32
257
259 {
260 } // ~Inst_VOP3__V_MUL_LEGACY_F32
261
262 // --- description from .arch file ---
263 // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
264 void
266 {
267 Wavefront *wf = gpuDynInst->wavefront();
268 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
269 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
270 VecOperandF32 vdst(gpuDynInst, instData.VDST);
271
272 src0.readSrc();
273 src1.readSrc();
274
275 if (instData.ABS & 0x1) {
276 src0.absModifier();
277 }
278
279 if (instData.ABS & 0x2) {
280 src1.absModifier();
281 }
282
283 if (extData.NEG & 0x1) {
284 src0.negModifier();
285 }
286
287 if (extData.NEG & 0x2) {
288 src1.negModifier();
289 }
290
294 assert(!(instData.ABS & 0x4));
295 assert(!(extData.NEG & 0x4));
296
297 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
298 if (wf->execMask(lane)) {
299 if (std::isnan(src0[lane]) ||
300 std::isnan(src1[lane])) {
301 vdst[lane] = NAN;
302 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
303 std::fpclassify(src0[lane]) == FP_ZERO) &&
304 !std::signbit(src0[lane])) {
305 if (std::isinf(src1[lane])) {
306 vdst[lane] = NAN;
307 } else if (!std::signbit(src1[lane])) {
308 vdst[lane] = +0.0;
309 } else {
310 vdst[lane] = -0.0;
311 }
312 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
313 std::fpclassify(src0[lane]) == FP_ZERO) &&
314 std::signbit(src0[lane])) {
315 if (std::isinf(src1[lane])) {
316 vdst[lane] = NAN;
317 } else if (std::signbit(src1[lane])) {
318 vdst[lane] = +0.0;
319 } else {
320 vdst[lane] = -0.0;
321 }
322 } else if (std::isinf(src0[lane]) &&
323 !std::signbit(src0[lane])) {
324 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
325 std::fpclassify(src1[lane]) == FP_ZERO) {
326 vdst[lane] = NAN;
327 } else if (!std::signbit(src1[lane])) {
328 vdst[lane] = +INFINITY;
329 } else {
330 vdst[lane] = -INFINITY;
331 }
332 } else if (std::isinf(src0[lane]) &&
333 std::signbit(src0[lane])) {
334 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
335 std::fpclassify(src1[lane]) == FP_ZERO) {
336 vdst[lane] = NAN;
337 } else if (std::signbit(src1[lane])) {
338 vdst[lane] = +INFINITY;
339 } else {
340 vdst[lane] = -INFINITY;
341 }
342 } else {
343 vdst[lane] = src0[lane] * src1[lane];
344 }
345 }
346 }
347
348 vdst.write();
349 } // execute
350 // --- Inst_VOP3__V_MUL_F32 class methods ---
351
353 : Inst_VOP3A(iFmt, "v_mul_f32", false)
354 {
355 setFlag(ALU);
356 setFlag(F32);
357 } // Inst_VOP3__V_MUL_F32
358
360 {
361 } // ~Inst_VOP3__V_MUL_F32
362
363 // --- description from .arch file ---
364 // D.f = S0.f * S1.f.
365 void
367 {
368 Wavefront *wf = gpuDynInst->wavefront();
369 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
370 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
371 VecOperandF32 vdst(gpuDynInst, instData.VDST);
372
373 src0.readSrc();
374 src1.readSrc();
375
376 if (instData.ABS & 0x1) {
377 src0.absModifier();
378 }
379
380 if (instData.ABS & 0x2) {
381 src1.absModifier();
382 }
383
384 if (extData.NEG & 0x1) {
385 src0.negModifier();
386 }
387
388 if (extData.NEG & 0x2) {
389 src1.negModifier();
390 }
391
395 assert(!(instData.ABS & 0x4));
396 assert(!(extData.NEG & 0x4));
397
398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
399 if (wf->execMask(lane)) {
400 if (std::isnan(src0[lane]) ||
401 std::isnan(src1[lane])) {
402 vdst[lane] = NAN;
403 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
404 std::fpclassify(src0[lane]) == FP_ZERO) &&
405 !std::signbit(src0[lane])) {
406 if (std::isinf(src1[lane])) {
407 vdst[lane] = NAN;
408 } else if (!std::signbit(src1[lane])) {
409 vdst[lane] = +0.0;
410 } else {
411 vdst[lane] = -0.0;
412 }
413 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
414 std::fpclassify(src0[lane]) == FP_ZERO) &&
415 std::signbit(src0[lane])) {
416 if (std::isinf(src1[lane])) {
417 vdst[lane] = NAN;
418 } else if (std::signbit(src1[lane])) {
419 vdst[lane] = +0.0;
420 } else {
421 vdst[lane] = -0.0;
422 }
423 } else if (std::isinf(src0[lane]) &&
424 !std::signbit(src0[lane])) {
425 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
426 std::fpclassify(src1[lane]) == FP_ZERO) {
427 vdst[lane] = NAN;
428 } else if (!std::signbit(src1[lane])) {
429 vdst[lane] = +INFINITY;
430 } else {
431 vdst[lane] = -INFINITY;
432 }
433 } else if (std::isinf(src0[lane]) &&
434 std::signbit(src0[lane])) {
435 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
436 std::fpclassify(src1[lane]) == FP_ZERO) {
437 vdst[lane] = NAN;
438 } else if (std::signbit(src1[lane])) {
439 vdst[lane] = +INFINITY;
440 } else {
441 vdst[lane] = -INFINITY;
442 }
443 } else {
444 vdst[lane] = src0[lane] * src1[lane];
445 }
446 }
447 }
448
449 vdst.write();
450 } // execute
451 // --- Inst_VOP3__V_MUL_I32_I24 class methods ---
452
454 : Inst_VOP3A(iFmt, "v_mul_i32_i24", false)
455 {
456 setFlag(ALU);
457 } // Inst_VOP3__V_MUL_I32_I24
458
460 {
461 } // ~Inst_VOP3__V_MUL_I32_I24
462
463 // --- description from .arch file ---
464 // D.i = S0.i[23:0] * S1.i[23:0].
465 void
467 {
468 Wavefront *wf = gpuDynInst->wavefront();
469 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
470 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
471 VecOperandI32 vdst(gpuDynInst, instData.VDST);
472
473 src0.readSrc();
474 src1.read();
475
479 assert(!(instData.ABS & 0x1));
480 assert(!(instData.ABS & 0x2));
481 assert(!(instData.ABS & 0x4));
482 assert(!(extData.NEG & 0x1));
483 assert(!(extData.NEG & 0x2));
484 assert(!(extData.NEG & 0x4));
485
486 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
487 if (wf->execMask(lane)) {
488 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
489 * sext<24>(bits(src1[lane], 23, 0));
490 }
491 }
492
493 vdst.write();
494 } // execute
495 // --- Inst_VOP3__V_MUL_HI_I32_I24 class methods ---
496
498 : Inst_VOP3A(iFmt, "v_mul_hi_i32_i24", false)
499 {
500 setFlag(ALU);
501 } // Inst_VOP3__V_MUL_HI_I32_I24
502
504 {
505 } // ~Inst_VOP3__V_MUL_HI_I32_I24
506
507 // --- description from .arch file ---
508 // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
509 void
511 {
512 Wavefront *wf = gpuDynInst->wavefront();
513 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
514 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
515 VecOperandI32 vdst(gpuDynInst, instData.VDST);
516
517 src0.readSrc();
518 src1.readSrc();
519
523 assert(!(instData.ABS & 0x1));
524 assert(!(instData.ABS & 0x2));
525 assert(!(instData.ABS & 0x4));
526 assert(!(extData.NEG & 0x1));
527 assert(!(extData.NEG & 0x2));
528 assert(!(extData.NEG & 0x4));
529
530 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
531 if (wf->execMask(lane)) {
532 VecElemI64 tmp_src0
533 = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
534 VecElemI64 tmp_src1
535 = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
536
537 vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
538 }
539 }
540
541 vdst.write();
542 } // execute
543 // --- Inst_VOP3__V_MUL_U32_U24 class methods ---
544
546 : Inst_VOP3A(iFmt, "v_mul_u32_u24", false)
547 {
548 setFlag(ALU);
549 } // Inst_VOP3__V_MUL_U32_U24
550
552 {
553 } // ~Inst_VOP3__V_MUL_U32_U24
554
555 // --- description from .arch file ---
556 // D.u = S0.u[23:0] * S1.u[23:0].
557 void
559 {
560 Wavefront *wf = gpuDynInst->wavefront();
561 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
562 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
563 VecOperandU32 vdst(gpuDynInst, instData.VDST);
564
565 src0.readSrc();
566 src1.readSrc();
567
571 assert(!(instData.ABS & 0x1));
572 assert(!(instData.ABS & 0x2));
573 assert(!(instData.ABS & 0x4));
574 assert(!(extData.NEG & 0x1));
575 assert(!(extData.NEG & 0x2));
576 assert(!(extData.NEG & 0x4));
577
578 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
579 if (wf->execMask(lane)) {
580 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
581 }
582 }
583
584 vdst.write();
585 } // execute
586 // --- Inst_VOP3__V_MUL_HI_U32_U24 class methods ---
587
589 : Inst_VOP3A(iFmt, "v_mul_hi_u32_u24", false)
590 {
591 setFlag(ALU);
592 } // Inst_VOP3__V_MUL_HI_U32_U24
593
595 {
596 } // ~Inst_VOP3__V_MUL_HI_U32_U24
597
598 // --- description from .arch file ---
599 // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
600 void
602 {
603 Wavefront *wf = gpuDynInst->wavefront();
604 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
605 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
606 VecOperandU32 vdst(gpuDynInst, instData.VDST);
607
608 src0.readSrc();
609 src1.readSrc();
610
614 assert(!(instData.ABS & 0x1));
615 assert(!(instData.ABS & 0x2));
616 assert(!(instData.ABS & 0x4));
617 assert(!(extData.NEG & 0x1));
618 assert(!(extData.NEG & 0x2));
619 assert(!(extData.NEG & 0x4));
620
621 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
622 if (wf->execMask(lane)) {
623 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
624 VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
625 vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
626 }
627 }
628
629 vdst.write();
630 } // execute
631 // --- Inst_VOP3__V_MIN_F32 class methods ---
632
634 : Inst_VOP3A(iFmt, "v_min_f32", false)
635 {
636 setFlag(ALU);
637 setFlag(F32);
638 } // Inst_VOP3__V_MIN_F32
639
641 {
642 } // ~Inst_VOP3__V_MIN_F32
643
644 // --- description from .arch file ---
645 // D.f = (S0.f < S1.f ? S0.f : S1.f).
646 void
648 {
649 Wavefront *wf = gpuDynInst->wavefront();
650 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
651 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
652 VecOperandF32 vdst(gpuDynInst, instData.VDST);
653
654 src0.readSrc();
655 src1.readSrc();
656
657 if (instData.ABS & 0x1) {
658 src0.absModifier();
659 }
660
661 if (instData.ABS & 0x2) {
662 src1.absModifier();
663 }
664
665 if (extData.NEG & 0x1) {
666 src0.negModifier();
667 }
668
669 if (extData.NEG & 0x2) {
670 src1.negModifier();
671 }
672
676 assert(!(instData.ABS & 0x4));
677 assert(!(extData.NEG & 0x4));
678
679 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
680 if (wf->execMask(lane)) {
681 vdst[lane] = std::fmin(src0[lane], src1[lane]);
682 }
683 }
684
685 vdst.write();
686 } // execute
687 // --- Inst_VOP3__V_MAX_F32 class methods ---
688
690 : Inst_VOP3A(iFmt, "v_max_f32", false)
691 {
692 setFlag(ALU);
693 setFlag(F32);
694 } // Inst_VOP3__V_MAX_F32
695
697 {
698 } // ~Inst_VOP3__V_MAX_F32
699
700 // --- description from .arch file ---
701 // D.f = (S0.f >= S1.f ? S0.f : S1.f).
702 void
704 {
705 Wavefront *wf = gpuDynInst->wavefront();
706 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
707 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
708 VecOperandF32 vdst(gpuDynInst, instData.VDST);
709
710 src0.readSrc();
711 src1.readSrc();
712
713 if (instData.ABS & 0x1) {
714 src0.absModifier();
715 }
716
717 if (instData.ABS & 0x2) {
718 src1.absModifier();
719 }
720
721 if (extData.NEG & 0x1) {
722 src0.negModifier();
723 }
724
725 if (extData.NEG & 0x2) {
726 src1.negModifier();
727 }
728
732 assert(!(instData.ABS & 0x4));
733 assert(!(extData.NEG & 0x4));
734
735 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
736 if (wf->execMask(lane)) {
737 vdst[lane] = std::fmax(src0[lane], src1[lane]);
738 }
739 }
740
741 vdst.write();
742 } // execute
743 // --- Inst_VOP3__V_MIN_I32 class methods ---
744
746 : Inst_VOP3A(iFmt, "v_min_i32", false)
747 {
748 setFlag(ALU);
749 } // Inst_VOP3__V_MIN_I32
750
752 {
753 } // ~Inst_VOP3__V_MIN_I32
754
755 // --- description from .arch file ---
756 // D.i = min(S0.i, S1.i).
757 void
759 {
760 Wavefront *wf = gpuDynInst->wavefront();
761 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
762 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
763 VecOperandI32 vdst(gpuDynInst, instData.VDST);
764
765 src0.readSrc();
766 src1.readSrc();
767
771 assert(!(instData.ABS & 0x1));
772 assert(!(instData.ABS & 0x2));
773 assert(!(instData.ABS & 0x4));
774 assert(!(extData.NEG & 0x1));
775 assert(!(extData.NEG & 0x2));
776 assert(!(extData.NEG & 0x4));
777
778 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
779 if (wf->execMask(lane)) {
780 vdst[lane] = std::min(src0[lane], src1[lane]);
781 }
782 }
783
784 vdst.write();
785 } // execute
786 // --- Inst_VOP3__V_MAX_I32 class methods ---
787
789 : Inst_VOP3A(iFmt, "v_max_i32", false)
790 {
791 setFlag(ALU);
792 } // Inst_VOP3__V_MAX_I32
793
795 {
796 } // ~Inst_VOP3__V_MAX_I32
797
798 // --- description from .arch file ---
799 // D.i = max(S0.i, S1.i).
800 void
802 {
803 Wavefront *wf = gpuDynInst->wavefront();
804 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
805 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
806 VecOperandI32 vdst(gpuDynInst, instData.VDST);
807
808 src0.readSrc();
809 src1.readSrc();
810
814 assert(!(instData.ABS & 0x1));
815 assert(!(instData.ABS & 0x2));
816 assert(!(instData.ABS & 0x4));
817 assert(!(extData.NEG & 0x1));
818 assert(!(extData.NEG & 0x2));
819 assert(!(extData.NEG & 0x4));
820
821 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
822 if (wf->execMask(lane)) {
823 vdst[lane] = std::max(src0[lane], src1[lane]);
824 }
825 }
826
827 vdst.write();
828 } // execute
829 // --- Inst_VOP3__V_MIN_U32 class methods ---
830
832 : Inst_VOP3A(iFmt, "v_min_u32", false)
833 {
834 setFlag(ALU);
835 } // Inst_VOP3__V_MIN_U32
836
838 {
839 } // ~Inst_VOP3__V_MIN_U32
840
841 // --- description from .arch file ---
842 // D.u = min(S0.u, S1.u).
843 void
845 {
846 Wavefront *wf = gpuDynInst->wavefront();
847 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
848 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
849 VecOperandU32 vdst(gpuDynInst, instData.VDST);
850
851 src0.readSrc();
852 src1.readSrc();
853
857 assert(!(instData.ABS & 0x1));
858 assert(!(instData.ABS & 0x2));
859 assert(!(instData.ABS & 0x4));
860 assert(!(extData.NEG & 0x1));
861 assert(!(extData.NEG & 0x2));
862 assert(!(extData.NEG & 0x4));
863
864 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
865 if (wf->execMask(lane)) {
866 vdst[lane] = std::min(src0[lane], src1[lane]);
867 }
868 }
869
870 vdst.write();
871 } // execute
872 // --- Inst_VOP3__V_MAX_U32 class methods ---
873
875 : Inst_VOP3A(iFmt, "v_max_u32", false)
876 {
877 setFlag(ALU);
878 } // Inst_VOP3__V_MAX_U32
879
881 {
882 } // ~Inst_VOP3__V_MAX_U32
883
884 // --- description from .arch file ---
885 // D.u = max(S0.u, S1.u).
886 void
888 {
889 Wavefront *wf = gpuDynInst->wavefront();
890 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
891 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
892 VecOperandU32 vdst(gpuDynInst, instData.VDST);
893
894 src0.readSrc();
895 src1.readSrc();
896
900 assert(!(instData.ABS & 0x1));
901 assert(!(instData.ABS & 0x2));
902 assert(!(instData.ABS & 0x4));
903 assert(!(extData.NEG & 0x1));
904 assert(!(extData.NEG & 0x2));
905 assert(!(extData.NEG & 0x4));
906
907 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
908 if (wf->execMask(lane)) {
909 vdst[lane] = std::max(src0[lane], src1[lane]);
910 }
911 }
912
913 vdst.write();
914 } // execute
915 // --- Inst_VOP3__V_LSHRREV_B32 class methods ---
916
918 : Inst_VOP3A(iFmt, "v_lshrrev_b32", false)
919 {
920 setFlag(ALU);
921 } // Inst_VOP3__V_LSHRREV_B32
922
924 {
925 } // ~Inst_VOP3__V_LSHRREV_B32
926
927 // --- description from .arch file ---
928 // D.u = S1.u >> S0.u[4:0].
929 // The vacated bits are set to zero.
930 // SQ translates this to an internal SP opcode.
931 void
933 {
934 Wavefront *wf = gpuDynInst->wavefront();
935 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
936 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
937 VecOperandU32 vdst(gpuDynInst, instData.VDST);
938
939 src0.readSrc();
940 src1.readSrc();
941
945 assert(!(instData.ABS & 0x1));
946 assert(!(instData.ABS & 0x2));
947 assert(!(instData.ABS & 0x4));
948 assert(!(extData.NEG & 0x1));
949 assert(!(extData.NEG & 0x2));
950 assert(!(extData.NEG & 0x4));
951
952 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
953 if (wf->execMask(lane)) {
954 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
955 }
956 }
957
958 vdst.write();
959 } // execute
960 // --- Inst_VOP3__V_ASHRREV_I32 class methods ---
961
963 : Inst_VOP3A(iFmt, "v_ashrrev_i32", false)
964 {
965 setFlag(ALU);
966 } // Inst_VOP3__V_ASHRREV_I32
967
969 {
970 } // ~Inst_VOP3__V_ASHRREV_I32
971
972 // --- description from .arch file ---
973 // D.i = signext(S1.i) >> S0.i[4:0].
974 // The vacated bits are set to the sign bit of the input value.
975 // SQ translates this to an internal SP opcode.
976 void
978 {
979 Wavefront *wf = gpuDynInst->wavefront();
980 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
981 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
982 VecOperandI32 vdst(gpuDynInst, instData.VDST);
983
984 src0.readSrc();
985 src1.readSrc();
986
990 assert(!(instData.ABS & 0x1));
991 assert(!(instData.ABS & 0x2));
992 assert(!(instData.ABS & 0x4));
993 assert(!(extData.NEG & 0x1));
994 assert(!(extData.NEG & 0x2));
995 assert(!(extData.NEG & 0x4));
996
997 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
998 if (wf->execMask(lane)) {
999 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
1000 }
1001 }
1002
1003 vdst.write();
1004 } // execute
1005 // --- Inst_VOP3__V_LSHLREV_B32 class methods ---
1006
1008 : Inst_VOP3A(iFmt, "v_lshlrev_b32", false)
1009 {
1010 setFlag(ALU);
1011 } // Inst_VOP3__V_LSHLREV_B32
1012
1014 {
1015 } // ~Inst_VOP3__V_LSHLREV_B32
1016
1017 // --- description from .arch file ---
1018 // D.u = S1.u << S0.u[4:0].
1019 // SQ translates this to an internal SP opcode.
1020 void
1022 {
1023 Wavefront *wf = gpuDynInst->wavefront();
1024 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1025 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1026 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1027
1028 src0.readSrc();
1029 src1.readSrc();
1030
1034 assert(!(instData.ABS & 0x1));
1035 assert(!(instData.ABS & 0x2));
1036 assert(!(instData.ABS & 0x4));
1037 assert(!(extData.NEG & 0x1));
1038 assert(!(extData.NEG & 0x2));
1039 assert(!(extData.NEG & 0x4));
1040
1041 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1042 if (wf->execMask(lane)) {
1043 vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
1044 }
1045 }
1046
1047 vdst.write();
1048 } // execute
1049 // --- Inst_VOP3__V_AND_B32 class methods ---
1050
1052 : Inst_VOP3A(iFmt, "v_and_b32", false)
1053 {
1054 setFlag(ALU);
1055 } // Inst_VOP3__V_AND_B32
1056
1058 {
1059 } // ~Inst_VOP3__V_AND_B32
1060
1061 // --- description from .arch file ---
1062 // D.u = S0.u & S1.u.
1063 // Input and output modifiers not supported.
1064 void
1066 {
1067 Wavefront *wf = gpuDynInst->wavefront();
1068 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1069 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1070 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1071
1072 src0.readSrc();
1073 src1.readSrc();
1074
1078 assert(!(instData.ABS & 0x1));
1079 assert(!(instData.ABS & 0x2));
1080 assert(!(instData.ABS & 0x4));
1081 assert(!(extData.NEG & 0x1));
1082 assert(!(extData.NEG & 0x2));
1083 assert(!(extData.NEG & 0x4));
1084
1085 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1086 if (wf->execMask(lane)) {
1087 vdst[lane] = src0[lane] & src1[lane];
1088 }
1089 }
1090
1091 vdst.write();
1092 } // execute
1093 // --- Inst_VOP3__V_OR_B32 class methods ---
1094
1096 : Inst_VOP3A(iFmt, "v_or_b32", false)
1097 {
1098 setFlag(ALU);
1099 } // Inst_VOP3__V_OR_B32
1100
1102 {
1103 } // ~Inst_VOP3__V_OR_B32
1104
1105 // --- description from .arch file ---
1106 // D.u = S0.u | S1.u.
1107 // Input and output modifiers not supported.
1108 void
1110 {
1111 Wavefront *wf = gpuDynInst->wavefront();
1112 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1113 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1114 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1115
1116 src0.readSrc();
1117 src1.readSrc();
1118
1122 assert(!(instData.ABS & 0x1));
1123 assert(!(instData.ABS & 0x2));
1124 assert(!(instData.ABS & 0x4));
1125 assert(!(extData.NEG & 0x1));
1126 assert(!(extData.NEG & 0x2));
1127 assert(!(extData.NEG & 0x4));
1128
1129 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1130 if (wf->execMask(lane)) {
1131 vdst[lane] = src0[lane] | src1[lane];
1132 }
1133 }
1134
1135 vdst.write();
1136 } // execute
1137 // --- Inst_VOP3__V_OR3_B32 class methods ---
1138
1140 : Inst_VOP3A(iFmt, "v_or3_b32", false)
1141 {
1142 setFlag(ALU);
1143 } // Inst_VOP3__V_OR3_B32
1144
1146 {
1147 } // ~Inst_VOP3__V_OR3_B32
1148
1149 // --- description from .arch file ---
1150 // D.u = S0.u | S1.u | S2.u.
1151 // Input and output modifiers not supported.
1152 void
1154 {
1155 Wavefront *wf = gpuDynInst->wavefront();
1156 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1157 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1158 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
1159 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1160
1161 src0.readSrc();
1162 src1.readSrc();
1163 src2.readSrc();
1164
1168 assert(!(instData.ABS & 0x1));
1169 assert(!(instData.ABS & 0x2));
1170 assert(!(instData.ABS & 0x4));
1171 assert(!(extData.NEG & 0x1));
1172 assert(!(extData.NEG & 0x2));
1173 assert(!(extData.NEG & 0x4));
1174
1175 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1176 if (wf->execMask(lane)) {
1177 vdst[lane] = src0[lane] | src1[lane] | src2[lane];
1178 }
1179 }
1180
1181 vdst.write();
1182 } // execute
1183 // --- Inst_VOP3__V_XOR_B32 class methods ---
1184
1186 : Inst_VOP3A(iFmt, "v_xor_b32", false)
1187 {
1188 setFlag(ALU);
1189 } // Inst_VOP3__V_XOR_B32
1190
1192 {
1193 } // ~Inst_VOP3__V_XOR_B32
1194
1195 // --- description from .arch file ---
1196 // D.u = S0.u ^ S1.u.
1197 // Input and output modifiers not supported.
1198 void
1200 {
1201 Wavefront *wf = gpuDynInst->wavefront();
1202 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1203 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1204 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1205
1206 src0.readSrc();
1207 src1.readSrc();
1208
1212 assert(!(instData.ABS & 0x1));
1213 assert(!(instData.ABS & 0x2));
1214 assert(!(instData.ABS & 0x4));
1215 assert(!(extData.NEG & 0x1));
1216 assert(!(extData.NEG & 0x2));
1217 assert(!(extData.NEG & 0x4));
1218
1219 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1220 if (wf->execMask(lane)) {
1221 vdst[lane] = src0[lane] ^ src1[lane];
1222 }
1223 }
1224
1225 vdst.write();
1226 } // execute
1227 // --- Inst_VOP3__V_MAC_F32 class methods ---
1228
1230 : Inst_VOP3A(iFmt, "v_mac_f32", false)
1231 {
1232 setFlag(ALU);
1233 setFlag(F32);
1234 setFlag(MAC);
1235 } // Inst_VOP3__V_MAC_F32
1236
1238 {
1239 } // ~Inst_VOP3__V_MAC_F32
1240
1241 // --- description from .arch file ---
1242 // D.f = S0.f * S1.f + D.f.
1243 // SQ translates to V_MAD_F32.
1244 void
1246 {
1247 Wavefront *wf = gpuDynInst->wavefront();
1248 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
1249 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
1250 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1251
1252 src0.readSrc();
1253 src1.readSrc();
1254 vdst.read();
1255
1256 if (instData.ABS & 0x1) {
1257 src0.absModifier();
1258 }
1259
1260 if (instData.ABS & 0x2) {
1261 src1.absModifier();
1262 }
1263
1264 if (extData.NEG & 0x1) {
1265 src0.negModifier();
1266 }
1267
1268 if (extData.NEG & 0x2) {
1269 src1.negModifier();
1270 }
1271
1275 assert(!(instData.ABS & 0x4));
1276 assert(!(extData.NEG & 0x4));
1277
1278 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1279 if (wf->execMask(lane)) {
1280 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
1281 }
1282 }
1283
1284 vdst.write();
1285 } // execute
1286 // --- Inst_VOP3__V_ADD_CO_U32 class methods ---
1287
1289 : Inst_VOP3B(iFmt, "v_add_co_u32")
1290 {
1291 setFlag(ALU);
1292 setFlag(WritesVCC);
1293 } // Inst_VOP3__V_ADD_CO_U32
1294
1296 {
1297 } // ~Inst_VOP3__V_ADD_CO_U32
1298
1299 // --- description from .arch file ---
1300 // D.u = S0.u + S1.u;
1301 // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
1302 // --- overflow or carry-out for V_ADDC_U32.
1303 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1304 void
1306 {
1307 Wavefront *wf = gpuDynInst->wavefront();
1308 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1309 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1310 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1311 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1312
1313 src0.readSrc();
1314 src1.readSrc();
1315
1319 assert(!(extData.NEG & 0x1));
1320 assert(!(extData.NEG & 0x2));
1321 assert(!(extData.NEG & 0x4));
1322
1323 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1324 if (wf->execMask(lane)) {
1325 vdst[lane] = src0[lane] + src1[lane];
1326 vcc.setBit(lane, ((VecElemU64)src0[lane]
1327 + (VecElemU64)src1[lane]) >= 0x100000000ULL ? 1 : 0);
1328 }
1329 }
1330
1331 vdst.write();
1332 vcc.write();
1333 } // execute
1334 // --- Inst_VOP3__V_SUB_CO_U32 class methods ---
1335
1337 : Inst_VOP3B(iFmt, "v_sub_co_u32")
1338 {
1339 setFlag(ALU);
1340 setFlag(WritesVCC);
1341 } // Inst_VOP3__V_SUB_CO_U32
1342
1344 {
1345 } // ~Inst_VOP3__V_SUB_CO_U32
1346
1347 // --- description from .arch file ---
1348 // D.u = S0.u - S1.u;
1349 // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
1350 // carry-out for V_SUBB_U32.
1351 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1352 void
1354 {
1355 Wavefront *wf = gpuDynInst->wavefront();
1356 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1357 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1358 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1359 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1360
1361 src0.readSrc();
1362 src1.readSrc();
1363
1367 assert(!(extData.NEG & 0x1));
1368 assert(!(extData.NEG & 0x2));
1369 assert(!(extData.NEG & 0x4));
1370
1371 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1372 if (wf->execMask(lane)) {
1373 vdst[lane] = src0[lane] - src1[lane];
1374 vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
1375 }
1376 }
1377
1378 vdst.write();
1379 vcc.write();
1380 } // execute
1381 // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---
1382
1384 InFmt_VOP3B *iFmt)
1385 : Inst_VOP3B(iFmt, "v_subrev_co_u32")
1386 {
1387 setFlag(ALU);
1388 setFlag(WritesVCC);
1389 } // Inst_VOP3__V_SUBREV_CO_U32
1390
1392 {
1393 } // ~Inst_VOP3__V_SUBREV_CO_U32
1394
1395 // --- description from .arch file ---
1396 // D.u = S1.u - S0.u;
1397 // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
1398 // carry-out for V_SUBB_U32.
1399 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1400 // SQ translates this to V_SUB_U32 with reversed operands.
1401 void
1403 {
1404 Wavefront *wf = gpuDynInst->wavefront();
1405 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1406 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1407 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1408 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
1409
1410 src0.readSrc();
1411 src1.readSrc();
1412
1416 assert(!(extData.NEG & 0x1));
1417 assert(!(extData.NEG & 0x2));
1418 assert(!(extData.NEG & 0x4));
1419
1420 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1421 if (wf->execMask(lane)) {
1422 vdst[lane] = src1[lane] - src0[lane];
1423 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
1424 }
1425 }
1426
1427 vdst.write();
1428 vcc.write();
1429 } // execute
1430 // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---
1431
1433 : Inst_VOP3B(iFmt, "v_addc_co_u32")
1434 {
1435 setFlag(ALU);
1436 setFlag(WritesVCC);
1437 setFlag(ReadsVCC);
1438 } // Inst_VOP3__V_ADDC_CO_U32
1439
1441 {
1442 } // ~Inst_VOP3__V_ADDC_CO_U32
1443
1444 // --- description from .arch file ---
1445 // D.u = S0.u + S1.u + VCC[threadId];
1446 // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
1447 // is an UNSIGNED overflow.
1448 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1449 // source comes from the SGPR-pair at S2.u.
1450 void
1452 {
1453 Wavefront *wf = gpuDynInst->wavefront();
1454 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1455 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1456 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1457 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1458 ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1459
1460 src0.readSrc();
1461 src1.readSrc();
1462 vcc.read();
1463
1467 assert(!(extData.NEG & 0x1));
1468 assert(!(extData.NEG & 0x2));
1469 assert(!(extData.NEG & 0x4));
1470
1471 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1472 if (wf->execMask(lane)) {
1473 vdst[lane] = src0[lane] + src1[lane]
1474 + bits(vcc.rawData(), lane);
1475 sdst.setBit(lane, ((VecElemU64)src0[lane]
1476 + (VecElemU64)src1[lane]
1477 + (VecElemU64)bits(vcc.rawData(), lane))
1478 >= 0x100000000 ? 1 : 0);
1479 }
1480 }
1481
1482 vdst.write();
1483 sdst.write();
1484 } // execute
1485 // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---
1486
1488 : Inst_VOP3B(iFmt, "v_subb_co_u32")
1489 {
1490 setFlag(ALU);
1491 setFlag(WritesVCC);
1492 setFlag(ReadsVCC);
1493 } // Inst_VOP3__V_SUBB_CO_U32
1494
1496 {
1497 } // ~Inst_VOP3__V_SUBB_CO_U32
1498
1499 // --- description from .arch file ---
1500 // D.u = S0.u - S1.u - VCC[threadId];
1501 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1502 // --- overflow.
1503 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1504 // --- source comes from the SGPR-pair at S2.u.
1505 void
1507 {
1508 Wavefront *wf = gpuDynInst->wavefront();
1509 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1510 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1511 ConstScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1512 ScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1513 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1514
1515 src0.readSrc();
1516 src1.readSrc();
1517 vcc.read();
1518
1522 assert(!(extData.NEG & 0x1));
1523 assert(!(extData.NEG & 0x2));
1524 assert(!(extData.NEG & 0x4));
1525
1526 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1527 if (wf->execMask(lane)) {
1528 vdst[lane] = src0[lane] - src1[lane]
1529 - bits(vcc.rawData(), lane);
1530 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1531 > src0[lane] ? 1 : 0);
1532 }
1533 }
1534
1535 vdst.write();
1536 sdst.write();
1537 } // execute
1538 // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---
1539
1541 InFmt_VOP3B *iFmt)
1542 : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
1543 {
1544 setFlag(ALU);
1545 setFlag(WritesVCC);
1546 setFlag(ReadsVCC);
1547 } // Inst_VOP3__V_SUBBREV_CO_U32
1548
1550 {
1551 } // ~Inst_VOP3__V_SUBBREV_CO_U32
1552
1553 // --- description from .arch file ---
1554 // D.u = S1.u - S0.u - VCC[threadId];
1555 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1556 // overflow.
1557 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1558 // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
1559 void
1561 {
1562 Wavefront *wf = gpuDynInst->wavefront();
1563 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
1564 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
1565 ConstScalarOperandU64 sdst(gpuDynInst, instData.SDST);
1566 ScalarOperandU64 vcc(gpuDynInst, extData.SRC2);
1567 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1568
1569 src0.readSrc();
1570 src1.readSrc();
1571 vcc.read();
1572
1576 assert(!(extData.NEG & 0x1));
1577 assert(!(extData.NEG & 0x2));
1578 assert(!(extData.NEG & 0x4));
1579
1580 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1581 if (wf->execMask(lane)) {
1582 vdst[lane] = src1[lane] - src0[lane]
1583 - bits(vcc.rawData(), lane);
1584 sdst.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1585 > src0[lane] ? 1 : 0);
1586 }
1587 }
1588
1589 vdst.write();
1590 sdst.write();
1591 } // execute
1592 // --- Inst_VOP3__V_ADD_F16 class methods ---
1593
1595 : Inst_VOP3A(iFmt, "v_add_f16", false)
1596 {
1597 setFlag(ALU);
1598 setFlag(F16);
1599 } // Inst_VOP3__V_ADD_F16
1600
1602 {
1603 } // ~Inst_VOP3__V_ADD_F16
1604
1605 // --- description from .arch file ---
1606 // D.f16 = S0.f16 + S1.f16.
1607 // Supports denormals, round mode, exception flags, saturation.
1608 void
1610 {
1612 } // execute
1613 // --- Inst_VOP3__V_SUB_F16 class methods ---
1614
1616 : Inst_VOP3A(iFmt, "v_sub_f16", false)
1617 {
1618 setFlag(ALU);
1619 setFlag(F16);
1620 } // Inst_VOP3__V_SUB_F16
1621
1623 {
1624 } // ~Inst_VOP3__V_SUB_F16
1625
1626 // --- description from .arch file ---
1627 // D.f16 = S0.f16 - S1.f16.
1628 // Supports denormals, round mode, exception flags, saturation.
1629 // SQ translates to V_ADD_F16.
1630 void
1632 {
1634 } // execute
1635 // --- Inst_VOP3__V_SUBREV_F16 class methods ---
1636
1638 : Inst_VOP3A(iFmt, "v_subrev_f16", false)
1639 {
1640 setFlag(ALU);
1641 setFlag(F16);
1642 } // Inst_VOP3__V_SUBREV_F16
1643
1645 {
1646 } // ~Inst_VOP3__V_SUBREV_F16
1647
1648 // --- description from .arch file ---
1649 // D.f16 = S1.f16 - S0.f16.
1650 // Supports denormals, round mode, exception flags, saturation.
1651 // SQ translates to V_ADD_F16.
1652 void
1654 {
1656 } // execute
1657 // --- Inst_VOP3__V_MUL_F16 class methods ---
1658
1660 : Inst_VOP3A(iFmt, "v_mul_f16", false)
1661 {
1662 setFlag(ALU);
1663 setFlag(F16);
1664 } // Inst_VOP3__V_MUL_F16
1665
1667 {
1668 } // ~Inst_VOP3__V_MUL_F16
1669
1670 // --- description from .arch file ---
1671 // D.f16 = S0.f16 * S1.f16.
1672 // Supports denormals, round mode, exception flags, saturation.
1673 void
1675 {
1677 } // execute
1678 // --- Inst_VOP3__V_MAC_F16 class methods ---
1679
1681 : Inst_VOP3A(iFmt, "v_mac_f16", false)
1682 {
1683 setFlag(ALU);
1684 setFlag(F16);
1685 setFlag(MAC);
1686 } // Inst_VOP3__V_MAC_F16
1687
1689 {
1690 } // ~Inst_VOP3__V_MAC_F16
1691
1692 // --- description from .arch file ---
1693 // D.f16 = S0.f16 * S1.f16 + D.f16.
1694 // Supports round mode, exception flags, saturation.
1695 // SQ translates this to V_MAD_F16.
1696 void
1698 {
1700 } // execute
1701 // --- Inst_VOP3__V_ADD_U16 class methods ---
1702
1704 : Inst_VOP3A(iFmt, "v_add_u16", false)
1705 {
1706 setFlag(ALU);
1707 } // Inst_VOP3__V_ADD_U16
1708
1710 {
1711 } // ~Inst_VOP3__V_ADD_U16
1712
1713 // --- description from .arch file ---
1714 // D.u16 = S0.u16 + S1.u16.
1715 // Supports saturation (unsigned 16-bit integer domain).
1716 void
1718 {
1719 Wavefront *wf = gpuDynInst->wavefront();
1720 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1721 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1722 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1723
1724 src0.readSrc();
1725 src1.readSrc();
1726
1730 assert(!(instData.ABS & 0x1));
1731 assert(!(instData.ABS & 0x2));
1732 assert(!(instData.ABS & 0x4));
1733 assert(!(extData.NEG & 0x1));
1734 assert(!(extData.NEG & 0x2));
1735 assert(!(extData.NEG & 0x4));
1736
1737 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1738 if (wf->execMask(lane)) {
1739 vdst[lane] = src0[lane] + src1[lane];
1740 }
1741 }
1742
1743 vdst.write();
1744 } // execute
1745 // --- Inst_VOP3__V_SUB_U16 class methods ---
1746
1748 : Inst_VOP3A(iFmt, "v_sub_u16", false)
1749 {
1750 setFlag(ALU);
1751 } // Inst_VOP3__V_SUB_U16
1752
1754 {
1755 } // ~Inst_VOP3__V_SUB_U16
1756
1757 // --- description from .arch file ---
1758 // D.u16 = S0.u16 - S1.u16.
1759 // Supports saturation (unsigned 16-bit integer domain).
1760 void
1762 {
1763 Wavefront *wf = gpuDynInst->wavefront();
1764 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1765 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1766 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1767
1768 src0.readSrc();
1769 src1.readSrc();
1770
1774 assert(!(instData.ABS & 0x1));
1775 assert(!(instData.ABS & 0x2));
1776 assert(!(instData.ABS & 0x4));
1777 assert(!(extData.NEG & 0x1));
1778 assert(!(extData.NEG & 0x2));
1779 assert(!(extData.NEG & 0x4));
1780
1781 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1782 if (wf->execMask(lane)) {
1783 vdst[lane] = src0[lane] - src1[lane];
1784 }
1785 }
1786
1787 vdst.write();
1788 } // execute
1789 // --- Inst_VOP3__V_SUBREV_U16 class methods ---
1790
1792 : Inst_VOP3A(iFmt, "v_subrev_u16", false)
1793 {
1794 setFlag(ALU);
1795 } // Inst_VOP3__V_SUBREV_U16
1796
1798 {
1799 } // ~Inst_VOP3__V_SUBREV_U16
1800
1801 // --- description from .arch file ---
1802 // D.u16 = S1.u16 - S0.u16.
1803 // Supports saturation (unsigned 16-bit integer domain).
1804 // SQ translates this to V_SUB_U16 with reversed operands.
1805 void
1807 {
1808 Wavefront *wf = gpuDynInst->wavefront();
1809 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1810 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1811 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1812
1813 src0.readSrc();
1814 src1.readSrc();
1815
1819 assert(!(instData.ABS & 0x1));
1820 assert(!(instData.ABS & 0x2));
1821 assert(!(instData.ABS & 0x4));
1822 assert(!(extData.NEG & 0x1));
1823 assert(!(extData.NEG & 0x2));
1824 assert(!(extData.NEG & 0x4));
1825
1826 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1827 if (wf->execMask(lane)) {
1828 vdst[lane] = src1[lane] - src0[lane];
1829 }
1830 }
1831
1832 vdst.write();
1833 } // execute
1834 // --- Inst_VOP3__V_MUL_LO_U16 class methods ---
1835
1837 : Inst_VOP3A(iFmt, "v_mul_lo_u16", false)
1838 {
1839 setFlag(ALU);
1840 } // Inst_VOP3__V_MUL_LO_U16
1841
1843 {
1844 } // ~Inst_VOP3__V_MUL_LO_U16
1845
1846 // --- description from .arch file ---
1847 // D.u16 = S0.u16 * S1.u16.
1848 // Supports saturation (unsigned 16-bit integer domain).
1849 void
1851 {
1852 Wavefront *wf = gpuDynInst->wavefront();
1853 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1854 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1855 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1856
1857 src0.readSrc();
1858 src1.readSrc();
1859
1863 assert(!(instData.ABS & 0x1));
1864 assert(!(instData.ABS & 0x2));
1865 assert(!(instData.ABS & 0x4));
1866 assert(!(extData.NEG & 0x1));
1867 assert(!(extData.NEG & 0x2));
1868 assert(!(extData.NEG & 0x4));
1869
1870 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1871 if (wf->execMask(lane)) {
1872 vdst[lane] = src0[lane] * src1[lane];
1873 }
1874 }
1875
1876 vdst.write();
1877 } // execute
1878 // --- Inst_VOP3__V_LSHLREV_B16 class methods ---
1879
1881 : Inst_VOP3A(iFmt, "v_lshlrev_b16", false)
1882 {
1883 setFlag(ALU);
1884 } // Inst_VOP3__V_LSHLREV_B16
1885
1887 {
1888 } // ~Inst_VOP3__V_LSHLREV_B16
1889
1890 // --- description from .arch file ---
1891 // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
1892 // SQ translates this to an internal SP opcode.
1893 void
1895 {
1896 Wavefront *wf = gpuDynInst->wavefront();
1897 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1898 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1899 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1900
1901 src0.readSrc();
1902 src1.readSrc();
1903
1907 assert(!(instData.ABS & 0x1));
1908 assert(!(instData.ABS & 0x2));
1909 assert(!(instData.ABS & 0x4));
1910 assert(!(extData.NEG & 0x1));
1911 assert(!(extData.NEG & 0x2));
1912 assert(!(extData.NEG & 0x4));
1913
1914 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1915 if (wf->execMask(lane)) {
1916 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
1917 }
1918 }
1919
1920 vdst.write();
1921 } // execute
1922 // --- Inst_VOP3__V_LSHRREV_B16 class methods ---
1923
1925 : Inst_VOP3A(iFmt, "v_lshrrev_b16", false)
1926 {
1927 setFlag(ALU);
1928 } // Inst_VOP3__V_LSHRREV_B16
1929
1931 {
1932 } // ~Inst_VOP3__V_LSHRREV_B16
1933
1934 // --- description from .arch file ---
1935 // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
1936 // The vacated bits are set to zero.
1937 // SQ translates this to an internal SP opcode.
1938 void
1940 {
1941 Wavefront *wf = gpuDynInst->wavefront();
1942 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
1943 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1944 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1945
1946 src0.readSrc();
1947 src1.readSrc();
1948
1949 if (instData.ABS & 0x1) {
1950 src0.absModifier();
1951 }
1952
1953 if (instData.ABS & 0x2) {
1954 src1.absModifier();
1955 }
1956
1957 if (extData.NEG & 0x1) {
1958 src0.negModifier();
1959 }
1960
1961 if (extData.NEG & 0x2) {
1962 src1.negModifier();
1963 }
1964
1965 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1966 if (wf->execMask(lane)) {
1967 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
1968 }
1969 }
1970
1971 vdst.write();
1972 } // execute
1973 // --- Inst_VOP3__V_ASHRREV_I16 class methods ---
1974
1976 : Inst_VOP3A(iFmt, "v_ashrrev_i16", false)
1977 {
1978 setFlag(ALU);
1979 } // Inst_VOP3__V_ASHRREV_I16
1980
1982 {
1983 } // ~Inst_VOP3__V_ASHRREV_I16
1984
1985 // --- description from .arch file ---
1986 // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
1987 // The vacated bits are set to the sign bit of the input value.
1988 // SQ translates this to an internal SP opcode.
1989 void
1991 {
1992 Wavefront *wf = gpuDynInst->wavefront();
1993 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
1994 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
1995 VecOperandI16 vdst(gpuDynInst, instData.VDST);
1996
1997 src0.readSrc();
1998 src1.readSrc();
1999
2003 assert(!(instData.ABS & 0x1));
2004 assert(!(instData.ABS & 0x2));
2005 assert(!(instData.ABS & 0x4));
2006 assert(!(extData.NEG & 0x1));
2007 assert(!(extData.NEG & 0x2));
2008 assert(!(extData.NEG & 0x4));
2009
2010 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2011 if (wf->execMask(lane)) {
2012 vdst[lane] = src1[lane] >> bits(src0[lane], 3, 0);
2013 }
2014 }
2015
2016 vdst.write();
2017 } // execute
2018 // --- Inst_VOP3__V_MAX_F16 class methods ---
2019
2021 : Inst_VOP3A(iFmt, "v_max_f16", false)
2022 {
2023 setFlag(ALU);
2024 setFlag(F16);
2025 } // Inst_VOP3__V_MAX_F16
2026
2028 {
2029 } // ~Inst_VOP3__V_MAX_F16
2030
2031 // --- description from .arch file ---
2032 // D.f16 = max(S0.f16, S1.f16).
2033 // IEEE compliant. Supports denormals, round mode, exception flags,
2034 // saturation.
2035 void
2037 {
2039 } // execute
2040 // --- Inst_VOP3__V_MIN_F16 class methods ---
2041
2043 : Inst_VOP3A(iFmt, "v_min_f16", false)
2044 {
2045 setFlag(ALU);
2046 setFlag(F16);
2047 } // Inst_VOP3__V_MIN_F16
2048
2050 {
2051 } // ~Inst_VOP3__V_MIN_F16
2052
2053 // --- description from .arch file ---
2054 // D.f16 = min(S0.f16, S1.f16).
2055 // IEEE compliant. Supports denormals, round mode, exception flags,
2056 // saturation.
2057 void
2059 {
2061 } // execute
2062 // --- Inst_VOP3__V_MAX_U16 class methods ---
2063
2065 : Inst_VOP3A(iFmt, "v_max_u16", false)
2066 {
2067 setFlag(ALU);
2068 } // Inst_VOP3__V_MAX_U16
2069
2071 {
2072 } // ~Inst_VOP3__V_MAX_U16
2073
2074 // --- description from .arch file ---
2075 // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
2076 void
2078 {
2079 Wavefront *wf = gpuDynInst->wavefront();
2080 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
2081 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
2082 VecOperandU16 vdst(gpuDynInst, instData.VDST);
2083
2084 src0.readSrc();
2085 src1.readSrc();
2086
2087 if (instData.ABS & 0x1) {
2088 src0.absModifier();
2089 }
2090
2091 if (instData.ABS & 0x2) {
2092 src1.absModifier();
2093 }
2094
2095 if (extData.NEG & 0x1) {
2096 src0.negModifier();
2097 }
2098
2099 if (extData.NEG & 0x2) {
2100 src1.negModifier();
2101 }
2102
2103 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2104 if (wf->execMask(lane)) {
2105 vdst[lane] = std::max(src0[lane], src1[lane]);
2106 }
2107 }
2108
2109 vdst.write();
2110 } // execute
2111 // --- Inst_VOP3__V_MAX_I16 class methods ---
2112
2114 : Inst_VOP3A(iFmt, "v_max_i16", false)
2115 {
2116 setFlag(ALU);
2117 } // Inst_VOP3__V_MAX_I16
2118
2120 {
2121 } // ~Inst_VOP3__V_MAX_I16
2122
2123 // --- description from .arch file ---
2124 // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
2125 void
2127 {
2128 Wavefront *wf = gpuDynInst->wavefront();
2129 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
2130 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
2131 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2132
2133 src0.readSrc();
2134 src1.readSrc();
2135
2136 if (instData.ABS & 0x1) {
2137 src0.absModifier();
2138 }
2139
2140 if (instData.ABS & 0x2) {
2141 src1.absModifier();
2142 }
2143
2144 if (extData.NEG & 0x1) {
2145 src0.negModifier();
2146 }
2147
2148 if (extData.NEG & 0x2) {
2149 src1.negModifier();
2150 }
2151
2152 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2153 if (wf->execMask(lane)) {
2154 vdst[lane] = std::max(src0[lane], src1[lane]);
2155 }
2156 }
2157
2158 vdst.write();
2159 } // execute
2160 // --- Inst_VOP3__V_MIN_U16 class methods ---
2161
2163 : Inst_VOP3A(iFmt, "v_min_u16", false)
2164 {
2165 setFlag(ALU);
2166 } // Inst_VOP3__V_MIN_U16
2167
2169 {
2170 } // ~Inst_VOP3__V_MIN_U16
2171
2172 // --- description from .arch file ---
2173 // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
2174 void
2176 {
2177 Wavefront *wf = gpuDynInst->wavefront();
2178 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
2179 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
2180 VecOperandU16 vdst(gpuDynInst, instData.VDST);
2181
2182 src0.readSrc();
2183 src1.readSrc();
2184
2185 if (instData.ABS & 0x1) {
2186 src0.absModifier();
2187 }
2188
2189 if (instData.ABS & 0x2) {
2190 src1.absModifier();
2191 }
2192
2193 if (extData.NEG & 0x1) {
2194 src0.negModifier();
2195 }
2196
2197 if (extData.NEG & 0x2) {
2198 src1.negModifier();
2199 }
2200
2201 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2202 if (wf->execMask(lane)) {
2203 vdst[lane] = std::min(src0[lane], src1[lane]);
2204 }
2205 }
2206
2207 vdst.write();
2208 } // execute
2209 // --- Inst_VOP3__V_MIN_I16 class methods ---
2210
2212 : Inst_VOP3A(iFmt, "v_min_i16", false)
2213 {
2214 setFlag(ALU);
2215 } // Inst_VOP3__V_MIN_I16
2216
2218 {
2219 } // ~Inst_VOP3__V_MIN_I16
2220
2221 // --- description from .arch file ---
2222 // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
2223 void
2225 {
2226 Wavefront *wf = gpuDynInst->wavefront();
2227 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
2228 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
2229 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2230
2231 src0.readSrc();
2232 src1.readSrc();
2233
2234 if (instData.ABS & 0x1) {
2235 src0.absModifier();
2236 }
2237
2238 if (instData.ABS & 0x2) {
2239 src1.absModifier();
2240 }
2241
2242 if (extData.NEG & 0x1) {
2243 src0.negModifier();
2244 }
2245
2246 if (extData.NEG & 0x2) {
2247 src1.negModifier();
2248 }
2249
2250 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2251 if (wf->execMask(lane)) {
2252 vdst[lane] = std::min(src0[lane], src1[lane]);
2253 }
2254 }
2255
2256 vdst.write();
2257 } // execute
2258 // --- Inst_VOP3__V_LDEXP_F16 class methods ---
2259
2261 : Inst_VOP3A(iFmt, "v_ldexp_f16", false)
2262 {
2263 setFlag(ALU);
2264 setFlag(F16);
2265 } // Inst_VOP3__V_LDEXP_F16
2266
2268 {
2269 } // ~Inst_VOP3__V_LDEXP_F16
2270
2271 // --- description from .arch file ---
2272 // D.f16 = S0.f16 * (2 ** S1.i16).
2273 void
2275 {
2277 } // execute
2278 // --- Inst_VOP3__V_ADD_U32 class methods ---
2279
2281 : Inst_VOP3A(iFmt, "v_add_u32", false)
2282 {
2283 setFlag(ALU);
2284 } // Inst_VOP3__V_ADD_U32
2285
2287 {
2288 } // ~Inst_VOP3__V_ADD_U32
2289
2290 // --- description from .arch file ---
2291 // D.u32 = S0.u32 + S1.u32.
2292 void
2294 {
2295 Wavefront *wf = gpuDynInst->wavefront();
2296 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2297 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2298 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2299
2300 src0.readSrc();
2301 src1.readSrc();
2302
2306 assert(!(instData.ABS & 0x1));
2307 assert(!(instData.ABS & 0x2));
2308 assert(!(instData.ABS & 0x4));
2309 assert(!(extData.NEG & 0x1));
2310 assert(!(extData.NEG & 0x2));
2311 assert(!(extData.NEG & 0x4));
2312
2313 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2314 if (wf->execMask(lane)) {
2315 vdst[lane] = src0[lane] + src1[lane];
2316 }
2317 }
2318
2319 vdst.write();
2320 } // execute
2321 // --- Inst_VOP3__V_SUB_U32 class methods ---
2322
2324 : Inst_VOP3A(iFmt, "v_sub_u32", false)
2325 {
2326 setFlag(ALU);
2327 } // Inst_VOP3__V_SUB_U32
2328
2330 {
2331 } // ~Inst_VOP3__V_SUB_U32
2332
2333 // --- description from .arch file ---
2334 // D.u32 = S0.u32 - S1.u32.
2335 void
2337 {
2338 Wavefront *wf = gpuDynInst->wavefront();
2339 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2340 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2341 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2342
2343 src0.readSrc();
2344 src1.readSrc();
2345
2349 assert(!(instData.ABS & 0x1));
2350 assert(!(instData.ABS & 0x2));
2351 assert(!(instData.ABS & 0x4));
2352 assert(!(extData.NEG & 0x1));
2353 assert(!(extData.NEG & 0x2));
2354 assert(!(extData.NEG & 0x4));
2355
2356 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2357 if (wf->execMask(lane)) {
2358 vdst[lane] = src0[lane] - src1[lane];
2359 }
2360 }
2361
2362 vdst.write();
2363 } // execute
2364 // --- Inst_VOP3__V_SUBREV_U32 class methods ---
2365
2367 : Inst_VOP3A(iFmt, "v_subrev_u32", false)
2368 {
2369 setFlag(ALU);
2370 } // Inst_VOP3__V_SUBREV_U32
2371
2373 {
2374 } // ~Inst_VOP3__V_SUBREV_U32
2375
2376 // --- description from .arch file ---
2377 // D.u32 = S1.u32 - S0.u32.
2378 void
2380 {
2381 Wavefront *wf = gpuDynInst->wavefront();
2382 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2383 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
2384 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2385
2386 src0.readSrc();
2387 src1.readSrc();
2388
2392 assert(!(instData.ABS & 0x1));
2393 assert(!(instData.ABS & 0x2));
2394 assert(!(instData.ABS & 0x4));
2395 assert(!(extData.NEG & 0x1));
2396 assert(!(extData.NEG & 0x2));
2397 assert(!(extData.NEG & 0x4));
2398
2399 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2400 if (wf->execMask(lane)) {
2401 vdst[lane] = src1[lane] - src0[lane];
2402 }
2403 }
2404
2405 vdst.write();
2406 } // execute
2407 // --- Inst_VOP3__V_FMAC_F32 class methods ---
2408
2410 : Inst_VOP3A(iFmt, "v_fmac_f32", false)
2411 {
2412 setFlag(ALU);
2413 setFlag(F32);
2414 setFlag(FMA);
2415 } // Inst_VOP3__V_FMAC_F32
2416
2418 {
2419 } // ~Inst_VOP3__V_FMAC_F32
2420
2421 // --- description from .arch file ---
2422 // D.f = S0.f * S1.f + D.f.
2423 void
2425 {
2426 Wavefront *wf = gpuDynInst->wavefront();
2427 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
2428 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
2429 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2430
2431 src0.readSrc();
2432 src1.readSrc();
2433 vdst.read();
2434
2435 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
2436 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2437 panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
2438
2439 if (instData.ABS & 0x1) {
2440 src0.absModifier();
2441 }
2442
2443 if (instData.ABS & 0x2) {
2444 src1.absModifier();
2445 }
2446
2447 if (instData.ABS & 0x4) {
2448 vdst.absModifier();
2449 }
2450
2451 if (extData.NEG & 0x1) {
2452 src0.negModifier();
2453 }
2454
2455 if (extData.NEG & 0x2) {
2456 src1.negModifier();
2457 }
2458
2459 if (extData.NEG & 0x4) {
2460 vdst.negModifier();
2461 }
2462
2463 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2464 if (wf->execMask(lane)) {
2465 float out = std::fma(src0[lane], src1[lane], vdst[lane]);
2466 out = omodModifier(out, extData.OMOD);
2467 if (instData.CLAMP) {
2468 out = std::clamp(vdst[lane], 0.0f, 1.0f);
2469 }
2470 vdst[lane] = out;
2471 }
2472 }
2473
2474 vdst.write();
2475 } // execute
2476 // --- Inst_VOP3__V_NOP class methods ---
2477
2479 : Inst_VOP3A(iFmt, "v_nop", false)
2480 {
2481 setFlag(Nop);
2482 setFlag(ALU);
2483 } // Inst_VOP3__V_NOP
2484
2486 {
2487 } // ~Inst_VOP3__V_NOP
2488
2489 // --- description from .arch file ---
2490 // Do nothing.
2491 void
2493 {
2494 } // execute
2495 // --- Inst_VOP3__V_MOV_B32 class methods ---
2496
2498 : Inst_VOP3A(iFmt, "v_mov_b32", false)
2499 {
2500 setFlag(ALU);
2501 } // Inst_VOP3__V_MOV_B32
2502
2504 {
2505 } // ~Inst_VOP3__V_MOV_B32
2506
2507 // --- description from .arch file ---
2508 // D.u = S0.u.
2509 // Input and output modifiers not supported; this is an untyped operation.
2510 void
2512 {
2513 Wavefront *wf = gpuDynInst->wavefront();
2514 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
2515 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2516
2517 src.readSrc();
2518
2519 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2520 if (wf->execMask(lane)) {
2521 vdst[lane] = src[lane];
2522 }
2523 }
2524
2525 vdst.write();
2526 } // execute
2527 // --- Inst_VOP3__V_CVT_I32_F64 class methods ---
2528
2530 : Inst_VOP3A(iFmt, "v_cvt_i32_f64", false)
2531 {
2532 setFlag(ALU);
2533 setFlag(F64);
2534 } // Inst_VOP3__V_CVT_I32_F64
2535
2537 {
2538 } // ~Inst_VOP3__V_CVT_I32_F64
2539
2540 // --- description from .arch file ---
2541 // D.i = (int)S0.d.
2542 // Out-of-range floating point values (including infinity) saturate. NaN is
2543 // --- converted to 0.
2544 void
2546 {
2547 Wavefront *wf = gpuDynInst->wavefront();
2548 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
2549 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2550
2551 src.readSrc();
2552
2553 if (instData.ABS & 0x1) {
2554 src.absModifier();
2555 }
2556
2557 if (extData.NEG & 0x1) {
2558 src.negModifier();
2559 }
2560
2561 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2562 if (wf->execMask(lane)) {
2563 int exp;
2564 std::frexp(src[lane],&exp);
2565 if (std::isnan(src[lane])) {
2566 vdst[lane] = 0;
2567 } else if (std::isinf(src[lane]) || exp > 30) {
2568 if (std::signbit(src[lane])) {
2569 vdst[lane] = INT_MIN;
2570 } else {
2571 vdst[lane] = INT_MAX;
2572 }
2573 } else {
2574 vdst[lane] = (VecElemI32)src[lane];
2575 }
2576 }
2577 }
2578
2579 vdst.write();
2580 } // execute
2581 // --- Inst_VOP3__V_CVT_F64_I32 class methods ---
2582
2584 : Inst_VOP3A(iFmt, "v_cvt_f64_i32", false)
2585 {
2586 setFlag(ALU);
2587 setFlag(F64);
2588 } // Inst_VOP3__V_CVT_F64_I32
2589
2591 {
2592 } // ~Inst_VOP3__V_CVT_F64_I32
2593
2594 // --- description from .arch file ---
2595 // D.d = (double)S0.i.
2596 void
2598 {
2599 Wavefront *wf = gpuDynInst->wavefront();
2600 ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
2601 VecOperandF64 vdst(gpuDynInst, instData.VDST);
2602
2603 src.readSrc();
2604
2605 if (instData.ABS & 0x1) {
2606 src.absModifier();
2607 }
2608
2609 if (extData.NEG & 0x1) {
2610 src.negModifier();
2611 }
2612
2613 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2614 if (wf->execMask(lane)) {
2615 vdst[lane] = (VecElemF64)src[lane];
2616 }
2617 }
2618
2619 vdst.write();
2620 } // execute
2621 // --- Inst_VOP3__V_CVT_F32_I32 class methods ---
2622
2624 : Inst_VOP3A(iFmt, "v_cvt_f32_i32", false)
2625 {
2626 setFlag(ALU);
2627 setFlag(F32);
2628 } // Inst_VOP3__V_CVT_F32_I32
2629
2631 {
2632 } // ~Inst_VOP3__V_CVT_F32_I32
2633
2634 // --- description from .arch file ---
2635 // D.f = (float)S0.i.
2636 void
2638 {
2639 Wavefront *wf = gpuDynInst->wavefront();
2640 VecOperandI32 src(gpuDynInst, extData.SRC0);
2641 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2642
2643 src.readSrc();
2644
2648 assert(!(instData.ABS & 0x1));
2649 assert(!(instData.ABS & 0x2));
2650 assert(!(instData.ABS & 0x4));
2651 assert(!(extData.NEG & 0x1));
2652 assert(!(extData.NEG & 0x2));
2653 assert(!(extData.NEG & 0x4));
2654
2655 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2656 if (wf->execMask(lane)) {
2657 vdst[lane] = (VecElemF32)src[lane];
2658 }
2659 }
2660
2661 vdst.write();
2662 } // execute
2663 // --- Inst_VOP3__V_CVT_F32_U32 class methods ---
2664
2666 : Inst_VOP3A(iFmt, "v_cvt_f32_u32", false)
2667 {
2668 setFlag(ALU);
2669 setFlag(F32);
2670 } // Inst_VOP3__V_CVT_F32_U32
2671
2673 {
2674 } // ~Inst_VOP3__V_CVT_F32_U32
2675
2676 // --- description from .arch file ---
2677 // D.f = (float)S0.u.
2678 void
2680 {
2681 Wavefront *wf = gpuDynInst->wavefront();
2682 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
2683 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2684
2685 src.readSrc();
2686
2687 if (instData.ABS & 0x1) {
2688 src.absModifier();
2689 }
2690
2691 if (extData.NEG & 0x1) {
2692 src.negModifier();
2693 }
2694
2695 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2696 if (wf->execMask(lane)) {
2697 vdst[lane] = (VecElemF32)src[lane];
2698 }
2699 }
2700
2701 vdst.write();
2702 } // execute
2703 // --- Inst_VOP3__V_CVT_U32_F32 class methods ---
2704
2706 : Inst_VOP3A(iFmt, "v_cvt_u32_f32", false)
2707 {
2708 setFlag(ALU);
2709 setFlag(F32);
2710 } // Inst_VOP3__V_CVT_U32_F32
2711
2713 {
2714 } // ~Inst_VOP3__V_CVT_U32_F32
2715
2716 // --- description from .arch file ---
2717 // D.u = (unsigned)S0.f.
2718 // Out-of-range floating point values (including infinity) saturate. NaN is
2719 // --- converted to 0.
2720 void
2722 {
2723 Wavefront *wf = gpuDynInst->wavefront();
2724 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2725 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2726
2727 src.readSrc();
2728
2729 if (instData.ABS & 0x1) {
2730 src.absModifier();
2731 }
2732
2733 if (extData.NEG & 0x1) {
2734 src.negModifier();
2735 }
2736
2737 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2738 if (wf->execMask(lane)) {
2739 int exp;
2740 std::frexp(src[lane],&exp);
2741 if (std::isnan(src[lane])) {
2742 vdst[lane] = 0;
2743 } else if (std::isinf(src[lane])) {
2744 if (std::signbit(src[lane])) {
2745 vdst[lane] = 0;
2746 } else {
2747 vdst[lane] = UINT_MAX;
2748 }
2749 } else if (exp > 31) {
2750 vdst[lane] = UINT_MAX;
2751 } else {
2752 vdst[lane] = (VecElemU32)src[lane];
2753 }
2754 }
2755 }
2756
2757 vdst.write();
2758 } // execute
2759 // --- Inst_VOP3__V_CVT_I32_F32 class methods ---
2760
2762 : Inst_VOP3A(iFmt, "v_cvt_i32_f32", false)
2763 {
2764 setFlag(ALU);
2765 setFlag(F32);
2766 } // Inst_VOP3__V_CVT_I32_F32
2767
2769 {
2770 } // ~Inst_VOP3__V_CVT_I32_F32
2771
2772 // --- description from .arch file ---
2773 // D.i = (int)S0.f.
2774 // Out-of-range floating point values (including infinity) saturate. NaN is
2775 // --- converted to 0.
2776 void
2778 {
2779 Wavefront *wf = gpuDynInst->wavefront();
2780 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2781 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2782
2783 src.readSrc();
2784
2785 if (instData.ABS & 0x1) {
2786 src.absModifier();
2787 }
2788
2789 if (extData.NEG & 0x1) {
2790 src.negModifier();
2791 }
2792
2796 assert(!(instData.ABS & 0x2));
2797 assert(!(instData.ABS & 0x4));
2798 assert(!(extData.NEG & 0x2));
2799 assert(!(extData.NEG & 0x4));
2800
2801 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2802 if (wf->execMask(lane)) {
2803 int exp;
2804 std::frexp(src[lane],&exp);
2805 if (std::isnan(src[lane])) {
2806 vdst[lane] = 0;
2807 } else if (std::isinf(src[lane]) || exp > 30) {
2808 if (std::signbit(src[lane])) {
2809 vdst[lane] = INT_MIN;
2810 } else {
2811 vdst[lane] = INT_MAX;
2812 }
2813 } else {
2814 vdst[lane] = (VecElemI32)src[lane];
2815 }
2816 }
2817 }
2818
2819 vdst.write();
2820 } // execute
2821 // --- Inst_VOP3__V_MOV_FED_B32 class methods ---
2822
2824 : Inst_VOP3A(iFmt, "v_mov_fed_b32", false)
2825 {
2826 setFlag(ALU);
2827 } // Inst_VOP3__V_MOV_FED_B32
2828
2830 {
2831 } // ~Inst_VOP3__V_MOV_FED_B32
2832
2833 // --- description from .arch file ---
2834 // D.u = S0.u;
2835 // Introduce EDC double error upon write to dest vgpr without causing an
2836 // --- exception.
2837 // Input and output modifiers not supported; this is an untyped operation.
2838 void
2840 {
2842 } // execute
2843 // --- Inst_VOP3__V_CVT_F16_F32 class methods ---
2844
2846 : Inst_VOP3A(iFmt, "v_cvt_f16_f32", false)
2847 {
2848 setFlag(ALU);
2849 setFlag(F32);
2850 } // Inst_VOP3__V_CVT_F16_F32
2851
2853 {
2854 } // ~Inst_VOP3__V_CVT_F16_F32
2855
2856 // --- description from .arch file ---
2857 // D.f16 = flt32_to_flt16(S0.f).
2858 // Supports input modifiers and creates FP16 denormals when appropriate.
2859 void
2861 {
2862 Wavefront *wf = gpuDynInst->wavefront();
2863 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
2864 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2865
2866 src0.readSrc();
2867 vdst.read();
2868
2869 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2870 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2871
2872 unsigned abs = instData.ABS;
2873 unsigned neg = extData.NEG;
2874 int opsel = instData.OPSEL;
2875
2876 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2877 if (wf->execMask(lane)) {
2878 float tmp = src0[lane];
2879
2880 if ((abs & 1) && (tmp < 0)) tmp = -tmp;
2881 if (neg & 1) tmp = -tmp;
2882
2883 tmp = omodModifier(tmp, extData.OMOD);
2884 if (instData.CLAMP) {
2885 tmp = std::clamp(tmp, 0.0f, 1.0f);
2886 }
2887
2888 AMDGPU::mxfloat16 out(tmp);
2889
2890 // If opsel[3] use upper 16-bits of dest, otherwise lower.
2891 if (opsel & 8) {
2892 replaceBits(vdst[lane], 31, 16, (out.data >> 16));
2893 } else {
2894 replaceBits(vdst[lane], 15, 0, (out.data >> 16));
2895 }
2896 }
2897 }
2898
2899 vdst.write();
2900 } // execute
2901 // --- Inst_VOP3__V_CVT_F32_F16 class methods ---
2902
2904 : Inst_VOP3A(iFmt, "v_cvt_f32_f16", false)
2905 {
2906 setFlag(ALU);
2907 setFlag(F32);
2908 } // Inst_VOP3__V_CVT_F32_F16
2909
2911 {
2912 } // ~Inst_VOP3__V_CVT_F32_F16
2913
2914 // --- description from .arch file ---
2915 // D.f = flt16_to_flt32(S0.f16).
2916 // FP16 denormal inputs are always accepted.
2917 void
2919 {
2920 Wavefront *wf = gpuDynInst->wavefront();
2921 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
2922 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2923
2924 src0.readSrc();
2925
2926 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2927 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2928 panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
2929
2930 unsigned abs = instData.ABS;
2931 unsigned neg = extData.NEG;
2932
2933 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2934 if (wf->execMask(lane)) {
2935 AMDGPU::mxfloat16 tmp(src0[lane]);
2936
2937 if ((abs & 1) && (tmp < 0)) tmp = -tmp;
2938 if (neg & 1) tmp = -tmp;
2939
2940 float out = omodModifier(float(tmp), extData.OMOD);
2941 if (instData.CLAMP) {
2942 out = std::clamp(out, 0.0f, 1.0f);
2943 }
2944
2945 vdst[lane] = out;
2946 }
2947 }
2948
2949 vdst.write();
2950 } // execute
2951 // --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
2952
2954 InFmt_VOP3A *iFmt)
2955 : Inst_VOP3A(iFmt, "v_cvt_rpi_i32_f32", false)
2956 {
2957 setFlag(ALU);
2958 setFlag(F32);
2959 } // Inst_VOP3__V_CVT_RPI_I32_F32
2960
2962 {
2963 } // ~Inst_VOP3__V_CVT_RPI_I32_F32
2964
2965 // --- description from .arch file ---
2966 // D.i = (int)floor(S0.f + 0.5).
2967 void
2969 {
2970 Wavefront *wf = gpuDynInst->wavefront();
2971 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
2972 VecOperandI32 vdst(gpuDynInst, instData.VDST);
2973
2974 src.readSrc();
2975
2976 if (instData.ABS & 0x1) {
2977 src.absModifier();
2978 }
2979
2980 if (extData.NEG & 0x1) {
2981 src.negModifier();
2982 }
2983
2984 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2985 if (wf->execMask(lane)) {
2986 vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
2987 }
2988 }
2989
2990 vdst.write();
2991 } // execute
2992 // --- Inst_VOP3__V_CVT_FLR_I32_F32 class methods ---
2993
2995 InFmt_VOP3A *iFmt)
2996 : Inst_VOP3A(iFmt, "v_cvt_flr_i32_f32", false)
2997 {
2998 setFlag(ALU);
2999 setFlag(F32);
3000 } // Inst_VOP3__V_CVT_FLR_I32_F32
3001
3003 {
3004 } // ~Inst_VOP3__V_CVT_FLR_I32_F32
3005
3006 // --- description from .arch file ---
3007 // D.i = (int)floor(S0.f).
3008 void
3010 {
3011 Wavefront *wf = gpuDynInst->wavefront();
3012 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3013 VecOperandI32 vdst(gpuDynInst, instData.VDST);
3014
3015 src.readSrc();
3016
3017 if (instData.ABS & 0x1) {
3018 src.absModifier();
3019 }
3020
3021 if (extData.NEG & 0x1) {
3022 src.negModifier();
3023 }
3024
3025 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3026 if (wf->execMask(lane)) {
3027 vdst[lane] = (VecElemI32)std::floor(src[lane]);
3028 }
3029 }
3030
3031 vdst.write();
3032 } // execute
3033 // --- Inst_VOP3__V_CVT_OFF_F32_I4 class methods ---
3034
3036 : Inst_VOP3A(iFmt, "v_cvt_off_f32_i4", false)
3037 {
3038 setFlag(ALU);
3039 setFlag(F32);
3040 } // Inst_VOP3__V_CVT_OFF_F32_I4
3041
3043 {
3044 } // ~Inst_VOP3__V_CVT_OFF_F32_I4
3045
3046 // --- description from .arch file ---
3047 // 4-bit signed int to 32-bit float. Used for interpolation in shader.
3048 void
3050 {
3051 // Could not parse sq_uc.arch desc field
3053 } // execute
3054 // --- Inst_VOP3__V_CVT_F32_F64 class methods ---
3055
3057 : Inst_VOP3A(iFmt, "v_cvt_f32_f64", false)
3058 {
3059 setFlag(ALU);
3060 setFlag(F64);
3061 } // Inst_VOP3__V_CVT_F32_F64
3062
3064 {
3065 } // ~Inst_VOP3__V_CVT_F32_F64
3066
3067 // --- description from .arch file ---
3068 // D.f = (float)S0.d.
3069 void
3071 {
3072 Wavefront *wf = gpuDynInst->wavefront();
3073 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3074 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3075
3076 src.readSrc();
3077
3078 if (instData.ABS & 0x1) {
3079 src.absModifier();
3080 }
3081
3082 if (extData.NEG & 0x1) {
3083 src.negModifier();
3084 }
3085
3089 assert(!(instData.ABS & 0x2));
3090 assert(!(instData.ABS & 0x4));
3091 assert(!(extData.NEG & 0x2));
3092 assert(!(extData.NEG & 0x4));
3093
3094 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3095 if (wf->execMask(lane)) {
3096 vdst[lane] = (VecElemF32)src[lane];
3097 }
3098 }
3099
3100 vdst.write();
3101 } // execute
3102 // --- Inst_VOP3__V_CVT_F64_F32 class methods ---
3103
3105 : Inst_VOP3A(iFmt, "v_cvt_f64_f32", false)
3106 {
3107 setFlag(ALU);
3108 setFlag(F64);
3109 } // Inst_VOP3__V_CVT_F64_F32
3110
3112 {
3113 } // ~Inst_VOP3__V_CVT_F64_F32
3114
3115 // --- description from .arch file ---
3116 // D.d = (double)S0.f.
3117 void
3119 {
3120 Wavefront *wf = gpuDynInst->wavefront();
3121 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3122 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3123
3124 src.readSrc();
3125
3126 if (instData.ABS & 0x1) {
3127 src.absModifier();
3128 }
3129
3130 if (extData.NEG & 0x1) {
3131 src.negModifier();
3132 }
3133
3137 assert(!(instData.ABS & 0x2));
3138 assert(!(instData.ABS & 0x4));
3139 assert(!(extData.NEG & 0x2));
3140 assert(!(extData.NEG & 0x4));
3141
3142 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3143 if (wf->execMask(lane)) {
3144 vdst[lane] = (VecElemF64)src[lane];
3145 }
3146 }
3147
3148 vdst.write();
3149 } // execute
3150 // --- Inst_VOP3__V_CVT_F32_UBYTE0 class methods ---
3151
3153 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte0", false)
3154 {
3155 setFlag(ALU);
3156 setFlag(F32);
3157 } // Inst_VOP3__V_CVT_F32_UBYTE0
3158
3160 {
3161 } // ~Inst_VOP3__V_CVT_F32_UBYTE0
3162
3163 // --- description from .arch file ---
3164 // D.f = (float)(S0.u[7:0]).
3165 void
3167 {
3168 Wavefront *wf = gpuDynInst->wavefront();
3169 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3170 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3171
3172 src.readSrc();
3173
3174 if (instData.ABS & 0x1) {
3175 src.absModifier();
3176 }
3177
3178 if (extData.NEG & 0x1) {
3179 src.negModifier();
3180 }
3181
3182 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3183 if (wf->execMask(lane)) {
3184 vdst[lane] = (VecElemF32)bits(src[lane], 7, 0);
3185 }
3186 }
3187
3188 vdst.write();
3189 } // execute
3190 // --- Inst_VOP3__V_CVT_F32_UBYTE1 class methods ---
3191
3193 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte1", false)
3194 {
3195 setFlag(ALU);
3196 setFlag(F32);
3197 } // Inst_VOP3__V_CVT_F32_UBYTE1
3198
3200 {
3201 } // ~Inst_VOP3__V_CVT_F32_UBYTE1
3202
3203 // --- description from .arch file ---
3204 // D.f = (float)(S0.u[15:8]).
3205 void
3207 {
3208 Wavefront *wf = gpuDynInst->wavefront();
3209 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3210 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3211
3212 src.readSrc();
3213
3214 if (instData.ABS & 0x1) {
3215 src.absModifier();
3216 }
3217
3218 if (extData.NEG & 0x1) {
3219 src.negModifier();
3220 }
3221
3222 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3223 if (wf->execMask(lane)) {
3224 vdst[lane] = (VecElemF32)bits(src[lane], 15, 8);
3225 }
3226 }
3227
3228 vdst.write();
3229 } // execute
3230 // --- Inst_VOP3__V_CVT_F32_UBYTE2 class methods ---
3231
3233 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte2", false)
3234 {
3235 setFlag(ALU);
3236 setFlag(F32);
3237 } // Inst_VOP3__V_CVT_F32_UBYTE2
3238
3240 {
3241 } // ~Inst_VOP3__V_CVT_F32_UBYTE2
3242
3243 // --- description from .arch file ---
3244 // D.f = (float)(S0.u[23:16]).
3245 void
3247 {
3248 Wavefront *wf = gpuDynInst->wavefront();
3249 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3250 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3251
3252 src.readSrc();
3253
3254 if (instData.ABS & 0x1) {
3255 src.absModifier();
3256 }
3257
3258 if (extData.NEG & 0x1) {
3259 src.negModifier();
3260 }
3261
3262 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3263 if (wf->execMask(lane)) {
3264 vdst[lane] = (VecElemF32)bits(src[lane], 23, 16);
3265 }
3266 }
3267
3268 vdst.write();
3269 } // execute
3270 // --- Inst_VOP3__V_CVT_F32_UBYTE3 class methods ---
3271
3273 : Inst_VOP3A(iFmt, "v_cvt_f32_ubyte3", false)
3274 {
3275 setFlag(ALU);
3276 setFlag(F32);
3277 } // Inst_VOP3__V_CVT_F32_UBYTE3
3278
3280 {
3281 } // ~Inst_VOP3__V_CVT_F32_UBYTE3
3282
3283 // --- description from .arch file ---
3284 // D.f = (float)(S0.u[31:24]).
3285 void
3287 {
3288 Wavefront *wf = gpuDynInst->wavefront();
3289 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3290 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3291
3292 src.readSrc();
3293
3294 if (instData.ABS & 0x1) {
3295 src.absModifier();
3296 }
3297
3298 if (extData.NEG & 0x1) {
3299 src.negModifier();
3300 }
3301
3302 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3303 if (wf->execMask(lane)) {
3304 vdst[lane] = (VecElemF32)bits(src[lane], 31, 24);
3305 }
3306 }
3307
3308 vdst.write();
3309 } // execute
3310 // --- Inst_VOP3__V_CVT_U32_F64 class methods ---
3311
3313 : Inst_VOP3A(iFmt, "v_cvt_u32_f64", false)
3314 {
3315 setFlag(ALU);
3316 setFlag(F64);
3317 } // Inst_VOP3__V_CVT_U32_F64
3318
3320 {
3321 } // ~Inst_VOP3__V_CVT_U32_F64
3322
3323 // --- description from .arch file ---
3324 // D.u = (unsigned)S0.d.
3325 // Out-of-range floating point values (including infinity) saturate. NaN is
3326 // --- converted to 0.
3327 void
3329 {
3330 Wavefront *wf = gpuDynInst->wavefront();
3331 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3332 VecOperandU32 vdst(gpuDynInst, instData.VDST);
3333
3334 src.readSrc();
3335
3336 if (instData.ABS & 0x1) {
3337 src.absModifier();
3338 }
3339
3340 if (extData.NEG & 0x1) {
3341 src.negModifier();
3342 }
3343
3344 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3345 if (wf->execMask(lane)) {
3346 int exp;
3347 std::frexp(src[lane],&exp);
3348 if (std::isnan(src[lane])) {
3349 vdst[lane] = 0;
3350 } else if (std::isinf(src[lane])) {
3351 if (std::signbit(src[lane])) {
3352 vdst[lane] = 0;
3353 } else {
3354 vdst[lane] = UINT_MAX;
3355 }
3356 } else if (exp > 31) {
3357 vdst[lane] = UINT_MAX;
3358 } else {
3359 vdst[lane] = (VecElemU32)src[lane];
3360 }
3361 }
3362 }
3363
3364 vdst.write();
3365 } // execute
3366 // --- Inst_VOP3__V_CVT_F64_U32 class methods ---
3367
3369 : Inst_VOP3A(iFmt, "v_cvt_f64_u32", false)
3370 {
3371 setFlag(ALU);
3372 setFlag(F64);
3373 } // Inst_VOP3__V_CVT_F64_U32
3374
3376 {
3377 } // ~Inst_VOP3__V_CVT_F64_U32
3378
3379 // --- description from .arch file ---
3380 // D.d = (double)S0.u.
3381 void
3383 {
3384 Wavefront *wf = gpuDynInst->wavefront();
3385 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
3386 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3387
3388 src.readSrc();
3389
3390 if (instData.ABS & 0x1) {
3391 src.absModifier();
3392 }
3393
3394 if (extData.NEG & 0x1) {
3395 src.negModifier();
3396 }
3397
3398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3399 if (wf->execMask(lane)) {
3400 vdst[lane] = (VecElemF64)src[lane];
3401 }
3402 }
3403
3404 vdst.write();
3405 } // execute
3406 // --- Inst_VOP3__V_TRUNC_F64 class methods ---
3407
3409 : Inst_VOP3A(iFmt, "v_trunc_f64", false)
3410 {
3411 setFlag(ALU);
3412 setFlag(F64);
3413 } // Inst_VOP3__V_TRUNC_F64
3414
3416 {
3417 } // ~Inst_VOP3__V_TRUNC_F64
3418
3419 // --- description from .arch file ---
3420 // D.d = trunc(S0.d), return integer part of S0.d.
3421 void
3423 {
3424 Wavefront *wf = gpuDynInst->wavefront();
3425 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3426 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3427
3428 src.readSrc();
3429
3430 if (instData.ABS & 0x1) {
3431 src.absModifier();
3432 }
3433
3434 if (extData.NEG & 0x1) {
3435 src.negModifier();
3436 }
3437
3438 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3439 if (wf->execMask(lane)) {
3440 vdst[lane] = std::trunc(src[lane]);
3441 }
3442 }
3443
3444 vdst.write();
3445 } // execute
3446 // --- Inst_VOP3__V_CEIL_F64 class methods ---
3447
3449 : Inst_VOP3A(iFmt, "v_ceil_f64", false)
3450 {
3451 setFlag(ALU);
3452 setFlag(F64);
3453 } // Inst_VOP3__V_CEIL_F64
3454
3456 {
3457 } // ~Inst_VOP3__V_CEIL_F64
3458
3459 // --- description from .arch file ---
3460 // D.d = trunc(S0.d);
3461 // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
3462 void
3464 {
3465 Wavefront *wf = gpuDynInst->wavefront();
3466 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3467 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3468
3469 src.readSrc();
3470
3471 if (instData.ABS & 0x1) {
3472 src.absModifier();
3473 }
3474
3475 if (extData.NEG & 0x1) {
3476 src.negModifier();
3477 }
3478
3479 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3480 if (wf->execMask(lane)) {
3481 vdst[lane] = std::ceil(src[lane]);
3482 }
3483 }
3484
3485 vdst.write();
3486 } // execute
3487 // --- Inst_VOP3__V_RNDNE_F64 class methods ---
3488
3490 : Inst_VOP3A(iFmt, "v_rndne_f64", false)
3491 {
3492 setFlag(ALU);
3493 setFlag(F64);
3494 } // Inst_VOP3__V_RNDNE_F64
3495
3497 {
3498 } // ~Inst_VOP3__V_RNDNE_F64
3499
3500 // --- description from .arch file ---
3501 // D.d = round_nearest_even(S0.d).
3502 void
3504 {
3505 Wavefront *wf = gpuDynInst->wavefront();
3506 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3507 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3508
3509 src.readSrc();
3510
3511 if (instData.ABS & 0x1) {
3512 src.absModifier();
3513 }
3514
3515 if (extData.NEG & 0x1) {
3516 src.negModifier();
3517 }
3518
3519 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3520 if (wf->execMask(lane)) {
3521 vdst[lane] = roundNearestEven(src[lane]);
3522 }
3523 }
3524
3525 vdst.write();
3526 } // execute
3527 // --- Inst_VOP3__V_FLOOR_F64 class methods ---
3528
3530 : Inst_VOP3A(iFmt, "v_floor_f64", false)
3531 {
3532 setFlag(ALU);
3533 setFlag(F64);
3534 } // Inst_VOP3__V_FLOOR_F64
3535
3537 {
3538 } // ~Inst_VOP3__V_FLOOR_F64
3539
3540 // --- description from .arch file ---
3541 // D.d = trunc(S0.d);
3542 // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
3543 void
3545 {
3546 Wavefront *wf = gpuDynInst->wavefront();
3547 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
3548 VecOperandF64 vdst(gpuDynInst, instData.VDST);
3549
3550 src.readSrc();
3551
3552 if (instData.ABS & 0x1) {
3553 src.absModifier();
3554 }
3555
3556 if (extData.NEG & 0x1) {
3557 src.negModifier();
3558 }
3559
3560 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3561 if (wf->execMask(lane)) {
3562 vdst[lane] = std::floor(src[lane]);
3563 }
3564 }
3565
3566 vdst.write();
3567 } // execute
3568 // --- Inst_VOP3__V_FRACT_F32 class methods ---
3569
3571 : Inst_VOP3A(iFmt, "v_fract_f32", false)
3572 {
3573 setFlag(ALU);
3574 setFlag(F32);
3575 } // Inst_VOP3__V_FRACT_F32
3576
3578 {
3579 } // ~Inst_VOP3__V_FRACT_F32
3580
3581 // --- description from .arch file ---
3582 // D.f = S0.f - floor(S0.f).
3583 void
3585 {
3586 Wavefront *wf = gpuDynInst->wavefront();
3587 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3588 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3589
3590 src.readSrc();
3591
3592 if (instData.ABS & 0x1) {
3593 src.absModifier();
3594 }
3595
3596 if (extData.NEG & 0x1) {
3597 src.negModifier();
3598 }
3599
3600 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3601 if (wf->execMask(lane)) {
3602 VecElemF32 int_part(0.0);
3603 vdst[lane] = std::modf(src[lane], &int_part);
3604 }
3605 }
3606
3607 vdst.write();
3608 } // execute
3609 // --- Inst_VOP3__V_TRUNC_F32 class methods ---
3610
3612 : Inst_VOP3A(iFmt, "v_trunc_f32", false)
3613 {
3614 setFlag(ALU);
3615 setFlag(F32);
3616 } // Inst_VOP3__V_TRUNC_F32
3617
3619 {
3620 } // ~Inst_VOP3__V_TRUNC_F32
3621
3622 // --- description from .arch file ---
3623 // D.f = trunc(S0.f), return integer part of S0.f.
3624 void
3626 {
3627 Wavefront *wf = gpuDynInst->wavefront();
3628 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3629 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3630
3631 src.readSrc();
3632
3633 if (instData.ABS & 0x1) {
3634 src.absModifier();
3635 }
3636
3637 if (extData.NEG & 0x1) {
3638 src.negModifier();
3639 }
3640
3641 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3642 if (wf->execMask(lane)) {
3643 vdst[lane] = std::trunc(src[lane]);
3644 }
3645 }
3646
3647 vdst.write();
3648 } // execute
3649 // --- Inst_VOP3__V_CEIL_F32 class methods ---
3650
3652 : Inst_VOP3A(iFmt, "v_ceil_f32", false)
3653 {
3654 setFlag(ALU);
3655 setFlag(F32);
3656 } // Inst_VOP3__V_CEIL_F32
3657
3659 {
3660 } // ~Inst_VOP3__V_CEIL_F32
3661
3662 // --- description from .arch file ---
3663 // D.f = trunc(S0.f);
3664 // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
3665 void
3667 {
3668 Wavefront *wf = gpuDynInst->wavefront();
3669 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3670 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3671
3672 src.readSrc();
3673
3674 if (instData.ABS & 0x1) {
3675 src.absModifier();
3676 }
3677
3678 if (extData.NEG & 0x1) {
3679 src.negModifier();
3680 }
3681
3682 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3683 if (wf->execMask(lane)) {
3684 vdst[lane] = std::ceil(src[lane]);
3685 }
3686 }
3687
3688 vdst.write();
3689 } // execute
3690 // --- Inst_VOP3__V_RNDNE_F32 class methods ---
3691
3693 : Inst_VOP3A(iFmt, "v_rndne_f32", false)
3694 {
3695 setFlag(ALU);
3696 setFlag(F32);
3697 } // Inst_VOP3__V_RNDNE_F32
3698
3700 {
3701 } // ~Inst_VOP3__V_RNDNE_F32
3702
3703 // --- description from .arch file ---
3704 // D.f = round_nearest_even(S0.f).
3705 void
3707 {
3708 Wavefront *wf = gpuDynInst->wavefront();
3709 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3710 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3711
3712 src.readSrc();
3713
3714 if (instData.ABS & 0x1) {
3715 src.absModifier();
3716 }
3717
3718 if (extData.NEG & 0x1) {
3719 src.negModifier();
3720 }
3721
3722 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3723 if (wf->execMask(lane)) {
3724 vdst[lane] = roundNearestEven(src[lane]);
3725 }
3726 }
3727
3728 vdst.write();
3729 } // execute
3730 // --- Inst_VOP3__V_FLOOR_F32 class methods ---
3731
3733 : Inst_VOP3A(iFmt, "v_floor_f32", false)
3734 {
3735 setFlag(ALU);
3736 setFlag(F32);
3737 } // Inst_VOP3__V_FLOOR_F32
3738
3740 {
3741 } // ~Inst_VOP3__V_FLOOR_F32
3742
3743 // --- description from .arch file ---
3744 // D.f = trunc(S0.f);
3745 // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
3746 void
3748 {
3749 Wavefront *wf = gpuDynInst->wavefront();
3750 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3751 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3752
3753 src.readSrc();
3754
3755 if (instData.ABS & 0x1) {
3756 src.absModifier();
3757 }
3758
3759 if (extData.NEG & 0x1) {
3760 src.negModifier();
3761 }
3762
3763 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3764 if (wf->execMask(lane)) {
3765 vdst[lane] = std::floor(src[lane]);
3766 }
3767 }
3768
3769 vdst.write();
3770 } // execute
3771 // --- Inst_VOP3__V_EXP_F32 class methods ---
3772
3774 : Inst_VOP3A(iFmt, "v_exp_f32", false)
3775 {
3776 setFlag(ALU);
3777 setFlag(F32);
3778 } // Inst_VOP3__V_EXP_F32
3779
3781 {
3782 } // ~Inst_VOP3__V_EXP_F32
3783
3784 // --- description from .arch file ---
3785 // D.f = pow(2.0, S0.f).
3786 void
3788 {
3789 Wavefront *wf = gpuDynInst->wavefront();
3790 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3791 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3792
3793 src.readSrc();
3794
3795 if (instData.ABS & 0x1) {
3796 src.absModifier();
3797 }
3798
3799 if (extData.NEG & 0x1) {
3800 src.negModifier();
3801 }
3802
3803 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3804 if (wf->execMask(lane)) {
3805 vdst[lane] = std::pow(2.0, src[lane]);
3806 }
3807 }
3808
3809 vdst.write();
3810 } // execute
3811 // --- Inst_VOP3__V_LOG_F32 class methods ---
3812
3814 : Inst_VOP3A(iFmt, "v_log_f32", false)
3815 {
3816 setFlag(ALU);
3817 setFlag(F32);
3818 } // Inst_VOP3__V_LOG_F32
3819
3821 {
3822 } // ~Inst_VOP3__V_LOG_F32
3823
3824 // --- description from .arch file ---
3825 // D.f = log2(S0.f). Base 2 logarithm.
3826 void
3828 {
3829 Wavefront *wf = gpuDynInst->wavefront();
3830 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3831 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3832
3833 src.readSrc();
3834
3835 if (instData.ABS & 0x1) {
3836 src.absModifier();
3837 }
3838
3839 if (extData.NEG & 0x1) {
3840 src.negModifier();
3841 }
3842
3846 assert(!(instData.ABS & 0x2));
3847 assert(!(instData.ABS & 0x4));
3848 assert(!(extData.NEG & 0x2));
3849 assert(!(extData.NEG & 0x4));
3850
3851 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3852 if (wf->execMask(lane)) {
3853 vdst[lane] = std::log2(src[lane]);
3854 }
3855 }
3856
3857 vdst.write();
3858 } // execute
3859 // --- Inst_VOP3__V_RCP_F32 class methods ---
3860
3862 : Inst_VOP3A(iFmt, "v_rcp_f32", false)
3863 {
3864 setFlag(ALU);
3865 setFlag(F32);
3866 } // Inst_VOP3__V_RCP_F32
3867
3869 {
3870 } // ~Inst_VOP3__V_RCP_F32
3871
3872 // --- description from .arch file ---
3873 // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
3874 void
3876 {
3877 Wavefront *wf = gpuDynInst->wavefront();
3878 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3879 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3880
3881 src.readSrc();
3882
3883 if (instData.ABS & 0x1) {
3884 src.absModifier();
3885 }
3886
3887 if (extData.NEG & 0x1) {
3888 src.negModifier();
3889 }
3890
3891 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3892 if (wf->execMask(lane)) {
3893 vdst[lane] = 1.0 / src[lane];
3894 }
3895 }
3896
3897 vdst.write();
3898 } // execute
3899 // --- Inst_VOP3__V_RCP_IFLAG_F32 class methods ---
3900
3902 : Inst_VOP3A(iFmt, "v_rcp_iflag_f32", false)
3903 {
3904 setFlag(ALU);
3905 setFlag(F32);
3906 } // Inst_VOP3__V_RCP_IFLAG_F32
3907
3909 {
3910 } // ~Inst_VOP3__V_RCP_IFLAG_F32
3911
3912 // --- description from .arch file ---
3913 // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
3914 // --- integer DIV_BY_ZERO exception but cannot raise floating-point
3915 // --- exceptions.
3916 void
3918 {
3919 Wavefront *wf = gpuDynInst->wavefront();
3920 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3921 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3922
3923 src.readSrc();
3924
3925 if (instData.ABS & 0x1) {
3926 src.absModifier();
3927 }
3928
3929 if (extData.NEG & 0x1) {
3930 src.negModifier();
3931 }
3932
3933 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3934 if (wf->execMask(lane)) {
3935 vdst[lane] = 1.0 / src[lane];
3936 }
3937 }
3938
3939 vdst.write();
3940 } // execute
3941 // --- Inst_VOP3__V_RSQ_F32 class methods ---
3942
3944 : Inst_VOP3A(iFmt, "v_rsq_f32", false)
3945 {
3946 setFlag(ALU);
3947 setFlag(F32);
3948 } // Inst_VOP3__V_RSQ_F32
3949
3951 {
3952 } // ~Inst_VOP3__V_RSQ_F32
3953
3954 // --- description from .arch file ---
3955 // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
3956 void
3958 {
3959 Wavefront *wf = gpuDynInst->wavefront();
3960 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
3961 VecOperandF32 vdst(gpuDynInst, instData.VDST);
3962
3963 src.readSrc();
3964
3965 if (instData.ABS & 0x1) {
3966 src.absModifier();
3967 }
3968
3969 if (extData.NEG & 0x1) {
3970 src.negModifier();
3971 }
3972
3973 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
3974 if (wf->execMask(lane)) {
3975 vdst[lane] = 1.0 / std::sqrt(src[lane]);
3976 }
3977 }
3978
3979 vdst.write();
3980 } // execute
3981 // --- Inst_VOP3__V_RCP_F64 class methods ---
3982
3984 : Inst_VOP3A(iFmt, "v_rcp_f64", false)
3985 {
3986 setFlag(ALU);
3987 setFlag(F64);
3988 } // Inst_VOP3__V_RCP_F64
3989
3991 {
3992 } // ~Inst_VOP3__V_RCP_F64
3993
3994 // --- description from .arch file ---
3995 // D.d = 1.0 / S0.d.
3996 void
3998 {
3999 Wavefront *wf = gpuDynInst->wavefront();
4000 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4001 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4002
4003 src.readSrc();
4004
4005 if (instData.ABS & 0x1) {
4006 src.absModifier();
4007 }
4008
4009 if (extData.NEG & 0x1) {
4010 src.negModifier();
4011 }
4012
4013 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4014 if (wf->execMask(lane)) {
4015 if (std::fpclassify(src[lane]) == FP_ZERO) {
4016 vdst[lane] = +INFINITY;
4017 } else if (std::isnan(src[lane])) {
4018 vdst[lane] = NAN;
4019 } else if (std::isinf(src[lane])) {
4020 if (std::signbit(src[lane])) {
4021 vdst[lane] = -0.0;
4022 } else {
4023 vdst[lane] = 0.0;
4024 }
4025 } else {
4026 vdst[lane] = 1.0 / src[lane];
4027 }
4028 }
4029 }
4030
4031 vdst.write();
4032 } // execute
4033 // --- Inst_VOP3__V_RSQ_F64 class methods ---
4034
4036 : Inst_VOP3A(iFmt, "v_rsq_f64", false)
4037 {
4038 setFlag(ALU);
4039 setFlag(F64);
4040 } // Inst_VOP3__V_RSQ_F64
4041
4043 {
4044 } // ~Inst_VOP3__V_RSQ_F64
4045
4046 // --- description from .arch file ---
4047 // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
4048 void
4050 {
4051 Wavefront *wf = gpuDynInst->wavefront();
4052 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4053 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4054
4055 src.readSrc();
4056
4057 if (instData.ABS & 0x1) {
4058 src.absModifier();
4059 }
4060
4061 if (extData.NEG & 0x1) {
4062 src.negModifier();
4063 }
4064
4065 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4066 if (wf->execMask(lane)) {
4067 if (std::fpclassify(src[lane]) == FP_ZERO) {
4068 vdst[lane] = +INFINITY;
4069 } else if (std::isnan(src[lane])) {
4070 vdst[lane] = NAN;
4071 } else if (std::isinf(src[lane]) && !std::signbit(src[lane])) {
4072 vdst[lane] = 0.0;
4073 } else if (std::signbit(src[lane])) {
4074 vdst[lane] = NAN;
4075 } else {
4076 vdst[lane] = 1.0 / std::sqrt(src[lane]);
4077 }
4078 }
4079 }
4080
4081 vdst.write();
4082 } // execute
4083 // --- Inst_VOP3__V_SQRT_F32 class methods ---
4084
4086 : Inst_VOP3A(iFmt, "v_sqrt_f32", false)
4087 {
4088 setFlag(ALU);
4089 setFlag(F32);
4090 } // Inst_VOP3__V_SQRT_F32
4091
4093 {
4094 } // ~Inst_VOP3__V_SQRT_F32
4095
4096 // --- description from .arch file ---
4097 // D.f = sqrt(S0.f).
4098 void
4100 {
4101 Wavefront *wf = gpuDynInst->wavefront();
4102 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4103 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4104
4105 src.readSrc();
4106
4107 if (instData.ABS & 0x1) {
4108 src.absModifier();
4109 }
4110
4111 if (extData.NEG & 0x1) {
4112 src.negModifier();
4113 }
4114
4115 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4116 if (wf->execMask(lane)) {
4117 vdst[lane] = std::sqrt(src[lane]);
4118 }
4119 }
4120
4121 vdst.write();
4122 } // execute
4123 // --- Inst_VOP3__V_SQRT_F64 class methods ---
4124
4126 : Inst_VOP3A(iFmt, "v_sqrt_f64", false)
4127 {
4128 setFlag(ALU);
4129 setFlag(F64);
4130 } // Inst_VOP3__V_SQRT_F64
4131
4133 {
4134 } // ~Inst_VOP3__V_SQRT_F64
4135
4136 // --- description from .arch file ---
4137 // D.d = sqrt(S0.d).
4138 void
4140 {
4141 Wavefront *wf = gpuDynInst->wavefront();
4142 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4143 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4144
4145 src.readSrc();
4146
4147 if (instData.ABS & 0x1) {
4148 src.absModifier();
4149 }
4150
4151 if (extData.NEG & 0x1) {
4152 src.negModifier();
4153 }
4154
4155 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4156 if (wf->execMask(lane)) {
4157 vdst[lane] = std::sqrt(src[lane]);
4158 }
4159 }
4160
4161 vdst.write();
4162 } // execute
4163 // --- Inst_VOP3__V_SIN_F32 class methods ---
4164
4166 : Inst_VOP3A(iFmt, "v_sin_f32", false)
4167 {
4168 setFlag(ALU);
4169 setFlag(F32);
4170 } // Inst_VOP3__V_SIN_F32
4171
4173 {
4174 } // ~Inst_VOP3__V_SIN_F32
4175
4176 // --- description from .arch file ---
4177 // D.f = sin(S0.f * 2 * PI).
4178 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
4179 // float 0.0.
4180 void
4182 {
4183 Wavefront *wf = gpuDynInst->wavefront();
4184 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4185 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
4186 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4187
4188 src.readSrc();
4189 pi.read();
4190
4191 if (instData.ABS & 0x1) {
4192 src.absModifier();
4193 }
4194
4195 if (extData.NEG & 0x1) {
4196 src.negModifier();
4197 }
4198
4199 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4200 if (wf->execMask(lane)) {
4201 vdst[lane] = std::sin(src[lane] * 2 * pi.rawData());
4202 }
4203 }
4204
4205 vdst.write();
4206 } // execute
4207 // --- Inst_VOP3__V_COS_F32 class methods ---
4208
4210 : Inst_VOP3A(iFmt, "v_cos_f32", false)
4211 {
4212 setFlag(ALU);
4213 setFlag(F32);
4214 } // Inst_VOP3__V_COS_F32
4215
4217 {
4218 } // ~Inst_VOP3__V_COS_F32
4219
4220 // --- description from .arch file ---
4221 // D.f = cos(S0.f * 2 * PI).
4222 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
4223 // float 1.0.
4224 void
4226 {
4227 Wavefront *wf = gpuDynInst->wavefront();
4228 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4229 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
4230 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4231
4232 src.readSrc();
4233 pi.read();
4234
4235 if (instData.ABS & 0x1) {
4236 src.absModifier();
4237 }
4238
4239 if (extData.NEG & 0x1) {
4240 src.negModifier();
4241 }
4242
4243 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4244 if (wf->execMask(lane)) {
4245 vdst[lane] = std::cos(src[lane] * 2 * pi.rawData());
4246 }
4247 }
4248
4249 vdst.write();
4250 } // execute
4251 // --- Inst_VOP3__V_NOT_B32 class methods ---
4252
4254 : Inst_VOP3A(iFmt, "v_not_b32", false)
4255 {
4256 setFlag(ALU);
4257 } // Inst_VOP3__V_NOT_B32
4258
4260 {
4261 } // ~Inst_VOP3__V_NOT_B32
4262
4263 // --- description from .arch file ---
4264 // D.u = ~S0.u.
4265 // Input and output modifiers not supported.
4266 void
4268 {
4269 Wavefront *wf = gpuDynInst->wavefront();
4270 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4271 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4272
4273 src.readSrc();
4274
4275 if (instData.ABS & 0x1) {
4276 src.absModifier();
4277 }
4278
4279 if (extData.NEG & 0x1) {
4280 src.negModifier();
4281 }
4282
4283 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4284 if (wf->execMask(lane)) {
4285 vdst[lane] = ~src[lane];
4286 }
4287 }
4288
4289 vdst.write();
4290 } // execute
4291 // --- Inst_VOP3__V_BFREV_B32 class methods ---
4292
4294 : Inst_VOP3A(iFmt, "v_bfrev_b32", false)
4295 {
4296 setFlag(ALU);
4297 } // Inst_VOP3__V_BFREV_B32
4298
4300 {
4301 } // ~Inst_VOP3__V_BFREV_B32
4302
4303 // --- description from .arch file ---
4304 // D.u[31:0] = S0.u[0:31], bitfield reverse.
4305 // Input and output modifiers not supported.
4306 void
4308 {
4309 Wavefront *wf = gpuDynInst->wavefront();
4310 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4311 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4312
4313 src.readSrc();
4314
4315 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4316 if (wf->execMask(lane)) {
4317 vdst[lane] = reverseBits(src[lane]);
4318 }
4319 }
4320
4321 vdst.write();
4322 } // execute
4323 // --- Inst_VOP3__V_FFBH_U32 class methods ---
4324
4326 : Inst_VOP3A(iFmt, "v_ffbh_u32", false)
4327 {
4328 setFlag(ALU);
4329 } // Inst_VOP3__V_FFBH_U32
4330
4332 {
4333 } // ~Inst_VOP3__V_FFBH_U32
4334
4335 // --- description from .arch file ---
4336 // D.u = position of first 1 in S0.u from MSB;
4337 // D.u = 0xffffffff if S0.u == 0.
4338 void
4340 {
4341 Wavefront *wf = gpuDynInst->wavefront();
4342 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4343 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4344
4345 src.readSrc();
4346
4347 if (instData.ABS & 0x1) {
4348 src.absModifier();
4349 }
4350
4351 if (extData.NEG & 0x1) {
4352 src.negModifier();
4353 }
4354
4355 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4356 if (wf->execMask(lane)) {
4357 vdst[lane] = findFirstOneMsb(src[lane]);
4358 }
4359 }
4360
4361 vdst.write();
4362 } // execute
4363 // --- Inst_VOP3__V_FFBL_B32 class methods ---
4364
4366 : Inst_VOP3A(iFmt, "v_ffbl_b32", false)
4367 {
4368 setFlag(ALU);
4369 } // Inst_VOP3__V_FFBL_B32
4370
4372 {
4373 } // ~Inst_VOP3__V_FFBL_B32
4374
4375 // --- description from .arch file ---
4376 // D.u = position of first 1 in S0.u from LSB;
4377 // D.u = 0xffffffff if S0.u == 0.
4378 void
4380 {
4381 Wavefront *wf = gpuDynInst->wavefront();
4382 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
4383 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4384
4385 src.readSrc();
4386
4387 if (instData.ABS & 0x1) {
4388 src.absModifier();
4389 }
4390
4391 if (extData.NEG & 0x1) {
4392 src.negModifier();
4393 }
4394
4395 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4396 if (wf->execMask(lane)) {
4397 vdst[lane] = findFirstOne(src[lane]);
4398 }
4399 }
4400
4401 vdst.write();
4402 } // execute
4403 // --- Inst_VOP3__V_FFBH_I32 class methods ---
4404
4406 : Inst_VOP3A(iFmt, "v_ffbh_i32", false)
4407 {
4408 setFlag(ALU);
4409 } // Inst_VOP3__V_FFBH_I32
4410
4412 {
4413 } // ~Inst_VOP3__V_FFBH_I32
4414
4415 // --- description from .arch file ---
4416 // D.u = position of first bit different from sign bit in S0.i from MSB;
4417 // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
4418 void
4420 {
4421 Wavefront *wf = gpuDynInst->wavefront();
4422 ConstVecOperandI32 src(gpuDynInst, extData.SRC0);
4423 VecOperandU32 vdst(gpuDynInst, instData.VDST);
4424
4425 src.readSrc();
4426
4427 if (instData.ABS & 0x1) {
4428 src.absModifier();
4429 }
4430
4431 if (extData.NEG & 0x1) {
4432 src.negModifier();
4433 }
4434
4435 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4436 if (wf->execMask(lane)) {
4437 vdst[lane] = firstOppositeSignBit(src[lane]);
4438 }
4439 }
4440
4441 vdst.write();
4442 } // execute
4443 // --- Inst_VOP3__V_FREXP_EXP_I32_F64 class methods ---
4444
4446 InFmt_VOP3A *iFmt)
4447 : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f64", false)
4448 {
4449 setFlag(ALU);
4450 setFlag(F64);
4451 } // Inst_VOP3__V_FREXP_EXP_I32_F64
4452
4454 {
4455 } // ~Inst_VOP3__V_FREXP_EXP_I32_F64
4456
4457 // --- description from .arch file ---
4458 // See V_FREXP_EXP_I32_F32.
4459 void
4461 {
4462 Wavefront *wf = gpuDynInst->wavefront();
4463 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4464 VecOperandI32 vdst(gpuDynInst, instData.VDST);
4465
4466 src.readSrc();
4467
4468 if (instData.ABS & 0x1) {
4469 src.absModifier();
4470 }
4471
4472 if (extData.NEG & 0x1) {
4473 src.negModifier();
4474 }
4475
4476 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4477 if (wf->execMask(lane)) {
4478 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
4479 vdst[lane] = 0;
4480 } else {
4481 VecElemI32 exp(0);
4482 std::frexp(src[lane], &exp);
4483 vdst[lane] = exp;
4484 }
4485 }
4486 }
4487
4488 vdst.write();
4489 } // execute
4490 // --- Inst_VOP3__V_FREXP_MANT_F64 class methods ---
4491
4493 : Inst_VOP3A(iFmt, "v_frexp_mant_f64", false)
4494 {
4495 setFlag(ALU);
4496 setFlag(F64);
4497 } // Inst_VOP3__V_FREXP_MANT_F64
4498
4500 {
4501 } // ~Inst_VOP3__V_FREXP_MANT_F64
4502
4503 // --- description from .arch file ---
4504 // See V_FREXP_MANT_F32.
4505 void
4507 {
4508 Wavefront *wf = gpuDynInst->wavefront();
4509 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4510 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4511
4512 src.readSrc();
4513
4514 if (instData.ABS & 0x1) {
4515 src.absModifier();
4516 }
4517
4518 if (extData.NEG & 0x1) {
4519 src.negModifier();
4520 }
4521
4522 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4523 if (wf->execMask(lane)) {
4524 VecElemI32 exp(0);
4525 vdst[lane] = std::frexp(src[lane], &exp);
4526 }
4527 }
4528
4529 vdst.write();
4530 } // execute
4531 // --- Inst_VOP3__V_FRACT_F64 class methods ---
4532
4534 : Inst_VOP3A(iFmt, "v_fract_f64", false)
4535 {
4536 setFlag(ALU);
4537 setFlag(F64);
4538 } // Inst_VOP3__V_FRACT_F64
4539
4541 {
4542 } // ~Inst_VOP3__V_FRACT_F64
4543
4544 // --- description from .arch file ---
4545 // See V_FRACT_F32.
4546 void
4548 {
4549 Wavefront *wf = gpuDynInst->wavefront();
4550 ConstVecOperandF64 src(gpuDynInst, extData.SRC0);
4551 VecOperandF64 vdst(gpuDynInst, instData.VDST);
4552
4553 src.readSrc();
4554
4555 if (instData.ABS & 0x1) {
4556 src.absModifier();
4557 }
4558
4559 if (extData.NEG & 0x1) {
4560 src.negModifier();
4561 }
4562
4563 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4564 if (wf->execMask(lane)) {
4565 VecElemF32 int_part(0.0);
4566 vdst[lane] = std::modf(src[lane], &int_part);
4567 }
4568 }
4569
4570 vdst.write();
4571 } // execute
4572 // --- Inst_VOP3__V_FREXP_EXP_I32_F32 class methods ---
4573
4575 InFmt_VOP3A *iFmt)
4576 : Inst_VOP3A(iFmt, "v_frexp_exp_i32_f32", false)
4577 {
4578 setFlag(ALU);
4579 setFlag(F32);
4580 } // Inst_VOP3__V_FREXP_EXP_I32_F32
4581
4583 {
4584 } // ~Inst_VOP3__V_FREXP_EXP_I32_F32
4585
4586 // --- description from .arch file ---
4587 // if (S0.f == INF || S0.f == NAN) then D.i = 0;
4588 // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
4589 // Returns exponent of single precision float input, such that S0.f =
4590 // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
4591 // the significand.
4592 void
4594 {
4595 Wavefront *wf = gpuDynInst->wavefront();
4596 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4597 VecOperandI32 vdst(gpuDynInst, instData.VDST);
4598
4599 src.readSrc();
4600
4601 if (instData.ABS & 0x1) {
4602 src.absModifier();
4603 }
4604
4605 if (extData.NEG & 0x1) {
4606 src.negModifier();
4607 }
4608
4609 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4610 if (wf->execMask(lane)) {
4611 if (std::isinf(src[lane])|| std::isnan(src[lane])) {
4612 vdst[lane] = 0;
4613 } else {
4614 VecElemI32 exp(0);
4615 std::frexp(src[lane], &exp);
4616 vdst[lane] = exp;
4617 }
4618 }
4619 }
4620
4621 vdst.write();
4622 } // execute
4623 // --- Inst_VOP3__V_FREXP_MANT_F32 class methods ---
4624
4626 : Inst_VOP3A(iFmt, "v_frexp_mant_f32", false)
4627 {
4628 setFlag(ALU);
4629 setFlag(F32);
4630 } // Inst_VOP3__V_FREXP_MANT_F32
4631
4633 {
4634 } // ~Inst_VOP3__V_FREXP_MANT_F32
4635
4636 // --- description from .arch file ---
4637 // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
4638 // else D.f = Mantissa(S0.f).
4639 // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
4640 // --- significand of single precision float input, such that S0.f =
4641 // --- significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
4642 // --- returns integer exponent.
4643 void
4645 {
4646 Wavefront *wf = gpuDynInst->wavefront();
4647 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
4648 VecOperandF32 vdst(gpuDynInst, instData.VDST);
4649
4650 src.readSrc();
4651
4652 if (instData.ABS & 0x1) {
4653 src.absModifier();
4654 }
4655
4656 if (extData.NEG & 0x1) {
4657 src.negModifier();
4658 }
4659
4660 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
4661 if (wf->execMask(lane)) {
4662 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
4663 vdst[lane] = src[lane];
4664 } else {
4665 VecElemI32 exp(0);
4666 vdst[lane] = std::frexp(src[lane], &exp);
4667 }
4668 }
4669 }
4670
4671 vdst.write();
4672 } // execute
4673 // --- Inst_VOP3__V_CLREXCP class methods ---
4674
4676 : Inst_VOP3A(iFmt, "v_clrexcp", false)
4677 {
4678 } // Inst_VOP3__V_CLREXCP
4679
4681 {
4682 } // ~Inst_VOP3__V_CLREXCP
4683
4684 // --- description from .arch file ---
4685 // Clear wave's exception state in SIMD (SP).
4686 void
4688 {
4690 } // execute
4691 // --- Inst_VOP3__V_CVT_F16_U16 class methods ---
4692
4694 : Inst_VOP3A(iFmt, "v_cvt_f16_u16", false)
4695 {
4696 setFlag(ALU);
4697 setFlag(F16);
4698 } // Inst_VOP3__V_CVT_F16_U16
4699
4701 {
4702 } // ~Inst_VOP3__V_CVT_F16_U16
4703
4704 // --- description from .arch file ---
4705 // D.f16 = uint16_to_flt16(S.u16).
4706 // Supports denormals, rounding, exception flags and saturation.
4707 void
4709 {
4711 } // execute
4712 // --- Inst_VOP3__V_CVT_F16_I16 class methods ---
4713
4715 : Inst_VOP3A(iFmt, "v_cvt_f16_i16", false)
4716 {
4717 setFlag(ALU);
4718 setFlag(F16);
4719 } // Inst_VOP3__V_CVT_F16_I16
4720
4722 {
4723 } // ~Inst_VOP3__V_CVT_F16_I16
4724
4725 // --- description from .arch file ---
4726 // D.f16 = int16_to_flt16(S.i16).
4727 // Supports denormals, rounding, exception flags and saturation.
4728 void
4730 {
4732 } // execute
4733 // --- Inst_VOP3__V_CVT_U16_F16 class methods ---
4734
4736 : Inst_VOP3A(iFmt, "v_cvt_u16_f16", false)
4737 {
4738 setFlag(ALU);
4739 setFlag(F16);
4740 } // Inst_VOP3__V_CVT_U16_F16
4741
4743 {
4744 } // ~Inst_VOP3__V_CVT_U16_F16
4745
4746 // --- description from .arch file ---
4747 // D.u16 = flt16_to_uint16(S.f16).
4748 // Supports rounding, exception flags and saturation.
4749 void
4751 {
4753 } // execute
4754 // --- Inst_VOP3__V_CVT_I16_F16 class methods ---
4755
4757 : Inst_VOP3A(iFmt, "v_cvt_i16_f16", false)
4758 {
4759 setFlag(ALU);
4760 setFlag(F16);
4761 } // Inst_VOP3__V_CVT_I16_F16
4762
4764 {
4765 } // ~Inst_VOP3__V_CVT_I16_F16
4766
4767 // --- description from .arch file ---
4768 // D.i16 = flt16_to_int16(S.f16).
4769 // Supports rounding, exception flags and saturation.
4770 void
4772 {
4774 } // execute
4775 // --- Inst_VOP3__V_RCP_F16 class methods ---
4776
4778 : Inst_VOP3A(iFmt, "v_rcp_f16", false)
4779 {
4780 setFlag(ALU);
4781 setFlag(F16);
4782 } // Inst_VOP3__V_RCP_F16
4783
4785 {
4786 } // ~Inst_VOP3__V_RCP_F16
4787
4788 // --- description from .arch file ---
4789 // if (S0.f16 == 1.0f)
4790 // D.f16 = 1.0f;
4791 // else
4792 // D.f16 = ApproximateRecip(S0.f16).
4793 void
4795 {
4797 } // execute
4798 // --- Inst_VOP3__V_SQRT_F16 class methods ---
4799
4801 : Inst_VOP3A(iFmt, "v_sqrt_f16", false)
4802 {
4803 setFlag(ALU);
4804 setFlag(F16);
4805 } // Inst_VOP3__V_SQRT_F16
4806
4808 {
4809 } // ~Inst_VOP3__V_SQRT_F16
4810
4811 // --- description from .arch file ---
4812 // if (S0.f16 == 1.0f)
4813 // D.f16 = 1.0f;
4814 // else
4815 // D.f16 = ApproximateSqrt(S0.f16).
4816 void
4818 {
4820 } // execute
4821 // --- Inst_VOP3__V_RSQ_F16 class methods ---
4822
4824 : Inst_VOP3A(iFmt, "v_rsq_f16", false)
4825 {
4826 setFlag(ALU);
4827 setFlag(F16);
4828 } // Inst_VOP3__V_RSQ_F16
4829
4831 {
4832 } // ~Inst_VOP3__V_RSQ_F16
4833
4834 // --- description from .arch file ---
4835 // if (S0.f16 == 1.0f)
4836 // D.f16 = 1.0f;
4837 // else
4838 // D.f16 = ApproximateRecipSqrt(S0.f16).
4839 void
4841 {
4843 } // execute
4844 // --- Inst_VOP3__V_LOG_F16 class methods ---
4845
4847 : Inst_VOP3A(iFmt, "v_log_f16", false)
4848 {
4849 setFlag(ALU);
4850 setFlag(F16);
4851 } // Inst_VOP3__V_LOG_F16
4852
4854 {
4855 } // ~Inst_VOP3__V_LOG_F16
4856
4857 // --- description from .arch file ---
4858 // if (S0.f16 == 1.0f)
4859 // D.f16 = 0.0f;
4860 // else
4861 // D.f16 = ApproximateLog2(S0.f16).
4862 void
4864 {
4866 } // execute
4867 // --- Inst_VOP3__V_EXP_F16 class methods ---
4868
4870 : Inst_VOP3A(iFmt, "v_exp_f16", false)
4871 {
4872 setFlag(ALU);
4873 setFlag(F16);
4874 } // Inst_VOP3__V_EXP_F16
4875
4877 {
4878 } // ~Inst_VOP3__V_EXP_F16
4879
4880 // --- description from .arch file ---
4881 // if (S0.f16 == 0.0f)
4882 // D.f16 = 1.0f;
4883 // else
4884 // D.f16 = Approximate2ToX(S0.f16).
4885 void
4887 {
4889 } // execute
4890 // --- Inst_VOP3__V_FREXP_MANT_F16 class methods ---
4891
4893 : Inst_VOP3A(iFmt, "v_frexp_mant_f16", false)
4894 {
4895 setFlag(ALU);
4896 setFlag(F16);
4897 } // Inst_VOP3__V_FREXP_MANT_F16
4898
4900 {
4901 } // ~Inst_VOP3__V_FREXP_MANT_F16
4902
4903 // --- description from .arch file ---
4904 // if (S0.f16 == +-INF || S0.f16 == NAN)
4905 // D.f16 = S0.f16;
4906 // else
4907 // D.f16 = mantissa(S0.f16).
4908 // Result range is (-1.0,-0.5][0.5,1.0).
4909 // C math library frexp function.
4910 // Returns binary significand of half precision float input, such that the
4911 // original single float = significand * (2 ** exponent).
4912 void
4914 {
4916 } // execute
4917 // --- Inst_VOP3__V_FREXP_EXP_I16_F16 class methods ---
4918
4920 InFmt_VOP3A *iFmt)
4921 : Inst_VOP3A(iFmt, "v_frexp_exp_i16_f16", false)
4922 {
4923 setFlag(ALU);
4924 setFlag(F16);
4925 } // Inst_VOP3__V_FREXP_EXP_I16_F16
4926
4928 {
4929 } // ~Inst_VOP3__V_FREXP_EXP_I16_F16
4930
4931 // --- description from .arch file ---
4932 // if (S0.f16 == +-INF || S0.f16 == NAN)
4933 // D.i16 = 0;
4934 // else
4935 // D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
4936 // C math library frexp function.
4937 // Returns exponent of half precision float input, such that the
4938 // original single float = significand * (2 ** exponent).
4939 void
4944 // --- Inst_VOP3__V_FLOOR_F16 class methods ---
4945
4947 : Inst_VOP3A(iFmt, "v_floor_f16", false)
4948 {
4949 setFlag(ALU);
4950 setFlag(F16);
4951 } // Inst_VOP3__V_FLOOR_F16
4952
4954 {
4955 } // ~Inst_VOP3__V_FLOOR_F16
4956
4957 // --- description from .arch file ---
4958 // D.f16 = trunc(S0.f16);
4959 // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
4960 void
4962 {
4964 } // execute
4965 // --- Inst_VOP3__V_CEIL_F16 class methods ---
4966
4968 : Inst_VOP3A(iFmt, "v_ceil_f16", false)
4969 {
4970 setFlag(ALU);
4971 setFlag(F16);
4972 } // Inst_VOP3__V_CEIL_F16
4973
4975 {
4976 } // ~Inst_VOP3__V_CEIL_F16
4977
4978 // --- description from .arch file ---
4979 // D.f16 = trunc(S0.f16);
4980 // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
4981 void
4983 {
4985 } // execute
4986 // --- Inst_VOP3__V_TRUNC_F16 class methods ---
4987
4989 : Inst_VOP3A(iFmt, "v_trunc_f16", false)
4990 {
4991 setFlag(ALU);
4992 setFlag(F16);
4993 } // Inst_VOP3__V_TRUNC_F16
4994
4996 {
4997 } // ~Inst_VOP3__V_TRUNC_F16
4998
4999 // --- description from .arch file ---
5000 // D.f16 = trunc(S0.f16).
5001 // Round-to-zero semantics.
5002 void
5004 {
5006 } // execute
5007 // --- Inst_VOP3__V_RNDNE_F16 class methods ---
5008
5010 : Inst_VOP3A(iFmt, "v_rndne_f16", false)
5011 {
5012 setFlag(ALU);
5013 setFlag(F16);
5014 } // Inst_VOP3__V_RNDNE_F16
5015
5017 {
5018 } // ~Inst_VOP3__V_RNDNE_F16
5019
5020 // --- description from .arch file ---
5021 // D.f16 = FLOOR(S0.f16 + 0.5f);
5022 // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
5023 // Round-to-nearest-even semantics.
5024 void
5026 {
5028 } // execute
5029 // --- Inst_VOP3__V_FRACT_F16 class methods ---
5030
5032 : Inst_VOP3A(iFmt, "v_fract_f16", false)
5033 {
5034 setFlag(ALU);
5035 setFlag(F16);
5036 } // Inst_VOP3__V_FRACT_F16
5037
5039 {
5040 } // ~Inst_VOP3__V_FRACT_F16
5041
5042 // --- description from .arch file ---
5043 // D.f16 = S0.f16 + -floor(S0.f16).
5044 void
5046 {
5048 } // execute
5049 // --- Inst_VOP3__V_SIN_F16 class methods ---
5050
5052 : Inst_VOP3A(iFmt, "v_sin_f16", false)
5053 {
5054 setFlag(ALU);
5055 setFlag(F16);
5056 } // Inst_VOP3__V_SIN_F16
5057
5059 {
5060 } // ~Inst_VOP3__V_SIN_F16
5061
5062 // --- description from .arch file ---
5063 // D.f16 = sin(S0.f16 * 2 * PI).
5064 void
5066 {
5068 } // execute
5069 // --- Inst_VOP3__V_COS_F16 class methods ---
5070
5072 : Inst_VOP3A(iFmt, "v_cos_f16", false)
5073 {
5074 setFlag(ALU);
5075 setFlag(F16);
5076 } // Inst_VOP3__V_COS_F16
5077
5079 {
5080 } // ~Inst_VOP3__V_COS_F16
5081
5082 // --- description from .arch file ---
5083 // D.f16 = cos(S0.f16 * 2 * PI).
5084 void
5086 {
5088 } // execute
5089 // --- Inst_VOP3__V_EXP_LEGACY_F32 class methods ---
5090
5092 : Inst_VOP3A(iFmt, "v_exp_legacy_f32", false)
5093 {
5094 setFlag(ALU);
5095 setFlag(F32);
5096 } // Inst_VOP3__V_EXP_LEGACY_F32
5097
5099 {
5100 } // ~Inst_VOP3__V_EXP_LEGACY_F32
5101
5102 // --- description from .arch file ---
5103 // D.f = pow(2.0, S0.f) with legacy semantics.
5104 void
5106 {
5107 Wavefront *wf = gpuDynInst->wavefront();
5108 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
5109 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5110
5111 src.readSrc();
5112
5113 if (instData.ABS & 0x1) {
5114 src.absModifier();
5115 }
5116
5117 if (extData.NEG & 0x1) {
5118 src.negModifier();
5119 }
5120
5124 assert(!(instData.ABS & 0x2));
5125 assert(!(instData.ABS & 0x4));
5126 assert(!(extData.NEG & 0x2));
5127 assert(!(extData.NEG & 0x4));
5128
5129 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5130 if (wf->execMask(lane)) {
5131 vdst[lane] = std::pow(2.0, src[lane]);
5132 }
5133 }
5134
5135 vdst.write();
5136 } // execute
5137 // --- Inst_VOP3__V_LOG_LEGACY_F32 class methods ---
5138
5140 : Inst_VOP3A(iFmt, "v_log_legacy_f32", false)
5141 {
5142 setFlag(ALU);
5143 setFlag(F32);
5144 } // Inst_VOP3__V_LOG_LEGACY_F32
5145
5147 {
5148 } // ~Inst_VOP3__V_LOG_LEGACY_F32
5149
5150 // --- description from .arch file ---
5151 // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
5152 void
5154 {
5155 Wavefront *wf = gpuDynInst->wavefront();
5156 ConstVecOperandF32 src(gpuDynInst, extData.SRC0);
5157 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5158
5159 src.readSrc();
5160
5161 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5162 if (wf->execMask(lane)) {
5163 vdst[lane] = std::log2(src[lane]);
5164 }
5165 }
5166
5167 vdst.write();
5168 } // execute
5169 // --- Inst_VOP3__V_MAD_LEGACY_F32 class methods ---
5170
5172 : Inst_VOP3A(iFmt, "v_mad_legacy_f32", false)
5173 {
5174 setFlag(ALU);
5175 setFlag(F32);
5176 setFlag(MAD);
5177 } // Inst_VOP3__V_MAD_LEGACY_F32
5178
5180 {
5181 } // ~Inst_VOP3__V_MAD_LEGACY_F32
5182
5183 // --- description from .arch file ---
5184 // D.f = S0.f * S1.f + S2.f (DX9 rules, 0.0 * x = 0.0).
5185 void
5187 {
5188 Wavefront *wf = gpuDynInst->wavefront();
5189 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5190 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5191 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5192 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5193
5194 src0.readSrc();
5195 src1.readSrc();
5196 src2.readSrc();
5197
5198 if (instData.ABS & 0x1) {
5199 src0.absModifier();
5200 }
5201
5202 if (instData.ABS & 0x2) {
5203 src1.absModifier();
5204 }
5205
5206 if (instData.ABS & 0x4) {
5207 src2.absModifier();
5208 }
5209
5210 if (extData.NEG & 0x1) {
5211 src0.negModifier();
5212 }
5213
5214 if (extData.NEG & 0x2) {
5215 src1.negModifier();
5216 }
5217
5218 if (extData.NEG & 0x4) {
5219 src2.negModifier();
5220 }
5221
5222 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5223 if (wf->execMask(lane)) {
5224 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5225 }
5226 }
5227
5228 vdst.write();
5229 } // execute
5230 // --- Inst_VOP3__V_MAD_F32 class methods ---
5231
5233 : Inst_VOP3A(iFmt, "v_mad_f32", false)
5234 {
5235 setFlag(ALU);
5236 setFlag(F32);
5237 setFlag(MAD);
5238 } // Inst_VOP3__V_MAD_F32
5239
5241 {
5242 } // ~Inst_VOP3__V_MAD_F32
5243
5244 // --- description from .arch file ---
5245 // D.f = S0.f * S1.f + S2.f.
5246 void
5248 {
5249 Wavefront *wf = gpuDynInst->wavefront();
5250 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5251 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5252 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5253 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5254
5255 src0.readSrc();
5256 src1.readSrc();
5257 src2.readSrc();
5258
5259 if (instData.ABS & 0x1) {
5260 src0.absModifier();
5261 }
5262
5263 if (instData.ABS & 0x2) {
5264 src1.absModifier();
5265 }
5266
5267 if (instData.ABS & 0x4) {
5268 src2.absModifier();
5269 }
5270
5271 if (extData.NEG & 0x1) {
5272 src0.negModifier();
5273 }
5274
5275 if (extData.NEG & 0x2) {
5276 src1.negModifier();
5277 }
5278
5279 if (extData.NEG & 0x4) {
5280 src2.negModifier();
5281 }
5282
5283 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5284 if (wf->execMask(lane)) {
5285 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5286 }
5287 }
5288
5289 vdst.write();
5290 } // execute
5291 // --- Inst_VOP3__V_MAD_I32_I24 class methods ---
5292
5294 : Inst_VOP3A(iFmt, "v_mad_i32_i24", false)
5295 {
5296 setFlag(ALU);
5297 setFlag(MAD);
5298 } // Inst_VOP3__V_MAD_I32_I24
5299
5301 {
5302 } // ~Inst_VOP3__V_MAD_I32_I24
5303
5304 // --- description from .arch file ---
5305 // D.i = S0.i[23:0] * S1.i[23:0] + S2.i.
5306 void
5308 {
5309 Wavefront *wf = gpuDynInst->wavefront();
5310 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5311 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
5312 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
5313 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5314
5315 src0.readSrc();
5316 src1.readSrc();
5317 src2.readSrc();
5318
5322 assert(!(instData.ABS & 0x1));
5323 assert(!(instData.ABS & 0x2));
5324 assert(!(instData.ABS & 0x4));
5325 assert(!(extData.NEG & 0x1));
5326 assert(!(extData.NEG & 0x2));
5327 assert(!(extData.NEG & 0x4));
5328
5329 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5330 if (wf->execMask(lane)) {
5331 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
5332 * sext<24>(bits(src1[lane], 23, 0)) + src2[lane];
5333 }
5334 }
5335
5336 vdst.write();
5337 } // execute
5338 // --- Inst_VOP3__V_MAD_U32_U24 class methods ---
5339
5341 : Inst_VOP3A(iFmt, "v_mad_u32_u24", false)
5342 {
5343 setFlag(ALU);
5344 setFlag(MAD);
5345 } // Inst_VOP3__V_MAD_U32_U24
5346
5348 {
5349 } // ~Inst_VOP3__V_MAD_U32_U24
5350
5351 // --- description from .arch file ---
5352 // D.u = S0.u[23:0] * S1.u[23:0] + S2.u.
5353 void
5355 {
5356 Wavefront *wf = gpuDynInst->wavefront();
5357 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5358 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5359 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5360 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5361
5362 src0.readSrc();
5363 src1.readSrc();
5364 src2.readSrc();
5365
5369 assert(!(instData.ABS & 0x1));
5370 assert(!(instData.ABS & 0x2));
5371 assert(!(instData.ABS & 0x4));
5372 assert(!(extData.NEG & 0x1));
5373 assert(!(extData.NEG & 0x2));
5374 assert(!(extData.NEG & 0x4));
5375
5376 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5377 if (wf->execMask(lane)) {
5378 vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0)
5379 + src2[lane];
5380 }
5381 }
5382
5383 vdst.write();
5384 } // execute
5385 // --- Inst_VOP3__V_CUBEID_F32 class methods ---
5386
5388 : Inst_VOP3A(iFmt, "v_cubeid_f32", false)
5389 {
5390 setFlag(ALU);
5391 setFlag(F32);
5392 } // Inst_VOP3__V_CUBEID_F32
5393
5395 {
5396 } // ~Inst_VOP3__V_CUBEID_F32
5397
5398 // --- description from .arch file ---
5399 // D.f = cubemap face ID ({0.0, 1.0, ..., 5.0}). XYZ coordinate is given in
5400 // --- (S0.f, S1.f, S2.f).
5401 void
5403 {
5405 } // execute
5406 // --- Inst_VOP3__V_CUBESC_F32 class methods ---
5407
5409 : Inst_VOP3A(iFmt, "v_cubesc_f32", false)
5410 {
5411 setFlag(ALU);
5412 setFlag(F32);
5413 } // Inst_VOP3__V_CUBESC_F32
5414
5416 {
5417 } // ~Inst_VOP3__V_CUBESC_F32
5418
5419 // --- description from .arch file ---
5420 // D.f = cubemap S coordinate. XYZ coordinate is given in (S0.f, S1.f,
5421 // S2.f).
5422 void
5424 {
5426 } // execute
5427 // --- Inst_VOP3__V_CUBETC_F32 class methods ---
5428
5430 : Inst_VOP3A(iFmt, "v_cubetc_f32", false)
5431 {
5432 setFlag(ALU);
5433 setFlag(F32);
5434 } // Inst_VOP3__V_CUBETC_F32
5435
5437 {
5438 } // ~Inst_VOP3__V_CUBETC_F32
5439
5440 // --- description from .arch file ---
5441 // D.f = cubemap T coordinate. XYZ coordinate is given in (S0.f, S1.f,
5442 // S2.f).
5443 void
5445 {
5447 } // execute
5448 // --- Inst_VOP3__V_CUBEMA_F32 class methods ---
5449
5451 : Inst_VOP3A(iFmt, "v_cubema_f32", false)
5452 {
5453 setFlag(ALU);
5454 setFlag(F32);
5455 } // Inst_VOP3__V_CUBEMA_F32
5456
5458 {
5459 } // ~Inst_VOP3__V_CUBEMA_F32
5460
5461 // --- description from .arch file ---
5462 // D.f = 2.0 * cubemap major axis. XYZ coordinate is given in (S0.f, S1.f,
5463 // --- S2.f).
5464 void
5466 {
5468 } // execute
5469 // --- Inst_VOP3__V_BFE_U32 class methods ---
5470
5472 : Inst_VOP3A(iFmt, "v_bfe_u32", false)
5473 {
5474 setFlag(ALU);
5475 } // Inst_VOP3__V_BFE_U32
5476
5478 {
5479 } // ~Inst_VOP3__V_BFE_U32
5480
5481 // --- description from .arch file ---
5482 // D.u = (S0.u>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
5483 // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
5484 void
5486 {
5487 Wavefront *wf = gpuDynInst->wavefront();
5488 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5489 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5490 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5491 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5492
5493 src0.readSrc();
5494 src1.readSrc();
5495 src2.readSrc();
5496
5500 assert(!(instData.ABS & 0x1));
5501 assert(!(instData.ABS & 0x2));
5502 assert(!(instData.ABS & 0x4));
5503 assert(!(extData.NEG & 0x1));
5504 assert(!(extData.NEG & 0x2));
5505 assert(!(extData.NEG & 0x4));
5506
5507 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5508 if (wf->execMask(lane)) {
5509 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
5510 & ((1 << bits(src2[lane], 4, 0)) - 1);
5511 }
5512 }
5513
5514 vdst.write();
5515 } // execute
5516 // --- Inst_VOP3__V_BFE_I32 class methods ---
5517
5519 : Inst_VOP3A(iFmt, "v_bfe_i32", false)
5520 {
5521 setFlag(ALU);
5522 } // Inst_VOP3__V_BFE_I32
5523
5525 {
5526 } // ~Inst_VOP3__V_BFE_I32
5527
5528 // --- description from .arch file ---
5529 // D.i = (S0.i>>S1.u[4:0]) & ((1<<S2.u[4:0])-1).
5530 // Bitfield extract with S0 = data, S1 = field_offset, S2 = field_width.
5531 void
5533 {
5534 Wavefront *wf = gpuDynInst->wavefront();
5535 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5536 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5537 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5538 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5539
5540 src0.readSrc();
5541 src1.readSrc();
5542 src2.readSrc();
5543
5547 assert(!(instData.ABS & 0x1));
5548 assert(!(instData.ABS & 0x2));
5549 assert(!(instData.ABS & 0x4));
5550 assert(!(extData.NEG & 0x1));
5551 assert(!(extData.NEG & 0x2));
5552 assert(!(extData.NEG & 0x4));
5553
5554 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5555 if (wf->execMask(lane)) {
5556 vdst[lane] = (src0[lane] >> bits(src1[lane], 4, 0))
5557 & ((1 << bits(src2[lane], 4, 0)) - 1);
5558
5559 // Above extracted a signed int of size src2 bits which needs
5560 // to be signed-extended. Check if the MSB of our src2-bit
5561 // integer is 1, and sign extend it is.
5562 if (vdst[lane] >> (bits(src2[lane], 4, 0) - 1)) {
5563 vdst[lane] |= 0xffffffff << bits(src2[lane], 4, 0);
5564 }
5565 }
5566 }
5567
5568 vdst.write();
5569 } // execute
5570 // --- Inst_VOP3__V_BFI_B32 class methods ---
5571
5573 : Inst_VOP3A(iFmt, "v_bfi_b32", false)
5574 {
5575 setFlag(ALU);
5576 } // Inst_VOP3__V_BFI_B32
5577
5579 {
5580 } // ~Inst_VOP3__V_BFI_B32
5581
5582 // --- description from .arch file ---
5583 // D.u = (S0.u & S1.u) | (~S0.u & S2.u); bitfield insert.
5584 void
5586 {
5587 Wavefront *wf = gpuDynInst->wavefront();
5588 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5589 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5590 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5591 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5592
5593 src0.readSrc();
5594 src1.readSrc();
5595 src2.readSrc();
5596
5600 assert(!(instData.ABS & 0x1));
5601 assert(!(instData.ABS & 0x2));
5602 assert(!(instData.ABS & 0x4));
5603 assert(!(extData.NEG & 0x1));
5604 assert(!(extData.NEG & 0x2));
5605 assert(!(extData.NEG & 0x4));
5606
5607 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5608 if (wf->execMask(lane)) {
5609 vdst[lane] = (src0[lane] & src1[lane]) | (~src0[lane]
5610 & src2[lane]);
5611 }
5612 }
5613
5614 vdst.write();
5615 } // execute
5616 // --- Inst_VOP3__V_FMA_F32 class methods ---
5617
5619 : Inst_VOP3A(iFmt, "v_fma_f32", false)
5620 {
5621 setFlag(ALU);
5622 setFlag(F32);
5623 setFlag(FMA);
5624 } // Inst_VOP3__V_FMA_F32
5625
5627 {
5628 } // ~Inst_VOP3__V_FMA_F32
5629
5630 // --- description from .arch file ---
5631 // D.f = S0.f * S1.f + S2.f.
5632 void
5634 {
5635 Wavefront *wf = gpuDynInst->wavefront();
5636 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5637 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5638 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5639 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5640
5641 src0.readSrc();
5642 src1.readSrc();
5643 src2.readSrc();
5644
5645 if (instData.ABS & 0x1) {
5646 src0.absModifier();
5647 }
5648
5649 if (instData.ABS & 0x2) {
5650 src1.absModifier();
5651 }
5652
5653 if (instData.ABS & 0x4) {
5654 src2.absModifier();
5655 }
5656
5657 if (extData.NEG & 0x1) {
5658 src0.negModifier();
5659 }
5660
5661 if (extData.NEG & 0x2) {
5662 src1.negModifier();
5663 }
5664
5665 if (extData.NEG & 0x4) {
5666 src2.negModifier();
5667 }
5668
5669 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5670 if (wf->execMask(lane)) {
5671 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5672 }
5673 }
5674
5675 vdst.write();
5676 } // execute
5677 // --- Inst_VOP3__V_FMA_F64 class methods ---
5678
5680 : Inst_VOP3A(iFmt, "v_fma_f64", false)
5681 {
5682 setFlag(ALU);
5683 setFlag(F64);
5684 setFlag(FMA);
5685 } // Inst_VOP3__V_FMA_F64
5686
5688 {
5689 } // ~Inst_VOP3__V_FMA_F64
5690
5691 // --- description from .arch file ---
5692 // D.d = S0.d * S1.d + S2.d.
5693 void
5695 {
5696 Wavefront *wf = gpuDynInst->wavefront();
5697 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
5698 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
5699 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
5700 VecOperandF64 vdst(gpuDynInst, instData.VDST);
5701
5702 src0.readSrc();
5703 src1.readSrc();
5704 src2.readSrc();
5705
5706 if (instData.ABS & 0x1) {
5707 src0.absModifier();
5708 }
5709
5710 if (instData.ABS & 0x2) {
5711 src1.absModifier();
5712 }
5713
5714 if (instData.ABS & 0x4) {
5715 src2.absModifier();
5716 }
5717
5718 if (extData.NEG & 0x1) {
5719 src0.negModifier();
5720 }
5721
5722 if (extData.NEG & 0x2) {
5723 src1.negModifier();
5724 }
5725
5726 if (extData.NEG & 0x4) {
5727 src2.negModifier();
5728 }
5729
5730 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5731 if (wf->execMask(lane)) {
5732 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
5733 }
5734 }
5735
5736 vdst.write();
5737 } // execute
5738 // --- Inst_VOP3__V_LERP_U8 class methods ---
5739
5741 : Inst_VOP3A(iFmt, "v_lerp_u8", false)
5742 {
5743 setFlag(ALU);
5744 } // Inst_VOP3__V_LERP_U8
5745
5747 {
5748 } // ~Inst_VOP3__V_LERP_U8
5749
5750 // --- description from .arch file ---
5751 // D.u = ((S0.u[31:24] + S1.u[31:24] + S2.u[24]) >> 1) << 24
5752 // D.u += ((S0.u[23:16] + S1.u[23:16] + S2.u[16]) >> 1) << 16;
5753 // D.u += ((S0.u[15:8] + S1.u[15:8] + S2.u[8]) >> 1) << 8;
5754 // D.u += ((S0.u[7:0] + S1.u[7:0] + S2.u[0]) >> 1).
5755 // Unsigned 8-bit pixel average on packed unsigned bytes (linear
5756 // --- interpolation). S2 acts as a round mode; if set, 0.5 rounds up,
5757 // --- otherwise 0.5 truncates.
5758 void
5760 {
5761 Wavefront *wf = gpuDynInst->wavefront();
5762 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5763 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5764 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5765 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5766
5767 src0.readSrc();
5768 src1.readSrc();
5769 src2.readSrc();
5770
5774 assert(!(instData.ABS & 0x1));
5775 assert(!(instData.ABS & 0x2));
5776 assert(!(instData.ABS & 0x4));
5777 assert(!(extData.NEG & 0x1));
5778 assert(!(extData.NEG & 0x2));
5779 assert(!(extData.NEG & 0x4));
5780
5781 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5782 if (wf->execMask(lane)) {
5783 vdst[lane] = ((bits(src0[lane], 31, 24)
5784 + bits(src1[lane], 31, 24) + bits(src2[lane], 24)) >> 1)
5785 << 24;
5786 vdst[lane] += ((bits(src0[lane], 23, 16)
5787 + bits(src1[lane], 23, 16) + bits(src2[lane], 16)) >> 1)
5788 << 16;
5789 vdst[lane] += ((bits(src0[lane], 15, 8)
5790 + bits(src1[lane], 15, 8) + bits(src2[lane], 8)) >> 1)
5791 << 8;
5792 vdst[lane] += ((bits(src0[lane], 7, 0) + bits(src1[lane], 7, 0)
5793 + bits(src2[lane], 0)) >> 1);
5794 }
5795 }
5796
5797 vdst.write();
5798 } // execute
5799 // --- Inst_VOP3__V_ALIGNBIT_B32 class methods ---
5800
5802 : Inst_VOP3A(iFmt, "v_alignbit_b32", false)
5803 {
5804 setFlag(ALU);
5805 } // Inst_VOP3__V_ALIGNBIT_B32
5806
5808 {
5809 } // ~Inst_VOP3__V_ALIGNBIT_B32
5810
5811 // --- description from .arch file ---
5812 // D.u = ({S0,S1} >> S2.u[4:0]) & 0xffffffff.
5813 void
5815 {
5816 Wavefront *wf = gpuDynInst->wavefront();
5817 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5818 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5819 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5820 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5821
5822 src0.readSrc();
5823 src1.readSrc();
5824 src2.readSrc();
5825
5829 assert(!(instData.ABS & 0x1));
5830 assert(!(instData.ABS & 0x2));
5831 assert(!(instData.ABS & 0x4));
5832 assert(!(extData.NEG & 0x1));
5833 assert(!(extData.NEG & 0x2));
5834 assert(!(extData.NEG & 0x4));
5835
5836 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5837 if (wf->execMask(lane)) {
5838 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
5839 | (VecElemU64)src1[lane]);
5840 vdst[lane] = (VecElemU32)((src_0_1
5841 >> (VecElemU64)bits(src2[lane], 4, 0)) & 0xffffffff);
5842 }
5843 }
5844
5845 vdst.write();
5846 } // execute
5847 // --- Inst_VOP3__V_ALIGNBYTE_B32 class methods ---
5848
5850 : Inst_VOP3A(iFmt, "v_alignbyte_b32", false)
5851 {
5852 setFlag(ALU);
5853 } // Inst_VOP3__V_ALIGNBYTE_B32
5854
5856 {
5857 } // ~Inst_VOP3__V_ALIGNBYTE_B32
5858
5859 // --- description from .arch file ---
5860 // D.u = ({S0,S1} >> (8*S2.u[4:0])) & 0xffffffff.
5861 void
5863 {
5864 Wavefront *wf = gpuDynInst->wavefront();
5865 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
5866 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
5867 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
5868 VecOperandU32 vdst(gpuDynInst, instData.VDST);
5869
5870 src0.readSrc();
5871 src1.readSrc();
5872 src2.readSrc();
5873
5877 assert(!(instData.ABS & 0x1));
5878 assert(!(instData.ABS & 0x2));
5879 assert(!(instData.ABS & 0x4));
5880 assert(!(extData.NEG & 0x1));
5881 assert(!(extData.NEG & 0x2));
5882 assert(!(extData.NEG & 0x4));
5883
5884 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5885 if (wf->execMask(lane)) {
5886 VecElemU64 src_0_1 = (((VecElemU64)src0[lane] << 32)
5887 | (VecElemU64)src1[lane]);
5888 vdst[lane] = (VecElemU32)((src_0_1
5889 >> (8ULL * (VecElemU64)bits(src2[lane], 4, 0)))
5890 & 0xffffffff);
5891 }
5892 }
5893
5894 vdst.write();
5895 } // execute
5896 // --- Inst_VOP3__V_MIN3_F32 class methods ---
5897
5899 : Inst_VOP3A(iFmt, "v_min3_f32", false)
5900 {
5901 setFlag(ALU);
5902 setFlag(F32);
5903 } // Inst_VOP3__V_MIN3_F32
5904
5906 {
5907 } // ~Inst_VOP3__V_MIN3_F32
5908
5909 // --- description from .arch file ---
5910 // D.f = min(S0.f, S1.f, S2.f).
5911 void
5913 {
5914 Wavefront *wf = gpuDynInst->wavefront();
5915 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
5916 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
5917 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
5918 VecOperandF32 vdst(gpuDynInst, instData.VDST);
5919
5920 src0.readSrc();
5921 src1.readSrc();
5922 src2.readSrc();
5923
5924 if (instData.ABS & 0x1) {
5925 src0.absModifier();
5926 }
5927
5928 if (instData.ABS & 0x2) {
5929 src1.absModifier();
5930 }
5931
5932 if (instData.ABS & 0x4) {
5933 src2.absModifier();
5934 }
5935
5936 if (extData.NEG & 0x1) {
5937 src0.negModifier();
5938 }
5939
5940 if (extData.NEG & 0x2) {
5941 src1.negModifier();
5942 }
5943
5944 if (extData.NEG & 0x4) {
5945 src2.negModifier();
5946 }
5947
5948 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5949 if (wf->execMask(lane)) {
5950 VecElemF32 min_0_1 = std::fmin(src0[lane], src1[lane]);
5951 vdst[lane] = std::fmin(min_0_1, src2[lane]);
5952 }
5953 }
5954
5955 vdst.write();
5956 } // execute
5957 // --- Inst_VOP3__V_MIN3_I32 class methods ---
5958
5960 : Inst_VOP3A(iFmt, "v_min3_i32", false)
5961 {
5962 setFlag(ALU);
5963 } // Inst_VOP3__V_MIN3_I32
5964
5966 {
5967 } // ~Inst_VOP3__V_MIN3_I32
5968
5969 // --- description from .arch file ---
5970 // D.i = min(S0.i, S1.i, S2.i).
5971 void
5973 {
5974 Wavefront *wf = gpuDynInst->wavefront();
5975 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
5976 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
5977 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
5978 VecOperandI32 vdst(gpuDynInst, instData.VDST);
5979
5980 src0.readSrc();
5981 src1.readSrc();
5982 src2.readSrc();
5983
5987 assert(!(instData.ABS & 0x1));
5988 assert(!(instData.ABS & 0x2));
5989 assert(!(instData.ABS & 0x4));
5990 assert(!(extData.NEG & 0x1));
5991 assert(!(extData.NEG & 0x2));
5992 assert(!(extData.NEG & 0x4));
5993
5994 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
5995 if (wf->execMask(lane)) {
5996 VecElemI32 min_0_1 = std::min(src0[lane], src1[lane]);
5997 vdst[lane] = std::min(min_0_1, src2[lane]);
5998 }
5999 }
6000
6001 vdst.write();
6002 } // execute
6003 // --- Inst_VOP3__V_MIN3_U32 class methods ---
6004
6006 : Inst_VOP3A(iFmt, "v_min3_u32", false)
6007 {
6008 setFlag(ALU);
6009 } // Inst_VOP3__V_MIN3_U32
6010
6012 {
6013 } // ~Inst_VOP3__V_MIN3_U32
6014
6015 // --- description from .arch file ---
6016 // D.u = min(S0.u, S1.u, S2.u).
6017 void
6019 {
6020 Wavefront *wf = gpuDynInst->wavefront();
6021 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6022 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6023 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6024 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6025
6026 src0.readSrc();
6027 src1.readSrc();
6028 src2.readSrc();
6029
6033 assert(!(instData.ABS & 0x1));
6034 assert(!(instData.ABS & 0x2));
6035 assert(!(instData.ABS & 0x4));
6036 assert(!(extData.NEG & 0x1));
6037 assert(!(extData.NEG & 0x2));
6038 assert(!(extData.NEG & 0x4));
6039
6040 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6041 if (wf->execMask(lane)) {
6042 VecElemU32 min_0_1 = std::min(src0[lane], src1[lane]);
6043 vdst[lane] = std::min(min_0_1, src2[lane]);
6044 }
6045 }
6046
6047 vdst.write();
6048 } // execute
6049 // --- Inst_VOP3__V_MAX3_F32 class methods ---
6050
6052 : Inst_VOP3A(iFmt, "v_max3_f32", false)
6053 {
6054 setFlag(ALU);
6055 setFlag(F32);
6056 } // Inst_VOP3__V_MAX3_F32
6057
6059 {
6060 } // ~Inst_VOP3__V_MAX3_F32
6061
6062 // --- description from .arch file ---
6063 // D.f = max(S0.f, S1.f, S2.f).
6064 void
6066 {
6067 Wavefront *wf = gpuDynInst->wavefront();
6068 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6069 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6070 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6071 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6072
6073 src0.readSrc();
6074 src1.readSrc();
6075 src2.readSrc();
6076
6077 if (instData.ABS & 0x1) {
6078 src0.absModifier();
6079 }
6080
6081 if (instData.ABS & 0x2) {
6082 src1.absModifier();
6083 }
6084
6085 if (instData.ABS & 0x4) {
6086 src2.absModifier();
6087 }
6088
6089 if (extData.NEG & 0x1) {
6090 src0.negModifier();
6091 }
6092
6093 if (extData.NEG & 0x2) {
6094 src1.negModifier();
6095 }
6096
6097 if (extData.NEG & 0x4) {
6098 src2.negModifier();
6099 }
6100
6101 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6102 if (wf->execMask(lane)) {
6103 VecElemF32 max_0_1 = std::fmax(src0[lane], src1[lane]);
6104 vdst[lane] = std::fmax(max_0_1, src2[lane]);
6105 }
6106 }
6107
6108 vdst.write();
6109 } // execute
6110 // --- Inst_VOP3__V_MAX3_I32 class methods ---
6111
6113 : Inst_VOP3A(iFmt, "v_max3_i32", false)
6114 {
6115 setFlag(ALU);
6116 } // Inst_VOP3__V_MAX3_I32
6117
6119 {
6120 } // ~Inst_VOP3__V_MAX3_I32
6121
6122 // --- description from .arch file ---
6123 // D.i = max(S0.i, S1.i, S2.i).
6124 void
6126 {
6127 Wavefront *wf = gpuDynInst->wavefront();
6128 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6129 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6130 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
6131 VecOperandI32 vdst(gpuDynInst, instData.VDST);
6132
6133 src0.readSrc();
6134 src1.readSrc();
6135 src2.readSrc();
6136
6140 assert(!(instData.ABS & 0x1));
6141 assert(!(instData.ABS & 0x2));
6142 assert(!(instData.ABS & 0x4));
6143 assert(!(extData.NEG & 0x1));
6144 assert(!(extData.NEG & 0x2));
6145 assert(!(extData.NEG & 0x4));
6146
6147 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6148 if (wf->execMask(lane)) {
6149 VecElemI32 max_0_1 = std::max(src0[lane], src1[lane]);
6150 vdst[lane] = std::max(max_0_1, src2[lane]);
6151 }
6152 }
6153
6154 vdst.write();
6155 } // execute
6156 // --- Inst_VOP3__V_MAX3_U32 class methods ---
6157
6159 : Inst_VOP3A(iFmt, "v_max3_u32", false)
6160 {
6161 setFlag(ALU);
6162 } // Inst_VOP3__V_MAX3_U32
6163
6165 {
6166 } // ~Inst_VOP3__V_MAX3_U32
6167
6168 // --- description from .arch file ---
6169 // D.u = max(S0.u, S1.u, S2.u).
6170 void
6172 {
6173 Wavefront *wf = gpuDynInst->wavefront();
6174 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6175 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6176 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6177 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6178
6179 src0.readSrc();
6180 src1.readSrc();
6181 src2.readSrc();
6182
6186 assert(!(instData.ABS & 0x1));
6187 assert(!(instData.ABS & 0x2));
6188 assert(!(instData.ABS & 0x4));
6189 assert(!(extData.NEG & 0x1));
6190 assert(!(extData.NEG & 0x2));
6191 assert(!(extData.NEG & 0x4));
6192
6193 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6194 if (wf->execMask(lane)) {
6195 VecElemU32 max_0_1 = std::max(src0[lane], src1[lane]);
6196 vdst[lane] = std::max(max_0_1, src2[lane]);
6197 }
6198 }
6199
6200 vdst.write();
6201 } // execute
6202 // --- Inst_VOP3__V_MED3_F32 class methods ---
6203
6205 : Inst_VOP3A(iFmt, "v_med3_f32", false)
6206 {
6207 setFlag(ALU);
6208 setFlag(F32);
6209 } // Inst_VOP3__V_MED3_F32
6210
6212 {
6213 } // ~Inst_VOP3__V_MED3_F32
6214
6215 // --- description from .arch file ---
6216 // D.f = median(S0.f, S1.f, S2.f).
6217 void
6219 {
6220 Wavefront *wf = gpuDynInst->wavefront();
6221 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6222 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6223 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6224 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6225
6226 src0.readSrc();
6227 src1.readSrc();
6228 src2.readSrc();
6229
6230 if (instData.ABS & 0x1) {
6231 src0.absModifier();
6232 }
6233
6234 if (instData.ABS & 0x2) {
6235 src1.absModifier();
6236 }
6237
6238 if (instData.ABS & 0x4) {
6239 src2.absModifier();
6240 }
6241
6242 if (extData.NEG & 0x1) {
6243 src0.negModifier();
6244 }
6245
6246 if (extData.NEG & 0x2) {
6247 src1.negModifier();
6248 }
6249
6250 if (extData.NEG & 0x4) {
6251 src2.negModifier();
6252 }
6253
6254 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6255 if (wf->execMask(lane)) {
6256 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6257 }
6258 }
6259
6260 vdst.write();
6261 } // execute
6262 // --- Inst_VOP3__V_MED3_I32 class methods ---
6263
6265 : Inst_VOP3A(iFmt, "v_med3_i32", false)
6266 {
6267 setFlag(ALU);
6268 } // Inst_VOP3__V_MED3_I32
6269
6271 {
6272 } // ~Inst_VOP3__V_MED3_I32
6273
6274 // --- description from .arch file ---
6275 // D.i = median(S0.i, S1.i, S2.i).
6276 void
6278 {
6279 Wavefront *wf = gpuDynInst->wavefront();
6280 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6281 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6282 ConstVecOperandI32 src2(gpuDynInst, extData.SRC2);
6283 VecOperandI32 vdst(gpuDynInst, instData.VDST);
6284
6285 src0.readSrc();
6286 src1.readSrc();
6287 src2.readSrc();
6288
6292 assert(!(instData.ABS & 0x1));
6293 assert(!(instData.ABS & 0x2));
6294 assert(!(instData.ABS & 0x4));
6295 assert(!(extData.NEG & 0x1));
6296 assert(!(extData.NEG & 0x2));
6297 assert(!(extData.NEG & 0x4));
6298
6299 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6300 if (wf->execMask(lane)) {
6301 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6302 }
6303 }
6304
6305 vdst.write();
6306 } // execute
6307 // --- Inst_VOP3__V_MED3_U32 class methods ---
6308
6310 : Inst_VOP3A(iFmt, "v_med3_u32", false)
6311 {
6312 setFlag(ALU);
6313 } // Inst_VOP3__V_MED3_U32
6314
6316 {
6317 } // ~Inst_VOP3__V_MED3_U32
6318
6319 // --- description from .arch file ---
6320 // D.u = median(S0.u, S1.u, S2.u).
6321 void
6323 {
6324 Wavefront *wf = gpuDynInst->wavefront();
6325 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6326 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6327 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6328 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6329
6330 src0.readSrc();
6331 src1.readSrc();
6332 src2.readSrc();
6333
6337 assert(!(instData.ABS & 0x1));
6338 assert(!(instData.ABS & 0x2));
6339 assert(!(instData.ABS & 0x4));
6340 assert(!(extData.NEG & 0x1));
6341 assert(!(extData.NEG & 0x2));
6342 assert(!(extData.NEG & 0x4));
6343
6344 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6345 if (wf->execMask(lane)) {
6346 vdst[lane] = median(src0[lane], src1[lane], src2[lane]);
6347 }
6348 }
6349
6350 vdst.write();
6351 } // execute
6352 // --- Inst_VOP3__V_SAD_U8 class methods ---
6353
6355 : Inst_VOP3A(iFmt, "v_sad_u8", false)
6356 {
6357 setFlag(ALU);
6358 } // Inst_VOP3__V_SAD_U8
6359
6361 {
6362 } // ~Inst_VOP3__V_SAD_U8
6363
6364 // --- description from .arch file ---
6365 // D.u = abs(S0.i[31:24] - S1.i[31:24]) + abs(S0.i[23:16] - S1.i[23:16]) +
6366 // abs(S0.i[15:8] - S1.i[15:8]) + abs(S0.i[7:0] - S1.i[7:0]) + S2.u.
6367 // Sum of absolute differences with accumulation, overflow into upper bits
6368 // is allowed.
6369 void
6371 {
6372 Wavefront *wf = gpuDynInst->wavefront();
6373 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6374 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6375 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6376 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6377
6378 src0.readSrc();
6379 src1.readSrc();
6380 src2.readSrc();
6381
6385 assert(!(instData.ABS & 0x1));
6386 assert(!(instData.ABS & 0x2));
6387 assert(!(instData.ABS & 0x4));
6388 assert(!(extData.NEG & 0x1));
6389 assert(!(extData.NEG & 0x2));
6390 assert(!(extData.NEG & 0x4));
6391
6392 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6393 if (wf->execMask(lane)) {
6394 vdst[lane] = std::abs(bits(src0[lane], 31, 24)
6395 - bits(src1[lane], 31, 24))
6396 + std::abs(bits(src0[lane], 23, 16)
6397 - bits(src1[lane], 23, 16))
6398 + std::abs(bits(src0[lane], 15, 8)
6399 - bits(src1[lane], 15, 8))
6400 + std::abs(bits(src0[lane], 7, 0)
6401 - bits(src1[lane], 7, 0)) + src2[lane];
6402 }
6403 }
6404
6405 vdst.write();
6406 } // execute
6407 // --- Inst_VOP3__V_SAD_HI_U8 class methods ---
6408
6410 : Inst_VOP3A(iFmt, "v_sad_hi_u8", false)
6411 {
6412 setFlag(ALU);
6413 } // Inst_VOP3__V_SAD_HI_U8
6414
6416 {
6417 } // ~Inst_VOP3__V_SAD_HI_U8
6418
6419 // --- description from .arch file ---
6420 // D.u = (SAD_U8(S0, S1, 0) << 16) + S2.u.
6421 // Sum of absolute differences with accumulation, overflow is lost.
6422 void
6424 {
6425 Wavefront *wf = gpuDynInst->wavefront();
6426 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
6427 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6428 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6429 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6430
6431 src0.readSrc();
6432 src1.readSrc();
6433 src2.readSrc();
6434
6438 assert(!(instData.ABS & 0x1));
6439 assert(!(instData.ABS & 0x2));
6440 assert(!(instData.ABS & 0x4));
6441 assert(!(extData.NEG & 0x1));
6442 assert(!(extData.NEG & 0x2));
6443 assert(!(extData.NEG & 0x4));
6444
6445 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6446 if (wf->execMask(lane)) {
6447 vdst[lane] = (((bits(src0[lane], 31, 24)
6448 - bits(src1[lane], 31, 24)) + (bits(src0[lane], 23, 16)
6449 - bits(src1[lane], 23, 16)) + (bits(src0[lane], 15, 8)
6450 - bits(src1[lane], 15, 8)) + (bits(src0[lane], 7, 0)
6451 - bits(src1[lane], 7, 0))) << 16) + src2[lane];
6452 }
6453 }
6454
6455 vdst.write();
6456 } // execute
6457 // --- Inst_VOP3__V_SAD_U16 class methods ---
6458
6460 : Inst_VOP3A(iFmt, "v_sad_u16", false)
6461 {
6462 setFlag(ALU);
6463 } // Inst_VOP3__V_SAD_U16
6464
6466 {
6467 } // ~Inst_VOP3__V_SAD_U16
6468
6469 // --- description from .arch file ---
6470 // D.u = abs(S0.i[31:16] - S1.i[31:16]) + abs(S0.i[15:0] - S1.i[15:0])
6471 // + S2.u.
6472 // Word SAD with accumulation.
6473 void
6475 {
6476 Wavefront *wf = gpuDynInst->wavefront();
6477 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6478 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6479 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6480 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6481
6482 src0.readSrc();
6483 src1.readSrc();
6484 src2.readSrc();
6485
6489 assert(!(instData.ABS & 0x1));
6490 assert(!(instData.ABS & 0x2));
6491 assert(!(instData.ABS & 0x4));
6492 assert(!(extData.NEG & 0x1));
6493 assert(!(extData.NEG & 0x2));
6494 assert(!(extData.NEG & 0x4));
6495
6496 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6497 if (wf->execMask(lane)) {
6498 vdst[lane] = std::abs(bits(src0[lane], 31, 16)
6499 - bits(src1[lane], 31, 16))
6500 + std::abs(bits(src0[lane], 15, 0)
6501 - bits(src1[lane], 15, 0)) + src2[lane];
6502 }
6503 }
6504
6505 vdst.write();
6506 } // execute
6507 // --- Inst_VOP3__V_SAD_U32 class methods ---
6508
6510 : Inst_VOP3A(iFmt, "v_sad_u32", false)
6511 {
6512 setFlag(ALU);
6513 } // Inst_VOP3__V_SAD_U32
6514
6516 {
6517 } // ~Inst_VOP3__V_SAD_U32
6518
6519 // --- description from .arch file ---
6520 // D.u = abs(S0.i - S1.i) + S2.u.
6521 // Dword SAD with accumulation.
6522 void
6524 {
6525 Wavefront *wf = gpuDynInst->wavefront();
6526 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
6527 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
6528 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6529 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6530
6531 src0.readSrc();
6532 src1.readSrc();
6533 src2.readSrc();
6534
6538 assert(!(instData.ABS & 0x1));
6539 assert(!(instData.ABS & 0x2));
6540 assert(!(instData.ABS & 0x4));
6541 assert(!(extData.NEG & 0x1));
6542 assert(!(extData.NEG & 0x2));
6543 assert(!(extData.NEG & 0x4));
6544
6545 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6546 if (wf->execMask(lane)) {
6547 vdst[lane] = std::abs(src0[lane] - src1[lane]) + src2[lane];
6548 } // if
6549 } // for
6550
6551 vdst.write();
6552 } // execute
6553 // --- Inst_VOP3__V_CVT_PK_U8_F32 class methods ---
6554
6556 : Inst_VOP3A(iFmt, "v_cvt_pk_u8_f32", false)
6557 {
6558 setFlag(ALU);
6559 setFlag(F32);
6560 } // Inst_VOP3__V_CVT_PK_U8_F32
6561
6563 {
6564 } // ~Inst_VOP3__V_CVT_PK_U8_F32
6565
6566 // --- description from .arch file ---
6567 // D.u = ((flt32_to_uint8(S0.f) & 0xff) << (8 * S1.u[1:0]))
6568 // | (S2.u & ~(0xff << (8 * S1.u[1:0]))).
6569 // Convert floating point value S0 to 8-bit unsigned integer and pack the
6570 // result into byte S1 of dword S2.
6571 void
6573 {
6574 Wavefront *wf = gpuDynInst->wavefront();
6575 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6576 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
6577 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
6578 VecOperandU32 vdst(gpuDynInst, instData.VDST);
6579
6580 src0.readSrc();
6581 src1.readSrc();
6582 src2.readSrc();
6583
6584 if (instData.ABS & 0x1) {
6585 src0.absModifier();
6586 }
6587
6588
6589 if (extData.NEG & 0x1) {
6590 src0.negModifier();
6591 }
6592
6596 assert(!(instData.ABS & 0x2));
6597 assert(!(instData.ABS & 0x4));
6598 assert(!(extData.NEG & 0x2));
6599 assert(!(extData.NEG & 0x4));
6600
6601 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6602 if (wf->execMask(lane)) {
6603 vdst[lane] = (((VecElemU8)src0[lane] & 0xff)
6604 << (8 * bits(src1[lane], 1, 0)))
6605 | (src2[lane] & ~(0xff << (8 * bits(src1[lane], 1, 0))));
6606 }
6607 }
6608
6609 vdst.write();
6610 } // execute
6611 // --- Inst_VOP3__V_DIV_FIXUP_F32 class methods ---
6612
6614 : Inst_VOP3A(iFmt, "v_div_fixup_f32", false)
6615 {
6616 setFlag(ALU);
6617 setFlag(F32);
6618 } // Inst_VOP3__V_DIV_FIXUP_F32
6619
6621 {
6622 } // ~Inst_VOP3__V_DIV_FIXUP_F32
6623
6624 // --- description from .arch file ---
6625 // D.f = Divide fixup and flags -- s0.f = Quotient, s1.f = Denominator,
6626 // s2.f = Numerator. This opcode generates exceptions resulting from the
6627 // division operation.
6628 void
6630 {
6631 Wavefront *wf = gpuDynInst->wavefront();
6632 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6633 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6634 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6635 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6636
6637 src0.readSrc();
6638 src1.readSrc();
6639 src2.readSrc();
6640
6641 if (instData.ABS & 0x1) {
6642 src0.absModifier();
6643 }
6644
6645 if (instData.ABS & 0x2) {
6646 src1.absModifier();
6647 }
6648
6649 if (instData.ABS & 0x4) {
6650 src2.absModifier();
6651 }
6652
6653 if (extData.NEG & 0x1) {
6654 src0.negModifier();
6655 }
6656
6657 if (extData.NEG & 0x2) {
6658 src1.negModifier();
6659 }
6660
6661 if (extData.NEG & 0x4) {
6662 src2.negModifier();
6663 }
6664
6665 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6666 if (wf->execMask(lane)) {
6667 if (std::fpclassify(src1[lane]) == FP_ZERO) {
6668 if (std::signbit(src1[lane])) {
6669 vdst[lane] = -INFINITY;
6670 } else {
6671 vdst[lane] = +INFINITY;
6672 }
6673 } else if (std::isnan(src2[lane]) || std::isnan(src1[lane])) {
6674 vdst[lane] = NAN;
6675 } else if (std::isinf(src1[lane])) {
6676 if (std::signbit(src1[lane])) {
6677 vdst[lane] = -INFINITY;
6678 } else {
6679 vdst[lane] = +INFINITY;
6680 }
6681 } else {
6682 vdst[lane] = src2[lane] / src1[lane];
6683 }
6684 }
6685 }
6686
6687 vdst.write();
6688 } // execute
6689 // --- Inst_VOP3__V_DIV_FIXUP_F64 class methods ---
6690
6692 : Inst_VOP3A(iFmt, "v_div_fixup_f64", false)
6693 {
6694 setFlag(ALU);
6695 setFlag(F64);
6696 } // Inst_VOP3__V_DIV_FIXUP_F64
6697
6699 {
6700 } // ~Inst_VOP3__V_DIV_FIXUP_F64
6701
6702 // --- description from .arch file ---
6703 // D.d = Divide fixup and flags -- s0.d = Quotient, s1.d = Denominator,
6704 // s2.d = Numerator. This opcode generates exceptions resulting from the
6705 // division operation.
6706 void
6708 {
6709 Wavefront *wf = gpuDynInst->wavefront();
6710 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
6711 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
6712 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
6713 VecOperandF64 vdst(gpuDynInst, instData.VDST);
6714
6715 src0.readSrc();
6716 src1.readSrc();
6717 src2.readSrc();
6718
6719 if (instData.ABS & 0x1) {
6720 src0.absModifier();
6721 }
6722
6723 if (instData.ABS & 0x2) {
6724 src1.absModifier();
6725 }
6726
6727 if (instData.ABS & 0x4) {
6728 src2.absModifier();
6729 }
6730
6731 if (extData.NEG & 0x1) {
6732 src0.negModifier();
6733 }
6734
6735 if (extData.NEG & 0x2) {
6736 src1.negModifier();
6737 }
6738
6739 if (extData.NEG & 0x4) {
6740 src2.negModifier();
6741 }
6742
6743 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6744 if (wf->execMask(lane)) {
6745 int sign_out = std::signbit(src1[lane])
6746 ^ std::signbit(src2[lane]);
6747 int exp1(0);
6748 int exp2(0);
6749 std::frexp(src1[lane], &exp1);
6750 std::frexp(src2[lane], &exp2);
6751
6752 if (std::isnan(src1[lane]) || std::isnan(src2[lane])) {
6753 vdst[lane] = std::numeric_limits<VecElemF64>::quiet_NaN();
6754 } else if (std::fpclassify(src1[lane]) == FP_ZERO
6755 && std::fpclassify(src2[lane]) == FP_ZERO) {
6756 vdst[lane]
6757 = std::numeric_limits<VecElemF64>::signaling_NaN();
6758 } else if (std::isinf(src1[lane]) && std::isinf(src2[lane])) {
6759 vdst[lane]
6760 = std::numeric_limits<VecElemF64>::signaling_NaN();
6761 } else if (std::fpclassify(src1[lane]) == FP_ZERO
6762 || std::isinf(src2[lane])) {
6763 vdst[lane] = sign_out ? -INFINITY : +INFINITY;
6764 } else if (std::isinf(src1[lane])
6765 || std::fpclassify(src2[lane]) == FP_ZERO) {
6766 vdst[lane] = sign_out ? -0.0 : +0.0;
6767 } else if (exp2 - exp1 < -1075) {
6768 vdst[lane] = src0[lane];
6769 } else if (exp1 == 2047) {
6770 vdst[lane] = src0[lane];
6771 } else {
6772 vdst[lane] = sign_out ? -std::fabs(src0[lane])
6773 : std::fabs(src0[lane]);
6774 }
6775 }
6776 }
6777
6778 vdst.write();
6779 } // execute
6780 // --- Inst_VOP3__V_DIV_SCALE_F32 class methods ---
6781
6783 InFmt_VOP3B *iFmt)
6784 : Inst_VOP3B(iFmt, "v_div_scale_f32")
6785 {
6786 setFlag(ALU);
6787 setFlag(WritesVCC);
6788 setFlag(F32);
6789 } // Inst_VOP3__V_DIV_SCALE_F32
6790
6792 {
6793 } // ~Inst_VOP3__V_DIV_SCALE_F32
6794
6795 // --- description from .arch file ---
6796 // {vcc,D.f} = Divide preop and flags -- s0.f = Quotient, s1.f =
6797 // Denominator, s2.f = Numerator -- s0 must equal s1 or s2. Given a
6798 // numerator and denominator, this opcode will appropriately scale inputs
6799 // for division to avoid subnormal terms during Newton-Raphson correction
6800 // algorithm. This opcode producses a VCC flag for post-scale of quotient.
6801 void
6803 {
6804 Wavefront *wf = gpuDynInst->wavefront();
6805 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6806 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6807 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6808 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
6809 VecOperandF32 vdst(gpuDynInst, instData.VDST);
6810
6811 src0.readSrc();
6812 src1.readSrc();
6813 src2.readSrc();
6814
6815 if (extData.NEG & 0x1) {
6816 src0.negModifier();
6817 }
6818
6819 if (extData.NEG & 0x2) {
6820 src1.negModifier();
6821 }
6822
6823 if (extData.NEG & 0x4) {
6824 src2.negModifier();
6825 }
6826
6827 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6828 if (wf->execMask(lane)) {
6829 vdst[lane] = src0[lane];
6830 vcc.setBit(lane, 0);
6831 }
6832 }
6833
6834 vcc.write();
6835 vdst.write();
6836 } // execute
6837 // --- Inst_VOP3__V_DIV_SCALE_F64 class methods ---
6838
6840 InFmt_VOP3B *iFmt)
6841 : Inst_VOP3B(iFmt, "v_div_scale_f64")
6842 {
6843 setFlag(ALU);
6844 setFlag(WritesVCC);
6845 setFlag(F64);
6846 } // Inst_VOP3__V_DIV_SCALE_F64
6847
6849 {
6850 } // ~Inst_VOP3__V_DIV_SCALE_F64
6851
6852 // --- description from .arch file ---
6853 // {vcc,D.d} = Divide preop and flags -- s0.d = Quotient, s1.d =
6854 // Denominator, s2.d = Numerator -- s0 must equal s1 or s2. Given a
6855 // numerator and denominator, this opcode will appropriately scale inputs
6856 // for division to avoid subnormal terms during Newton-Raphson correction
6857 // algorithm. This opcode producses a VCC flag for post-scale of quotient.
6858 void
6860 {
6861 Wavefront *wf = gpuDynInst->wavefront();
6862 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
6863 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
6864 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
6865 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
6866 VecOperandF64 vdst(gpuDynInst, instData.VDST);
6867
6868 src0.readSrc();
6869 src1.readSrc();
6870 src2.readSrc();
6871
6872 if (extData.NEG & 0x1) {
6873 src0.negModifier();
6874 }
6875
6876 if (extData.NEG & 0x2) {
6877 src1.negModifier();
6878 }
6879
6880 if (extData.NEG & 0x4) {
6881 src2.negModifier();
6882 }
6883
6884 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6885 if (wf->execMask(lane)) {
6886 int exp1(0);
6887 int exp2(0);
6888 std::frexp(src1[lane], &exp1);
6889 std::frexp(src2[lane], &exp2);
6890 vcc.setBit(lane, 0);
6891
6892 if (std::fpclassify(src1[lane]) == FP_ZERO
6893 || std::fpclassify(src2[lane]) == FP_ZERO) {
6894 vdst[lane] = NAN;
6895 } else if (exp2 - exp1 >= 768) {
6896 vcc.setBit(lane, 1);
6897 if (src0[lane] == src1[lane]) {
6898 vdst[lane] = std::ldexp(src0[lane], 128);
6899 }
6900 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL) {
6901 vdst[lane] = std::ldexp(src0[lane], 128);
6902 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL
6903 && std::fpclassify(src2[lane] / src1[lane])
6904 == FP_SUBNORMAL) {
6905 vcc.setBit(lane, 1);
6906 if (src0[lane] == src1[lane]) {
6907 vdst[lane] = std::ldexp(src0[lane], 128);
6908 }
6909 } else if (std::fpclassify(1.0 / src1[lane]) == FP_SUBNORMAL) {
6910 vdst[lane] = std::ldexp(src0[lane], -128);
6911 } else if (std::fpclassify(src2[lane] / src1[lane])
6912 == FP_SUBNORMAL) {
6913 vcc.setBit(lane, 1);
6914 if (src0[lane] == src2[lane]) {
6915 vdst[lane] = std::ldexp(src0[lane], 128);
6916 }
6917 } else if (exp2 <= 53) {
6918 vdst[lane] = std::ldexp(src0[lane], 128);
6919 }
6920 }
6921 }
6922
6923 vcc.write();
6924 vdst.write();
6925 } // execute
6926 // --- Inst_VOP3__V_DIV_FMAS_F32 class methods ---
6927
6929 : Inst_VOP3A(iFmt, "v_div_fmas_f32", false)
6930 {
6931 setFlag(ALU);
6932 setFlag(ReadsVCC);
6933 setFlag(F32);
6934 setFlag(FMA);
6935 } // Inst_VOP3__V_DIV_FMAS_F32
6936
6938 {
6939 } // ~Inst_VOP3__V_DIV_FMAS_F32
6940
6941 // --- description from .arch file ---
6942 // D.f = Special case divide FMA with scale and flags(s0.f = Quotient,
6943 // s1.f = Denominator, s2.f = Numerator)
6944 void
6946 {
6947 Wavefront *wf = gpuDynInst->wavefront();
6948 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
6949 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
6950 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
6951 VecOperandF64 vdst(gpuDynInst, instData.VDST);
6952
6953 src0.readSrc();
6954 src1.readSrc();
6955 src2.readSrc();
6956
6957 if (instData.ABS & 0x1) {
6958 src0.absModifier();
6959 }
6960
6961 if (instData.ABS & 0x2) {
6962 src1.absModifier();
6963 }
6964
6965 if (instData.ABS & 0x4) {
6966 src2.absModifier();
6967 }
6968
6969 if (extData.NEG & 0x1) {
6970 src0.negModifier();
6971 }
6972
6973 if (extData.NEG & 0x2) {
6974 src1.negModifier();
6975 }
6976
6977 if (extData.NEG & 0x4) {
6978 src2.negModifier();
6979 }
6980
6981 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
6982 if (wf->execMask(lane)) {
6983 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
6984 }
6985 }
6986
6987 //vdst.write();
6988 } // execute
6989 // --- Inst_VOP3__V_DIV_FMAS_F64 class methods ---
6990
6992 : Inst_VOP3A(iFmt, "v_div_fmas_f64", false)
6993 {
6994 setFlag(ALU);
6995 setFlag(ReadsVCC);
6996 setFlag(F64);
6997 setFlag(FMA);
6998 } // Inst_VOP3__V_DIV_FMAS_F64
6999
7001 {
7002 } // ~Inst_VOP3__V_DIV_FMAS_F64
7003
7004 // --- description from .arch file ---
7005 // D.d = Special case divide FMA with scale and flags(s0.d = Quotient,
7006 // s1.d = Denominator, s2.d = Numerator)
7007 void
7009 {
7010 Wavefront *wf = gpuDynInst->wavefront();
7011 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
7012 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
7013 ConstVecOperandF64 src2(gpuDynInst, extData.SRC2);
7014 VecOperandF64 vdst(gpuDynInst, instData.VDST);
7015 ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
7016
7017 src0.readSrc();
7018 src1.readSrc();
7019 src2.readSrc();
7020 vcc.read();
7021
7022 if (instData.ABS & 0x1) {
7023 src0.absModifier();
7024 }
7025
7026 if (instData.ABS & 0x2) {
7027 src1.absModifier();
7028 }
7029
7030 if (instData.ABS & 0x4) {
7031 src2.absModifier();
7032 }
7033
7034 if (extData.NEG & 0x1) {
7035 src0.negModifier();
7036 }
7037
7038 if (extData.NEG & 0x2) {
7039 src1.negModifier();
7040 }
7041
7042 if (extData.NEG & 0x4) {
7043 src2.negModifier();
7044 }
7045
7046 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7047 if (wf->execMask(lane)) {
7048 if (bits(vcc.rawData(), lane)) {
7049 vdst[lane] = std::pow(2, 64)
7050 * std::fma(src0[lane], src1[lane], src2[lane]);
7051 } else {
7052 vdst[lane] = std::fma(src0[lane], src1[lane], src2[lane]);
7053 }
7054 }
7055 }
7056
7057 vdst.write();
7058 } // execute
7059 // --- Inst_VOP3__V_MSAD_U8 class methods ---
7060
7062 : Inst_VOP3A(iFmt, "v_msad_u8", false)
7063 {
7064 setFlag(ALU);
7065 } // Inst_VOP3__V_MSAD_U8
7066
7068 {
7069 } // ~Inst_VOP3__V_MSAD_U8
7070
7071 // --- description from .arch file ---
7072 // D.u = Masked Byte SAD with accum_lo(S0.u, S1.u, S2.u).
7073 void
7075 {
7077 } // execute
7078 // --- Inst_VOP3__V_QSAD_PK_U16_U8 class methods ---
7079
7081 : Inst_VOP3A(iFmt, "v_qsad_pk_u16_u8", false)
7082 {
7083 setFlag(ALU);
7084 } // Inst_VOP3__V_QSAD_PK_U16_U8
7085
7087 {
7088 } // ~Inst_VOP3__V_QSAD_PK_U16_U8
7089
7090 // --- description from .arch file ---
7091 // D.u = Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
7092 // S1.u[31:0], S2.u[63:0])
7093 void
7095 {
7097 } // execute
7098 // --- Inst_VOP3__V_MQSAD_PK_U16_U8 class methods ---
7099
7101 InFmt_VOP3A *iFmt)
7102 : Inst_VOP3A(iFmt, "v_mqsad_pk_u16_u8", false)
7103 {
7104 setFlag(ALU);
7105 } // Inst_VOP3__V_MQSAD_PK_U16_U8
7106
7108 {
7109 } // ~Inst_VOP3__V_MQSAD_PK_U16_U8
7110
7111 // --- description from .arch file ---
7112 // D.u = Masked Quad-Byte SAD with 16-bit packed accum_lo/hi(S0.u[63:0],
7113 // --- S1.u[31:0], S2.u[63:0])
7114 void
7116 {
7118 } // execute
7119 // --- Inst_VOP3__V_MQSAD_U32_U8 class methods ---
7120
7122 : Inst_VOP3A(iFmt, "v_mqsad_u32_u8", false)
7123 {
7124 setFlag(ALU);
7125 } // Inst_VOP3__V_MQSAD_U32_U8
7126
7128 {
7129 } // ~Inst_VOP3__V_MQSAD_U32_U8
7130
7131 // --- description from .arch file ---
7132 // D.u128 = Masked Quad-Byte SAD with 32-bit accum_lo/hi(S0.u[63:0],
7133 // --- S1.u[31:0], S2.u[127:0])
7134 void
7136 {
7138 } // execute
7139 // --- Inst_VOP3__V_MAD_U64_U32 class methods ---
7140
7142 InFmt_VOP3B *iFmt)
7143 : Inst_VOP3B(iFmt, "v_mad_u64_u32")
7144 {
7145 setFlag(ALU);
7146 setFlag(WritesVCC);
7147 setFlag(MAD);
7148 } // Inst_VOP3__V_MAD_U64_U32
7149
7151 {
7152 } // ~Inst_VOP3__V_MAD_U64_U32
7153
7154 // --- description from .arch file ---
7155 // {vcc_out,D.u64} = S0.u32 * S1.u32 + S2.u64.
7156 void
7158 {
7159 Wavefront *wf = gpuDynInst->wavefront();
7160 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7161 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7162 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
7163 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
7164 VecOperandU64 vdst(gpuDynInst, instData.VDST);
7165
7166 src0.readSrc();
7167 src1.readSrc();
7168 src2.readSrc();
7169 vdst.read();
7170
7174 assert(!(extData.NEG & 0x1));
7175 assert(!(extData.NEG & 0x2));
7176 assert(!(extData.NEG & 0x4));
7177
7178 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7179 if (wf->execMask(lane)) {
7180 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
7181 src2[lane]));
7182 }
7183 }
7184
7185 vcc.write();
7186 vdst.write();
7187 } // execute
7188 // --- Inst_VOP3__V_MAD_I64_I32 class methods ---
7189
7191 InFmt_VOP3B *iFmt)
7192 : Inst_VOP3B(iFmt, "v_mad_i64_i32")
7193 {
7194 setFlag(ALU);
7195 setFlag(WritesVCC);
7196 setFlag(MAD);
7197 } // Inst_VOP3__V_MAD_I64_I32
7198
7200 {
7201 } // ~Inst_VOP3__V_MAD_I64_I32
7202
7203 // --- description from .arch file ---
7204 // {vcc_out,D.i64} = S0.i32 * S1.i32 + S2.i64.
7205 void
7207 {
7208 Wavefront *wf = gpuDynInst->wavefront();
7209 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
7210 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
7211 ConstVecOperandI64 src2(gpuDynInst, extData.SRC2);
7212 ScalarOperandU64 vcc(gpuDynInst, instData.SDST);
7213 VecOperandI64 vdst(gpuDynInst, instData.VDST);
7214
7215 src0.readSrc();
7216 src1.readSrc();
7217 src2.readSrc();
7218
7222 assert(!(extData.NEG & 0x1));
7223 assert(!(extData.NEG & 0x2));
7224 assert(!(extData.NEG & 0x4));
7225
7226 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7227 if (wf->execMask(lane)) {
7228 vcc.setBit(lane, muladd(vdst[lane], src0[lane], src1[lane],
7229 src2[lane]));
7230 }
7231 }
7232
7233 vcc.write();
7234 vdst.write();
7235 } // execute
7236 // --- Inst_VOP3__V_XAD_U32 class methods ---
7237
7239 : Inst_VOP3A(iFmt, "v_xad_u32", false)
7240 {
7241 setFlag(ALU);
7242 } // Inst_VOP3__V_XAD_U32
7243
7245 {
7246 } // ~Inst_VOP3__V_XAD_U32
7247
7248 // --- description from .arch file ---
7249 // D.u32 = (S0.u32 ^ S1.u32) + S2.u32.
7250 void
7252 {
7253 Wavefront *wf = gpuDynInst->wavefront();
7254 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7255 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7256 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7257 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7258
7259 src0.readSrc();
7260 src1.readSrc();
7261 src2.readSrc();
7262
7266 assert(!(instData.ABS & 0x1));
7267 assert(!(instData.ABS & 0x2));
7268 assert(!(instData.ABS & 0x4));
7269 assert(!(extData.NEG & 0x1));
7270 assert(!(extData.NEG & 0x2));
7271 assert(!(extData.NEG & 0x4));
7272
7273 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7274 if (wf->execMask(lane)) {
7275 vdst[lane] = (src0[lane] ^ src1[lane]) + src2[lane];
7276 }
7277 }
7278
7279 vdst.write();
7280 } // execute
7281 // --- Inst_VOP3__V_LSHL_ADD_U32 class methods ---
7282
7284 : Inst_VOP3A(iFmt, "v_lshl_add_u32", false)
7285 {
7286 setFlag(ALU);
7287 } // Inst_VOP3__V_LSHL_ADD_U32
7288
7290 {
7291 } // ~Inst_VOP3__V_LSHL_ADD_U32
7292
7293 // --- description from .arch file ---
7294 // D.u = (S0.u << S1.u[4:0]) + S2.u.
7295 void
7297 {
7298 Wavefront *wf = gpuDynInst->wavefront();
7299 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7300 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7301 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7302 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7303
7304 src0.readSrc();
7305 src1.readSrc();
7306 src2.readSrc();
7307
7311 assert(!(instData.ABS & 0x1));
7312 assert(!(instData.ABS & 0x2));
7313 assert(!(instData.ABS & 0x4));
7314 assert(!(extData.NEG & 0x1));
7315 assert(!(extData.NEG & 0x2));
7316 assert(!(extData.NEG & 0x4));
7317
7318 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7319 if (wf->execMask(lane)) {
7320 vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
7321 + src2[lane];
7322 }
7323 }
7324
7325 vdst.write();
7326 } // execute
7327 // --- Inst_VOP3__V_ADD_LSHL_U32 class methods ---
7328
7330 : Inst_VOP3A(iFmt, "v_add_lshl_u32", false)
7331 {
7332 setFlag(ALU);
7333 } // Inst_VOP3__V_ADD_LSHL_U32
7334
7336 {
7337 } // ~Inst_VOP3__V_ADD_LSHL_U32
7338
7339 // --- description from .arch file ---
7340 // D.u = (S0.u + S1.u) << S2.u[4:0].
7341 void
7343 {
7344 Wavefront *wf = gpuDynInst->wavefront();
7345 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7346 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7347 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7348 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7349
7350 src0.readSrc();
7351 src1.readSrc();
7352 src2.readSrc();
7353
7357 assert(!(instData.ABS & 0x1));
7358 assert(!(instData.ABS & 0x2));
7359 assert(!(instData.ABS & 0x4));
7360 assert(!(extData.NEG & 0x1));
7361 assert(!(extData.NEG & 0x2));
7362 assert(!(extData.NEG & 0x4));
7363
7364 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7365 if (wf->execMask(lane)) {
7366 vdst[lane] =
7367 (src0[lane] + src1[lane]) << bits(src2[lane], 4, 0);
7368 }
7369 }
7370
7371 vdst.write();
7372 } // execute
7373 // --- Inst_VOP3__V_ADD3_U32 class methods ---
7374
7376 : Inst_VOP3A(iFmt, "v_add3_u32", false)
7377 {
7378 setFlag(ALU);
7379 } // Inst_VOP3__V_ADD3_U32
7380
7382 {
7383 } // ~Inst_VOP3__V_ADD3_U32
7384
7385 // --- description from .arch file ---
7386 // D.u = S0.u + S1.u + S2.u.
7387 void
7389 {
7390 Wavefront *wf = gpuDynInst->wavefront();
7391 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7392 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7393 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7394 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7395
7396 src0.readSrc();
7397 src1.readSrc();
7398 src2.readSrc();
7399
7403 assert(!(instData.ABS & 0x1));
7404 assert(!(instData.ABS & 0x2));
7405 assert(!(instData.ABS & 0x4));
7406 assert(!(extData.NEG & 0x1));
7407 assert(!(extData.NEG & 0x2));
7408 assert(!(extData.NEG & 0x4));
7409
7410 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7411 if (wf->execMask(lane)) {
7412 vdst[lane] = src0[lane] + src1[lane] + src2[lane];
7413 }
7414 }
7415
7416 vdst.write();
7417 } // execute
7418 // --- Inst_VOP3__V_LSHL_OR_B32 class methods ---
7419
7421 : Inst_VOP3A(iFmt, "v_lshl_or_b32", false)
7422 {
7423 setFlag(ALU);
7424 } // Inst_VOP3__V_LSHL_OR_B32
7425
7427 {
7428 } // ~Inst_VOP3__V_LSHL_OR_B32
7429
7430 // --- description from .arch file ---
7431 // D.u = (S0.u << S1.u[4:0]) | S2.u.
7432 void
7434 {
7435 Wavefront *wf = gpuDynInst->wavefront();
7436 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7437 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7438 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7439 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7440
7441 src0.readSrc();
7442 src1.readSrc();
7443 src2.readSrc();
7444
7448 assert(!(instData.ABS & 0x1));
7449 assert(!(instData.ABS & 0x2));
7450 assert(!(instData.ABS & 0x4));
7451 assert(!(extData.NEG & 0x1));
7452 assert(!(extData.NEG & 0x2));
7453 assert(!(extData.NEG & 0x4));
7454
7455 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7456 if (wf->execMask(lane)) {
7457 vdst[lane] = (src0[lane] << bits(src1[lane], 4, 0))
7458 | src2[lane];
7459 }
7460 }
7461
7462 vdst.write();
7463 } // execute
7464 // --- Inst_VOP3__V_AND_OR_B32 class methods ---
7465
7467 : Inst_VOP3A(iFmt, "v_and_or_b32", false)
7468 {
7469 setFlag(ALU);
7470 } // Inst_VOP3__V_AND_OR_B32
7471
7473 {
7474 } // ~Inst_VOP3__V_AND_OR_B32
7475
7476 // --- description from .arch file ---
7477 // D.u = (S0.u & S1.u) | S2.u.
7478 // Input and output modifiers not supported.
7479 void
7481 {
7482 Wavefront *wf = gpuDynInst->wavefront();
7483 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7484 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7485 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7486 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7487
7488 src0.readSrc();
7489 src1.readSrc();
7490 src2.readSrc();
7491
7495 assert(!(instData.ABS & 0x1));
7496 assert(!(instData.ABS & 0x2));
7497 assert(!(instData.ABS & 0x4));
7498 assert(!(extData.NEG & 0x1));
7499 assert(!(extData.NEG & 0x2));
7500 assert(!(extData.NEG & 0x4));
7501
7502 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7503 if (wf->execMask(lane)) {
7504 vdst[lane] = (src0[lane] & src1[lane]) | src2[lane];
7505 }
7506 }
7507
7508 vdst.write();
7509 } // execute
7510 // --- Inst_VOP3__V_MAD_F16 class methods ---
7511
7513 : Inst_VOP3A(iFmt, "v_mad_f16", false)
7514 {
7515 setFlag(ALU);
7516 setFlag(F16);
7517 setFlag(MAD);
7518 } // Inst_VOP3__V_MAD_F16
7519
7521 {
7522 } // ~Inst_VOP3__V_MAD_F16
7523
7524 // --- description from .arch file ---
7525 // D.f16 = S0.f16 * S1.f16 + S2.f16.
7526 // Supports round mode, exception flags, saturation.
7527 void
7529 {
7531 } // execute
7532 // --- Inst_VOP3__V_MAD_U16 class methods ---
7533
7535 : Inst_VOP3A(iFmt, "v_mad_u16", false)
7536 {
7537 setFlag(ALU);
7538 setFlag(MAD);
7539 } // Inst_VOP3__V_MAD_U16
7540
7542 {
7543 } // ~Inst_VOP3__V_MAD_U16
7544
7545 // --- description from .arch file ---
7546 // D.u16 = S0.u16 * S1.u16 + S2.u16.
7547 // Supports saturation (unsigned 16-bit integer domain).
7548 void
7550 {
7551 Wavefront *wf = gpuDynInst->wavefront();
7552 ConstVecOperandU16 src0(gpuDynInst, extData.SRC0);
7553 ConstVecOperandU16 src1(gpuDynInst, extData.SRC1);
7554 ConstVecOperandU16 src2(gpuDynInst, extData.SRC2);
7555 VecOperandU16 vdst(gpuDynInst, instData.VDST);
7556
7557 src0.readSrc();
7558 src1.readSrc();
7559 src2.readSrc();
7560
7564 assert(!(instData.ABS & 0x1));
7565 assert(!(instData.ABS & 0x2));
7566 assert(!(instData.ABS & 0x4));
7567 assert(!(extData.NEG & 0x1));
7568 assert(!(extData.NEG & 0x2));
7569 assert(!(extData.NEG & 0x4));
7570
7571 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7572 if (wf->execMask(lane)) {
7573 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
7574 }
7575 }
7576
7577 vdst.write();
7578 } // execute
7579 // --- Inst_VOP3__V_MAD_I16 class methods ---
7580
7582 : Inst_VOP3A(iFmt, "v_mad_i16", false)
7583 {
7584 setFlag(ALU);
7585 setFlag(MAD);
7586 } // Inst_VOP3__V_MAD_I16
7587
7589 {
7590 } // ~Inst_VOP3__V_MAD_I16
7591
7592 // --- description from .arch file ---
7593 // D.i16 = S0.i16 * S1.i16 + S2.i16.
7594 // Supports saturation (signed 16-bit integer domain).
7595 void
7597 {
7598 Wavefront *wf = gpuDynInst->wavefront();
7599 ConstVecOperandI16 src0(gpuDynInst, extData.SRC0);
7600 ConstVecOperandI16 src1(gpuDynInst, extData.SRC1);
7601 ConstVecOperandI16 src2(gpuDynInst, extData.SRC2);
7602 VecOperandI16 vdst(gpuDynInst, instData.VDST);
7603
7604 src0.readSrc();
7605 src1.readSrc();
7606 src2.readSrc();
7607
7611 assert(!(instData.ABS & 0x1));
7612 assert(!(instData.ABS & 0x2));
7613 assert(!(instData.ABS & 0x4));
7614 assert(!(extData.NEG & 0x1));
7615 assert(!(extData.NEG & 0x2));
7616 assert(!(extData.NEG & 0x4));
7617
7618 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7619 if (wf->execMask(lane)) {
7620 vdst[lane] = src0[lane] * src1[lane] + src2[lane];
7621 }
7622 }
7623
7624 vdst.write();
7625 } // execute
7626 // --- Inst_VOP3__V_PERM_B32 class methods ---
7627
7629 : Inst_VOP3A(iFmt, "v_perm_b32", false)
7630 {
7631 setFlag(ALU);
7632 } // Inst_VOP3__V_PERM_B32
7633
7635 {
7636 } // ~Inst_VOP3__V_PERM_B32
7637
7638 // --- description from .arch file ---
7639 // D.u[31:24] = permute({S0.u, S1.u}, S2.u[31:24]);
7640 // D.u[23:16] = permute({S0.u, S1.u}, S2.u[23:16]);
7641 // D.u[15:8] = permute({S0.u, S1.u}, S2.u[15:8]);
7642 // D.u[7:0] = permute({S0.u, S1.u}, S2.u[7:0]);
7643 // byte permute(byte in[8], byte sel) {
7644 // if (sel>=13) then return 0xff;
7645 // elsif(sel==12) then return 0x00;
7646 // elsif(sel==11) then return in[7][7] * 0xff;
7647 // elsif(sel==10) then return in[5][7] * 0xff;
7648 // elsif(sel==9) then return in[3][7] * 0xff;
7649 // elsif(sel==8) then return in[1][7] * 0xff;
7650 // else return in[sel];
7651 // }
7652 // Byte permute.
7653 void
7655 {
7656 Wavefront *wf = gpuDynInst->wavefront();
7657 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
7658 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7659 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
7660 VecOperandU32 vdst(gpuDynInst, instData.VDST);
7661
7662 src0.readSrc();
7663 src1.readSrc();
7664 src2.readSrc();
7665
7666 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7667 if (wf->execMask(lane)) {
7668 VecElemU64 selector = (VecElemU64)src0[lane];
7669 selector = (selector << 32) | (VecElemU64)src1[lane];
7670 vdst[lane] = 0;
7671
7672 DPRINTF(VEGA, "Executing v_perm_b32 src_0 0x%08x, src_1 "
7673 "0x%08x, src_2 0x%08x, vdst 0x%08x\n", src0[lane],
7674 src1[lane], src2[lane], vdst[lane]);
7675 DPRINTF(VEGA, "Selector: 0x%08x \n", selector);
7676
7677 for (int i = 0; i < 4 ; ++i) {
7678 VecElemU32 permuted_val = permute(selector, 0xFF
7679 & ((VecElemU32)src2[lane] >> (8 * i)));
7680 vdst[lane] |= (permuted_val << (8 * i));
7681 }
7682
7683 DPRINTF(VEGA, "v_perm result: 0x%08x\n", vdst[lane]);
7684 }
7685 }
7686
7687 vdst.write();
7688 } // execute
7689 // --- Inst_VOP3__V_FMA_F16 class methods ---
7690
7692 : Inst_VOP3A(iFmt, "v_fma_f16", false)
7693 {
7694 setFlag(ALU);
7695 setFlag(F16);
7696 setFlag(FMA);
7697 } // Inst_VOP3__V_FMA_F16
7698
7700 {
7701 } // ~Inst_VOP3__V_FMA_F16
7702
7703 // --- description from .arch file ---
7704 // D.f16 = S0.f16 * S1.f16 + S2.f16.
7705 // Fused half precision multiply add.
7706 void
7708 {
7710 } // execute
7711 // --- Inst_VOP3__V_DIV_FIXUP_F16 class methods ---
7712
7714 : Inst_VOP3A(iFmt, "v_div_fixup_f16", false)
7715 {
7716 setFlag(ALU);
7717 setFlag(F16);
7718 } // Inst_VOP3__V_DIV_FIXUP_F16
7719
7721 {
7722 } // ~Inst_VOP3__V_DIV_FIXUP_F16
7723
7724 // --- description from .arch file ---
7725 // sign_out = sign(S1.f16)^sign(S2.f16);
7726 // if (S2.f16 == NAN)
7727 // D.f16 = Quiet(S2.f16);
7728 // else if (S1.f16 == NAN)
7729 // D.f16 = Quiet(S1.f16);
7730 // else if (S1.f16 == S2.f16 == 0)
7731 // # 0/0
7732 // D.f16 = pele_nan(0xfe00);
7733 // else if (abs(S1.f16) == abs(S2.f16) == +-INF)
7734 // # inf/inf
7735 // D.f16 = pele_nan(0xfe00);
7736 // else if (S1.f16 ==0 || abs(S2.f16) == +-INF)
7737 // # x/0, or inf/y
7738 // D.f16 = sign_out ? -INF : INF;
7739 // else if (abs(S1.f16) == +-INF || S2.f16 == 0)
7740 // # x/inf, 0/y
7741 // D.f16 = sign_out ? -0 : 0;
7742 // else if ((exp(S2.f16) - exp(S1.f16)) < -150)
7743 // D.f16 = sign_out ? -underflow : underflow;
7744 // else if (exp(S1.f16) == 255)
7745 // D.f16 = sign_out ? -overflow : overflow;
7746 // else
7747 // D.f16 = sign_out ? -abs(S0.f16) : abs(S0.f16).
7748 // Half precision division fixup.
7749 // S0 = Quotient, S1 = Denominator, S3 = Numerator.
7750 // Given a numerator, denominator, and quotient from a divide, this opcode
7751 // will detect and apply special case numerics, touching up the quotient if
7752 // necessary. This opcode also generates invalid, denorm and divide by
7753 // zero exceptions caused by the division.
7754 void
7756 {
7758 } // execute
7759 // --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
7760
7762 : Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
7763 {
7764 setFlag(ALU);
7765 } // Inst_VOP3__V_LSHL_ADD_U64
7766
7768 {
7769 } // ~Inst_VOP3__V_LSHL_ADD_U64
7770
7771 // --- description from .arch file ---
7772 // D.u = (S0.u << S1.u[4:0]) + S2.u.
7773 void
7775 {
7776 Wavefront *wf = gpuDynInst->wavefront();
7777 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
7778 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
7779 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
7780 VecOperandU64 vdst(gpuDynInst, instData.VDST);
7781
7782 src0.readSrc();
7783 src1.readSrc();
7784 src2.readSrc();
7785
7789 assert(!(instData.ABS & 0x1));
7790 assert(!(instData.ABS & 0x2));
7791 assert(!(instData.ABS & 0x4));
7792 assert(!(extData.NEG & 0x1));
7793 assert(!(extData.NEG & 0x2));
7794 assert(!(extData.NEG & 0x4));
7795
7796 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
7797 if (wf->execMask(lane)) {
7798 int shift_amount = bits(src1[lane], 2, 0);
7799 shift_amount = shift_amount > 4 ? 0 : shift_amount;
7800 vdst[lane] = (src0[lane] << shift_amount)
7801 + src2[lane];
7802 }
7803 }
7804
7805 vdst.write();
7806 } // execute
7807 // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
7808
7810 InFmt_VOP3A *iFmt)
7811 : Inst_VOP3A(iFmt, "v_cvt_pkaccum_u8_f32", false)
7812 {
7813 setFlag(ALU);
7814 setFlag(F32);
7815 } // Inst_VOP3__V_CVT_PKACCUM_U8_F32
7816
7818 {
7819 } // ~Inst_VOP3__V_CVT_PKACCUM_U8_F32
7820
7821 // --- description from .arch file ---
7822 // byte = S1.u[1:0]; bit = byte * 8;
7823 // D.u[bit+7:bit] = flt32_to_uint8(S0.f);
7824 // Pack converted value of S0.f into byte S1 of the destination.
7825 // SQ translates to V_CVT_PK_U8_F32.
7826 // Note: this opcode uses src_c to pass destination in as a source.
7827 void
7832 // --- Inst_VOP3__V_INTERP_P1_F32 class methods ---
7833
7835 : Inst_VOP3A(iFmt, "v_interp_p1_f32", false)
7836 {
7837 setFlag(ALU);
7838 setFlag(F32);
7839 } // Inst_VOP3__V_INTERP_P1_F32
7840
7842 {
7843 } // ~Inst_VOP3__V_INTERP_P1_F32
7844
7845 // --- description from .arch file ---
7846 // D.f = P10 * S.f + P0; parameter interpolation (SQ translates to
7847 // V_MAD_F32 for SP).
7848 // CAUTION: when in HALF_LDS mode, D must not be the same GPR as S; if
7849 // D == S then data corruption will occur.
7850 // NOTE: In textual representations the I/J VGPR is the first source and
7851 // the attribute is the second source; however in the VOP3 encoding the
7852 // attribute is stored in the src0 field and the VGPR is stored in the
7853 // src1 field.
7854 void
7856 {
7858 } // execute
7859 // --- Inst_VOP3__V_INTERP_P2_F32 class methods ---
7860
7862 : Inst_VOP3A(iFmt, "v_interp_p2_f32", false)
7863 {
7864 setFlag(ALU);
7865 setFlag(F32);
7866 } // Inst_VOP3__V_INTERP_P2_F32
7867
7869 {
7870 } // ~Inst_VOP3__V_INTERP_P2_F32
7871
7872 // --- description from .arch file ---
7873 // D.f = P20 * S.f + D.f; parameter interpolation (SQ translates to
7874 // V_MAD_F32 for SP).
7875 // NOTE: In textual representations the I/J VGPR is the first source and
7876 // the attribute is the second source; however in the VOP3 encoding the
7877 // attribute is stored in the src0 field and the VGPR is stored in the
7878 // src1 field.
7879 void
7881 {
7883 } // execute
7884 // --- Inst_VOP3__V_INTERP_MOV_F32 class methods ---
7885
7887 : Inst_VOP3A(iFmt, "v_interp_mov_f32", false)
7888 {
7889 setFlag(ALU);
7890 setFlag(F32);
7891 } // Inst_VOP3__V_INTERP_MOV_F32
7892
7894 {
7895 } // ~Inst_VOP3__V_INTERP_MOV_F32
7896
7897 // --- description from .arch file ---
7898 // D.f = {P10,P20,P0}[S.u]; parameter load.
7899 void
7901 {
7903 } // execute
7904 // --- Inst_VOP3__V_INTERP_P1LL_F16 class methods ---
7905
7907 InFmt_VOP3A *iFmt)
7908 : Inst_VOP3A(iFmt, "v_interp_p1ll_f16", false)
7909 {
7910 setFlag(ALU);
7911 setFlag(F16);
7912 } // Inst_VOP3__V_INTERP_P1LL_F16
7913
7915 {
7916 } // ~Inst_VOP3__V_INTERP_P1LL_F16
7917
7918 // --- description from .arch file ---
7919 // D.f32 = P10.f16 * S0.f32 + P0.f16.
7920 // 'LL' stands for 'two LDS arguments'.
7921 // attr_word selects the high or low half 16 bits of each LDS dword
7922 // accessed.
7923 // This opcode is available for 32-bank LDS only.
7924 // NOTE: In textual representations the I/J VGPR is the first source and
7925 // the attribute is the second source; however in the VOP3 encoding the
7926 // attribute is stored in the src0 field and the VGPR is stored in the
7927 // src1 field.
7928 void
7930 {
7932 } // execute
7933 // --- Inst_VOP3__V_INTERP_P1LV_F16 class methods ---
7934
7936 InFmt_VOP3A *iFmt)
7937 : Inst_VOP3A(iFmt, "v_interp_p1lv_f16", false)
7938 {
7939 setFlag(ALU);
7940 setFlag(F16);
7941 } // Inst_VOP3__V_INTERP_P1LV_F16
7942
7944 {
7945 } // ~Inst_VOP3__V_INTERP_P1LV_F16
7946
7947 // --- description from .arch file ---
7948 // D.f32 = P10.f16 * S0.f32 + (S2.u32 >> (attr_word * 16)).f16.
7949 // 'LV' stands for 'One LDS and one VGPR argument'.
7950 // S2 holds two parameters, attr_word selects the high or low word of the
7951 // VGPR for this calculation, as well as the high or low half of the LDS
7952 // data.
7953 // Meant for use with 16-bank LDS.
7954 // NOTE: In textual representations the I/J VGPR is the first source and
7955 // the attribute is the second source; however in the VOP3 encoding the
7956 // attribute is stored in the src0 field and the VGPR is stored in the
7957 // src1 field.
7958 void
7960 {
7962 } // execute
7963 // --- Inst_VOP3__V_INTERP_P2_F16 class methods ---
7964
7966 : Inst_VOP3A(iFmt, "v_interp_p2_f16", false)
7967 {
7968 setFlag(ALU);
7969 setFlag(F16);
7970 } // Inst_VOP3__V_INTERP_P2_F16
7971
7973 {
7974 } // ~Inst_VOP3__V_INTERP_P2_F16
7975
7976 // --- description from .arch file ---
7977 // D.f16 = P20.f16 * S0.f32 + S2.f32.
7978 // Final computation. attr_word selects LDS high or low 16bits. Used for
7979 // both 16- and 32-bank LDS.
7980 // Result is always written to the 16 LSBs of the destination VGPR.
7981 // NOTE: In textual representations the I/J VGPR is the first source and
7982 // the attribute is the second source; however in the VOP3 encoding the
7983 // attribute is stored in the src0 field and the VGPR is stored in the
7984 // src1 field.
7985 void
7987 {
7989 } // execute
7990 // --- Inst_VOP3__V_ADD_F64 class methods ---
7991
7993 : Inst_VOP3A(iFmt, "v_add_f64", false)
7994 {
7995 setFlag(ALU);
7996 setFlag(F64);
7997 } // Inst_VOP3__V_ADD_F64
7998
8000 {
8001 } // ~Inst_VOP3__V_ADD_F64
8002
8003 // --- description from .arch file ---
8004 // D.d = S0.d + S1.d.
8005 void
8007 {
8008 Wavefront *wf = gpuDynInst->wavefront();
8009 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8010 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8011 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8012
8013 src0.readSrc();
8014 src1.readSrc();
8015
8016 if (instData.ABS & 0x1) {
8017 src0.absModifier();
8018 }
8019
8020 if (instData.ABS & 0x2) {
8021 src1.absModifier();
8022 }
8023
8024 if (extData.NEG & 0x1) {
8025 src0.negModifier();
8026 }
8027
8028 if (extData.NEG & 0x2) {
8029 src1.negModifier();
8030 }
8031
8035 assert(!(instData.ABS & 0x4));
8036 assert(!(extData.NEG & 0x4));
8037
8038 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8039 if (wf->execMask(lane)) {
8040 if (std::isnan(src0[lane]) ||
8041 std::isnan(src1[lane]) ) {
8042 vdst[lane] = NAN;
8043 } else if (std::isinf(src0[lane]) &&
8044 std::isinf(src1[lane])) {
8045 if (std::signbit(src0[lane]) !=
8046 std::signbit(src1[lane])) {
8047 vdst[lane] = NAN;
8048 } else {
8049 vdst[lane] = src0[lane];
8050 }
8051 } else if (std::isinf(src0[lane])) {
8052 vdst[lane] = src0[lane];
8053 } else if (std::isinf(src1[lane])) {
8054 vdst[lane] = src1[lane];
8055 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8056 std::fpclassify(src0[lane]) == FP_ZERO) {
8057 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8058 std::fpclassify(src1[lane]) == FP_ZERO) {
8059 if (std::signbit(src0[lane]) &&
8060 std::signbit(src1[lane])) {
8061 vdst[lane] = -0.0;
8062 } else {
8063 vdst[lane] = 0.0;
8064 }
8065 } else {
8066 vdst[lane] = src1[lane];
8067 }
8068 } else if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8069 std::fpclassify(src1[lane]) == FP_ZERO) {
8070 if (std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8071 std::fpclassify(src0[lane]) == FP_ZERO) {
8072 if (std::signbit(src0[lane]) &&
8073 std::signbit(src1[lane])) {
8074 vdst[lane] = -0.0;
8075 } else {
8076 vdst[lane] = 0.0;
8077 }
8078 } else {
8079 vdst[lane] = src0[lane];
8080 }
8081 } else {
8082 vdst[lane] = src0[lane] + src1[lane];
8083 }
8084 }
8085 }
8086
8087 vdst.write();
8088 } // execute
8089 // --- Inst_VOP3__V_MUL_F64 class methods ---
8090
8092 : Inst_VOP3A(iFmt, "v_mul_f64", false)
8093 {
8094 setFlag(ALU);
8095 setFlag(F64);
8096 } // Inst_VOP3__V_MUL_F64
8097
8099 {
8100 } // ~Inst_VOP3__V_MUL_F64
8101
8102 // --- description from .arch file ---
8103 // D.d = S0.d * S1.d.
8104 void
8106 {
8107 Wavefront *wf = gpuDynInst->wavefront();
8108 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8109 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8110 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8111
8112 src0.readSrc();
8113 src1.readSrc();
8114
8115 if (instData.ABS & 0x1) {
8116 src0.absModifier();
8117 }
8118
8119 if (instData.ABS & 0x2) {
8120 src1.absModifier();
8121 }
8122
8123 if (extData.NEG & 0x1) {
8124 src0.negModifier();
8125 }
8126
8127 if (extData.NEG & 0x2) {
8128 src1.negModifier();
8129 }
8130
8134 assert(!(instData.ABS & 0x4));
8135 assert(!(extData.NEG & 0x4));
8136
8137 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8138 if (wf->execMask(lane)) {
8139 if (std::isnan(src0[lane]) ||
8140 std::isnan(src1[lane])) {
8141 vdst[lane] = NAN;
8142 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8143 std::fpclassify(src0[lane]) == FP_ZERO) &&
8144 !std::signbit(src0[lane])) {
8145 if (std::isinf(src1[lane])) {
8146 vdst[lane] = NAN;
8147 } else if (!std::signbit(src1[lane])) {
8148 vdst[lane] = +0.0;
8149 } else {
8150 vdst[lane] = -0.0;
8151 }
8152 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
8153 std::fpclassify(src0[lane]) == FP_ZERO) &&
8154 std::signbit(src0[lane])) {
8155 if (std::isinf(src1[lane])) {
8156 vdst[lane] = NAN;
8157 } else if (std::signbit(src1[lane])) {
8158 vdst[lane] = +0.0;
8159 } else {
8160 vdst[lane] = -0.0;
8161 }
8162 } else if (std::isinf(src0[lane]) &&
8163 !std::signbit(src0[lane])) {
8164 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8165 std::fpclassify(src1[lane]) == FP_ZERO) {
8166 vdst[lane] = NAN;
8167 } else if (!std::signbit(src1[lane])) {
8168 vdst[lane] = +INFINITY;
8169 } else {
8170 vdst[lane] = -INFINITY;
8171 }
8172 } else if (std::isinf(src0[lane]) &&
8173 std::signbit(src0[lane])) {
8174 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
8175 std::fpclassify(src1[lane]) == FP_ZERO) {
8176 vdst[lane] = NAN;
8177 } else if (std::signbit(src1[lane])) {
8178 vdst[lane] = +INFINITY;
8179 } else {
8180 vdst[lane] = -INFINITY;
8181 }
8182 } else {
8183 vdst[lane] = src0[lane] * src1[lane];
8184 }
8185 }
8186 }
8187
8188 vdst.write();
8189 } // execute
8190 // --- Inst_VOP3__V_MIN_F64 class methods ---
8191
8193 : Inst_VOP3A(iFmt, "v_min_f64", false)
8194 {
8195 setFlag(ALU);
8196 setFlag(F64);
8197 } // Inst_VOP3__V_MIN_F64
8198
8200 {
8201 } // ~Inst_VOP3__V_MIN_F64
8202
8203 // --- description from .arch file ---
8204 // D.d = min(S0.d, S1.d).
8205 void
8207 {
8208 Wavefront *wf = gpuDynInst->wavefront();
8209 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8210 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8211 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8212
8213 src0.readSrc();
8214 src1.readSrc();
8215
8216 if (instData.ABS & 0x1) {
8217 src0.absModifier();
8218 }
8219
8220 if (instData.ABS & 0x2) {
8221 src1.absModifier();
8222 }
8223
8224 if (extData.NEG & 0x1) {
8225 src0.negModifier();
8226 }
8227
8228 if (extData.NEG & 0x2) {
8229 src1.negModifier();
8230 }
8231
8235 assert(!(instData.ABS & 0x4));
8236 assert(!(extData.NEG & 0x4));
8237
8238 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8239 if (wf->execMask(lane)) {
8240 vdst[lane] = std::fmin(src0[lane], src1[lane]);
8241 }
8242 }
8243
8244 vdst.write();
8245 } // execute
8246 // --- Inst_VOP3__V_MAX_F64 class methods ---
8247
8249 : Inst_VOP3A(iFmt, "v_max_f64", false)
8250 {
8251 setFlag(ALU);
8252 setFlag(F64);
8253 } // Inst_VOP3__V_MAX_F64
8254
8256 {
8257 } // ~Inst_VOP3__V_MAX_F64
8258
8259 // --- description from .arch file ---
8260 // D.d = max(S0.d, S1.d).
8261 void
8263 {
8264 Wavefront *wf = gpuDynInst->wavefront();
8265 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8266 ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
8267 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8268
8269 src0.readSrc();
8270 src1.readSrc();
8271
8272 if (instData.ABS & 0x1) {
8273 src0.absModifier();
8274 }
8275
8276 if (instData.ABS & 0x2) {
8277 src1.absModifier();
8278 }
8279
8280 if (extData.NEG & 0x1) {
8281 src0.negModifier();
8282 }
8283
8284 if (extData.NEG & 0x2) {
8285 src1.negModifier();
8286 }
8287
8291 assert(!(instData.ABS & 0x4));
8292 assert(!(extData.NEG & 0x4));
8293
8294 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8295 if (wf->execMask(lane)) {
8296 vdst[lane] = std::fmax(src0[lane], src1[lane]);
8297 }
8298 }
8299
8300 vdst.write();
8301 } // execute
8302 // --- Inst_VOP3__V_LDEXP_F64 class methods ---
8303
8305 : Inst_VOP3A(iFmt, "v_ldexp_f64", false)
8306 {
8307 setFlag(ALU);
8308 setFlag(F64);
8309 } // Inst_VOP3__V_LDEXP_F64
8310
8312 {
8313 } // ~Inst_VOP3__V_LDEXP_F64
8314
8315 // --- description from .arch file ---
8316 // D.d = pow(S0.d, S1.i[31:0]).
8317 void
8319 {
8320 Wavefront *wf = gpuDynInst->wavefront();
8321 ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
8322 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8323 VecOperandF64 vdst(gpuDynInst, instData.VDST);
8324
8325 src0.readSrc();
8326 src1.readSrc();
8327
8328 if (instData.ABS & 0x1) {
8329 src0.absModifier();
8330 }
8331
8332 if (extData.NEG & 0x1) {
8333 src0.negModifier();
8334 }
8335
8339 assert(!(instData.ABS & 0x2));
8340 assert(!(instData.ABS & 0x4));
8341 assert(!(extData.NEG & 0x2));
8342 assert(!(extData.NEG & 0x4));
8343
8344 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8345 if (wf->execMask(lane)) {
8346 if (std::isnan(src0[lane]) || std::isinf(src0[lane])) {
8347 vdst[lane] = src0[lane];
8348 } else if (std::fpclassify(src0[lane]) == FP_SUBNORMAL
8349 || std::fpclassify(src0[lane]) == FP_ZERO) {
8350 if (std::signbit(src0[lane])) {
8351 vdst[lane] = -0.0;
8352 } else {
8353 vdst[lane] = +0.0;
8354 }
8355 } else {
8356 vdst[lane] = std::ldexp(src0[lane], src1[lane]);
8357 }
8358 }
8359 }
8360
8361 vdst.write();
8362 } // execute
8363 // --- Inst_VOP3__V_MUL_LO_U32 class methods ---
8364
8366 : Inst_VOP3A(iFmt, "v_mul_lo_u32", false)
8367 {
8368 setFlag(ALU);
8369 } // Inst_VOP3__V_MUL_LO_U32
8370
8372 {
8373 } // ~Inst_VOP3__V_MUL_LO_U32
8374
8375 // --- description from .arch file ---
8376 // D.u = S0.u * S1.u.
8377 void
8379 {
8380 Wavefront *wf = gpuDynInst->wavefront();
8381 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8382 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8383 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8384
8385 src0.readSrc();
8386 src1.readSrc();
8387
8391 assert(!(instData.ABS & 0x1));
8392 assert(!(instData.ABS & 0x2));
8393 assert(!(instData.ABS & 0x4));
8394 assert(!(extData.NEG & 0x1));
8395 assert(!(extData.NEG & 0x2));
8396 assert(!(extData.NEG & 0x4));
8397
8398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8399 if (wf->execMask(lane)) {
8400 VecElemI64 s0 = (VecElemI64)src0[lane];
8401 VecElemI64 s1 = (VecElemI64)src1[lane];
8402 vdst[lane] = (VecElemU32)((s0 * s1) & 0xffffffffLL);
8403 }
8404 }
8405
8406 vdst.write();
8407 } // execute
8408 // --- Inst_VOP3__V_MUL_HI_U32 class methods ---
8409
8411 : Inst_VOP3A(iFmt, "v_mul_hi_u32", false)
8412 {
8413 setFlag(ALU);
8414 } // Inst_VOP3__V_MUL_HI_U32
8415
8417 {
8418 } // ~Inst_VOP3__V_MUL_HI_U32
8419
8420 // --- description from .arch file ---
8421 // D.u = (S0.u * S1.u) >> 32.
8422 void
8424 {
8425 Wavefront *wf = gpuDynInst->wavefront();
8426 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8427 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8428 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8429
8430 src0.readSrc();
8431 src1.readSrc();
8432
8436 assert(!(instData.ABS & 0x1));
8437 assert(!(instData.ABS & 0x2));
8438 assert(!(instData.ABS & 0x4));
8439 assert(!(extData.NEG & 0x1));
8440 assert(!(extData.NEG & 0x2));
8441 assert(!(extData.NEG & 0x4));
8442
8443 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8444 if (wf->execMask(lane)) {
8445 VecElemI64 s0 = (VecElemI64)src0[lane];
8446 VecElemI64 s1 = (VecElemI64)src1[lane];
8447 vdst[lane]
8448 = (VecElemU32)(((s0 * s1) >> 32) & 0xffffffffLL);
8449 }
8450 }
8451
8452 vdst.write();
8453 } // execute
8454 // --- Inst_VOP3__V_MUL_HI_I32 class methods ---
8455
8457 : Inst_VOP3A(iFmt, "v_mul_hi_i32", false)
8458 {
8459 setFlag(ALU);
8460 } // Inst_VOP3__V_MUL_HI_I32
8461
8463 {
8464 } // ~Inst_VOP3__V_MUL_HI_I32
8465
8466 // --- description from .arch file ---
8467 // D.i = (S0.i * S1.i) >> 32.
8468 void
8470 {
8471 Wavefront *wf = gpuDynInst->wavefront();
8472 ConstVecOperandI32 src0(gpuDynInst, extData.SRC0);
8473 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
8474 VecOperandI32 vdst(gpuDynInst, instData.VDST);
8475
8476 src0.readSrc();
8477 src1.readSrc();
8478
8482 assert(!(instData.ABS & 0x1));
8483 assert(!(instData.ABS & 0x2));
8484 assert(!(instData.ABS & 0x4));
8485 assert(!(extData.NEG & 0x1));
8486 assert(!(extData.NEG & 0x2));
8487 assert(!(extData.NEG & 0x4));
8488
8489 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8490 if (wf->execMask(lane)) {
8491 VecElemI64 s0 = (VecElemI64)src0[lane];
8492 VecElemI64 s1 = (VecElemI64)src1[lane];
8493 vdst[lane]
8494 = (VecElemI32)(((s0 * s1) >> 32LL) & 0xffffffffLL);
8495 }
8496 }
8497
8498 vdst.write();
8499 } // execute
8500 // --- Inst_VOP3__V_LDEXP_F32 class methods ---
8501
8503 : Inst_VOP3A(iFmt, "v_ldexp_f32", false)
8504 {
8505 setFlag(ALU);
8506 setFlag(F32);
8507 } // Inst_VOP3__V_LDEXP_F32
8508
8510 {
8511 } // ~Inst_VOP3__V_LDEXP_F32
8512
8513 // --- description from .arch file ---
8514 // D.f = pow(S0.f, S1.i)
8515 void
8517 {
8518 Wavefront *wf = gpuDynInst->wavefront();
8519 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
8520 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1);
8521 VecOperandF32 vdst(gpuDynInst, instData.VDST);
8522
8523 src0.readSrc();
8524 src1.readSrc();
8525
8529 assert(!(instData.ABS & 0x2));
8530 assert(!(instData.ABS & 0x4));
8531 assert(!(extData.NEG & 0x2));
8532 assert(!(extData.NEG & 0x4));
8533
8534 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8535 if (wf->execMask(lane)) {
8536 vdst[lane] = std::ldexp(src0[lane], src1[lane]);
8537 }
8538 }
8539
8540 vdst.write();
8541 } // execute
8542 // --- Inst_VOP3__V_READLANE_B32 class methods ---
8543
8545 : Inst_VOP3A(iFmt, "v_readlane_b32", true)
8546 {
8547 setFlag(ALU);
8548 setFlag(IgnoreExec);
8549 } // Inst_VOP3__V_READLANE_B32
8550
8552 {
8553 } // ~Inst_VOP3__V_READLANE_B32
8554
8555 // --- description from .arch file ---
8556 // Copy one VGPR value to one SGPR. D = SGPR-dest, S0 = Source Data (VGPR#
8557 // or M0(lds-direct)), S1 = Lane Select (SGPR or M0). Ignores exec mask.
8558 // Input and output modifiers not supported; this is an untyped operation.
8559 void
8561 {
8562 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8563 ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
8564 ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
8565
8566 src0.readSrc();
8567 src1.read();
8568
8569 sdst = src0[src1.rawData() & 0x3f];
8570
8571 sdst.write();
8572 } // execute
8573 // --- Inst_VOP3__V_WRITELANE_B32 class methods ---
8574
8576 : Inst_VOP3A(iFmt, "v_writelane_b32", false)
8577 {
8578 setFlag(ALU);
8579 setFlag(IgnoreExec);
8580 } // Inst_VOP3__V_WRITELANE_B32
8581
8583 {
8584 } // ~Inst_VOP3__V_WRITELANE_B32
8585
8586 // --- description from .arch file ---
8587 // Write value into one VGPR in one lane. D = VGPR-dest, S0 = Source Data
8588 // (sgpr, m0, exec or constants), S1 = Lane Select (SGPR or M0). Ignores
8589 // exec mask.
8590 // Input and output modifiers not supported; this is an untyped operation.
8591 // SQ translates to V_MOV_B32.
8592 void
8594 {
8595 ConstScalarOperandU32 src0(gpuDynInst, extData.SRC0);
8596 ConstScalarOperandU32 src1(gpuDynInst, extData.SRC1);
8597 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8598
8599 src0.read();
8600 src1.read();
8601 vdst.read();
8602
8603 vdst[src1.rawData() & 0x3f] = src0.rawData();
8604
8605 vdst.write();
8606 } // execute
8607 // --- Inst_VOP3__V_BCNT_U32_B32 class methods ---
8608
8610 : Inst_VOP3A(iFmt, "v_bcnt_u32_b32", false)
8611 {
8612 setFlag(ALU);
8613 } // Inst_VOP3__V_BCNT_U32_B32
8614
8616 {
8617 } // ~Inst_VOP3__V_BCNT_U32_B32
8618
8619 // --- description from .arch file ---
8620 // D.u = CountOneBits(S0.u) + S1.u. Bit count.
8621 void
8623 {
8624 Wavefront *wf = gpuDynInst->wavefront();
8625 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8626 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8627 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8628
8629 src0.readSrc();
8630 src1.readSrc();
8631
8635 assert(!(instData.ABS & 0x1));
8636 assert(!(instData.ABS & 0x2));
8637 assert(!(instData.ABS & 0x4));
8638 assert(!(extData.NEG & 0x1));
8639 assert(!(extData.NEG & 0x2));
8640 assert(!(extData.NEG & 0x4));
8641
8642 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8643 if (wf->execMask(lane)) {
8644 vdst[lane] = popCount(src0[lane]) + src1[lane];
8645 }
8646 }
8647
8648 vdst.write();
8649 } // execute
8650 // --- Inst_VOP3__V_MBCNT_LO_U32_B32 class methods ---
8651
8653 InFmt_VOP3A *iFmt)
8654 : Inst_VOP3A(iFmt, "v_mbcnt_lo_u32_b32", false)
8655 {
8656 setFlag(ALU);
8657 } // Inst_VOP3__V_MBCNT_LO_U32_B32
8658
8660 {
8661 } // ~Inst_VOP3__V_MBCNT_LO_U32_B32
8662
8663 // --- description from .arch file ---
8664 // ThreadMask = (1 << ThreadPosition) - 1;
8665 // D.u = CountOneBits(S0.u & ThreadMask[31:0]) + S1.u.
8666 // Masked bit count, ThreadPosition is the position of this thread in the
8667 // --- wavefront (in 0..63).
8668 void
8670 {
8671 Wavefront *wf = gpuDynInst->wavefront();
8672 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8673 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8674 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8675 uint64_t threadMask = 0;
8676
8677 src0.readSrc();
8678 src1.readSrc();
8679
8683 assert(!(instData.ABS & 0x1));
8684 assert(!(instData.ABS & 0x2));
8685 assert(!(instData.ABS & 0x4));
8686 assert(!(extData.NEG & 0x1));
8687 assert(!(extData.NEG & 0x2));
8688 assert(!(extData.NEG & 0x4));
8689
8690 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8691 if (wf->execMask(lane)) {
8692 threadMask = ((1ULL << lane) - 1ULL);
8693 vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
8694 src1[lane];
8695 }
8696 }
8697
8698 vdst.write();
8699 } // execute
8700 // --- Inst_VOP3__V_MBCNT_HI_U32_B32 class methods ---
8701
8703 InFmt_VOP3A *iFmt)
8704 : Inst_VOP3A(iFmt, "v_mbcnt_hi_u32_b32", false)
8705 {
8706 setFlag(ALU);
8707 } // Inst_VOP3__V_MBCNT_HI_U32_B32
8708
8710 {
8711 } // ~Inst_VOP3__V_MBCNT_HI_U32_B32
8712
8713 // --- description from .arch file ---
8714 // ThreadMask = (1 << ThreadPosition) - 1;
8715 // D.u = CountOneBits(S0.u & ThreadMask[63:32]) + S1.u.
8716 // Masked bit count, ThreadPosition is the position of this thread in the
8717 // --- wavefront (in 0..63).
8718 void
8720 {
8721 Wavefront *wf = gpuDynInst->wavefront();
8722 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8723 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8724 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8725 uint64_t threadMask = 0;
8726
8727 src0.readSrc();
8728 src1.readSrc();
8729
8733 assert(!(instData.ABS & 0x1));
8734 assert(!(instData.ABS & 0x2));
8735 assert(!(instData.ABS & 0x4));
8736 assert(!(extData.NEG & 0x1));
8737 assert(!(extData.NEG & 0x2));
8738 assert(!(extData.NEG & 0x4));
8739
8740 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8741 if (wf->execMask(lane)) {
8742 threadMask = ((1ULL << lane) - 1ULL);
8743 vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
8744 src1[lane];
8745 }
8746 }
8747
8748 vdst.write();
8749 } // execute
8750 // --- Inst_VOP3__V_LSHLREV_B64 class methods ---
8751
8753 : Inst_VOP3A(iFmt, "v_lshlrev_b64", false)
8754 {
8755 setFlag(ALU);
8756 } // Inst_VOP3__V_LSHLREV_B64
8757
8759 {
8760 } // ~Inst_VOP3__V_LSHLREV_B64
8761
8762 // --- description from .arch file ---
8763 // D.u64 = S1.u64 << S0.u[5:0].
8764 // SQ translates this to an internal SP opcode.
8765 void
8767 {
8768 Wavefront *wf = gpuDynInst->wavefront();
8769 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8770 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
8771 VecOperandU64 vdst(gpuDynInst, instData.VDST);
8772
8773 src0.readSrc();
8774 src1.readSrc();
8775
8779 assert(!(instData.ABS & 0x1));
8780 assert(!(instData.ABS & 0x2));
8781 assert(!(instData.ABS & 0x4));
8782 assert(!(extData.NEG & 0x1));
8783 assert(!(extData.NEG & 0x2));
8784 assert(!(extData.NEG & 0x4));
8785
8786 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8787 if (wf->execMask(lane)) {
8788 vdst[lane] = src1[lane] << bits(src0[lane], 5, 0);
8789 }
8790 }
8791
8792 vdst.write();
8793 } // execute
8794 // --- Inst_VOP3__V_LSHRREV_B64 class methods ---
8795
8797 : Inst_VOP3A(iFmt, "v_lshrrev_b64", false)
8798 {
8799 setFlag(ALU);
8800 } // Inst_VOP3__V_LSHRREV_B64
8801
8803 {
8804 } // ~Inst_VOP3__V_LSHRREV_B64
8805
8806 // --- description from .arch file ---
8807 // D.u64 = S1.u64 >> S0.u[5:0].
8808 // The vacated bits are set to zero.
8809 // SQ translates this to an internal SP opcode.
8810 void
8812 {
8813 Wavefront *wf = gpuDynInst->wavefront();
8814 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8815 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
8816 VecOperandU64 vdst(gpuDynInst, instData.VDST);
8817
8818 src0.readSrc();
8819 src1.readSrc();
8820
8824 assert(!(instData.ABS & 0x1));
8825 assert(!(instData.ABS & 0x2));
8826 assert(!(instData.ABS & 0x4));
8827 assert(!(extData.NEG & 0x1));
8828 assert(!(extData.NEG & 0x2));
8829 assert(!(extData.NEG & 0x4));
8830
8831 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8832 if (wf->execMask(lane)) {
8833 vdst[lane] = src1[lane] >> bits(src0[lane], 5, 0);
8834 }
8835 }
8836
8837 vdst.write();
8838 } // execute
8839 // --- Inst_VOP3__V_ASHRREV_I64 class methods ---
8840
8842 : Inst_VOP3A(iFmt, "v_ashrrev_i64", false)
8843 {
8844 setFlag(ALU);
8845 } // Inst_VOP3__V_ASHRREV_I64
8846
8848 {
8849 } // ~Inst_VOP3__V_ASHRREV_I64
8850
8851 // --- description from .arch file ---
8852 // D.u64 = signext(S1.u64) >> S0.u[5:0].
8853 // The vacated bits are set to the sign bit of the input value.
8854 // SQ translates this to an internal SP opcode.
8855 void
8857 {
8858 Wavefront *wf = gpuDynInst->wavefront();
8859 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8860 ConstVecOperandI64 src1(gpuDynInst, extData.SRC1);
8861 VecOperandU64 vdst(gpuDynInst, instData.VDST);
8862
8863 src0.readSrc();
8864 src1.readSrc();
8865
8869 assert(!(instData.ABS & 0x1));
8870 assert(!(instData.ABS & 0x2));
8871 assert(!(instData.ABS & 0x4));
8872 assert(!(extData.NEG & 0x1));
8873 assert(!(extData.NEG & 0x2));
8874 assert(!(extData.NEG & 0x4));
8875
8876 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8877 if (wf->execMask(lane)) {
8878 vdst[lane]
8879 = src1[lane] >> bits(src0[lane], 5, 0);
8880 }
8881 }
8882
8883 vdst.write();
8884 } // execute
8885 // --- Inst_VOP3__V_TRIG_PREOP_F64 class methods ---
8886
8888 : Inst_VOP3A(iFmt, "v_trig_preop_f64", false)
8889 {
8890 setFlag(ALU);
8891 setFlag(F64);
8892 } // Inst_VOP3__V_TRIG_PREOP_F64
8893
8895 {
8896 } // ~Inst_VOP3__V_TRIG_PREOP_F64
8897
8898 // --- description from .arch file ---
8899 // D.d = Look Up 2/PI (S0.d) with segment select S1.u[4:0]. This operation
8900 // returns an aligned, double precision segment of 2/PI needed to do range
8901 // reduction on S0.d (double-precision value). Multiple segments can be
8902 // specified through S1.u[4:0]. Rounding is always round-to-zero. Large
8903 // inputs (exp > 1968) are scaled to avoid loss of precision through
8904 // denormalization.
8905 void
8907 {
8909 } // execute
8910 // --- Inst_VOP3__V_BFM_B32 class methods ---
8911
8913 : Inst_VOP3A(iFmt, "v_bfm_b32", false)
8914 {
8915 setFlag(ALU);
8916 } // Inst_VOP3__V_BFM_B32
8917
8919 {
8920 } // ~Inst_VOP3__V_BFM_B32
8921
8922 // --- description from .arch file ---
8923 // D.u = ((1<<S0.u[4:0])-1) << S1.u[4:0]; S0 is the bitfield width and S1
8924 // is the bitfield offset.
8925 void
8927 {
8928 Wavefront *wf = gpuDynInst->wavefront();
8929 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
8930 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
8931 VecOperandU32 vdst(gpuDynInst, instData.VDST);
8932
8933 src0.readSrc();
8934 src1.readSrc();
8935
8939 assert(!(instData.ABS & 0x1));
8940 assert(!(instData.ABS & 0x2));
8941 assert(!(instData.ABS & 0x4));
8942 assert(!(extData.NEG & 0x1));
8943 assert(!(extData.NEG & 0x2));
8944 assert(!(extData.NEG & 0x4));
8945
8946 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
8947 if (wf->execMask(lane)) {
8948 vdst[lane] = ((1 << bits(src0[lane], 4, 0)) - 1)
8949 << bits(src1[lane], 4, 0);
8950 }
8951 }
8952
8953 vdst.write();
8954 } // execute
8955 // --- Inst_VOP3__V_CVT_PKNORM_I16_F32 class methods ---
8956
8958 InFmt_VOP3A *iFmt)
8959 : Inst_VOP3A(iFmt, "v_cvt_pknorm_i16_f32", false)
8960 {
8961 setFlag(ALU);
8962 setFlag(F32);
8963 } // Inst_VOP3__V_CVT_PKNORM_I16_F32
8964
8966 {
8967 } // ~Inst_VOP3__V_CVT_PKNORM_I16_F32
8968
8969 // --- description from .arch file ---
8970 // D = {(snorm)S1.f, (snorm)S0.f}.
8971 void
8976 // --- Inst_VOP3__V_CVT_PKNORM_U16_F32 class methods ---
8977
8979 InFmt_VOP3A *iFmt)
8980 : Inst_VOP3A(iFmt, "v_cvt_pknorm_u16_f32", false)
8981 {
8982 setFlag(ALU);
8983 setFlag(F32);
8984 } // Inst_VOP3__V_CVT_PKNORM_U16_F32
8985
8987 {
8988 } // ~Inst_VOP3__V_CVT_PKNORM_U16_F32
8989
8990 // --- description from .arch file ---
8991 // D = {(unorm)S1.f, (unorm)S0.f}.
8992 void
8997 // --- Inst_VOP3__V_CVT_PKRTZ_F16_F32 class methods ---
8998
9000 InFmt_VOP3A *iFmt)
9001 : Inst_VOP3A(iFmt, "v_cvt_pkrtz_f16_f32", false)
9002 {
9003 setFlag(ALU);
9004 setFlag(F32);
9005 } // Inst_VOP3__V_CVT_PKRTZ_F16_F32
9006
9008 {
9009 } // ~Inst_VOP3__V_CVT_PKRTZ_F16_F32
9010
9011 // --- description from .arch file ---
9012 // D = {flt32_to_flt16(S1.f),flt32_to_flt16(S0.f)}, with round-toward-zero
9013 // --- regardless of current round mode setting in hardware.
9014 // This opcode is intended for use with 16-bit compressed exports.
9015 // See V_CVT_F16_F32 for a version that respects the current rounding mode.
9016 void
9021 // --- Inst_VOP3__V_CVT_PK_U16_U32 class methods ---
9022
9024 : Inst_VOP3A(iFmt, "v_cvt_pk_u16_u32", false)
9025 {
9026 setFlag(ALU);
9027 } // Inst_VOP3__V_CVT_PK_U16_U32
9028
9030 {
9031 } // ~Inst_VOP3__V_CVT_PK_U16_U32
9032
9033 // --- description from .arch file ---
9034 // D = {uint32_to_uint16(S1.u), uint32_to_uint16(S0.u)}.
9035 void
9037 {
9039 } // execute
9040 // --- Inst_VOP3__V_CVT_PK_I16_I32 class methods ---
9041
9043 : Inst_VOP3A(iFmt, "v_cvt_pk_i16_i32", false)
9044 {
9045 setFlag(ALU);
9046 } // Inst_VOP3__V_CVT_PK_I16_I32
9047
9049 {
9050 } // ~Inst_VOP3__V_CVT_PK_I16_I32
9051
9052 // --- description from .arch file ---
9053 // D = {int32_to_int16(S1.i), int32_to_int16(S0.i)}.
9054 void
9056 {
9058 } // execute
9059 // --- Inst_VOP3__V_CVT_PK_FP8_F32 class methods ---
9060
9062 : Inst_VOP3A(iFmt, "v_cvt_pk_fp8_f32", false)
9063 {
9064 setFlag(ALU);
9065 } // Inst_VOP3__V_CVT_PK_FP8_F32
9066
9068 {
9069 } // ~Inst_VOP3__V_CVT_PK_FP8_F32
9070
9071 void
9073 {
9074 Wavefront *wf = gpuDynInst->wavefront();
9075 ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
9076 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
9077 VecOperandU32 vdst(gpuDynInst, instData.VDST);
9078
9079 src0.readSrc();
9080 src1.readSrc();
9081 vdst.read(); // Preserve bits
9082
9083 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
9084 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
9085 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
9086 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
9087
9088 unsigned opsel = instData.OPSEL;
9089 unsigned abs = instData.ABS;
9090 unsigned neg = extData.NEG;
9091
9092 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
9093 if (wf->execMask(lane)) {
9094 AMDGPU::mxfloat8 tmp0(src0[lane]), tmp1(src1[lane]);
9095
9096 if ((abs & 1) && (tmp0 < 0)) tmp0 = -tmp0;
9097 if ((abs & 2) && (tmp1 < 0)) tmp1 = -tmp1;
9098 if (neg & 1) tmp0 = -tmp0;
9099 if (neg & 2) tmp1 = -tmp1;
9100
9101 uint16_t packed_data = (bits(tmp0.data, 31, 24) << 8)
9102 | bits(tmp1.data, 31, 24);
9103
9104 if (opsel & 8) {
9105 replaceBits(vdst[lane], 31, 16, packed_data);
9106 } else {
9107 replaceBits(vdst[lane], 15, 0, packed_data);
9108 }
9109 }
9110 }
9111
9112 vdst.write();
9113 } // execute
9114} // namespace VegaISA
9115} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
uint32_t data
Definition mxfp.hh:112
void setFlag(Flags flag)
const std::string _opcode
Nop class.
Definition nop.hh:49
T omodModifier(T val, unsigned omod)
void execute(GPUDynInstPtr) override
Definition vop3.cc:7388
void execute(GPUDynInstPtr) override
Definition vop3.cc:1451
void execute(GPUDynInstPtr) override
Definition vop3.cc:1305
void execute(GPUDynInstPtr) override
Definition vop3.cc:1609
Inst_VOP3__V_ADD_F16(InFmt_VOP3A *)
Definition vop3.cc:1594
Inst_VOP3__V_ADD_F32(InFmt_VOP3A *)
Definition vop3.cc:81
void execute(GPUDynInstPtr) override
Definition vop3.cc:95
void execute(GPUDynInstPtr) override
Definition vop3.cc:8006
Inst_VOP3__V_ADD_F64(InFmt_VOP3A *)
Definition vop3.cc:7992
void execute(GPUDynInstPtr) override
Definition vop3.cc:7342
Inst_VOP3__V_ADD_U16(InFmt_VOP3A *)
Definition vop3.cc:1703
void execute(GPUDynInstPtr) override
Definition vop3.cc:1717
void execute(GPUDynInstPtr) override
Definition vop3.cc:2293
Inst_VOP3__V_ADD_U32(InFmt_VOP3A *)
Definition vop3.cc:2280
void execute(GPUDynInstPtr) override
Definition vop3.cc:5814
void execute(GPUDynInstPtr) override
Definition vop3.cc:5862
void execute(GPUDynInstPtr) override
Definition vop3.cc:1065
Inst_VOP3__V_AND_B32(InFmt_VOP3A *)
Definition vop3.cc:1051
void execute(GPUDynInstPtr) override
Definition vop3.cc:7480
void execute(GPUDynInstPtr) override
Definition vop3.cc:1990
void execute(GPUDynInstPtr) override
Definition vop3.cc:977
void execute(GPUDynInstPtr) override
Definition vop3.cc:8856
void execute(GPUDynInstPtr) override
Definition vop3.cc:8622
Inst_VOP3__V_BFE_I32(InFmt_VOP3A *)
Definition vop3.cc:5518
void execute(GPUDynInstPtr) override
Definition vop3.cc:5532
void execute(GPUDynInstPtr) override
Definition vop3.cc:5485
Inst_VOP3__V_BFE_U32(InFmt_VOP3A *)
Definition vop3.cc:5471
Inst_VOP3__V_BFI_B32(InFmt_VOP3A *)
Definition vop3.cc:5572
void execute(GPUDynInstPtr) override
Definition vop3.cc:5585
Inst_VOP3__V_BFM_B32(InFmt_VOP3A *)
Definition vop3.cc:8912
void execute(GPUDynInstPtr) override
Definition vop3.cc:8926
void execute(GPUDynInstPtr) override
Definition vop3.cc:4307
void execute(GPUDynInstPtr) override
Definition vop3.cc:4982
void execute(GPUDynInstPtr) override
Definition vop3.cc:3666
void execute(GPUDynInstPtr) override
Definition vop3.cc:3463
void execute(GPUDynInstPtr) override
Definition vop3.cc:4687
Inst_VOP3__V_CLREXCP(InFmt_VOP3A *)
Definition vop3.cc:4675
Inst_VOP3__V_CNDMASK_B32(InFmt_VOP3A *)
Definition vop3.cc:43
void execute(GPUDynInstPtr) override
Definition vop3.cc:58
void execute(GPUDynInstPtr) override
Definition vop3.cc:5085
Inst_VOP3__V_COS_F16(InFmt_VOP3A *)
Definition vop3.cc:5071
void execute(GPUDynInstPtr) override
Definition vop3.cc:4225
Inst_VOP3__V_COS_F32(InFmt_VOP3A *)
Definition vop3.cc:4209
void execute(GPUDynInstPtr) override
Definition vop3.cc:5402
void execute(GPUDynInstPtr) override
Definition vop3.cc:5465
void execute(GPUDynInstPtr) override
Definition vop3.cc:5423
void execute(GPUDynInstPtr) override
Definition vop3.cc:5444
void execute(GPUDynInstPtr) override
Definition vop3.cc:2860
void execute(GPUDynInstPtr) override
Definition vop3.cc:4729
void execute(GPUDynInstPtr) override
Definition vop3.cc:4708
void execute(GPUDynInstPtr) override
Definition vop3.cc:2918
void execute(GPUDynInstPtr) override
Definition vop3.cc:3070
void execute(GPUDynInstPtr) override
Definition vop3.cc:2637
void execute(GPUDynInstPtr) override
Definition vop3.cc:2679
void execute(GPUDynInstPtr) override
Definition vop3.cc:3166
void execute(GPUDynInstPtr) override
Definition vop3.cc:3206
void execute(GPUDynInstPtr) override
Definition vop3.cc:3246
void execute(GPUDynInstPtr) override
Definition vop3.cc:3286
void execute(GPUDynInstPtr) override
Definition vop3.cc:3118
void execute(GPUDynInstPtr) override
Definition vop3.cc:2597
void execute(GPUDynInstPtr) override
Definition vop3.cc:3382
void execute(GPUDynInstPtr) override
Definition vop3.cc:3009
void execute(GPUDynInstPtr) override
Definition vop3.cc:4771
void execute(GPUDynInstPtr) override
Definition vop3.cc:2777
void execute(GPUDynInstPtr) override
Definition vop3.cc:2545
void execute(GPUDynInstPtr) override
Definition vop3.cc:3049
void execute(GPUDynInstPtr) override
Definition vop3.cc:7828
void execute(GPUDynInstPtr) override
Definition vop3.cc:8972
void execute(GPUDynInstPtr) override
Definition vop3.cc:8993
void execute(GPUDynInstPtr) override
Definition vop3.cc:9017
void execute(GPUDynInstPtr) override
Definition vop3.cc:9072
void execute(GPUDynInstPtr) override
Definition vop3.cc:9055
void execute(GPUDynInstPtr) override
Definition vop3.cc:9036
void execute(GPUDynInstPtr) override
Definition vop3.cc:6572
void execute(GPUDynInstPtr) override
Definition vop3.cc:2968
void execute(GPUDynInstPtr) override
Definition vop3.cc:4750
void execute(GPUDynInstPtr) override
Definition vop3.cc:2721
void execute(GPUDynInstPtr) override
Definition vop3.cc:3328
void execute(GPUDynInstPtr) override
Definition vop3.cc:7755
void execute(GPUDynInstPtr) override
Definition vop3.cc:6629
void execute(GPUDynInstPtr) override
Definition vop3.cc:6707
void execute(GPUDynInstPtr) override
Definition vop3.cc:6945
void execute(GPUDynInstPtr) override
Definition vop3.cc:7008
void execute(GPUDynInstPtr) override
Definition vop3.cc:6802
void execute(GPUDynInstPtr) override
Definition vop3.cc:6859
Inst_VOP3__V_EXP_F16(InFmt_VOP3A *)
Definition vop3.cc:4869
void execute(GPUDynInstPtr) override
Definition vop3.cc:4886
void execute(GPUDynInstPtr) override
Definition vop3.cc:3787
Inst_VOP3__V_EXP_F32(InFmt_VOP3A *)
Definition vop3.cc:3773
void execute(GPUDynInstPtr) override
Definition vop3.cc:5105
void execute(GPUDynInstPtr) override
Definition vop3.cc:4419
void execute(GPUDynInstPtr) override
Definition vop3.cc:4339
void execute(GPUDynInstPtr) override
Definition vop3.cc:4379
void execute(GPUDynInstPtr) override
Definition vop3.cc:4961
void execute(GPUDynInstPtr) override
Definition vop3.cc:3747
void execute(GPUDynInstPtr) override
Definition vop3.cc:3544
void execute(GPUDynInstPtr) override
Definition vop3.cc:2424
Inst_VOP3__V_FMA_F16(InFmt_VOP3A *)
Definition vop3.cc:7691
void execute(GPUDynInstPtr) override
Definition vop3.cc:7707
void execute(GPUDynInstPtr) override
Definition vop3.cc:5633
Inst_VOP3__V_FMA_F32(InFmt_VOP3A *)
Definition vop3.cc:5618
Inst_VOP3__V_FMA_F64(InFmt_VOP3A *)
Definition vop3.cc:5679
void execute(GPUDynInstPtr) override
Definition vop3.cc:5694
void execute(GPUDynInstPtr) override
Definition vop3.cc:5045
void execute(GPUDynInstPtr) override
Definition vop3.cc:3584
void execute(GPUDynInstPtr) override
Definition vop3.cc:4547
void execute(GPUDynInstPtr) override
Definition vop3.cc:4940
void execute(GPUDynInstPtr) override
Definition vop3.cc:4593
void execute(GPUDynInstPtr) override
Definition vop3.cc:4460
void execute(GPUDynInstPtr) override
Definition vop3.cc:4913
void execute(GPUDynInstPtr) override
Definition vop3.cc:4644
void execute(GPUDynInstPtr) override
Definition vop3.cc:4506
void execute(GPUDynInstPtr) override
Definition vop3.cc:7900
void execute(GPUDynInstPtr) override
Definition vop3.cc:7929
void execute(GPUDynInstPtr) override
Definition vop3.cc:7959
void execute(GPUDynInstPtr) override
Definition vop3.cc:7855
void execute(GPUDynInstPtr) override
Definition vop3.cc:7986
void execute(GPUDynInstPtr) override
Definition vop3.cc:7880
void execute(GPUDynInstPtr) override
Definition vop3.cc:2274
void execute(GPUDynInstPtr) override
Definition vop3.cc:8516
void execute(GPUDynInstPtr) override
Definition vop3.cc:8318
void execute(GPUDynInstPtr) override
Definition vop3.cc:5759
Inst_VOP3__V_LERP_U8(InFmt_VOP3A *)
Definition vop3.cc:5740
void execute(GPUDynInstPtr) override
Definition vop3.cc:4863
Inst_VOP3__V_LOG_F16(InFmt_VOP3A *)
Definition vop3.cc:4846
void execute(GPUDynInstPtr) override
Definition vop3.cc:3827
Inst_VOP3__V_LOG_F32(InFmt_VOP3A *)
Definition vop3.cc:3813
void execute(GPUDynInstPtr) override
Definition vop3.cc:5153
void execute(GPUDynInstPtr) override
Definition vop3.cc:1894
void execute(GPUDynInstPtr) override
Definition vop3.cc:1021
void execute(GPUDynInstPtr) override
Definition vop3.cc:8766
void execute(GPUDynInstPtr) override
Definition vop3.cc:7296
void execute(GPUDynInstPtr) override
Definition vop3.cc:7774
void execute(GPUDynInstPtr) override
Definition vop3.cc:7433
void execute(GPUDynInstPtr) override
Definition vop3.cc:1939
void execute(GPUDynInstPtr) override
Definition vop3.cc:932
void execute(GPUDynInstPtr) override
Definition vop3.cc:8811
void execute(GPUDynInstPtr) override
Definition vop3.cc:1697
Inst_VOP3__V_MAC_F16(InFmt_VOP3A *)
Definition vop3.cc:1680
Inst_VOP3__V_MAC_F32(InFmt_VOP3A *)
Definition vop3.cc:1229
void execute(GPUDynInstPtr) override
Definition vop3.cc:1245
void execute(GPUDynInstPtr) override
Definition vop3.cc:7528
Inst_VOP3__V_MAD_F16(InFmt_VOP3A *)
Definition vop3.cc:7512
void execute(GPUDynInstPtr) override
Definition vop3.cc:5247
Inst_VOP3__V_MAD_F32(InFmt_VOP3A *)
Definition vop3.cc:5232
Inst_VOP3__V_MAD_I16(InFmt_VOP3A *)
Definition vop3.cc:7581
void execute(GPUDynInstPtr) override
Definition vop3.cc:7596
void execute(GPUDynInstPtr) override
Definition vop3.cc:5307
void execute(GPUDynInstPtr) override
Definition vop3.cc:7206
void execute(GPUDynInstPtr) override
Definition vop3.cc:5186
void execute(GPUDynInstPtr) override
Definition vop3.cc:7549
Inst_VOP3__V_MAD_U16(InFmt_VOP3A *)
Definition vop3.cc:7534
void execute(GPUDynInstPtr) override
Definition vop3.cc:5354
void execute(GPUDynInstPtr) override
Definition vop3.cc:7157
void execute(GPUDynInstPtr) override
Definition vop3.cc:6065
void execute(GPUDynInstPtr) override
Definition vop3.cc:6125
void execute(GPUDynInstPtr) override
Definition vop3.cc:6171
void execute(GPUDynInstPtr) override
Definition vop3.cc:2036
Inst_VOP3__V_MAX_F16(InFmt_VOP3A *)
Definition vop3.cc:2020
void execute(GPUDynInstPtr) override
Definition vop3.cc:703
Inst_VOP3__V_MAX_F32(InFmt_VOP3A *)
Definition vop3.cc:689
Inst_VOP3__V_MAX_F64(InFmt_VOP3A *)
Definition vop3.cc:8248
void execute(GPUDynInstPtr) override
Definition vop3.cc:8262
void execute(GPUDynInstPtr) override
Definition vop3.cc:2126
Inst_VOP3__V_MAX_I16(InFmt_VOP3A *)
Definition vop3.cc:2113
void execute(GPUDynInstPtr) override
Definition vop3.cc:801
Inst_VOP3__V_MAX_I32(InFmt_VOP3A *)
Definition vop3.cc:788
Inst_VOP3__V_MAX_U16(InFmt_VOP3A *)
Definition vop3.cc:2064
void execute(GPUDynInstPtr) override
Definition vop3.cc:2077
void execute(GPUDynInstPtr) override
Definition vop3.cc:887
Inst_VOP3__V_MAX_U32(InFmt_VOP3A *)
Definition vop3.cc:874
void execute(GPUDynInstPtr) override
Definition vop3.cc:8719
void execute(GPUDynInstPtr) override
Definition vop3.cc:8669
void execute(GPUDynInstPtr) override
Definition vop3.cc:6218
void execute(GPUDynInstPtr) override
Definition vop3.cc:6277
void execute(GPUDynInstPtr) override
Definition vop3.cc:6322
void execute(GPUDynInstPtr) override
Definition vop3.cc:5912
void execute(GPUDynInstPtr) override
Definition vop3.cc:5972
void execute(GPUDynInstPtr) override
Definition vop3.cc:6018
void execute(GPUDynInstPtr) override
Definition vop3.cc:2058
Inst_VOP3__V_MIN_F16(InFmt_VOP3A *)
Definition vop3.cc:2042
void execute(GPUDynInstPtr) override
Definition vop3.cc:647
Inst_VOP3__V_MIN_F32(InFmt_VOP3A *)
Definition vop3.cc:633
Inst_VOP3__V_MIN_F64(InFmt_VOP3A *)
Definition vop3.cc:8192
void execute(GPUDynInstPtr) override
Definition vop3.cc:8206
Inst_VOP3__V_MIN_I16(InFmt_VOP3A *)
Definition vop3.cc:2211
void execute(GPUDynInstPtr) override
Definition vop3.cc:2224
void execute(GPUDynInstPtr) override
Definition vop3.cc:758
Inst_VOP3__V_MIN_I32(InFmt_VOP3A *)
Definition vop3.cc:745
void execute(GPUDynInstPtr) override
Definition vop3.cc:2175
Inst_VOP3__V_MIN_U16(InFmt_VOP3A *)
Definition vop3.cc:2162
Inst_VOP3__V_MIN_U32(InFmt_VOP3A *)
Definition vop3.cc:831
void execute(GPUDynInstPtr) override
Definition vop3.cc:844
void execute(GPUDynInstPtr) override
Definition vop3.cc:2511
Inst_VOP3__V_MOV_B32(InFmt_VOP3A *)
Definition vop3.cc:2497
void execute(GPUDynInstPtr) override
Definition vop3.cc:2839
void execute(GPUDynInstPtr) override
Definition vop3.cc:7115
void execute(GPUDynInstPtr) override
Definition vop3.cc:7135
void execute(GPUDynInstPtr) override
Definition vop3.cc:7074
Inst_VOP3__V_MSAD_U8(InFmt_VOP3A *)
Definition vop3.cc:7061
Inst_VOP3__V_MUL_F16(InFmt_VOP3A *)
Definition vop3.cc:1659
void execute(GPUDynInstPtr) override
Definition vop3.cc:1674
void execute(GPUDynInstPtr) override
Definition vop3.cc:366
Inst_VOP3__V_MUL_F32(InFmt_VOP3A *)
Definition vop3.cc:352
Inst_VOP3__V_MUL_F64(InFmt_VOP3A *)
Definition vop3.cc:8091
void execute(GPUDynInstPtr) override
Definition vop3.cc:8105
void execute(GPUDynInstPtr) override
Definition vop3.cc:510
void execute(GPUDynInstPtr) override
Definition vop3.cc:8469
void execute(GPUDynInstPtr) override
Definition vop3.cc:601
void execute(GPUDynInstPtr) override
Definition vop3.cc:8423
void execute(GPUDynInstPtr) override
Definition vop3.cc:466
void execute(GPUDynInstPtr) override
Definition vop3.cc:265
void execute(GPUDynInstPtr) override
Definition vop3.cc:1850
void execute(GPUDynInstPtr) override
Definition vop3.cc:8378
void execute(GPUDynInstPtr) override
Definition vop3.cc:558
void execute(GPUDynInstPtr) override
Definition vop3.cc:2492
Inst_VOP3__V_NOP(InFmt_VOP3A *)
Definition vop3.cc:2478
void execute(GPUDynInstPtr) override
Definition vop3.cc:4267
Inst_VOP3__V_NOT_B32(InFmt_VOP3A *)
Definition vop3.cc:4253
Inst_VOP3__V_OR3_B32(InFmt_VOP3A *)
Definition vop3.cc:1139
void execute(GPUDynInstPtr) override
Definition vop3.cc:1153
Inst_VOP3__V_OR_B32(InFmt_VOP3A *)
Definition vop3.cc:1095
void execute(GPUDynInstPtr) override
Definition vop3.cc:1109
uint8_t permute(uint64_t in_dword2x, uint32_t sel)
void execute(GPUDynInstPtr) override
Definition vop3.cc:7654
void execute(GPUDynInstPtr) override
Definition vop3.cc:7094
Inst_VOP3__V_RCP_F16(InFmt_VOP3A *)
Definition vop3.cc:4777
void execute(GPUDynInstPtr) override
Definition vop3.cc:4794
Inst_VOP3__V_RCP_F32(InFmt_VOP3A *)
Definition vop3.cc:3861
void execute(GPUDynInstPtr) override
Definition vop3.cc:3875
void execute(GPUDynInstPtr) override
Definition vop3.cc:3997
Inst_VOP3__V_RCP_F64(InFmt_VOP3A *)
Definition vop3.cc:3983
void execute(GPUDynInstPtr) override
Definition vop3.cc:3917
void execute(GPUDynInstPtr) override
Definition vop3.cc:8560
void execute(GPUDynInstPtr) override
Definition vop3.cc:5025
void execute(GPUDynInstPtr) override
Definition vop3.cc:3706
void execute(GPUDynInstPtr) override
Definition vop3.cc:3503
void execute(GPUDynInstPtr) override
Definition vop3.cc:4840
Inst_VOP3__V_RSQ_F16(InFmt_VOP3A *)
Definition vop3.cc:4823
Inst_VOP3__V_RSQ_F32(InFmt_VOP3A *)
Definition vop3.cc:3943
void execute(GPUDynInstPtr) override
Definition vop3.cc:3957
void execute(GPUDynInstPtr) override
Definition vop3.cc:4049
Inst_VOP3__V_RSQ_F64(InFmt_VOP3A *)
Definition vop3.cc:4035
void execute(GPUDynInstPtr) override
Definition vop3.cc:6423
Inst_VOP3__V_SAD_U16(InFmt_VOP3A *)
Definition vop3.cc:6459
void execute(GPUDynInstPtr) override
Definition vop3.cc:6474
Inst_VOP3__V_SAD_U32(InFmt_VOP3A *)
Definition vop3.cc:6509
void execute(GPUDynInstPtr) override
Definition vop3.cc:6523
void execute(GPUDynInstPtr) override
Definition vop3.cc:6370
Inst_VOP3__V_SAD_U8(InFmt_VOP3A *)
Definition vop3.cc:6354
void execute(GPUDynInstPtr) override
Definition vop3.cc:5065
Inst_VOP3__V_SIN_F16(InFmt_VOP3A *)
Definition vop3.cc:5051
Inst_VOP3__V_SIN_F32(InFmt_VOP3A *)
Definition vop3.cc:4165
void execute(GPUDynInstPtr) override
Definition vop3.cc:4181
void execute(GPUDynInstPtr) override
Definition vop3.cc:4817
void execute(GPUDynInstPtr) override
Definition vop3.cc:4099
void execute(GPUDynInstPtr) override
Definition vop3.cc:4139
void execute(GPUDynInstPtr) override
Definition vop3.cc:1560
void execute(GPUDynInstPtr) override
Definition vop3.cc:1506
void execute(GPUDynInstPtr) override
Definition vop3.cc:1402
void execute(GPUDynInstPtr) override
Definition vop3.cc:1653
void execute(GPUDynInstPtr) override
Definition vop3.cc:209
void execute(GPUDynInstPtr) override
Definition vop3.cc:1806
void execute(GPUDynInstPtr) override
Definition vop3.cc:2379
void execute(GPUDynInstPtr) override
Definition vop3.cc:1353
Inst_VOP3__V_SUB_F16(InFmt_VOP3A *)
Definition vop3.cc:1615
void execute(GPUDynInstPtr) override
Definition vop3.cc:1631
void execute(GPUDynInstPtr) override
Definition vop3.cc:152
Inst_VOP3__V_SUB_F32(InFmt_VOP3A *)
Definition vop3.cc:137
Inst_VOP3__V_SUB_U16(InFmt_VOP3A *)
Definition vop3.cc:1747
void execute(GPUDynInstPtr) override
Definition vop3.cc:1761
Inst_VOP3__V_SUB_U32(InFmt_VOP3A *)
Definition vop3.cc:2323
void execute(GPUDynInstPtr) override
Definition vop3.cc:2336
void execute(GPUDynInstPtr) override
Definition vop3.cc:8906
void execute(GPUDynInstPtr) override
Definition vop3.cc:5003
void execute(GPUDynInstPtr) override
Definition vop3.cc:3625
void execute(GPUDynInstPtr) override
Definition vop3.cc:3422
void execute(GPUDynInstPtr) override
Definition vop3.cc:8593
void execute(GPUDynInstPtr) override
Definition vop3.cc:7251
Inst_VOP3__V_XAD_U32(InFmt_VOP3A *)
Definition vop3.cc:7238
Inst_VOP3__V_XOR_B32(InFmt_VOP3A *)
Definition vop3.cc:1185
void execute(GPUDynInstPtr) override
Definition vop3.cc:1199
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:409
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:392
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
Definition operand.hh:491
void read() override
read from the vrf.
Definition operand.hh:147
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:131
void write() override
write to the vrf.
Definition operand.hh:199
VectorMask & execMask()
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition bitfield.hh:415
constexpr void replaceBits(T &val, unsigned first, unsigned last, B bit_val)
A convenience function to replace bits first to last of val with bit_val in place.
Definition bitfield.hh:216
std::enable_if_t< std::is_integral_v< T >, T > reverseBits(T val, size_t size=sizeof(T))
Takes a value and returns the bit reversed version.
Definition bitfield.hh:255
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 7 > i
Definition misc_types.hh:67
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition inst_util.hh:174
ScalarRegI32 findFirstOne(T val)
Definition inst_util.hh:142
T median(T val_0, T val_1, T val_2)
Definition inst_util.hh:247
ScalarRegI32 findFirstOneMsb(T val)
Definition inst_util.hh:153
T roundNearestEven(T val)
Definition inst_util.hh:259
uint32_t VecElemU32
uint64_t VecElemU64
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition inst_util.hh:272
Bitfield< 31, 16 > selector
Definition misc.hh:1038
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:78
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:83

Generated on Mon Jan 13 2025 04:28:05 for gem5 by doxygen 1.9.8