gem5 v24.1.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
vop1.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
34
35namespace gem5
36{
37
38namespace VegaISA
39{
40 // --- Inst_VOP1__V_NOP class methods ---
41
43 : Inst_VOP1(iFmt, "v_nop")
44 {
45 setFlag(Nop);
46 setFlag(ALU);
47 } // Inst_VOP1__V_NOP
48
50 {
51 } // ~Inst_VOP1__V_NOP
52
53 // --- description from .arch file ---
54 // Do nothing.
55 void
57 {
58 } // execute
59 // --- Inst_VOP1__V_MOV_B32 class methods ---
60
62 : Inst_VOP1(iFmt, "v_mov_b32")
63 {
64 setFlag(ALU);
65 } // Inst_VOP1__V_MOV_B32
66
68 {
69 } // ~Inst_VOP1__V_MOV_B32
70
71 // --- description from .arch file ---
72 // D.u = S0.u.
73 // Input and output modifiers not supported; this is an untyped operation.
74 void
76 {
77 Wavefront *wf = gpuDynInst->wavefront();
78 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
79 VecOperandU32 vdst(gpuDynInst, instData.VDST);
80
81 src.readSrc();
82
83 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
84
85 if (isDPPInst()) {
86 VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
87 src_dpp.read();
88
89 DPRINTF(VEGA, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], "
90 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
91 "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
92 "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
101
102 // NOTE: For VOP1, there is no SRC1, so make sure we're not trying
103 // to negate it or take the absolute value of it
106 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp);
107
108 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
109 if (wf->execMask(lane)) {
110 vdst[lane] = src_dpp[lane];
111 }
112 }
113 } else {
114 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
115 if (wf->execMask(lane)) {
116 vdst[lane] = src[lane];
117 }
118 }
119 }
120
121 vdst.write();
122 } // execute
123 // --- Inst_VOP1__V_READFIRSTLANE_B32 class methods ---
124
126 InFmt_VOP1 *iFmt)
127 : Inst_VOP1(iFmt, "v_readfirstlane_b32")
128 {
129 setFlag(ALU);
130 } // Inst_VOP1__V_READFIRSTLANE_B32
131
133 {
134 } // ~Inst_VOP1__V_READFIRSTLANE_B32
135
136 // --- description from .arch file ---
137 // Copy one VGPR value to one SGPR. D = SGPR destination, S0 = source data
138 // (VGPR# or M0 for lds direct access), Lane# = FindFirst1fromLSB(exec)
139 // (Lane# = 0 if exec is zero). Ignores exec mask for the access. SQ
140 // translates to V_READLANE_B32.
141 // Input and output modifiers not supported; this is an untyped operation.
142 void
144 {
145 Wavefront *wf = gpuDynInst->wavefront();
146 ScalarRegI32 src_lane(0);
147 ScalarRegU64 exec_mask = wf->execMask().to_ullong();
148 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
149 ScalarOperandU32 sdst(gpuDynInst, instData.VDST);
150
151 src.readSrc();
152
153 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
154 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
155
156 if (exec_mask) {
157 src_lane = findLsbSet(exec_mask);
158 }
159
160 sdst = src[src_lane];
161
162 sdst.write();
163 } // execute
164 // --- Inst_VOP1__V_CVT_I32_F64 class methods ---
165
167 : Inst_VOP1(iFmt, "v_cvt_i32_f64")
168 {
169 setFlag(ALU);
170 setFlag(F64);
171 } // Inst_VOP1__V_CVT_I32_F64
172
174 {
175 } // ~Inst_VOP1__V_CVT_I32_F64
176
177 // --- description from .arch file ---
178 // D.i = (int)S0.d.
179 // Out-of-range floating point values (including infinity) saturate. NaN is
180 // --- converted to 0.
181 void
183 {
184 Wavefront *wf = gpuDynInst->wavefront();
185 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
186 VecOperandI32 vdst(gpuDynInst, instData.VDST);
187
188 src.readSrc();
189
190 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
191 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
192
193 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
194 if (wf->execMask(lane)) {
195 int exp;
196 std::frexp(src[lane],&exp);
197 if (std::isnan(src[lane])) {
198 vdst[lane] = 0;
199 } else if (std::isinf(src[lane]) || exp > 30) {
200 if (std::signbit(src[lane])) {
201 vdst[lane] = INT_MIN;
202 } else {
203 vdst[lane] = INT_MAX;
204 }
205 } else {
206 vdst[lane] = (VecElemI32)src[lane];
207 }
208 }
209 }
210
211 vdst.write();
212 } // execute
213 // --- Inst_VOP1__V_CVT_F64_I32 class methods ---
214
216 : Inst_VOP1(iFmt, "v_cvt_f64_i32")
217 {
218 setFlag(ALU);
219 setFlag(F64);
220 } // Inst_VOP1__V_CVT_F64_I32
221
223 {
224 } // ~Inst_VOP1__V_CVT_F64_I32
225
226 // --- description from .arch file ---
227 // D.d = (double)S0.i.
228 void
230 {
231 Wavefront *wf = gpuDynInst->wavefront();
232 ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
233 VecOperandF64 vdst(gpuDynInst, instData.VDST);
234
235 src.readSrc();
236
237 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
238 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
239
240 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
241 if (wf->execMask(lane)) {
242 vdst[lane] = (VecElemF64)src[lane];
243 }
244 }
245
246 vdst.write();
247 } // execute
248 // --- Inst_VOP1__V_CVT_F32_I32 class methods ---
249
251 : Inst_VOP1(iFmt, "v_cvt_f32_i32")
252 {
253 setFlag(ALU);
254 setFlag(F32);
255 } // Inst_VOP1__V_CVT_F32_I32
256
258 {
259 } // ~Inst_VOP1__V_CVT_F32_I32
260
261 // --- description from .arch file ---
262 // D.f = (float)S0.i.
263 void
265 {
266 Wavefront *wf = gpuDynInst->wavefront();
267 ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
268 VecOperandF32 vdst(gpuDynInst, instData.VDST);
269
270 src.readSrc();
271
272 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
273 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
274
275 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
276 if (wf->execMask(lane)) {
277 vdst[lane] = (VecElemF32)src[lane];
278 }
279 }
280
281 vdst.write();
282 } // execute
283 // --- Inst_VOP1__V_CVT_F32_U32 class methods ---
284
286 : Inst_VOP1(iFmt, "v_cvt_f32_u32")
287 {
288 setFlag(ALU);
289 setFlag(F32);
290 } // Inst_VOP1__V_CVT_F32_U32
291
293 {
294 } // ~Inst_VOP1__V_CVT_F32_U32
295
296 // --- description from .arch file ---
297 // D.f = (float)S0.u.
298 void
300 {
301 Wavefront *wf = gpuDynInst->wavefront();
302 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
303 VecOperandF32 vdst(gpuDynInst, instData.VDST);
304
305 src.readSrc();
306
307 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
308 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
309
310 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
311 if (wf->execMask(lane)) {
312 vdst[lane] = (VecElemF32)src[lane];
313 }
314 }
315
316 vdst.write();
317 } // execute
318 // --- Inst_VOP1__V_CVT_U32_F32 class methods ---
319
321 : Inst_VOP1(iFmt, "v_cvt_u32_f32")
322 {
323 setFlag(ALU);
324 setFlag(F32);
325 } // Inst_VOP1__V_CVT_U32_F32
326
328 {
329 } // ~Inst_VOP1__V_CVT_U32_F32
330
331 // --- description from .arch file ---
332 // D.u = (unsigned)S0.f.
333 // Out-of-range floating point values (including infinity) saturate. NaN is
334 // --- converted to 0.
335 void
337 {
338 Wavefront *wf = gpuDynInst->wavefront();
339 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
340 VecOperandU32 vdst(gpuDynInst, instData.VDST);
341
342 src.readSrc();
343
344 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
345 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
346
347 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
348 if (wf->execMask(lane)) {
349 int exp;
350 std::frexp(src[lane],&exp);
351 if (std::isnan(src[lane])) {
352 vdst[lane] = 0;
353 } else if (std::isinf(src[lane])) {
354 if (std::signbit(src[lane])) {
355 vdst[lane] = 0;
356 } else {
357 vdst[lane] = UINT_MAX;
358 }
359 } else if (exp > 31) {
360 vdst[lane] = UINT_MAX;
361 } else {
362 vdst[lane] = (VecElemU32)src[lane];
363 }
364 }
365 }
366
367 vdst.write();
368 } // execute
369 // --- Inst_VOP1__V_CVT_I32_F32 class methods ---
370
372 : Inst_VOP1(iFmt, "v_cvt_i32_f32")
373 {
374 setFlag(ALU);
375 setFlag(F32);
376 } // Inst_VOP1__V_CVT_I32_F32
377
379 {
380 } // ~Inst_VOP1__V_CVT_I32_F32
381
382 // --- description from .arch file ---
383 // D.i = (int)S0.f.
384 // Out-of-range floating point values (including infinity) saturate. NaN is
385 // --- converted to 0.
386 void
388 {
389 Wavefront *wf = gpuDynInst->wavefront();
390 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
391 VecOperandI32 vdst(gpuDynInst, instData.VDST);
392
393 src.readSrc();
394
395 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
396 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
397
398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
399 if (wf->execMask(lane)) {
400 int exp;
401 std::frexp(src[lane],&exp);
402 if (std::isnan(src[lane])) {
403 vdst[lane] = 0;
404 } else if (std::isinf(src[lane]) || exp > 30) {
405 if (std::signbit(src[lane])) {
406 vdst[lane] = INT_MIN;
407 } else {
408 vdst[lane] = INT_MAX;
409 }
410 } else {
411 vdst[lane] = (VecElemI32)src[lane];
412 }
413 }
414 }
415
416 vdst.write();
417 } // execute
418 // --- Inst_VOP1__V_MOV_FED_B32 class methods ---
419
421 : Inst_VOP1(iFmt, "v_mov_fed_b32")
422 {
423 setFlag(ALU);
424 } // Inst_VOP1__V_MOV_FED_B32
425
427 {
428 } // ~Inst_VOP1__V_MOV_FED_B32
429
430 // --- description from .arch file ---
431 // D.u = S0.u;
432 // Introduce EDC double error upon write to dest vgpr without causing an
433 // --- exception.
434 // Input and output modifiers not supported; this is an untyped operation.
435 void
437 {
439 } // execute
440 // --- Inst_VOP1__V_CVT_F16_F32 class methods ---
441
443 : Inst_VOP1(iFmt, "v_cvt_f16_f32")
444 {
445 setFlag(ALU);
446 setFlag(F32);
447 } // Inst_VOP1__V_CVT_F16_F32
448
450 {
451 } // ~Inst_VOP1__V_CVT_F16_F32
452
453 // --- description from .arch file ---
454 // D.f16 = flt32_to_flt16(S0.f).
455 // Supports input modifiers and creates FP16 denormals when appropriate.
456 void
458 {
459 Wavefront *wf = gpuDynInst->wavefront();
460 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
461 VecOperandU32 vdst(gpuDynInst, instData.VDST);
462
463 src.readSrc();
464
465 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
466 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
467
468 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
469 if (wf->execMask(lane)) {
470 float tmp = src[lane];
471 AMDGPU::mxfloat16 out(tmp);
472
473 vdst[lane] = (out.data >> 16);
474 }
475 }
476
477 vdst.write();
478 } // execute
479 // --- Inst_VOP1__V_CVT_F32_F16 class methods ---
480
482 : Inst_VOP1(iFmt, "v_cvt_f32_f16")
483 {
484 setFlag(ALU);
485 setFlag(F32);
486 } // Inst_VOP1__V_CVT_F32_F16
487
489 {
490 } // ~Inst_VOP1__V_CVT_F32_F16
491
492 // --- description from .arch file ---
493 // D.f = flt16_to_flt32(S0.f16).
494 // FP16 denormal inputs are always accepted.
495 void
497 {
498 Wavefront *wf = gpuDynInst->wavefront();
499 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
500 VecOperandF32 vdst(gpuDynInst, instData.VDST);
501
502 src.readSrc();
503
504 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
505 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
506
507 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
508 if (wf->execMask(lane)) {
509 AMDGPU::mxfloat16 tmp(src[lane]);
510 vdst[lane] = float(tmp);
511 }
512 }
513
514 vdst.write();
515 } // execute
516 // --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
517
519 InFmt_VOP1 *iFmt)
520 : Inst_VOP1(iFmt, "v_cvt_rpi_i32_f32")
521 {
522 setFlag(ALU);
523 setFlag(F32);
524 } // Inst_VOP1__V_CVT_RPI_I32_F32
525
527 {
528 } // ~Inst_VOP1__V_CVT_RPI_I32_F32
529
530 // --- description from .arch file ---
531 // D.i = (int)floor(S0.f + 0.5).
532 void
534 {
535 Wavefront *wf = gpuDynInst->wavefront();
536 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
537 VecOperandI32 vdst(gpuDynInst, instData.VDST);
538
539 src.readSrc();
540
541 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
542 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
543
544 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
545 if (wf->execMask(lane)) {
546 vdst[lane] = (VecElemI32)std::floor(src[lane] + 0.5);
547 }
548 }
549
550 vdst.write();
551 } // execute
552 // --- Inst_VOP1__V_CVT_FLR_I32_F32 class methods ---
553
555 InFmt_VOP1 *iFmt)
556 : Inst_VOP1(iFmt, "v_cvt_flr_i32_f32")
557 {
558 setFlag(ALU);
559 setFlag(F32);
560 } // Inst_VOP1__V_CVT_FLR_I32_F32
561
563 {
564 } // ~Inst_VOP1__V_CVT_FLR_I32_F32
565
566 // --- description from .arch file ---
567 // D.i = (int)floor(S0.f).
568 void
570 {
571 Wavefront *wf = gpuDynInst->wavefront();
572 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
573 VecOperandI32 vdst(gpuDynInst, instData.VDST);
574
575 src.readSrc();
576
577 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
578 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
579
580 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
581 if (wf->execMask(lane)) {
582 vdst[lane] = (VecElemI32)std::floor(src[lane]);
583 }
584 }
585
586 vdst.write();
587 } // execute
588 // --- Inst_VOP1__V_CVT_OFF_F32_I4 class methods ---
589
591 : Inst_VOP1(iFmt, "v_cvt_off_f32_i4")
592 {
593 setFlag(ALU);
594 setFlag(F32);
595 } // Inst_VOP1__V_CVT_OFF_F32_I4
596
598 {
599 } // ~Inst_VOP1__V_CVT_OFF_F32_I4
600
601 // --- description from .arch file ---
602 // 4-bit signed int to 32-bit float. Used for interpolation in shader.
603 void
605 {
606 // Could not parse sq_uc.arch desc field
608 } // execute
609 // --- Inst_VOP1__V_CVT_F32_F64 class methods ---
610
612 : Inst_VOP1(iFmt, "v_cvt_f32_f64")
613 {
614 setFlag(ALU);
615 setFlag(F64);
616 } // Inst_VOP1__V_CVT_F32_F64
617
619 {
620 } // ~Inst_VOP1__V_CVT_F32_F64
621
622 // --- description from .arch file ---
623 // D.f = (float)S0.d.
624 void
626 {
627 Wavefront *wf = gpuDynInst->wavefront();
628 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
629 VecOperandF32 vdst(gpuDynInst, instData.VDST);
630
631 src.readSrc();
632
633 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
634 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
635
636 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
637 if (wf->execMask(lane)) {
638 vdst[lane] = (VecElemF32)src[lane];
639 }
640 }
641
642 vdst.write();
643 } // execute
644 // --- Inst_VOP1__V_CVT_F64_F32 class methods ---
645
647 : Inst_VOP1(iFmt, "v_cvt_f64_f32")
648 {
649 setFlag(ALU);
650 setFlag(F64);
651 } // Inst_VOP1__V_CVT_F64_F32
652
654 {
655 } // ~Inst_VOP1__V_CVT_F64_F32
656
657 // --- description from .arch file ---
658 // D.d = (double)S0.f.
659 void
661 {
662 Wavefront *wf = gpuDynInst->wavefront();
663 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
664 VecOperandF64 vdst(gpuDynInst, instData.VDST);
665
666 src.readSrc();
667
668 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
669 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
670
671 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
672 if (wf->execMask(lane)) {
673 vdst[lane] = (VecElemF64)src[lane];
674 }
675 }
676
677 vdst.write();
678 } // execute
679 // --- Inst_VOP1__V_CVT_F32_UBYTE0 class methods ---
680
682 : Inst_VOP1(iFmt, "v_cvt_f32_ubyte0")
683 {
684 setFlag(ALU);
685 setFlag(F32);
686 } // Inst_VOP1__V_CVT_F32_UBYTE0
687
689 {
690 } // ~Inst_VOP1__V_CVT_F32_UBYTE0
691
692 // --- description from .arch file ---
693 // D.f = (float)(S0.u[7:0]).
694 void
696 {
697 Wavefront *wf = gpuDynInst->wavefront();
698 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
699 VecOperandF32 vdst(gpuDynInst, instData.VDST);
700
701 src.readSrc();
702
703 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
704 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
705
706 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
707 if (wf->execMask(lane)) {
708 vdst[lane] = (VecElemF32)(bits(src[lane], 7, 0));
709 }
710 }
711
712 vdst.write();
713 } // execute
714 // --- Inst_VOP1__V_CVT_F32_UBYTE1 class methods ---
715
717 : Inst_VOP1(iFmt, "v_cvt_f32_ubyte1")
718 {
719 setFlag(ALU);
720 setFlag(F32);
721 } // Inst_VOP1__V_CVT_F32_UBYTE1
722
724 {
725 } // ~Inst_VOP1__V_CVT_F32_UBYTE1
726
727 // --- description from .arch file ---
728 // D.f = (float)(S0.u[15:8]).
729 void
731 {
732 Wavefront *wf = gpuDynInst->wavefront();
733 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
734 VecOperandF32 vdst(gpuDynInst, instData.VDST);
735
736 src.readSrc();
737
738 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
739 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
740
741 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
742 if (wf->execMask(lane)) {
743 vdst[lane] = (VecElemF32)(bits(src[lane], 15, 8));
744 }
745 }
746
747 vdst.write();
748 } // execute
749 // --- Inst_VOP1__V_CVT_F32_UBYTE2 class methods ---
750
752 : Inst_VOP1(iFmt, "v_cvt_f32_ubyte2")
753 {
754 setFlag(ALU);
755 setFlag(F32);
756 } // Inst_VOP1__V_CVT_F32_UBYTE2
757
759 {
760 } // ~Inst_VOP1__V_CVT_F32_UBYTE2
761
762 // --- description from .arch file ---
763 // D.f = (float)(S0.u[23:16]).
764 void
766 {
767 Wavefront *wf = gpuDynInst->wavefront();
768 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
769 VecOperandF32 vdst(gpuDynInst, instData.VDST);
770
771 src.readSrc();
772
773 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
774 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
775
776 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
777 if (wf->execMask(lane)) {
778 vdst[lane] = (VecElemF32)(bits(src[lane], 23, 16));
779 }
780 }
781
782 vdst.write();
783 } // execute
784 // --- Inst_VOP1__V_CVT_F32_UBYTE3 class methods ---
785
787 : Inst_VOP1(iFmt, "v_cvt_f32_ubyte3")
788 {
789 setFlag(ALU);
790 setFlag(F32);
791 } // Inst_VOP1__V_CVT_F32_UBYTE3
792
794 {
795 } // ~Inst_VOP1__V_CVT_F32_UBYTE3
796
797 // --- description from .arch file ---
798 // D.f = (float)(S0.u[31:24]).
799 void
801 {
802 Wavefront *wf = gpuDynInst->wavefront();
803 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
804 VecOperandF32 vdst(gpuDynInst, instData.VDST);
805
806 src.readSrc();
807
808 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
809 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
810
811 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
812 if (wf->execMask(lane)) {
813 vdst[lane] = (VecElemF32)(bits(src[lane], 31, 24));
814 }
815 }
816
817 vdst.write();
818 } // execute
819 // --- Inst_VOP1__V_CVT_U32_F64 class methods ---
820
822 : Inst_VOP1(iFmt, "v_cvt_u32_f64")
823 {
824 setFlag(ALU);
825 setFlag(F64);
826 } // Inst_VOP1__V_CVT_U32_F64
827
829 {
830 } // ~Inst_VOP1__V_CVT_U32_F64
831
832 // --- description from .arch file ---
833 // D.u = (unsigned)S0.d.
834 // Out-of-range floating point values (including infinity) saturate. NaN is
835 // --- converted to 0.
836 void
838 {
839 Wavefront *wf = gpuDynInst->wavefront();
840 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
841 VecOperandU32 vdst(gpuDynInst, instData.VDST);
842
843 src.readSrc();
844
845 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
846 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
847
848 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
849 if (wf->execMask(lane)) {
850 int exp;
851 std::frexp(src[lane],&exp);
852 if (std::isnan(src[lane])) {
853 vdst[lane] = 0;
854 } else if (std::isinf(src[lane])) {
855 if (std::signbit(src[lane])) {
856 vdst[lane] = 0;
857 } else {
858 vdst[lane] = UINT_MAX;
859 }
860 } else if (exp > 31) {
861 vdst[lane] = UINT_MAX;
862 } else {
863 vdst[lane] = (VecElemU32)src[lane];
864 }
865 }
866 }
867
868 vdst.write();
869 } // execute
870 // --- Inst_VOP1__V_CVT_F64_U32 class methods ---
871
873 : Inst_VOP1(iFmt, "v_cvt_f64_u32")
874 {
875 setFlag(ALU);
876 setFlag(F64);
877 } // Inst_VOP1__V_CVT_F64_U32
878
880 {
881 } // ~Inst_VOP1__V_CVT_F64_U32
882
883 // --- description from .arch file ---
884 // D.d = (double)S0.u.
885 void
887 {
888 Wavefront *wf = gpuDynInst->wavefront();
889 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
890 VecOperandF64 vdst(gpuDynInst, instData.VDST);
891
892 src.readSrc();
893
894 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
895 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
896
897 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
898 if (wf->execMask(lane)) {
899 vdst[lane] = (VecElemF64)src[lane];
900 }
901 }
902
903 vdst.write();
904 } // execute
905 // --- Inst_VOP1__V_TRUNC_F64 class methods ---
906
908 : Inst_VOP1(iFmt, "v_trunc_f64")
909 {
910 setFlag(ALU);
911 setFlag(F64);
912 } // Inst_VOP1__V_TRUNC_F64
913
915 {
916 } // ~Inst_VOP1__V_TRUNC_F64
917
918 // --- description from .arch file ---
919 // D.d = trunc(S0.d), return integer part of S0.d.
920 void
922 {
923 Wavefront *wf = gpuDynInst->wavefront();
924 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
925 VecOperandF64 vdst(gpuDynInst, instData.VDST);
926
927 src.readSrc();
928
929 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
930 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
931
932 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
933 if (wf->execMask(lane)) {
934 vdst[lane] = std::trunc(src[lane]);
935 }
936 }
937
938 vdst.write();
939 } // execute
940 // --- Inst_VOP1__V_CEIL_F64 class methods ---
941
943 : Inst_VOP1(iFmt, "v_ceil_f64")
944 {
945 setFlag(ALU);
946 setFlag(F64);
947 } // Inst_VOP1__V_CEIL_F64
948
950 {
951 } // ~Inst_VOP1__V_CEIL_F64
952
953 // --- description from .arch file ---
954 // D.d = trunc(S0.d);
955 // if (S0.d > 0.0 && S0.d != D.d) then D.d += 1.0.
956 void
958 {
959 Wavefront *wf = gpuDynInst->wavefront();
960 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
961 VecOperandF64 vdst(gpuDynInst, instData.VDST);
962
963 src.readSrc();
964
965 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
966 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
967
968 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
969 if (wf->execMask(lane)) {
970 vdst[lane] = std::ceil(src[lane]);
971 }
972 }
973
974 vdst.write();
975 } // execute
976 // --- Inst_VOP1__V_RNDNE_F64 class methods ---
977
979 : Inst_VOP1(iFmt, "v_rndne_f64")
980 {
981 setFlag(ALU);
982 setFlag(F64);
983 } // Inst_VOP1__V_RNDNE_F64
984
986 {
987 } // ~Inst_VOP1__V_RNDNE_F64
988
989 // --- description from .arch file ---
990 // D.d = round_nearest_even(S0.d).
991 void
993 {
994 Wavefront *wf = gpuDynInst->wavefront();
995 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
996 VecOperandF64 vdst(gpuDynInst, instData.VDST);
997
998 src.readSrc();
999
1000 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1001 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1002
1003 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1004 if (wf->execMask(lane)) {
1005 vdst[lane] = roundNearestEven(src[lane]);
1006 }
1007 }
1008
1009 vdst.write();
1010 } // execute
1011 // --- Inst_VOP1__V_FLOOR_F64 class methods ---
1012
1014 : Inst_VOP1(iFmt, "v_floor_f64")
1015 {
1016 setFlag(ALU);
1017 setFlag(F64);
1018 } // Inst_VOP1__V_FLOOR_F64
1019
1021 {
1022 } // ~Inst_VOP1__V_FLOOR_F64
1023
1024 // --- description from .arch file ---
1025 // D.d = trunc(S0.d);
1026 // if (S0.d < 0.0 && S0.d != D.d) then D.d += -1.0.
1027 void
1029 {
1030 Wavefront *wf = gpuDynInst->wavefront();
1031 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
1032 VecOperandF64 vdst(gpuDynInst, instData.VDST);
1033
1034 src.readSrc();
1035
1036 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1037 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1038
1039 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1040 if (wf->execMask(lane)) {
1041 vdst[lane] = std::floor(src[lane]);
1042 }
1043 }
1044
1045 vdst.write();
1046 } // execute
1047 // --- Inst_VOP1__V_FRACT_F32 class methods ---
1048
1050 : Inst_VOP1(iFmt, "v_fract_f32")
1051 {
1052 setFlag(ALU);
1053 setFlag(F32);
1054 } // Inst_VOP1__V_FRACT_F32
1055
1057 {
1058 } // ~Inst_VOP1__V_FRACT_F32
1059
1060 // --- description from .arch file ---
1061 // D.f = S0.f - floor(S0.f).
1062 void
1064 {
1065 Wavefront *wf = gpuDynInst->wavefront();
1066 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1067 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1068
1069 src.readSrc();
1070
1071 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1072 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1073
1074 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1075 if (wf->execMask(lane)) {
1076 VecElemF32 int_part(0.0);
1077 vdst[lane] = std::modf(src[lane], &int_part);
1078 }
1079 }
1080
1081 vdst.write();
1082 } // execute
1083 // --- Inst_VOP1__V_TRUNC_F32 class methods ---
1084
1086 : Inst_VOP1(iFmt, "v_trunc_f32")
1087 {
1088 setFlag(ALU);
1089 setFlag(F32);
1090 } // Inst_VOP1__V_TRUNC_F32
1091
1093 {
1094 } // ~Inst_VOP1__V_TRUNC_F32
1095
1096 // --- description from .arch file ---
1097 // D.f = trunc(S0.f), return integer part of S0.f.
1098 void
1100 {
1101 Wavefront *wf = gpuDynInst->wavefront();
1102 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1103 VecOperandF32 vdst (gpuDynInst, instData.VDST);
1104
1105 src.readSrc();
1106
1107 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1108 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1109
1110 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1111 if (wf->execMask(lane)) {
1112 vdst[lane] = std::trunc(src[lane]);
1113 }
1114 }
1115
1116 vdst.write();
1117 } // execute
1118 // --- Inst_VOP1__V_CEIL_F32 class methods ---
1119
1121 : Inst_VOP1(iFmt, "v_ceil_f32")
1122 {
1123 setFlag(ALU);
1124 setFlag(F32);
1125 } // Inst_VOP1__V_CEIL_F32
1126
1128 {
1129 } // ~Inst_VOP1__V_CEIL_F32
1130
1131 // --- description from .arch file ---
1132 // D.f = trunc(S0.f);
1133 // if (S0.f > 0.0 && S0.f != D.f) then D.f += 1.0.
1134 void
1136 {
1137 Wavefront *wf = gpuDynInst->wavefront();
1138 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1139 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1140
1141 src.readSrc();
1142
1143 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1144 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1145
1146 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1147 if (wf->execMask(lane)) {
1148 vdst[lane] = std::ceil(src[lane]);
1149 }
1150 }
1151
1152 vdst.write();
1153 } // execute
1154 // --- Inst_VOP1__V_RNDNE_F32 class methods ---
1155
1157 : Inst_VOP1(iFmt, "v_rndne_f32")
1158 {
1159 setFlag(ALU);
1160 setFlag(F32);
1161 } // Inst_VOP1__V_RNDNE_F32
1162
1164 {
1165 } // ~Inst_VOP1__V_RNDNE_F32
1166
1167 // --- description from .arch file ---
1168 // D.f = round_nearest_even(S0.f).
1169 void
1171 {
1172 Wavefront *wf = gpuDynInst->wavefront();
1173 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1174 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1175
1176 src.readSrc();
1177
1178 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1179 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1180
1181 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1182 if (wf->execMask(lane)) {
1183 vdst[lane] = roundNearestEven(src[lane]);
1184 }
1185 }
1186
1187 vdst.write();
1188 } // execute
1189 // --- Inst_VOP1__V_FLOOR_F32 class methods ---
1190
1192 : Inst_VOP1(iFmt, "v_floor_f32")
1193 {
1194 setFlag(ALU);
1195 setFlag(F32);
1196 } // Inst_VOP1__V_FLOOR_F32
1197
1199 {
1200 } // ~Inst_VOP1__V_FLOOR_F32
1201
1202 // --- description from .arch file ---
1203 // D.f = trunc(S0.f);
1204 // if (S0.f < 0.0 && S0.f != D.f) then D.f += -1.0.
1205 void
1207 {
1208 Wavefront *wf = gpuDynInst->wavefront();
1209 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1210 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1211
1212 src.readSrc();
1213
1214 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1215 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1216
1217 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1218 if (wf->execMask(lane)) {
1219 vdst[lane] = std::floor(src[lane]);
1220 }
1221 }
1222
1223 vdst.write();
1224 } // execute
1225 // --- Inst_VOP1__V_EXP_F32 class methods ---
1226
1228 : Inst_VOP1(iFmt, "v_exp_f32")
1229 {
1230 setFlag(ALU);
1231 setFlag(F32);
1232 } // Inst_VOP1__V_EXP_F32
1233
1235 {
1236 } // ~Inst_VOP1__V_EXP_F32
1237
1238 // --- description from .arch file ---
1239 // D.f = pow(2.0, S0.f).
1240 void
1242 {
1243 Wavefront *wf = gpuDynInst->wavefront();
1244 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1245 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1246
1247 src.readSrc();
1248
1249 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1250 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1251
1252 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1253 if (wf->execMask(lane)) {
1254 vdst[lane] = std::pow(2.0, src[lane]);
1255 }
1256 }
1257
1258 vdst.write();
1259 } // execute
1260 // --- Inst_VOP1__V_LOG_F32 class methods ---
1261
1263 : Inst_VOP1(iFmt, "v_log_f32")
1264 {
1265 setFlag(ALU);
1266 setFlag(F32);
1267 } // Inst_VOP1__V_LOG_F32
1268
1270 {
1271 } // ~Inst_VOP1__V_LOG_F32
1272
1273 // --- description from .arch file ---
1274 // D.f = log2(S0.f). Base 2 logarithm.
1275 void
1277 {
1278 Wavefront *wf = gpuDynInst->wavefront();
1279 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1280 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1281
1282 src.readSrc();
1283
1284 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1285 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1286
1287 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1288 if (wf->execMask(lane)) {
1289 vdst[lane] = std::log2(src[lane]);
1290 }
1291 }
1292
1293 vdst.write();
1294 } // execute
1295 // --- Inst_VOP1__V_RCP_F32 class methods ---
1296
1298 : Inst_VOP1(iFmt, "v_rcp_f32")
1299 {
1300 setFlag(ALU);
1301 setFlag(F32);
1302 } // Inst_VOP1__V_RCP_F32
1303
1305 {
1306 } // ~Inst_VOP1__V_RCP_F32
1307
1308 // --- description from .arch file ---
1309 // D.f = 1.0 / S0.f. Reciprocal with IEEE rules and < 1ulp error.
1310 void
1312 {
1313 Wavefront *wf = gpuDynInst->wavefront();
1314 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1315 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1316
1317 src.readSrc();
1318
1319 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1320 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1321
1322 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1323 if (wf->execMask(lane)) {
1324 vdst[lane] = 1.0 / src[lane];
1325 }
1326 }
1327
1328 vdst.write();
1329 } // execute
1330 // --- Inst_VOP1__V_RCP_IFLAG_F32 class methods ---
1331
1333 : Inst_VOP1(iFmt, "v_rcp_iflag_f32")
1334 {
1335 setFlag(ALU);
1336 setFlag(F32);
1337 } // Inst_VOP1__V_RCP_IFLAG_F32
1338
1340 {
1341 } // ~Inst_VOP1__V_RCP_IFLAG_F32
1342
1343 // --- description from .arch file ---
1344 // D.f = 1.0 / S0.f. Reciprocal intended for integer division, can raise
1345 // --- integer DIV_BY_ZERO exception but cannot raise floating-point
1346 // --- exceptions.
1347 void
1349 {
1350 Wavefront *wf = gpuDynInst->wavefront();
1351 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1352 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1353
1354 src.readSrc();
1355
1356 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1357 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1358
1359 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1360 if (wf->execMask(lane)) {
1361 vdst[lane] = 1.0 / src[lane];
1362 }
1363 }
1364
1365 vdst.write();
1366 } // execute
1367 // --- Inst_VOP1__V_RSQ_F32 class methods ---
1368
1370 : Inst_VOP1(iFmt, "v_rsq_f32")
1371 {
1372 setFlag(ALU);
1373 setFlag(F32);
1374 } // Inst_VOP1__V_RSQ_F32
1375
1377 {
1378 } // ~Inst_VOP1__V_RSQ_F32
1379
1380 // --- description from .arch file ---
1381 // D.f = 1.0 / sqrt(S0.f). Reciprocal square root with IEEE rules.
1382 void
1384 {
1385 Wavefront *wf = gpuDynInst->wavefront();
1386 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1387 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1388
1389 src.readSrc();
1390
1391 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1392 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1393
1394 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1395 if (wf->execMask(lane)) {
1396 vdst[lane] = 1.0 / std::sqrt(src[lane]);
1397 }
1398 }
1399
1400 vdst.write();
1401 } // execute
1402 // --- Inst_VOP1__V_RCP_F64 class methods ---
1403
1405 : Inst_VOP1(iFmt, "v_rcp_f64")
1406 {
1407 setFlag(ALU);
1408 setFlag(F64);
1409 } // Inst_VOP1__V_RCP_F64
1410
1412 {
1413 } // ~Inst_VOP1__V_RCP_F64
1414
1415 // --- description from .arch file ---
1416 // D.d = 1.0 / S0.d.
1417 void
1419 {
1420 Wavefront *wf = gpuDynInst->wavefront();
1421 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
1422 VecOperandF64 vdst(gpuDynInst, instData.VDST);
1423
1424 src.readSrc();
1425
1426 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1427 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1428
1429 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1430 if (wf->execMask(lane)) {
1431 if (std::fpclassify(src[lane]) == FP_ZERO) {
1432 vdst[lane] = +INFINITY;
1433 } else if (std::isnan(src[lane])) {
1434 vdst[lane] = NAN;
1435 } else if (std::isinf(src[lane])) {
1436 if (std::signbit(src[lane])) {
1437 vdst[lane] = -0.0;
1438 } else {
1439 vdst[lane] = 0.0;
1440 }
1441 } else {
1442 vdst[lane] = 1.0 / src[lane];
1443 }
1444 }
1445 }
1446
1447 vdst.write();
1448 } // execute
1449 // --- Inst_VOP1__V_RSQ_F64 class methods ---
1450
1452 : Inst_VOP1(iFmt, "v_rsq_f64")
1453 {
1454 setFlag(ALU);
1455 setFlag(F64);
1456 } // Inst_VOP1__V_RSQ_F64
1457
1459 {
1460 } // ~Inst_VOP1__V_RSQ_F64
1461
1462 // --- description from .arch file ---
1463 // D.d = 1.0 / sqrt(S0.d). See V_RSQ_F32.
1464 void
1466 {
1467 Wavefront *wf = gpuDynInst->wavefront();
1468 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
1469 VecOperandF64 vdst(gpuDynInst, instData.VDST);
1470
1471 src.readSrc();
1472
1473 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1474 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1475
1476 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1477 if (wf->execMask(lane)) {
1478 if (std::fpclassify(src[lane]) == FP_ZERO) {
1479 vdst[lane] = +INFINITY;
1480 } else if (std::isnan(src[lane])) {
1481 vdst[lane] = NAN;
1482 } else if (std::isinf(src[lane])
1483 && !std::signbit(src[lane])) {
1484 vdst[lane] = 0.0;
1485 } else if (std::signbit(src[lane])) {
1486 vdst[lane] = NAN;
1487 } else {
1488 vdst[lane] = 1.0 / std::sqrt(src[lane]);
1489 }
1490 }
1491 }
1492
1493 vdst.write();
1494 } // execute
1495 // --- Inst_VOP1__V_SQRT_F32 class methods ---
1496
1498 : Inst_VOP1(iFmt, "v_sqrt_f32")
1499 {
1500 setFlag(ALU);
1501 setFlag(F32);
1502 } // Inst_VOP1__V_SQRT_F32
1503
1505 {
1506 } // ~Inst_VOP1__V_SQRT_F32
1507
1508 // --- description from .arch file ---
1509 // D.f = sqrt(S0.f).
1510 void
1512 {
1513 Wavefront *wf = gpuDynInst->wavefront();
1514 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1515 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1516
1517 src.readSrc();
1518
1519 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1520 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1521
1522 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1523 if (wf->execMask(lane)) {
1524 vdst[lane] = std::sqrt(src[lane]);
1525 }
1526 }
1527
1528 vdst.write();
1529 } // execute
1530 // --- Inst_VOP1__V_SQRT_F64 class methods ---
1531
1533 : Inst_VOP1(iFmt, "v_sqrt_f64")
1534 {
1535 setFlag(ALU);
1536 setFlag(F64);
1537 } // Inst_VOP1__V_SQRT_F64
1538
1540 {
1541 } // ~Inst_VOP1__V_SQRT_F64
1542
1543 // --- description from .arch file ---
1544 // D.d = sqrt(S0.d).
1545 void
1547 {
1548 Wavefront *wf = gpuDynInst->wavefront();
1549 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
1550 VecOperandF64 vdst(gpuDynInst, instData.VDST);
1551
1552 src.readSrc();
1553
1554 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1555 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1556
1557 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1558 if (wf->execMask(lane)) {
1559 vdst[lane] = std::sqrt(src[lane]);
1560 }
1561 }
1562
1563 vdst.write();
1564 } // execute
1565 // --- Inst_VOP1__V_SIN_F32 class methods ---
1566
1568 : Inst_VOP1(iFmt, "v_sin_f32")
1569 {
1570 setFlag(ALU);
1571 setFlag(F32);
1572 } // Inst_VOP1__V_SIN_F32
1573
1575 {
1576 } // ~Inst_VOP1__V_SIN_F32
1577
1578 // --- description from .arch file ---
1579 // D.f = sin(S0.f * 2 * PI).
1580 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
1581 // float 0.0.
1582 void
1584 {
1585 Wavefront *wf = gpuDynInst->wavefront();
1586 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1587 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
1588 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1589
1590 src.readSrc();
1591 pi.read();
1592
1593 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1594 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1595
1596 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1597 if (wf->execMask(lane)) {
1598 if (src[lane] < -256.0 || src[lane] > 256.0) {
1599 vdst[lane] = 0.0;
1600 } else {
1601 vdst[lane] = std::sin(src[lane] * 2.0 * pi.rawData());
1602 }
1603 }
1604 }
1605
1606 vdst.write();
1607 } // execute
1608 // --- Inst_VOP1__V_COS_F32 class methods ---
1609
1611 : Inst_VOP1(iFmt, "v_cos_f32")
1612 {
1613 setFlag(ALU);
1614 setFlag(F32);
1615 } // Inst_VOP1__V_COS_F32
1616
1618 {
1619 } // ~Inst_VOP1__V_COS_F32
1620
1621 // --- description from .arch file ---
1622 // D.f = cos(S0.f * 2 * PI).
1623 // Valid range of S0.f is [-256.0, +256.0]. Out of range input results in
1624 // float 1.0.
1625 void
1627 {
1628 Wavefront *wf = gpuDynInst->wavefront();
1629 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1630 ConstScalarOperandF32 pi(gpuDynInst, REG_PI);
1631 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1632
1633 src.readSrc();
1634 pi.read();
1635
1636 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1637 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1638
1639 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1640 if (wf->execMask(lane)) {
1641 if (src[lane] < -256.0 || src[lane] > 256.0) {
1642 vdst[lane] = 0.0;
1643 } else {
1644 vdst[lane] = std::cos(src[lane] * 2.0 * pi.rawData());
1645 }
1646 }
1647 }
1648
1649 vdst.write();
1650 } // execute
1651 // --- Inst_VOP1__V_NOT_B32 class methods ---
1652
1654 : Inst_VOP1(iFmt, "v_not_b32")
1655 {
1656 setFlag(ALU);
1657 } // Inst_VOP1__V_NOT_B32
1658
1660 {
1661 } // ~Inst_VOP1__V_NOT_B32
1662
1663 // --- description from .arch file ---
1664 // D.u = ~S0.u.
1665 // Input and output modifiers not supported.
1666 void
1668 {
1669 Wavefront *wf = gpuDynInst->wavefront();
1670 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
1671 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1672
1673 src.readSrc();
1674
1675 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1676 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1677
1678 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1679 if (wf->execMask(lane)) {
1680 vdst[lane] = ~src[lane];
1681 }
1682 }
1683
1684 vdst.write();
1685 } // execute
1686 // --- Inst_VOP1__V_BFREV_B32 class methods ---
1687
1689 : Inst_VOP1(iFmt, "v_bfrev_b32")
1690 {
1691 setFlag(ALU);
1692 } // Inst_VOP1__V_BFREV_B32
1693
1695 {
1696 } // ~Inst_VOP1__V_BFREV_B32
1697
1698 // --- description from .arch file ---
1699 // D.u[31:0] = S0.u[0:31], bitfield reverse.
1700 // Input and output modifiers not supported.
1701 void
1703 {
1704 Wavefront *wf = gpuDynInst->wavefront();
1705 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
1706 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1707
1708 src.readSrc();
1709
1710 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1711 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1712
1713 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1714 if (wf->execMask(lane)) {
1715 vdst[lane] = reverseBits(src[lane]);
1716 }
1717 }
1718
1719 vdst.write();
1720 } // execute
1721 // --- Inst_VOP1__V_FFBH_U32 class methods ---
1722
1724 : Inst_VOP1(iFmt, "v_ffbh_u32")
1725 {
1726 setFlag(ALU);
1727 } // Inst_VOP1__V_FFBH_U32
1728
1730 {
1731 } // ~Inst_VOP1__V_FFBH_U32
1732
1733 // --- description from .arch file ---
1734 // D.u = position of first 1 in S0.u from MSB;
1735 // D.u = 0xffffffff if S0.u == 0.
1736 void
1738 {
1739 Wavefront *wf = gpuDynInst->wavefront();
1740 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
1741 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1742
1743 src.readSrc();
1744
1745 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1746 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1747
1748 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1749 if (wf->execMask(lane)) {
1750 vdst[lane] = findFirstOneMsb(src[lane]);
1751 }
1752 }
1753
1754 vdst.write();
1755 } // execute
1756 // --- Inst_VOP1__V_FFBL_B32 class methods ---
1757
1759 : Inst_VOP1(iFmt, "v_ffbl_b32")
1760 {
1761 setFlag(ALU);
1762 } // Inst_VOP1__V_FFBL_B32
1763
1765 {
1766 } // ~Inst_VOP1__V_FFBL_B32
1767
1768 // --- description from .arch file ---
1769 // D.u = position of first 1 in S0.u from LSB;
1770 // D.u = 0xffffffff if S0.u == 0.
1771 void
1773 {
1774 Wavefront *wf = gpuDynInst->wavefront();
1775 ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
1776 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1777
1778 src.readSrc();
1779
1780 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1781 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1782
1783 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1784 if (wf->execMask(lane)) {
1785 vdst[lane] = findFirstOne(src[lane]);
1786 }
1787 }
1788
1789 vdst.write();
1790 } // execute
1791 // --- Inst_VOP1__V_FFBH_I32 class methods ---
1792
1794 : Inst_VOP1(iFmt, "v_ffbh_i32")
1795 {
1796 setFlag(ALU);
1797 } // Inst_VOP1__V_FFBH_I32
1798
1800 {
1801 } // ~Inst_VOP1__V_FFBH_I32
1802
1803 // --- description from .arch file ---
1804 // D.u = position of first bit different from sign bit in S0.i from MSB;
1805 // D.u = 0xffffffff if S0.i == 0 or S0.i == 0xffffffff.
1806 void
1808 {
1809 Wavefront *wf = gpuDynInst->wavefront();
1810 ConstVecOperandI32 src(gpuDynInst, instData.SRC0);
1811 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1812
1813 src.readSrc();
1814
1815 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1816 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1817
1818 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1819 if (wf->execMask(lane)) {
1820 vdst[lane] = firstOppositeSignBit(src[lane]);
1821 }
1822 }
1823
1824 vdst.write();
1825 } // execute
1826 // --- Inst_VOP1__V_FREXP_EXP_I32_F64 class methods ---
1827
1829 InFmt_VOP1 *iFmt)
1830 : Inst_VOP1(iFmt, "v_frexp_exp_i32_f64")
1831 {
1832 setFlag(ALU);
1833 setFlag(F64);
1834 } // Inst_VOP1__V_FREXP_EXP_I32_F64
1835
1837 {
1838 } // ~Inst_VOP1__V_FREXP_EXP_I32_F64
1839
1840 // --- description from .arch file ---
1841 // See V_FREXP_EXP_I32_F32.
1842 void
1844 {
1845 Wavefront *wf = gpuDynInst->wavefront();
1846 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
1847 VecOperandI32 vdst(gpuDynInst, instData.VDST);
1848
1849 src.readSrc();
1850
1851 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1852 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1853
1854 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1855 if (wf->execMask(lane)) {
1856 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
1857 vdst[lane] = 0;
1858 } else {
1859 VecElemI32 exp = 0;
1860 std::frexp(src[lane], &exp);
1861 vdst[lane] = exp;
1862 }
1863 }
1864 }
1865
1866 vdst.write();
1867 } // execute
1868 // --- Inst_VOP1__V_FREXP_MANT_F64 class methods ---
1869
1871 : Inst_VOP1(iFmt, "v_frexp_mant_f64")
1872 {
1873 setFlag(ALU);
1874 setFlag(F64);
1875 } // Inst_VOP1__V_FREXP_MANT_F64
1876
1878 {
1879 } // ~Inst_VOP1__V_FREXP_MANT_F64
1880
1881 // --- description from .arch file ---
1882 // See V_FREXP_MANT_F32.
1883 void
1885 {
1886 Wavefront *wf = gpuDynInst->wavefront();
1887 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
1888 VecOperandF64 vdst(gpuDynInst, instData.VDST);
1889
1890 src.readSrc();
1891
1892 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1893 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1894
1895 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1896 if (wf->execMask(lane)) {
1897 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
1898 vdst[lane] = src[lane];
1899 } else {
1900 VecElemI32 exp(0);
1901 vdst[lane] = std::frexp(src[lane], &exp);
1902 }
1903 }
1904 }
1905
1906 vdst.write();
1907 } // execute
1908 // --- Inst_VOP1__V_FRACT_F64 class methods ---
1909
1911 : Inst_VOP1(iFmt, "v_fract_f64")
1912 {
1913 setFlag(ALU);
1914 setFlag(F64);
1915 } // Inst_VOP1__V_FRACT_F64
1916
1918 {
1919 } // ~Inst_VOP1__V_FRACT_F64
1920
1921 // --- description from .arch file ---
1922 // See V_FRACT_F32.
1923 void
1925 {
1926 Wavefront *wf = gpuDynInst->wavefront();
1927 ConstVecOperandF64 src(gpuDynInst, instData.SRC0);
1928 VecOperandF64 vdst(gpuDynInst, instData.VDST);
1929
1930 src.readSrc();
1931
1932 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1933 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1934
1935 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1936 if (wf->execMask(lane)) {
1937 VecElemF64 int_part(0.0);
1938 vdst[lane] = std::modf(src[lane], &int_part);
1939 }
1940 }
1941
1942 vdst.write();
1943 } // execute
1944 // --- Inst_VOP1__V_FREXP_EXP_I32_F32 class methods ---
1945
1947 InFmt_VOP1 *iFmt)
1948 : Inst_VOP1(iFmt, "v_frexp_exp_i32_f32")
1949 {
1950 setFlag(ALU);
1951 setFlag(F32);
1952 } // Inst_VOP1__V_FREXP_EXP_I32_F32
1953
1955 {
1956 } // ~Inst_VOP1__V_FREXP_EXP_I32_F32
1957
1958 // --- description from .arch file ---
1959 // if (S0.f == INF || S0.f == NAN) then D.i = 0;
1960 // else D.i = TwosComplement(Exponent(S0.f) - 127 + 1).
1961 // Returns exponent of single precision float input, such that S0.f =
1962 // significand * (2 ** exponent). See also FREXP_MANT_F32, which returns
1963 // the significand.
1964 void
1966 {
1967 Wavefront *wf = gpuDynInst->wavefront();
1968 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
1969 VecOperandI32 vdst(gpuDynInst, instData.VDST);
1970
1971 src.readSrc();
1972
1973 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1974 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1975
1976 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1977 if (wf->execMask(lane)) {
1978 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
1979 vdst[lane] = 0;
1980 } else {
1981 VecElemI32 exp(0);
1982 std::frexp(src[lane], &exp);
1983 vdst[lane] = exp;
1984 }
1985 }
1986 }
1987
1988 vdst.write();
1989 } // execute
1990 // --- Inst_VOP1__V_FREXP_MANT_F32 class methods ---
1991
1993 : Inst_VOP1(iFmt, "v_frexp_mant_f32")
1994 {
1995 setFlag(ALU);
1996 setFlag(F32);
1997 } // Inst_VOP1__V_FREXP_MANT_F32
1998
2000 {
2001 } // ~Inst_VOP1__V_FREXP_MANT_F32
2002
2003 // --- description from .arch file ---
2004 // if (S0.f == INF || S0.f == NAN) then D.f = S0.f;
2005 // else D.f = Mantissa(S0.f).
2006 // Result range is in (-1.0,-0.5][0.5,1.0) in normal cases. Returns binary
2007 // --- significand of single precision float input, such that S0.f =
2008 // --- significand * (2 ** exponent). See also FREXP_EXP_I32_F32, which
2009 // --- returns integer exponent.
2010 void
2012 {
2013 Wavefront *wf = gpuDynInst->wavefront();
2014 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
2015 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2016
2017 src.readSrc();
2018
2019 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2020 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2021
2022 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2023 if (wf->execMask(lane)) {
2024 if (std::isinf(src[lane]) || std::isnan(src[lane])) {
2025 vdst[lane] = src[lane];
2026 } else {
2027 VecElemI32 exp(0);
2028 vdst[lane] = std::frexp(src[lane], &exp);
2029 }
2030 }
2031 }
2032
2033 vdst.write();
2034 } // execute
2035 // --- Inst_VOP1__V_CLREXCP class methods ---
2036
2038 : Inst_VOP1(iFmt, "v_clrexcp")
2039 {
2040 setFlag(ALU);
2041 } // Inst_VOP1__V_CLREXCP
2042
2044 {
2045 } // ~Inst_VOP1__V_CLREXCP
2046
2047 // --- description from .arch file ---
2048 // Clear wave's exception state in SIMD (SP).
2049 void
2051 {
2053 } // execute
2054 // --- Inst_VOP1__V_MOV_B64 class methods ---
2055
2057 : Inst_VOP1(iFmt, "v_mov_b64")
2058 {
2059 setFlag(ALU);
2060 } // Inst_VOP1__V_MOV_B64
2061
2063 {
2064 } // ~Inst_VOP1__V_MOV_B64
2065
2066 // --- description from .arch file ---
2067 // D.u = S0.u.
2068 // Input and output modifiers not supported; this is an untyped operation.
2069 void
2071 {
2072 Wavefront *wf = gpuDynInst->wavefront();
2073 ConstVecOperandU64 src(gpuDynInst, instData.SRC0);
2074 VecOperandU64 vdst(gpuDynInst, instData.VDST);
2075
2076 src.readSrc();
2077
2078 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2079 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2080
2081 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2082 if (wf->execMask(lane)) {
2083 vdst[lane] = src[lane];
2084 }
2085 }
2086
2087 vdst.write();
2088 } // execute
2089 // --- Inst_VOP1__V_CVT_F16_U16 class methods ---
2090
2092 : Inst_VOP1(iFmt, "v_cvt_f16_u16")
2093 {
2094 setFlag(ALU);
2095 setFlag(F16);
2096 } // Inst_VOP1__V_CVT_F16_U16
2097
2099 {
2100 } // ~Inst_VOP1__V_CVT_F16_U16
2101
2102 // --- description from .arch file ---
2103 // D.f16 = uint16_to_flt16(S.u16).
2104 // Supports denormals, rounding, exception flags and saturation.
2105 void
2107 {
2109 } // execute
2110 // --- Inst_VOP1__V_CVT_F16_I16 class methods ---
2111
2113 : Inst_VOP1(iFmt, "v_cvt_f16_i16")
2114 {
2115 setFlag(ALU);
2116 setFlag(F16);
2117 } // Inst_VOP1__V_CVT_F16_I16
2118
2120 {
2121 } // ~Inst_VOP1__V_CVT_F16_I16
2122
2123 // --- description from .arch file ---
2124 // D.f16 = int16_to_flt16(S.i16).
2125 // Supports denormals, rounding, exception flags and saturation.
2126 void
2128 {
2130 } // execute
2131 // --- Inst_VOP1__V_CVT_U16_F16 class methods ---
2132
2134 : Inst_VOP1(iFmt, "v_cvt_u16_f16")
2135 {
2136 setFlag(ALU);
2137 setFlag(F16);
2138 } // Inst_VOP1__V_CVT_U16_F16
2139
2141 {
2142 } // ~Inst_VOP1__V_CVT_U16_F16
2143
2144 // --- description from .arch file ---
2145 // D.u16 = flt16_to_uint16(S.f16).
2146 // Supports rounding, exception flags and saturation.
2147 void
2149 {
2151 } // execute
2152 // --- Inst_VOP1__V_CVT_I16_F16 class methods ---
2153
2155 : Inst_VOP1(iFmt, "v_cvt_i16_f16")
2156 {
2157 setFlag(ALU);
2158 setFlag(F16);
2159 } // Inst_VOP1__V_CVT_I16_F16
2160
2162 {
2163 } // ~Inst_VOP1__V_CVT_I16_F16
2164
2165 // --- description from .arch file ---
2166 // D.i16 = flt16_to_int16(S.f16).
2167 // Supports rounding, exception flags and saturation.
2168 void
2170 {
2172 } // execute
2173 // --- Inst_VOP1__V_RCP_F16 class methods ---
2174
2176 : Inst_VOP1(iFmt, "v_rcp_f16")
2177 {
2178 setFlag(ALU);
2179 setFlag(F16);
2180 } // Inst_VOP1__V_RCP_F16
2181
2183 {
2184 } // ~Inst_VOP1__V_RCP_F16
2185
2186 // --- description from .arch file ---
2187 // if (S0.f16 == 1.0f)
2188 // D.f16 = 1.0f;
2189 // else
2190 // D.f16 = ApproximateRecip(S0.f16).
2191 void
2193 {
2195 } // execute
2196 // --- Inst_VOP1__V_SQRT_F16 class methods ---
2197
2199 : Inst_VOP1(iFmt, "v_sqrt_f16")
2200 {
2201 setFlag(ALU);
2202 setFlag(F16);
2203 } // Inst_VOP1__V_SQRT_F16
2204
2206 {
2207 } // ~Inst_VOP1__V_SQRT_F16
2208
2209 // --- description from .arch file ---
2210 // if (S0.f16 == 1.0f)
2211 // D.f16 = 1.0f;
2212 // else
2213 // D.f16 = ApproximateSqrt(S0.f16).
2214 void
2216 {
2218 } // execute
2219 // --- Inst_VOP1__V_RSQ_F16 class methods ---
2220
2222 : Inst_VOP1(iFmt, "v_rsq_f16")
2223 {
2224 setFlag(ALU);
2225 setFlag(F16);
2226 } // Inst_VOP1__V_RSQ_F16
2227
2229 {
2230 } // ~Inst_VOP1__V_RSQ_F16
2231
2232 // --- description from .arch file ---
2233 // if (S0.f16 == 1.0f)
2234 // D.f16 = 1.0f;
2235 // else
2236 // D.f16 = ApproximateRecipSqrt(S0.f16).
2237 void
2239 {
2241 } // execute
2242 // --- Inst_VOP1__V_LOG_F16 class methods ---
2243
2245 : Inst_VOP1(iFmt, "v_log_f16")
2246 {
2247 setFlag(ALU);
2248 setFlag(F16);
2249 } // Inst_VOP1__V_LOG_F16
2250
2252 {
2253 } // ~Inst_VOP1__V_LOG_F16
2254
2255 // --- description from .arch file ---
2256 // if (S0.f16 == 1.0f)
2257 // D.f16 = 0.0f;
2258 // else
2259 // D.f16 = ApproximateLog2(S0.f16).
2260 void
2262 {
2264 } // execute
2265 // --- Inst_VOP1__V_EXP_F16 class methods ---
2266
2268 : Inst_VOP1(iFmt, "v_exp_f16")
2269 {
2270 setFlag(ALU);
2271 setFlag(F16);
2272 } // Inst_VOP1__V_EXP_F16
2273
2275 {
2276 } // ~Inst_VOP1__V_EXP_F16
2277
2278 // --- description from .arch file ---
2279 // if (S0.f16 == 0.0f)
2280 // D.f16 = 1.0f;
2281 // else
2282 // D.f16 = Approximate2ToX(S0.f16).
2283 void
2285 {
2287 } // execute
2288 // --- Inst_VOP1__V_FREXP_MANT_F16 class methods ---
2289
2291 : Inst_VOP1(iFmt, "v_frexp_mant_f16")
2292 {
2293 setFlag(ALU);
2294 setFlag(F16);
2295 } // Inst_VOP1__V_FREXP_MANT_F16
2296
2298 {
2299 } // ~Inst_VOP1__V_FREXP_MANT_F16
2300
2301 // --- description from .arch file ---
2302 // if (S0.f16 == +-INF || S0.f16 == NAN)
2303 // D.f16 = S0.f16;
2304 // else
2305 // D.f16 = mantissa(S0.f16).
2306 // Result range is (-1.0,-0.5][0.5,1.0).
2307 // C math library frexp function.
2308 // Returns binary significand of half precision float input, such that the
2309 // original single float = significand * (2 ** exponent).
2310 void
2312 {
2314 } // execute
2315 // --- Inst_VOP1__V_FREXP_EXP_I16_F16 class methods ---
2316
2318 InFmt_VOP1 *iFmt)
2319 : Inst_VOP1(iFmt, "v_frexp_exp_i16_f16")
2320 {
2321 setFlag(ALU);
2322 setFlag(F16);
2323 } // Inst_VOP1__V_FREXP_EXP_I16_F16
2324
2326 {
2327 } // ~Inst_VOP1__V_FREXP_EXP_I16_F16
2328
2329 // --- description from .arch file ---
2330 // if (S0.f16 == +-INF || S0.f16 == NAN)
2331 // D.i16 = 0;
2332 // else
2333 // D.i16 = 2s_complement(exponent(S0.f16) - 15 + 1).
2334 // C math library frexp function.
2335 // Returns exponent of half precision float input, such that the
2336 // original single float = significand * (2 ** exponent).
2337 void
2342 // --- Inst_VOP1__V_FLOOR_F16 class methods ---
2343
2345 : Inst_VOP1(iFmt, "v_floor_f16")
2346 {
2347 setFlag(ALU);
2348 setFlag(F16);
2349 } // Inst_VOP1__V_FLOOR_F16
2350
2352 {
2353 } // ~Inst_VOP1__V_FLOOR_F16
2354
2355 // --- description from .arch file ---
2356 // D.f16 = trunc(S0.f16);
2357 // if (S0.f16 < 0.0f && S0.f16 != D.f16) then D.f16 -= 1.0f.
2358 void
2360 {
2362 } // execute
2363 // --- Inst_VOP1__V_CEIL_F16 class methods ---
2364
2366 : Inst_VOP1(iFmt, "v_ceil_f16")
2367 {
2368 setFlag(ALU);
2369 setFlag(F16);
2370 } // Inst_VOP1__V_CEIL_F16
2371
2373 {
2374 } // ~Inst_VOP1__V_CEIL_F16
2375
2376 // --- description from .arch file ---
2377 // D.f16 = trunc(S0.f16);
2378 // if (S0.f16 > 0.0f && S0.f16 != D.f16) then D.f16 += 1.0f.
2379 void
2381 {
2383 } // execute
2384 // --- Inst_VOP1__V_TRUNC_F16 class methods ---
2385
2387 : Inst_VOP1(iFmt, "v_trunc_f16")
2388 {
2389 setFlag(ALU);
2390 setFlag(F16);
2391 } // Inst_VOP1__V_TRUNC_F16
2392
2394 {
2395 } // ~Inst_VOP1__V_TRUNC_F16
2396
2397 // --- description from .arch file ---
2398 // D.f16 = trunc(S0.f16).
2399 // Round-to-zero semantics.
2400 void
2402 {
2404 } // execute
2405 // --- Inst_VOP1__V_RNDNE_F16 class methods ---
2406
2408 : Inst_VOP1(iFmt, "v_rndne_f16")
2409 {
2410 setFlag(ALU);
2411 setFlag(F16);
2412 } // Inst_VOP1__V_RNDNE_F16
2413
2415 {
2416 } // ~Inst_VOP1__V_RNDNE_F16
2417
2418 // --- description from .arch file ---
2419 // D.f16 = FLOOR(S0.f16 + 0.5f);
2420 // if (floor(S0.f16) is even && fract(S0.f16) == 0.5f) then D.f16 -= 1.0f.
2421 // Round-to-nearest-even semantics.
2422 void
2424 {
2426 } // execute
2427 // --- Inst_VOP1__V_FRACT_F16 class methods ---
2428
2430 : Inst_VOP1(iFmt, "v_fract_f16")
2431 {
2432 setFlag(ALU);
2433 setFlag(F16);
2434 } // Inst_VOP1__V_FRACT_F16
2435
2437 {
2438 } // ~Inst_VOP1__V_FRACT_F16
2439
2440 // --- description from .arch file ---
2441 // D.f16 = S0.f16 + -floor(S0.f16).
2442 void
2444 {
2446 } // execute
2447 // --- Inst_VOP1__V_SIN_F16 class methods ---
2448
2450 : Inst_VOP1(iFmt, "v_sin_f16")
2451 {
2452 setFlag(ALU);
2453 setFlag(F16);
2454 } // Inst_VOP1__V_SIN_F16
2455
2457 {
2458 } // ~Inst_VOP1__V_SIN_F16
2459
2460 // --- description from .arch file ---
2461 // D.f16 = sin(S0.f16 * 2 * PI).
2462 void
2464 {
2466 } // execute
2467 // --- Inst_VOP1__V_COS_F16 class methods ---
2468
2470 : Inst_VOP1(iFmt, "v_cos_f16")
2471 {
2472 setFlag(ALU);
2473 setFlag(F16);
2474 } // Inst_VOP1__V_COS_F16
2475
2477 {
2478 } // ~Inst_VOP1__V_COS_F16
2479
2480 // --- description from .arch file ---
2481 // D.f16 = cos(S0.f16 * 2 * PI).
2482 void
2484 {
2486 } // execute
2487 // --- Inst_VOP1__V_EXP_LEGACY_F32 class methods ---
2488
2490 : Inst_VOP1(iFmt, "v_exp_legacy_f32")
2491 {
2492 setFlag(ALU);
2493 setFlag(F32);
2494 } // Inst_VOP1__V_EXP_LEGACY_F32
2495
2497 {
2498 } // ~Inst_VOP1__V_EXP_LEGACY_F32
2499
2500 // --- description from .arch file ---
2501 // D.f = pow(2.0, S0.f) with legacy semantics.
2502 void
2504 {
2505 Wavefront *wf = gpuDynInst->wavefront();
2506 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
2507 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2508
2509 src.readSrc();
2510
2511 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2512 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2513
2514 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2515 if (wf->execMask(lane)) {
2516 vdst[lane] = std::pow(2.0, src[lane]);
2517 }
2518 }
2519
2520 vdst.write();
2521 } // execute
2522 // --- Inst_VOP1__V_LOG_LEGACY_F32 class methods ---
2523
2525 : Inst_VOP1(iFmt, "v_log_legacy_f32")
2526 {
2527 setFlag(ALU);
2528 setFlag(F32);
2529 } // Inst_VOP1__V_LOG_LEGACY_F32
2530
2532 {
2533 } // ~Inst_VOP1__V_LOG_LEGACY_F32
2534
2535 // --- description from .arch file ---
2536 // D.f = log2(S0.f). Base 2 logarithm with legacy semantics.
2537 void
2539 {
2540 Wavefront *wf = gpuDynInst->wavefront();
2541 ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
2542 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2543
2544 src.readSrc();
2545
2546 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2547 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2548
2549 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2550 if (wf->execMask(lane)) {
2551 vdst[lane] = std::log2(src[lane]);
2552 }
2553 }
2554
2555 vdst.write();
2556 } // execute
2557 // --- Inst_VOP1__V_ACCVGPR_MOV_B32 class methods ---
2558
2561 : Inst_VOP1(iFmt, "v_accvgpr_mov_b32")
2562 {
2563 setFlag(ALU);
2564 } // Inst_VOP1__V_ACCVGPR_MOV_B32
2565
2567 {
2568 } // ~Inst_VOP1__V_ACCVGPR_MOV_B32
2569
2570 void
2572 {
2573 Wavefront *wf = gpuDynInst->wavefront();
2574 unsigned accum_offset = wf->accumOffset;
2575
2576 ConstVecOperandU32 src(gpuDynInst, instData.SRC0+accum_offset);
2577 VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
2578
2579 src.readSrc();
2580
2581 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2582 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2583
2584 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2585 if (wf->execMask(lane)) {
2586 vdst[lane] = src[lane];
2587 }
2588 }
2589
2590 vdst.write();
2591 } // execute
2592} // namespace VegaISA
2593} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
uint32_t data
Definition mxfp.hh:112
void setFlag(Flags flag)
const std::string _opcode
Nop class.
Definition nop.hh:49
void execute(GPUDynInstPtr) override
Definition vop1.cc:2571
void execute(GPUDynInstPtr) override
Definition vop1.cc:1702
void execute(GPUDynInstPtr) override
Definition vop1.cc:2380
void execute(GPUDynInstPtr) override
Definition vop1.cc:1135
void execute(GPUDynInstPtr) override
Definition vop1.cc:957
void execute(GPUDynInstPtr) override
Definition vop1.cc:2050
void execute(GPUDynInstPtr) override
Definition vop1.cc:2483
void execute(GPUDynInstPtr) override
Definition vop1.cc:1626
void execute(GPUDynInstPtr) override
Definition vop1.cc:457
void execute(GPUDynInstPtr) override
Definition vop1.cc:2127
void execute(GPUDynInstPtr) override
Definition vop1.cc:2106
void execute(GPUDynInstPtr) override
Definition vop1.cc:496
void execute(GPUDynInstPtr) override
Definition vop1.cc:625
void execute(GPUDynInstPtr) override
Definition vop1.cc:264
void execute(GPUDynInstPtr) override
Definition vop1.cc:299
void execute(GPUDynInstPtr) override
Definition vop1.cc:695
void execute(GPUDynInstPtr) override
Definition vop1.cc:730
void execute(GPUDynInstPtr) override
Definition vop1.cc:765
void execute(GPUDynInstPtr) override
Definition vop1.cc:800
void execute(GPUDynInstPtr) override
Definition vop1.cc:660
void execute(GPUDynInstPtr) override
Definition vop1.cc:229
void execute(GPUDynInstPtr) override
Definition vop1.cc:886
void execute(GPUDynInstPtr) override
Definition vop1.cc:569
void execute(GPUDynInstPtr) override
Definition vop1.cc:2169
void execute(GPUDynInstPtr) override
Definition vop1.cc:387
void execute(GPUDynInstPtr) override
Definition vop1.cc:182
void execute(GPUDynInstPtr) override
Definition vop1.cc:604
void execute(GPUDynInstPtr) override
Definition vop1.cc:533
void execute(GPUDynInstPtr) override
Definition vop1.cc:2148
void execute(GPUDynInstPtr) override
Definition vop1.cc:336
void execute(GPUDynInstPtr) override
Definition vop1.cc:837
void execute(GPUDynInstPtr) override
Definition vop1.cc:2284
void execute(GPUDynInstPtr) override
Definition vop1.cc:1241
void execute(GPUDynInstPtr) override
Definition vop1.cc:2503
void execute(GPUDynInstPtr) override
Definition vop1.cc:1807
void execute(GPUDynInstPtr) override
Definition vop1.cc:1737
void execute(GPUDynInstPtr) override
Definition vop1.cc:1772
void execute(GPUDynInstPtr) override
Definition vop1.cc:2359
void execute(GPUDynInstPtr) override
Definition vop1.cc:1206
void execute(GPUDynInstPtr) override
Definition vop1.cc:1028
void execute(GPUDynInstPtr) override
Definition vop1.cc:2443
void execute(GPUDynInstPtr) override
Definition vop1.cc:1063
void execute(GPUDynInstPtr) override
Definition vop1.cc:1924
void execute(GPUDynInstPtr) override
Definition vop1.cc:2338
void execute(GPUDynInstPtr) override
Definition vop1.cc:1965
void execute(GPUDynInstPtr) override
Definition vop1.cc:1843
void execute(GPUDynInstPtr) override
Definition vop1.cc:2311
void execute(GPUDynInstPtr) override
Definition vop1.cc:2011
void execute(GPUDynInstPtr) override
Definition vop1.cc:1884
void execute(GPUDynInstPtr) override
Definition vop1.cc:2261
void execute(GPUDynInstPtr) override
Definition vop1.cc:1276
void execute(GPUDynInstPtr) override
Definition vop1.cc:2538
Inst_VOP1__V_MOV_B32(InFmt_VOP1 *)
Definition vop1.cc:61
void execute(GPUDynInstPtr) override
Definition vop1.cc:75
void execute(GPUDynInstPtr) override
Definition vop1.cc:2070
void execute(GPUDynInstPtr) override
Definition vop1.cc:436
Inst_VOP1__V_NOP(InFmt_VOP1 *)
Definition vop1.cc:42
void execute(GPUDynInstPtr) override
Definition vop1.cc:56
void execute(GPUDynInstPtr) override
Definition vop1.cc:1667
void execute(GPUDynInstPtr) override
Definition vop1.cc:2192
void execute(GPUDynInstPtr) override
Definition vop1.cc:1311
void execute(GPUDynInstPtr) override
Definition vop1.cc:1418
void execute(GPUDynInstPtr) override
Definition vop1.cc:1348
void execute(GPUDynInstPtr) override
Definition vop1.cc:143
void execute(GPUDynInstPtr) override
Definition vop1.cc:2423
void execute(GPUDynInstPtr) override
Definition vop1.cc:1170
void execute(GPUDynInstPtr) override
Definition vop1.cc:992
void execute(GPUDynInstPtr) override
Definition vop1.cc:2238
void execute(GPUDynInstPtr) override
Definition vop1.cc:1383
void execute(GPUDynInstPtr) override
Definition vop1.cc:1465
void execute(GPUDynInstPtr) override
Definition vop1.cc:2463
void execute(GPUDynInstPtr) override
Definition vop1.cc:1583
void execute(GPUDynInstPtr) override
Definition vop1.cc:2215
void execute(GPUDynInstPtr) override
Definition vop1.cc:1511
void execute(GPUDynInstPtr) override
Definition vop1.cc:1546
void execute(GPUDynInstPtr) override
Definition vop1.cc:2401
void execute(GPUDynInstPtr) override
Definition vop1.cc:1099
void execute(GPUDynInstPtr) override
Definition vop1.cc:921
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:409
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:392
void read() override
read from the vrf.
Definition operand.hh:147
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:131
void write() override
write to the vrf.
Definition operand.hh:199
uint32_t accumOffset
Definition wavefront.hh:137
VectorMask & execMask()
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input That function will either use a builtin ...
Definition bitfield.hh:369
std::enable_if_t< std::is_integral_v< T >, T > reverseBits(T val, size_t size=sizeof(T))
Takes a value and returns the bit reversed version.
Definition bitfield.hh:255
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition inst_util.hh:174
uint64_t ScalarRegU64
ScalarRegI32 findFirstOne(T val)
Definition inst_util.hh:142
ScalarRegI32 findFirstOneMsb(T val)
Definition inst_util.hh:153
T roundNearestEven(T val)
Definition inst_util.hh:259
uint32_t VecElemU32
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:424
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:78
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:83

Generated on Mon Jan 13 2025 04:27:59 for gem5 by doxygen 1.9.8