gem5 [DEVELOP-FOR-25.1]
Loading...
Searching...
No Matches
vop3_cvt.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2025 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __ARCH_AMDGPU_VEGA_INSTS_VOP3_CVT_HH__
33#define __ARCH_AMDGPU_VEGA_INSTS_VOP3_CVT_HH__
34
35#include <vector>
36
39
40namespace gem5
41{
42
43namespace VegaISA
44{
45
50template<typename dFMT, typename sFMT, const char **MNEM>
52{
53public:
55 : Inst_VOP3A(iFmt, *MNEM, false)
56{
57 setFlag(ALU);
58}
59
61
62void
63execute(GPUDynInstPtr gpuDynInst) override
64{
65 static_assert(dFMT::size() == 32 || dFMT::size() == 16 ||
66 dFMT::size() == 8 || dFMT::size() == 4);
67 static_assert(sFMT::size() == 32 || sFMT::size() == 16 ||
68 sFMT::size() == 8 || sFMT::size() == 4);
69
70 Wavefront *wf = gpuDynInst->wavefront();
71
72 // For the operands, there might be an easier way to type these based
73 // on dFMT/sFMT. Here we define the possibilities and only read/write
74 // the valid ones in an if constexpr conditional.
75 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
76 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
77 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
78
79 VecOperandU64 vdst64(gpuDynInst, instData.VDST);
80 VecOperandU32 vdst32(gpuDynInst, instData.VDST);
81
82 src0.readSrc();
83 src1.readSrc();
84
85 if constexpr (sFMT::size() == 32) {
86 src2.readSrc();
87 }
88
89 // These are read in the case of dFMT < 32 bit. In those cases the dest
90 // register is updated without clobbering the unwritten bits.
91 if constexpr (dFMT::size() == 32) {
92 vdst64.read();
93 } else {
94 vdst32.read();
95 }
96
97 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
98 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
99 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
100 panic_if(instData.ABS, "ABS not supported for %s", _opcode);
101 panic_if(extData.NEG, "NEG not supported for %s", _opcode);
102
103 // For 16 bit source format this is unused. For 8 bit only bit
104 // 0 is valid. For 4 bit only bits 0 and 1 are valid.
105 [[maybe_unused]] int in_opsel = 0;
106 if constexpr (sFMT::size() == 8) {
107 in_opsel = instData.OPSEL & 1;
108 } else if (sFMT::size() == 4) {
109 in_opsel = instData.OPSEL & 3;
110 } else {
111 in_opsel = 0;
112 }
113
114 // If the destination size is 8 bits select the word in vdst using
115 // bit 3. If the size is 4 bits select the word using bits 3 and 2
116 int out_opsel = 0;
117 if constexpr (dFMT::size() == 8) {
118 out_opsel = (instData.OPSEL >> 3) & 1;
119 } else if (dFMT::size() == 4) {
120 out_opsel = (instData.OPSEL >> 2) & 3;
121 } else {
122 out_opsel = 0;
123 }
124
125 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
126 if (wf->execMask(lane)) {
127 dFMT cvt1, cvt2;
128
129 // When downcasting, scale before conversion otherwise scale
130 // after conversion. Read the scale value first in either case.
131 float scale_val = 1.0f;
132 if constexpr (sFMT::size() == 32) {
133 scale_val = src2[lane];
134 } else {
135 VecElemU32 tmp = src1[lane];
136 scale_val = *reinterpret_cast<float*>(&(tmp));
137 }
138
139 if constexpr (sFMT::size() == 32) {
140 sFMT tmp1(src0[lane]);
141 sFMT tmp2(src1[lane]);
142
143 static_assert(dFMT::size() < sFMT::size());
144 tmp1.scaleDiv(scale_val);
145 tmp2.scaleDiv(scale_val);
146
147 // Implicit convert here
148 cvt1 = tmp1;
149 cvt2 = tmp2;
150 } else {
151 auto in = unpackMXOperands<sFMT>(src0[lane], in_opsel);
152
153 if (dFMT::size() < sFMT::size()) {
154 in.first.scaleDiv(scale_val);
155 in.second.scaleDiv(scale_val);
156 }
157
158 cvt1 = in.first;
159 cvt2 = in.second;
160 }
161
162 // Upcasting. Scale after conversion from above.
163 if (dFMT::size() >= sFMT::size()) {
164 cvt1.scaleMul(scale_val);
165 cvt2.scaleMul(scale_val);
166 }
167
168 if (instData.CLAMP) {
169 cvt1 = std::clamp(float(cvt1), 0.0f, 1.0f);
170 cvt2 = std::clamp(float(cvt2), 0.0f, 1.0f);
171 }
172
173 if constexpr (dFMT::size() == 32) {
174 vdst64[lane] = packMXOperands64(cvt2, cvt1);
175 } else if (dFMT::size() == 16) {
176 vdst32[lane] = packMXOperands32(cvt2, cvt1);
177 } else if (dFMT::size() == 8) {
178 uint16_t packed_data = packMXOperands32(cvt2, cvt1);
179 vdst32[lane] = insertBits(vdst32[lane], 16 * out_opsel + 15,
180 16 * out_opsel, packed_data);
181 } else {
182 uint8_t packed_data = packMXOperands32(cvt2, cvt1);
183 vdst32[lane] = insertBits(vdst32[lane], 8 * out_opsel + 7,
184 8 * out_opsel, packed_data);
185 }
186 }
187 }
188
189 if constexpr (dFMT::size() == 32) {
190 vdst64.write();
191 } else {
192 vdst32.write();
193 }
194}
195
196int
198{
200}
201
202int
204{
205 return 1;
206}
207
208int
210{
211 if constexpr (dFMT::size() == 32) {
212 return 3;
213 }
214
215 return 2;
216}
217
218int
219getOperandSize(int opIdx) override
220{
221 if constexpr (dFMT::size() == 32) {
222 switch (opIdx) {
223 case 0: //src_0
224 return 4;
225 case 1: //src_1
226 return 4;
227 case 2: //src_2
228 return 4;
229 case 3: //vdst
230 return 8;
231 default:
232 fatal("op idx %i out of bounds\n", opIdx);
233 return -1;
234 }
235 } else {
236 switch (opIdx) {
237 case 0: //src_0
238 return 4;
239 case 1: //src_1
240 return 4;
241 case 2: //vdst
242 return 4;
243 default:
244 fatal("op idx %i out of bounds\n", opIdx);
245 return -1;
246 }
247 }
248
249 fatal("op idx %i out of bounds\n", opIdx);
250 return -1;
251}
252
253};
254
256 "v_cvt_scalef32_pk_fp8_f32";
260
262 "v_cvt_scalef32_pk_bf8_f32";
266
268 "v_cvt_scalef32_pk_f32_fp8";
272
274 "v_cvt_scalef32_pk_f32_bf8";
278
280 "v_cvt_scalef32_pk_fp4_f32";
284
286 "v_cvt_scalef32_pk_f32_fp4";
290
292 "v_cvt_scalef32_pk_fp8_f16";
296
298 "v_cvt_scalef32_pk_bf8_f16";
302
304 "v_cvt_scalef32_pk_fp8_bf16";
308
310 "v_cvt_scalef32_pk_bf8_bf16";
314
316 "v_cvt_scalef32_pk_f16_fp8";
320
322 "v_cvt_scalef32_pk_f16_bf8";
326
328 "v_cvt_scalef32_pk_fp4_f16";
332
334 "v_cvt_scalef32_pk_fp4_bf16";
338
340 "v_cvt_scalef32_pk_f16_fp4";
344
346 "v_cvt_scalef32_pk_bf16_fp4";
350
352 "v_cvt_scalef32_pk_bf16_fp8";
356
358 "v_cvt_scalef32_pk_bf16_bf8";
362
363
364
370template<typename dFMT, typename sFMT, const char **MNEM>
372{
373public:
375 : Inst_VOP3A(iFmt, *MNEM, false)
376{
377 setFlag(ALU);
378}
379
381
382dFMT
383omodModifier(dFMT val, unsigned omod)
384{
385 // These implicitly convert to F32 first. However that is always larger
386 // than the largest source format so there should be not precision loss.
387 assert(omod < 4);
388
389 if (omod == 1) return val * 2.0f;
390 if (omod == 2) return val * 4.0f;
391 if (omod == 3) return val / 2.0f;
392
393 return val;
394}
395
396void
397execute(GPUDynInstPtr gpuDynInst) override
398{
399 // Currently only 4 conversions
400 static_assert(dFMT::size() == 32 || dFMT::size() == 16);
401 static_assert(sFMT::size() == 8);
402
403 Wavefront *wf = gpuDynInst->wavefront();
404
405 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
406 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
407
408 // The instruction spec does not mention existing bits in the dest be
409 // preserved, so we do not read this before modifying and clobber it.
410 VecOperandU32 vdst(gpuDynInst, instData.VDST);
411
412 src0.readSrc();
413 src1.readSrc();
414
415 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
416 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
417 panic_if(instData.ABS, "ABS not supported for %s", _opcode);
418 panic_if(extData.NEG, "NEG not supported for %s", _opcode);
419
420 // Two bits to select the byte in the dword. No output opsel bit is
421 // mentioned in the spec.
422 int in_opsel = instData.OPSEL & 3;
423
424 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
425 if (wf->execMask(lane)) {
426 sFMT in(bits(src0[lane], 8 * in_opsel + 7, 8 * in_opsel));
427 dFMT cvt;
428
429 // Implicit convert
430 cvt = in;
431
432 float scale_val = src1[lane];
433
434 // Upcast only in this template. Apply after converting.
435 cvt.scaleMul(scale_val);
436
437 // Not marked OPF_NOOMOD, apply output modifiers before clamp.
438 cvt = omodModifier(cvt, extData.OMOD);
439
440 if (instData.CLAMP) {
441 cvt = std::clamp(float(cvt), 0.0f, 1.0f);
442 }
443
444 // Write raw data back to register
445 vdst[lane] = cvt.data >> (32 - dFMT::size());
446 }
447 }
448
449 vdst.write();
450}
451
452int
454{
456}
457
458int
460{
461 return 1;
462}
463
464int
466{
467 return 2;
468}
469
470int
471getOperandSize(int opIdx) override
472{
473 switch (opIdx) {
474 case 0: //src_0
475 return 4;
476 case 1: //src_1
477 return 4;
478 case 2: //vdst
479 return 4;
480 default:
481 fatal("op idx %i out of bounds\n", opIdx);
482 return -1;
483 }
484}
485
486};
487
489 "v_cvt_scalef32_f16_bf8";
493
495 "v_cvt_scalef32_f16_fp8";
499
501 "v_cvt_scalef32_f32_bf8";
505
507 "v_cvt_scalef32_f32_fp8";
511
512
513
514
522template<typename dFMT, typename sFMT, const char **MNEM>
524{
525public:
527 : Inst_VOP3A(iFmt, *MNEM, false)
528{
529 setFlag(ALU);
530}
531
533
534void
535execute(GPUDynInstPtr gpuDynInst) override
536{
537 static_assert(dFMT::size() == 32 ||
538 dFMT::size() == 16 || dFMT::size() == 6);
539 static_assert(sFMT::size() == 16 || sFMT::size() == 6);
540
541 // There are 32 values packed into a huge operand. These are called
542 // components in the spec.
543 constexpr const int components = sFMT::size() == 32 ? 16 : 32;
544 size_t input_regs = getOperandSize(0) / 4;
545 size_t output_regs = getOperandSize(2) / 4;
546
547 Wavefront *wf = gpuDynInst->wavefront();
548
549 // The gem5 operand types are really only handy up to 64 bits. For BF
550 // operand sizes such as in these instructions, just create an array of
551 // 32-bit registers to use.
553 src0.reserve(input_regs);
554 for (int reg = 0; reg < input_regs; ++reg) {
555 src0.emplace_back(gpuDynInst, extData.SRC0 + reg);
556 src0[reg].readSrc();
557 }
558
559 ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
560 src1.readSrc();
561
562 std::vector<typename std::aligned_storage<sizeof(VecOperandU32),
563 alignof(VecOperandU32)>::type>
564 _vdst(output_regs);
565 VecOperandU32* vdst =
566 std::launder(reinterpret_cast<VecOperandU32*>(_vdst.data()));
567 for (int reg = 0; reg < output_regs; ++reg) {
568 new (&vdst[reg]) VecOperandU32(gpuDynInst, instData.VDST + reg);
569 }
570
571 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
572 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
573 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
574 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
575
576 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
577 if (wf->execMask(lane)) {
578 PackedReg<sFMT::size() * components, sFMT::size()> in_reg;
579 PackedReg<dFMT::size() * components, dFMT::size()> out_reg;
580
581 for (int reg = 0; reg < input_regs; ++reg) {
582 in_reg.setDword(reg, src0[reg][lane]);
583 }
584
585 for (int pass = 0; pass < components; ++pass) {
586 sFMT in;
587 dFMT out;
588
589 // When downcasting, scale before conversion otherwise scale
590 // after conversion. Read the scale value first in either case.
591 float scale_val = src1[lane];
592 if (instData.ABS & 2) {
593 scale_val = std::fabs(scale_val);
594 }
595 if (extData.NEG & 2) {
596 scale_val = -scale_val;
597 }
598
599 // Note: Due to the union of a signed int and bitfield struct,
600 // the data is [31:(32 - sFMT::size())], so we must align this
601 // otherwise the conversions will result in a zero value.
602 in.data = in_reg.getElem(pass) << (32 - sFMT::size());
603
604 // Apply ABS, NEG
605 if (instData.ABS & 1 && float(in) < 0.0f) {
606 in = -in;
607 }
608 if (extData.NEG & 1) {
609 in = -in;
610 }
611
612 // Downcasting. Apply scale before converting.
613 if constexpr (dFMT::size() < sFMT::size()) {
614 out.scaleDiv(scale_val);
615 }
616
617 out = in; // Implicit conversion happens here.
618
619 // Upcasting. Apply scale after converting.
620 if constexpr (dFMT::size() >= sFMT::size()) {
621 out.scaleMul(scale_val);
622 }
623
624 // Apply ABS, NEG
625 if (instData.ABS & 8 && float(out) < 0.0f) {
626 out = -out;
627 }
628 if (extData.NEG & 8) {
629 out = -out;
630 }
631
632 out_reg.setElem(pass, out.data >> (32 - dFMT::size()));
633 }
634
635 for (int reg = 0; reg < output_regs; ++reg) {
636 vdst[reg][lane] = out_reg.getDword(reg);
637 }
638 }
639 }
640
641 for (int reg = 0; reg < output_regs; ++reg) {
642 vdst[reg].write();
643 }
644}
645
646int
648{
650}
651
652int
654{
655 return 1;
656}
657
658int
660{
661 return 2;
662}
663
664int
665getOperandSize(int opIdx) override
666{
667 switch (opIdx) {
668 case 0: //src_0
669 if constexpr (sFMT::size() == 32) {
670 return sFMT::size() * 2;
671 } else {
672 return sFMT::size() * 4;
673 }
674 case 1: //src_1
675 return 4;
676 case 2: //vdst
677 return dFMT::size() * 4;
678 default:
679 fatal("op idx %i out of bounds\n", opIdx);
680 return -1;
681 }
682}
683
684};
685
687 "v_cvt_scalef32_pk32_bf16_bf6";
692
694 "v_cvt_scalef32_pk32_bf16_fp6";
699
701 "v_cvt_scalef32_pk32_bf6_bf16";
706
708 "v_cvt_scalef32_pk32_bf6_f16";
713
715 "v_cvt_scalef32_pk32_f16_bf6";
720
722 "v_cvt_scalef32_pk32_f16_fp6";
727
729 "v_cvt_scalef32_pk32_f32_bf6";
734
736 "v_cvt_scalef32_pk32_f32_fp6";
741
743 "v_cvt_scalef32_pk32_fp6_bf16";
748
750 "v_cvt_scalef32_pk32_fp6_f16";
755
756
762template<typename dFMT, typename sFMT, const char **MNEM>
764{
765public:
767 : Inst_VOP3A(iFmt, *MNEM, false)
768{
769 setFlag(ALU);
770}
771
773
774void
775execute(GPUDynInstPtr gpuDynInst) override
776{
777 static_assert(dFMT::size() == 6);
778 static_assert(sFMT::size() == 32);
779
780 // There are 32 values over two source operands which have 16 values.
781 // These are called components in the spec.
782 constexpr const int components = 32;
783 size_t input_regs = getOperandSize(0) / 4;
784 size_t output_regs = getOperandSize(3) / 4;
785
786 Wavefront *wf = gpuDynInst->wavefront();
787
788 // The gem5 operand types are really only handy up to 64 bits. For BF
789 // operand sizes such as in these instructions, just create an array of
790 // 32-bit registers to use.
791 std::vector<typename std::aligned_storage<
792 sizeof(ConstVecOperandU32), alignof(ConstVecOperandU32)>::type>
793 _src0(input_regs);
794 ConstVecOperandU32* src0 =
795 std::launder(reinterpret_cast<ConstVecOperandU32*>(_src0.data()));
796 for (int reg = 0; reg < input_regs; ++reg) {
797 new (&src0[reg]) ConstVecOperandU32(gpuDynInst, extData.SRC0 + reg);
798 src0[reg].readSrc();
799 }
800
801 std::vector<typename std::aligned_storage<
802 sizeof(ConstVecOperandU32), alignof(ConstVecOperandU32)>::type>
803 _src1(input_regs);
804 ConstVecOperandU32* src1 =
805 std::launder(reinterpret_cast<ConstVecOperandU32*>(_src1.data()));
806 for (int reg = 0; reg < input_regs; ++reg) {
807 new (&src1[reg]) ConstVecOperandU32(gpuDynInst, extData.SRC1 + reg);
808 src1[reg].readSrc();
809 }
810
811 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2);
812 src2.readSrc();
813
814 std::vector<typename std::aligned_storage<sizeof(VecOperandU32),
815 alignof(VecOperandU32)>::type>
816 _vdst(output_regs);
817 VecOperandU32* vdst =
818 std::launder(reinterpret_cast<VecOperandU32*>(_vdst.data()));
819 for (int reg = 0; reg < output_regs; ++reg) {
820 new (&vdst[reg]) VecOperandU32(gpuDynInst, instData.VDST + reg);
821 }
822
823 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
824 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
825 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
826 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
827
828 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
829 if (wf->execMask(lane)) {
830 PackedReg<sFMT::size() * components, sFMT::size()> in_reg;
831 PackedReg<dFMT::size() * components, dFMT::size()> out_reg;
832
833 for (int reg = 0; reg < input_regs; ++reg) {
834 in_reg.setDword(reg * 2, src0[reg][lane]);
835 }
836
837 for (int reg = 0; reg < input_regs; ++reg) {
838 in_reg.setDword(reg * 2 + 1, src1[reg][lane]);
839 }
840
841 for (int pass = 0; pass < components; ++pass) {
842 sFMT in;
843 dFMT out;
844
845 // Note: Due to the union of a signed int and bitfield struct,
846 // the data is [31:(32 - sFMT::size())], so we must align this
847 // otherwise the conversions will result in a zero value.
848 in.data = in_reg.getElem(pass) << (32 - sFMT::size());
849
850 // Apply ABS, NEG
851 if (instData.ABS & 1 && float(in) < 0.0f) {
852 in = -in;
853 }
854 if (extData.NEG & 1) {
855 in = -in;
856 }
857
858 // Only downcasts in this template. Scale before converting.
859 float scale_val = src2[lane];
860 if (instData.ABS & 2) {
861 scale_val = std::fabs(scale_val);
862 }
863 if (extData.NEG & 2) {
864 scale_val = -scale_val;
865 }
866
867 in.scaleDiv(scale_val);
868
869 out = in; // Implicit conversion happens here.
870
871 // Apply ABS, NEG
872 if (instData.ABS & 8 && float(out) < 0.0f) {
873 out = -out;
874 }
875 if (extData.NEG & 8) {
876 out = -out;
877 }
878
879 out_reg.setElem(pass, out.data >> (32 - dFMT::size()));
880 }
881
882 for (int reg = 0; reg < output_regs; ++reg) {
883 vdst[reg][lane] = out_reg.getDword(reg);
884 }
885 }
886 }
887
888 for (int reg = 0; reg < output_regs; ++reg) {
889 vdst[reg].write();
890 }
891}
892
893int
895{
897}
898
899int
901{
902 return 1;
903}
904
905int
907{
908 return 3;
909}
910
911int
912getOperandSize(int opIdx) override
913{
914 switch (opIdx) {
915 case 0: //src_0
916 return sFMT::size() * 2;
917 case 1: //src_1
918 return sFMT::size() * 2;
919 case 2: //src_2
920 return 4;
921 case 3: //vdst
922 return dFMT::size() * 4;
923 default:
924 fatal("op idx %i out of bounds\n", opIdx);
925 return -1;
926 }
927}
928
929};
930
932 "v_cvt_scalef32_2xpk16_bf6_f32";
937
939 "v_cvt_scalef32_2xpk16_fp6_f32";
944
945
951template<typename dFMT, typename sFMT, const char **MNEM>
953{
954public:
956 : Inst_VOP3A(iFmt, *MNEM, false)
957{
958 setFlag(ALU);
959}
960
962
963void
964execute(GPUDynInstPtr gpuDynInst) override
965{
966 // Currently only 5 conversions
967 static_assert(dFMT::size() == 8);
968 static_assert(sFMT::size() == 32 || sFMT::size() == 16);
969
970 Wavefront *wf = gpuDynInst->wavefront();
971
972 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); // input
973 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1); // seed
974 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2); // scale
975 VecOperandU64 vdst(gpuDynInst, instData.VDST);
976
977 src0.readSrc();
978 src1.readSrc();
979 src2.readSrc();
980 vdst.read();
981
982 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
983 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
984 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
985 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
986
987 // Two bits [3:2] select the byte in the output dword. No input opsel bit
988 // is mentioned in the spec.
989 int out_byte = bits(instData.OPSEL, 3, 2);
990
991 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
992 if (wf->execMask(lane)) {
993 sFMT in(src0[lane]);
994 dFMT cvt;
995
996 if (instData.ABS & 1 && float(in) < 0.0f) {
997 in = -in;
998 }
999 if (extData.NEG & 1) {
1000 in = -in;
1001 }
1002
1003 VecElemI32 seed_val = src1[lane];
1004
1005 if (instData.ABS & 2) {
1006 seed_val = std::abs(seed_val);
1007 }
1008 if (extData.NEG & 2) {
1009 seed_val = -seed_val;
1010 }
1011
1012 float scale_val = src2[lane];
1013 if (instData.ABS & 4) {
1014 scale_val = std::fabs(scale_val);
1015 }
1016 if (extData.NEG & 4) {
1017 scale_val = -scale_val;
1018 }
1019
1020 // Only downcasts in this template. Apply scale before converting.
1021 in.scaleDiv(scale_val);
1022
1023 using sInfo = decltype(in.getFmt());
1024 using dInfo = decltype(cvt.getFmt());
1025 dInfo cvt_info = AMDGPU::convertMXFP<dInfo, sInfo>(
1026 in.getFmt(), AMDGPU::roundStochastic, seed_val
1027 );
1028 cvt.setFmt(cvt_info);
1029
1030 if (instData.ABS & 8 && float(cvt) < 0.0f) {
1031 cvt = -cvt;
1032 }
1033 if (extData.NEG & 8) {
1034 cvt = -cvt;
1035 }
1036
1037 // Write raw data back to register
1038 vdst[lane] = insertBits(vdst[lane], out_byte * 8 + 7, out_byte * 8,
1039 bits(cvt.data, 31, 32 - dFMT::size()));
1040 }
1041 }
1042
1043 vdst.write();
1044}
1045
1046int
1048{
1050}
1051
1052int
1054{
1055 return 1;
1056}
1057
1058int
1060{
1061 return 3;
1062}
1063
1064int
1065getOperandSize(int opIdx) override
1066{
1067 switch (opIdx) {
1068 case 0: //src_0
1069 return 4;
1070 case 1: //src_1
1071 return 4;
1072 case 2: //src_2
1073 return 4;
1074 case 3: //vdst
1075 return 4;
1076 default:
1077 fatal("op idx %i out of bounds\n", opIdx);
1078 return -1;
1079 }
1080}
1081
1082};
1083
1085 "v_cvt_scale_sr_bf8_f16";
1090
1092 "v_cvt_scale_sr_bf8_f32";
1097
1099 "v_cvt_scale_sr_fp8_bf16";
1104
1106 "v_cvt_scale_sr_bf8_bf16";
1111
1113 "v_cvt_scale_sr_fp8_f16";
1118
1120 "v_cvt_scale_sr_fp8_f32";
1125
1126
1132template<typename dFMT, typename sFMT, const char **MNEM>
1134{
1135public:
1137 : Inst_VOP3A(iFmt, *MNEM, false)
1138{
1139 setFlag(ALU);
1140}
1141
1143
1144void
1145execute(GPUDynInstPtr gpuDynInst) override
1146{
1147 static_assert(dFMT::size() == 6);
1148 static_assert(sFMT::size() == 32 || sFMT::size() == 16);
1149
1150 // There are 32 values in all cases.
1151 constexpr const int components = 32;
1152 size_t input_regs = getOperandSize(0) / 4;
1153 size_t output_regs = getOperandSize(3) / 4;
1154
1155 Wavefront *wf = gpuDynInst->wavefront();
1156
1157 // The gem5 operand types are really only handy up to 64 bits. For BF
1158 // operand sizes such as in these instructions, just create an array of
1159 // 32-bit registers to use.
1160 std::vector<typename std::aligned_storage<
1161 sizeof(ConstVecOperandU32), alignof(ConstVecOperandU32)>::type>
1162 _src0(input_regs);
1163 ConstVecOperandU32* src0 =
1164 std::launder(reinterpret_cast<ConstVecOperandU32*>(_src0.data()));
1165 for (int reg = 0; reg < input_regs; ++reg) {
1166 new (&src0[reg]) ConstVecOperandU32(gpuDynInst, extData.SRC0 + reg);
1167 src0[reg].readSrc();
1168 }
1169
1170 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1); // seed
1171 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2); // scale
1172 src1.readSrc();
1173 src2.readSrc();
1174
1175 std::vector<typename std::aligned_storage<sizeof(VecOperandU32),
1176 alignof(VecOperandU32)>::type>
1177 _vdst(output_regs);
1178 VecOperandU32* vdst =
1179 std::launder(reinterpret_cast<VecOperandU32*>(_vdst.data()));
1180 for (int reg = 0; reg < output_regs; ++reg) {
1181 new (&vdst[reg]) VecOperandU32(gpuDynInst, instData.VDST + reg);
1182 }
1183
1184 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
1185 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1186 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
1187 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
1188
1189 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1190 if (wf->execMask(lane)) {
1191 PackedReg<sFMT::size() * components, sFMT::size()> in_reg;
1192 PackedReg<dFMT::size() * components, dFMT::size()> out_reg;
1193
1194 for (int reg = 0; reg < input_regs; ++reg) {
1195 in_reg.setDword(reg, src0[reg][lane]);
1196 }
1197
1198 for (int pass = 0; pass < components; ++pass) {
1199 sFMT in;
1200 dFMT out;
1201
1202 // Note: Due to the union of a signed int and bitfield struct,
1203 // the data is [31:(32 - sFMT::size())], so we must align this
1204 // otherwise the conversions will result in a zero value.
1205 in.data = in_reg.getElem(pass) << (32 - sFMT::size());
1206
1207 // Apply ABS, NEG, and scale
1208 if (instData.ABS & 1 && float(in) < 0.0f) {
1209 in = -in;
1210 }
1211 if (extData.NEG & 1) {
1212 in = -in;
1213 }
1214
1215 VecElemI32 seed_val = src1[lane];
1216
1217 if (instData.ABS & 2) {
1218 seed_val = std::fabs(seed_val);
1219 }
1220 if (extData.NEG & 2) {
1221 seed_val = -seed_val;
1222 }
1223
1224 float scale_val = src2[lane];
1225 if (instData.ABS & 4) {
1226 scale_val = std::fabs(scale_val);
1227 }
1228 if (extData.NEG & 4) {
1229 scale_val = -scale_val;
1230 }
1231
1232 // Only downcasts in this template. Scale before converting.
1233 in.scaleDiv(scale_val);
1234
1235 using sInfo = decltype(in.getFmt());
1236 using dInfo = decltype(out.getFmt());
1237 dInfo cvt_info = AMDGPU::convertMXFP<dInfo, sInfo>(
1238 in.getFmt(), AMDGPU::roundStochastic, seed_val
1239 );
1240 out.setFmt(cvt_info);
1241
1242 if (instData.ABS & 8 && float(out) < 0.0f) {
1243 out = -out;
1244 }
1245 if (extData.NEG & 8) {
1246 out = -out;
1247 }
1248
1249 out_reg.setElem(pass, out.data >> (32 - dFMT::size()));
1250 }
1251
1252 for (int reg = 0; reg < output_regs; ++reg) {
1253 vdst[reg][lane] = out_reg.getDword(reg);
1254 }
1255 }
1256 }
1257
1258 for (int reg = 0; reg < output_regs; ++reg) {
1259 vdst[reg].write();
1260 }
1261}
1262
1263int
1265{
1267}
1268
1269int
1271{
1272 return 1;
1273}
1274
1275int
1277{
1278 return 3;
1279}
1280
1281int
1282getOperandSize(int opIdx) override
1283{
1284 switch (opIdx) {
1285 case 0: //src_0
1286 return sFMT::size() * 4;
1287 case 1: //src_1
1288 return 4;
1289 case 2: //src_2
1290 return 4;
1291 case 3: //vdst
1292 // Always 6 dwords
1293 return 6 * 4;
1294 default:
1295 fatal("op idx %i out of bounds\n", opIdx);
1296 return -1;
1297 }
1298}
1299
1300};
1301
1303 "v_cvt_scale_sr_pk_bf6_bf16";
1308
1310 "v_cvt_scale_sr_pk_bf6_f16";
1315
1317 "v_cvt_scale_sr_pk_bf6_f32";
1322
1324 "v_cvt_scale_sr_pk_fp6_bf16";
1329
1331 "v_cvt_scale_sr_pk_fp6_f16";
1336
1338 "v_cvt_scale_sr_pk_fp6_f32";
1343
1344
1350template<typename dFMT, typename sFMT, const char **MNEM>
1352{
1353public:
1355 : Inst_VOP3A(iFmt, *MNEM, false)
1356{
1357 setFlag(ALU);
1358}
1359
1361
1362void
1363execute(GPUDynInstPtr gpuDynInst) override
1364{
1365 static_assert(dFMT::size() == 4);
1366 static_assert(sFMT::size() == 32 || sFMT::size() == 16);
1367
1368 Wavefront *wf = gpuDynInst->wavefront();
1369
1370 // There are either one or two dwords read depending on input type. To
1371 // simplify things, just declare two here and don't read the second
1372 // dword if it is not used.
1373 ConstVecOperandU32 src0[2] = { // input
1374 ConstVecOperandU32(gpuDynInst, extData.SRC0 + 0),
1375 ConstVecOperandU32(gpuDynInst, extData.SRC0 + 1)
1376 };
1377 ConstVecOperandI32 src1(gpuDynInst, extData.SRC1); // seed
1378 ConstVecOperandF32 src2(gpuDynInst, extData.SRC2); // scale
1379 VecOperandU32 vdst(gpuDynInst, instData.VDST); // output
1380
1381 src0[0].readSrc();
1382 if constexpr (sFMT::size() == 32) {
1383 src0[1].readSrc();
1384 }
1385 src1.readSrc();
1386 src2.readSrc();
1387
1388 // We want to replace the bits at the OPSEL location and not clobber
1389 // the rest of the register, therefore need to read modify and write.
1390 vdst.read();
1391
1392 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
1393 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
1394 panic_if(instData.CLAMP, "CLAMP not supported for %s", _opcode);
1395 panic_if(extData.OMOD, "OMOD not supported for %s", _opcode);
1396
1397 // Output byte. Input is always either 2x 16-bit values or 2x 32-bit
1398 // values. Therefore there is no input opsel.
1399 int out_opsel = bits(instData.OPSEL, 3, 2);
1400
1401 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1402 if (wf->execMask(lane)) {
1403 sFMT in[2];
1404 dFMT out[2]; // Always FP4 but may as well keep it templated.
1405
1406 if constexpr (sFMT::size() == 32) {
1407 in[0].data = src0[0][lane];
1408 in[1].data = src0[1][lane];
1409 } else {
1410 assert(sFMT::size() == 16);
1411 in[0].data = bits(src0[0][lane], 15, 0) << 16;
1412 in[1].data = bits(src0[0][lane], 31, 15) << 16;
1413 }
1414
1415 // Apply ABS, NEG, and scale - Assume these apply to both packed
1416 // values.
1417 if (instData.ABS & 1) {
1418 if (float(in[0]) < 0.0f) {
1419 in[0] = -in[0];
1420 }
1421 if (float(in[1]) < 0.0f) {
1422 in[1] = -in[1];
1423 }
1424 }
1425 if (extData.NEG & 1) {
1426 in[0] = -in[0];
1427 in[1] = -in[1];
1428 }
1429
1430 VecElemI32 seed_val = src1[lane];
1431
1432 if (instData.ABS & 2) {
1433 seed_val = std::fabs(seed_val);
1434 }
1435 if (extData.NEG & 2) {
1436 seed_val = -seed_val;
1437 }
1438
1439 // Only downcasts in this template. Apply scale before converting.
1440 float scale_val = src2[lane];
1441 if (instData.ABS & 4) {
1442 scale_val = std::fabs(scale_val);
1443 }
1444 if (extData.NEG & 4) {
1445 scale_val = -scale_val;
1446 }
1447
1448 in[0].scaleDiv(scale_val);
1449 in[1].scaleDiv(scale_val);
1450
1451 using sInfo = decltype(in[0].getFmt());
1452 using dInfo = decltype(out[0].getFmt());
1453 dInfo cvt_info = AMDGPU::convertMXFP<dInfo, sInfo>(
1454 in[0].getFmt(), AMDGPU::roundStochastic, seed_val
1455 );
1456 out[0].setFmt(cvt_info);
1458 in[1].getFmt(), AMDGPU::roundStochastic, seed_val
1459 );
1460 out[1].setFmt(cvt_info);
1461
1462 if (instData.ABS & 8) {
1463 if (float(out[0]) < 0.0f) {
1464 out[0] = -out[0];
1465 }
1466 if (float(out[1]) < 0.0f) {
1467 out[1] = -out[1];
1468 }
1469 }
1470
1471 // The bits of the mxfp type are aligned to the left of the dword,
1472 // so bits [31:28] are the relevant bits.
1473 uint8_t packed_output = (bits(out[1].data, 31, 28) << 4)
1474 | bits(out[0].data, 31, 28);
1475 vdst[lane] = insertBits(vdst[lane], 8 * out_opsel + 7,
1476 8 * out_opsel, packed_output);
1477 }
1478 }
1479
1480 vdst.write();
1481}
1482
1483int
1485{
1487}
1488
1489int
1491{
1492 return 1;
1493}
1494
1495int
1497{
1498 return 3;
1499}
1500
1501int
1502getOperandSize(int opIdx) override
1503{
1504 switch (opIdx) {
1505 case 0: //src_0
1506 return sFMT::size() / 4;
1507 case 1: //src_1
1508 return 4;
1509 case 2: //src_2
1510 return 4;
1511 case 3: //vdst
1512 return 4;
1513 default:
1514 fatal("op idx %i out of bounds\n", opIdx);
1515 return -1;
1516 }
1517}
1518
1519};
1520
1522 "v_cvt_scale_sr_pk_fp4_bf16";
1527
1529 "v_cvt_scale_sr_pk_fp4_f16";
1534
1536 "v_cvt_scale_sr_pk_fp4_f32";
1541
1542
1543}
1544}
1545
1546#endif // __ARCH_AMDGPU_VEGA_INSTS_VOP3_CVT_HH__
const char data[]
void setFlag(Flags flag)
const std::string _opcode
Inst_VOP3A(InFmt_VOP3A *, const std::string &opcode, bool sgpr_dst)
Base class for all V_CVT_SCALEF32_PK32* MI355X instructions which have F32 inputs.
Definition vop3_cvt.hh:764
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3_cvt.hh:775
Base class for all V_CVT_SCALEF32_SR_* instructions in MI355X which are NOT packed.
Definition vop3_cvt.hh:953
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3_cvt.hh:964
Base class for all V_CVT_SCALEF32_PK32* MI355X instructions (except with F32 inputs).
Definition vop3_cvt.hh:524
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3_cvt.hh:535
int getOperandSize(int opIdx) override
Definition vop3_cvt.hh:665
Base class for all V_CVT_SCALEF32_PK* instructions in MI355X.
Definition vop3_cvt.hh:52
int getOperandSize(int opIdx) override
Definition vop3_cvt.hh:219
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3_cvt.hh:63
Inst_VOP3__V_CVT_SCALE_PK(InFmt_VOP3A *iFmt)
Definition vop3_cvt.hh:54
Base class for all V_CVT_SCALEF32_SR_PK32* MI355X instructions.
Definition vop3_cvt.hh:1134
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3_cvt.hh:1145
Base class for all V_CVT_SCALEF32_SR_PK_FP4* MI355X instructions.
Definition vop3_cvt.hh:1352
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3_cvt.hh:1363
Base class for all V_CVT_SCALEF32* instructions in MI355X which are NOT packed.
Definition vop3_cvt.hh:372
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3_cvt.hh:397
int getOperandSize(int opIdx) override
Definition vop3_cvt.hh:471
dFMT omodModifier(dFMT val, unsigned omod)
Definition vop3_cvt.hh:383
Inst_VOP3__V_CVT_SCALE(InFmt_VOP3A *iFmt)
Definition vop3_cvt.hh:374
uint32_t getElem(int elem)
Definition operand.hh:892
void setDword(int dw, uint32_t value)
Definition operand.hh:878
void setElem(int elem, uint32_t value)
Definition operand.hh:933
uint32_t getDword(int dw)
Definition operand.hh:885
void read() override
read from the vrf.
Definition operand.hh:148
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:132
void write() override
write to the vrf.
Definition operand.hh:203
VectorMask & execMask()
STL vector class.
Definition stl.hh:37
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition bitfield.hh:185
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
mxfp< fp6_e3m2_info > mxbf6
Definition mxfp_types.hh:44
mxfp< fp8_e4m3_info > mxfloat8
Definition mxfp_types.hh:47
mxfp< binary32 > mxfloat32
Definition mxfp_types.hh:52
mxfp< fp16_e8m7_info > mxbfloat16
Definition mxfp_types.hh:49
mxfp< fp16_e5m10_info > mxfloat16
Definition mxfp_types.hh:50
dFMT convertMXFP(sFMT in, mxfpRoundingMode mode=roundTiesToEven, uint32_t seed=0)
mxfp< fp4_e2m1_info > mxfp4
Definition mxfp_types.hh:42
mxfp< fp8_e5m2_info > mxbfloat8
Definition mxfp_types.hh:46
mxfp< fp6_e2m3_info > mxfp6
Definition mxfp_types.hh:43
classes that represnt vector/scalar operands in VEGA ISA.
Definition faults.cc:39
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_SR_FP8_BF16 > Inst_VOP3__V_CVT_SCALEF32_SR_FP8_BF16
Definition vop3_cvt.hh:1100
static const char * MNEM__V_CVT_SCALE_SR_PK_FP4_F16
Definition vop3_cvt.hh:1528
static const char * MNEM__V_CVT_SCALEF32_F16_BF8
Definition vop3_cvt.hh:488
Inst_VOP3__V_CVT_SCALE_SR_PK_FP4< AMDGPU::mxfp4, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP4_F16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP4_F16
Definition vop3_cvt.hh:1530
uint64_t packMXOperands64(T &lower_operand, T &upper_operand)
Pack two MXFP values into one qword.
Definition inst_util.hh:953
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK_BF8_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK_BF8_BF16
Definition vop3_cvt.hh:311
static const char * MNEM__V_CVT_SCALEF32_PK_F16_FP4
Definition vop3_cvt.hh:339
static const char * MNEM__V_CVT_SCALEF32_PK32_BF16_FP6
Definition vop3_cvt.hh:693
VecOperand< VecElemF32, true > ConstVecOperandF32
Definition operand.hh:846
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxbfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_SR_BF8_F16 > Inst_VOP3__V_CVT_SCALEF32_SR_BF8_F16
Definition vop3_cvt.hh:1086
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_F16_FP8 > Inst_VOP3__V_CVT_SCALEF32_F16_FP8
Definition vop3_cvt.hh:496
VecOperand< VecElemU32, false > VecOperandU32
Definition operand.hh:829
static const char * MNEM__V_CVT_SCALEF32_SR_FP8_F32
Definition vop3_cvt.hh:1119
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxbf6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALE_SR_PK_BF6_F32 > Inst_VOP3__V_CVT_SCALE_SR_PK_BF6_F32
Definition vop3_cvt.hh:1318
static const char * MNEM__V_CVT_SCALEF32_PK_BF8_BF16
Definition vop3_cvt.hh:309
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfp4, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK_FP4_F32 > Inst_VOP3__V_CVT_SCALEF32_PK_FP4_F32
Definition vop3_cvt.hh:281
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat32, AMDGPU::mxfp4, &MNEM__V_CVT_SCALEF32_PK_F32_FP4 > Inst_VOP3__V_CVT_SCALEF32_PK_F32_FP4
Definition vop3_cvt.hh:287
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxfp6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALE_SR_PK_FP6_F32 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP6_F32
Definition vop3_cvt.hh:1339
static const char * MNEM__V_CVT_SCALEF32_SR_FP8_F16
Definition vop3_cvt.hh:1112
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxbf6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALE_SR_PK_BF6_BF16 > Inst_VOP3__V_CVT_SCALE_SR_PK_BF6_BF16
Definition vop3_cvt.hh:1304
std::pair< T, T > unpackMXOperands(uint32_t src, int opsel)
Unpack MXFP values from a register based on opsel value and type size.
Definition inst_util.hh:911
static const char * MNEM__V_CVT_SCALEF32_PK_FP4_F16
Definition vop3_cvt.hh:327
static const char * MNEM__V_CVT_SCALEF32_PK_FP4_BF16
Definition vop3_cvt.hh:333
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat16, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_F16_BF8 > Inst_VOP3__V_CVT_SCALEF32_F16_BF8
Definition vop3_cvt.hh:490
static const char * MNEM__V_CVT_SCALE_SR_PK_BF6_F32
Definition vop3_cvt.hh:1316
static const char * MNEM__V_CVT_SCALEF32_PK_FP8_F32
Definition vop3_cvt.hh:255
static const char * MNEM__V_CVT_SCALEF32_SR_BF8_F32
Definition vop3_cvt.hh:1091
Inst_VOP3__V_CVT_SCALE_SR_PK_FP4< AMDGPU::mxfp4, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP4_BF16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP4_BF16
Definition vop3_cvt.hh:1523
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat16, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_PK_F16_BF8 > Inst_VOP3__V_CVT_SCALEF32_PK_F16_BF8
Definition vop3_cvt.hh:323
VecOperand< VecElemI32, true > ConstVecOperandI32
Definition operand.hh:845
VecOperand< VecElemU32, true > ConstVecOperandU32
Definition operand.hh:844
static const char * MNEM__V_CVT_SCALEF32_PK_FP8_BF16
Definition vop3_cvt.hh:303
static const char * MNEM__V_CVT_SCALEF32_PK_BF8_F32
Definition vop3_cvt.hh:261
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxfp6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP6_BF16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP6_BF16
Definition vop3_cvt.hh:1325
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK_BF8_F32 > Inst_VOP3__V_CVT_SCALEF32_PK_BF8_F32
Definition vop3_cvt.hh:263
static const char * MNEM__V_CVT_SCALE_SR_PK_FP4_BF16
Definition vop3_cvt.hh:1521
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxbfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_SR_BF8_F32 > Inst_VOP3__V_CVT_SCALEF32_SR_BF8_F32
Definition vop3_cvt.hh:1093
static const char * MNEM__V_CVT_SCALEF32_PK_BF16_FP4
Definition vop3_cvt.hh:345
static const char * MNEM__V_CVT_SCALEF32_F32_FP8
Definition vop3_cvt.hh:506
static const char * MNEM__V_CVT_SCALEF32_PK_BF16_BF8
Definition vop3_cvt.hh:357
static const char * MNEM__V_CVT_SCALE_SR_PK_BF6_F16
Definition vop3_cvt.hh:1309
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat32, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_PK_F32_BF8 > Inst_VOP3__V_CVT_SCALEF32_PK_F32_BF8
Definition vop3_cvt.hh:275
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfp4, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK_FP4_F16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP4_F16
Definition vop3_cvt.hh:329
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbf6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK32_BF6_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF6_BF16
Definition vop3_cvt.hh:702
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbfloat16, AMDGPU::mxfp6, &MNEM__V_CVT_SCALEF32_PK32_BF16_FP6 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF16_FP6
Definition vop3_cvt.hh:695
static const char * MNEM__V_CVT_SCALE_SR_PK_FP4_F32
Definition vop3_cvt.hh:1535
static const char * MNEM__V_CVT_SCALEF32_PK_F16_BF8
Definition vop3_cvt.hh:321
static const char * MNEM__V_CVT_SCALEF32_SR_FP8_BF16
Definition vop3_cvt.hh:1098
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_SR_FP8_F32 > Inst_VOP3__V_CVT_SCALEF32_SR_FP8_F32
Definition vop3_cvt.hh:1121
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfp6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK32_FP6_F16 > Inst_VOP3__V_CVT_SCALEF32_PK32_FP6_F16
Definition vop3_cvt.hh:751
Inst_VOP3__V_CVT_SCALE_SR_PK_FP4< AMDGPU::mxfp4, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALE_SR_PK_FP4_F32 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP4_F32
Definition vop3_cvt.hh:1537
uint32_t VecElemU32
static const char * MNEM__V_CVT_SCALE_SR_PK_BF6_BF16
Definition vop3_cvt.hh:1302
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat16, AMDGPU::mxfp6, &MNEM__V_CVT_SCALEF32_PK32_F16_FP6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F16_FP6
Definition vop3_cvt.hh:723
static const char * MNEM__V_CVT_SCALEF32_PK_F32_FP4
Definition vop3_cvt.hh:285
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_BF16_FP8 > Inst_VOP3__V_CVT_SCALEF32_PK_BF16_FP8
Definition vop3_cvt.hh:353
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfp6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK32_FP6_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK32_FP6_BF16
Definition vop3_cvt.hh:744
static const char * MNEM__V_CVT_SCALEF32_PK32_FP6_F32
Definition vop3_cvt.hh:938
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_F16_FP8 > Inst_VOP3__V_CVT_SCALEF32_PK_F16_FP8
Definition vop3_cvt.hh:317
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxbfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_SR_BF8_BF16 > Inst_VOP3__V_CVT_SCALEF32_SR_BF8_BF16
Definition vop3_cvt.hh:1107
static const char * MNEM__V_CVT_SCALEF32_PK_BF16_FP8
Definition vop3_cvt.hh:351
Inst_VOP3__V_CVT_SCALEF32_2XPK16_F32< AMDGPU::mxfp6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK32_FP6_F32 > Inst_VOP3__V_CVT_SCALEF32_2XPK16_FP6_F32
Definition vop3_cvt.hh:940
static const char * MNEM__V_CVT_SCALEF32_PK32_F32_BF6
Definition vop3_cvt.hh:728
static const char * MNEM__V_CVT_SCALEF32_PK_BF8_F16
Definition vop3_cvt.hh:297
static const char * MNEM__V_CVT_SCALE_SR_PK_FP6_F32
Definition vop3_cvt.hh:1337
static const char * MNEM__V_CVT_SCALEF32_SR_BF8_BF16
Definition vop3_cvt.hh:1105
static const char * MNEM__V_CVT_SCALE_SR_PK_FP6_F16
Definition vop3_cvt.hh:1330
static const char * MNEM__V_CVT_SCALE_SR_PK_FP6_BF16
Definition vop3_cvt.hh:1323
static const char * MNEM__V_CVT_SCALEF32_PK_FP4_F32
Definition vop3_cvt.hh:279
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfp4, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK_FP4_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP4_BF16
Definition vop3_cvt.hh:335
static const char * MNEM__V_CVT_SCALEF32_PK32_BF16_BF6
Definition vop3_cvt.hh:686
static const char * MNEM__V_CVT_SCALEF32_PK_F32_FP8
Definition vop3_cvt.hh:267
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK_FP8_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP8_BF16
Definition vop3_cvt.hh:305
static const char * MNEM__V_CVT_SCALEF32_PK_FP8_F16
Definition vop3_cvt.hh:291
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbfloat16, AMDGPU::mxbf6, &MNEM__V_CVT_SCALEF32_PK32_BF16_BF6 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF16_BF6
Definition vop3_cvt.hh:688
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat32, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_F32_FP8 > Inst_VOP3__V_CVT_SCALEF32_PK_F32_FP8
Definition vop3_cvt.hh:269
static const char * MNEM__V_CVT_SCALEF32_PK32_BF6_F32
Definition vop3_cvt.hh:931
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat16, AMDGPU::mxbf6, &MNEM__V_CVT_SCALEF32_PK32_F16_BF6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F16_BF6
Definition vop3_cvt.hh:716
const int NumVecElemPerVecReg(64)
VecOperand< VecElemU64, false > VecOperandU64
Definition operand.hh:832
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxbf6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALE_SR_PK_BF6_F16 > Inst_VOP3__V_CVT_SCALE_SR_PK_BF6_F16
Definition vop3_cvt.hh:1311
static const char * MNEM__V_CVT_SCALEF32_F16_FP8
Definition vop3_cvt.hh:494
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat16, AMDGPU::mxfp4, &MNEM__V_CVT_SCALEF32_PK_BF16_FP4 > Inst_VOP3__V_CVT_SCALEF32_PK_BF16_FP4
Definition vop3_cvt.hh:347
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_SR_FP8_F16 > Inst_VOP3__V_CVT_SCALEF32_SR_FP8_F16
Definition vop3_cvt.hh:1114
static const char * MNEM__V_CVT_SCALEF32_PK32_F16_BF6
Definition vop3_cvt.hh:714
static const char * MNEM__V_CVT_SCALEF32_PK32_BF6_F16
Definition vop3_cvt.hh:707
static const char * MNEM__V_CVT_SCALEF32_SR_BF8_F16
Definition vop3_cvt.hh:1084
Inst_VOP3__V_CVT_SCALEF32_2XPK16_F32< AMDGPU::mxbf6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK32_BF6_F32 > Inst_VOP3__V_CVT_SCALEF32_2XPK16_BF6_F32
Definition vop3_cvt.hh:933
static const char * MNEM__V_CVT_SCALEF32_PK_F32_BF8
Definition vop3_cvt.hh:273
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbf6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK32_BF6_F16 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF6_F16
Definition vop3_cvt.hh:709
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat32, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_F32_FP8 > Inst_VOP3__V_CVT_SCALEF32_F32_FP8
Definition vop3_cvt.hh:508
static const char * MNEM__V_CVT_SCALEF32_F32_BF8
Definition vop3_cvt.hh:500
uint32_t packMXOperands32(T &upper_operand, T &lower_operand)
Pack two MXFP values into one dword.
Definition inst_util.hh:932
static const char * MNEM__V_CVT_SCALEF32_PK32_FP6_BF16
Definition vop3_cvt.hh:742
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK_FP8_F32 > Inst_VOP3__V_CVT_SCALEF32_PK_FP8_F32
Definition vop3_cvt.hh:257
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat32, AMDGPU::mxbf6, &MNEM__V_CVT_SCALEF32_PK32_F32_BF6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F32_BF6
Definition vop3_cvt.hh:730
static const char * MNEM__V_CVT_SCALEF32_PK32_BF6_BF16
Definition vop3_cvt.hh:700
static const char * MNEM__V_CVT_SCALEF32_PK32_FP6_F16
Definition vop3_cvt.hh:749
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK_FP8_F16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP8_F16
Definition vop3_cvt.hh:293
static const char * MNEM__V_CVT_SCALEF32_PK32_F32_FP6
Definition vop3_cvt.hh:735
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat16, AMDGPU::mxfp4, &MNEM__V_CVT_SCALEF32_PK_F16_FP4 > Inst_VOP3__V_CVT_SCALEF32_PK_F16_FP4
Definition vop3_cvt.hh:341
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_BF16_BF8 > Inst_VOP3__V_CVT_SCALEF32_PK_BF16_BF8
Definition vop3_cvt.hh:359
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxfp6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP6_F16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP6_F16
Definition vop3_cvt.hh:1332
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK_BF8_F16 > Inst_VOP3__V_CVT_SCALEF32_PK_BF8_F16
Definition vop3_cvt.hh:299
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat32, AMDGPU::mxfp6, &MNEM__V_CVT_SCALEF32_PK32_F32_FP6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F32_FP6
Definition vop3_cvt.hh:737
static const char * MNEM__V_CVT_SCALEF32_PK32_F16_FP6
Definition vop3_cvt.hh:721
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat32, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_F32_BF8 > Inst_VOP3__V_CVT_SCALEF32_F32_BF8
Definition vop3_cvt.hh:502
static const char * MNEM__V_CVT_SCALEF32_PK_F16_FP8
Definition vop3_cvt.hh:315
Bitfield< 5, 3 > reg
Definition types.hh:92
Bitfield< 63 > val
Definition misc.hh:804
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

Generated on Mon Oct 27 2025 04:12:50 for gem5 by doxygen 1.14.0