32#ifndef __ARCH_AMDGPU_VEGA_INSTS_VOP3_CVT_HH__
33#define __ARCH_AMDGPU_VEGA_INSTS_VOP3_CVT_HH__
50template<
typename dFMT,
typename sFMT, const
char **MNEM>
65 static_assert(dFMT::size() == 32 || dFMT::size() == 16 ||
66 dFMT::size() == 8 || dFMT::size() == 4);
67 static_assert(sFMT::size() == 32 || sFMT::size() == 16 ||
68 sFMT::size() == 8 || sFMT::size() == 4);
85 if constexpr (sFMT::size() == 32) {
91 if constexpr (dFMT::size() == 32) {
105 [[maybe_unused]]
int in_opsel = 0;
106 if constexpr (sFMT::size() == 8) {
108 }
else if (sFMT::size() == 4) {
117 if constexpr (dFMT::size() == 8) {
118 out_opsel = (
instData.OPSEL >> 3) & 1;
119 }
else if (dFMT::size() == 4) {
120 out_opsel = (
instData.OPSEL >> 2) & 3;
131 float scale_val = 1.0f;
132 if constexpr (sFMT::size() == 32) {
133 scale_val = src2[lane];
136 scale_val = *
reinterpret_cast<float*
>(&(tmp));
139 if constexpr (sFMT::size() == 32) {
140 sFMT tmp1(src0[lane]);
141 sFMT tmp2(src1[lane]);
143 static_assert(dFMT::size() < sFMT::size());
144 tmp1.scaleDiv(scale_val);
145 tmp2.scaleDiv(scale_val);
153 if (dFMT::size() < sFMT::size()) {
154 in.first.scaleDiv(scale_val);
155 in.second.scaleDiv(scale_val);
163 if (dFMT::size() >= sFMT::size()) {
164 cvt1.scaleMul(scale_val);
165 cvt2.scaleMul(scale_val);
169 cvt1 = std::clamp(
float(cvt1), 0.0f, 1.0f);
170 cvt2 = std::clamp(
float(cvt2), 0.0f, 1.0f);
173 if constexpr (dFMT::size() == 32) {
175 }
else if (dFMT::size() == 16) {
177 }
else if (dFMT::size() == 8) {
179 vdst32[lane] =
insertBits(vdst32[lane], 16 * out_opsel + 15,
180 16 * out_opsel, packed_data);
183 vdst32[lane] =
insertBits(vdst32[lane], 8 * out_opsel + 7,
184 8 * out_opsel, packed_data);
189 if constexpr (dFMT::size() == 32) {
211 if constexpr (dFMT::size() == 32) {
221 if constexpr (dFMT::size() == 32) {
232 fatal(
"op idx %i out of bounds\n", opIdx);
244 fatal(
"op idx %i out of bounds\n", opIdx);
249 fatal(
"op idx %i out of bounds\n", opIdx);
256 "v_cvt_scalef32_pk_fp8_f32";
262 "v_cvt_scalef32_pk_bf8_f32";
268 "v_cvt_scalef32_pk_f32_fp8";
274 "v_cvt_scalef32_pk_f32_bf8";
280 "v_cvt_scalef32_pk_fp4_f32";
286 "v_cvt_scalef32_pk_f32_fp4";
292 "v_cvt_scalef32_pk_fp8_f16";
298 "v_cvt_scalef32_pk_bf8_f16";
304 "v_cvt_scalef32_pk_fp8_bf16";
310 "v_cvt_scalef32_pk_bf8_bf16";
316 "v_cvt_scalef32_pk_f16_fp8";
322 "v_cvt_scalef32_pk_f16_bf8";
328 "v_cvt_scalef32_pk_fp4_f16";
334 "v_cvt_scalef32_pk_fp4_bf16";
340 "v_cvt_scalef32_pk_f16_fp4";
346 "v_cvt_scalef32_pk_bf16_fp4";
352 "v_cvt_scalef32_pk_bf16_fp8";
358 "v_cvt_scalef32_pk_bf16_bf8";
370template<
typename dFMT,
typename sFMT, const
char **MNEM>
389 if (omod == 1)
return val * 2.0f;
390 if (omod == 2)
return val * 4.0f;
391 if (omod == 3)
return val / 2.0f;
400 static_assert(dFMT::size() == 32 || dFMT::size() == 16);
401 static_assert(sFMT::size() == 8);
426 sFMT in(
bits(src0[lane], 8 * in_opsel + 7, 8 * in_opsel));
432 float scale_val = src1[lane];
435 cvt.scaleMul(scale_val);
441 cvt = std::clamp(
float(cvt), 0.0f, 1.0f);
445 vdst[lane] = cvt.data >> (32 - dFMT::size());
481 fatal(
"op idx %i out of bounds\n", opIdx);
489 "v_cvt_scalef32_f16_bf8";
495 "v_cvt_scalef32_f16_fp8";
501 "v_cvt_scalef32_f32_bf8";
507 "v_cvt_scalef32_f32_fp8";
522template<
typename dFMT,
typename sFMT, const
char **MNEM>
537 static_assert(dFMT::size() == 32 ||
538 dFMT::size() == 16 || dFMT::size() == 6);
539 static_assert(sFMT::size() == 16 || sFMT::size() == 6);
543 constexpr const int components = sFMT::size() == 32 ? 16 : 32;
553 src0.reserve(input_regs);
554 for (
int reg = 0;
reg < input_regs; ++
reg) {
555 src0.emplace_back(gpuDynInst,
extData.SRC0 +
reg);
566 std::launder(
reinterpret_cast<VecOperandU32*
>(_vdst.data()));
567 for (
int reg = 0;
reg < output_regs; ++
reg) {
578 PackedReg<sFMT::size() * components, sFMT::size()> in_reg;
579 PackedReg<dFMT::size() * components, dFMT::size()> out_reg;
581 for (
int reg = 0;
reg < input_regs; ++
reg) {
585 for (
int pass = 0; pass < components; ++pass) {
591 float scale_val = src1[lane];
593 scale_val = std::fabs(scale_val);
596 scale_val = -scale_val;
602 in.data = in_reg.
getElem(pass) << (32 - sFMT::size());
605 if (
instData.ABS & 1 &&
float(in) < 0.0f) {
613 if constexpr (dFMT::size() < sFMT::size()) {
614 out.scaleDiv(scale_val);
620 if constexpr (dFMT::size() >= sFMT::size()) {
621 out.scaleMul(scale_val);
625 if (
instData.ABS & 8 &&
float(out) < 0.0f) {
632 out_reg.
setElem(pass, out.data >> (32 - dFMT::size()));
635 for (
int reg = 0;
reg < output_regs; ++
reg) {
641 for (
int reg = 0;
reg < output_regs; ++
reg) {
669 if constexpr (sFMT::size() == 32) {
670 return sFMT::size() * 2;
672 return sFMT::size() * 4;
677 return dFMT::size() * 4;
679 fatal(
"op idx %i out of bounds\n", opIdx);
687 "v_cvt_scalef32_pk32_bf16_bf6";
694 "v_cvt_scalef32_pk32_bf16_fp6";
701 "v_cvt_scalef32_pk32_bf6_bf16";
708 "v_cvt_scalef32_pk32_bf6_f16";
715 "v_cvt_scalef32_pk32_f16_bf6";
722 "v_cvt_scalef32_pk32_f16_fp6";
729 "v_cvt_scalef32_pk32_f32_bf6";
736 "v_cvt_scalef32_pk32_f32_fp6";
743 "v_cvt_scalef32_pk32_fp6_bf16";
750 "v_cvt_scalef32_pk32_fp6_f16";
762template<
typename dFMT,
typename sFMT, const
char **MNEM>
777 static_assert(dFMT::size() == 6);
778 static_assert(sFMT::size() == 32);
782 constexpr const int components = 32;
796 for (
int reg = 0;
reg < input_regs; ++
reg) {
806 for (
int reg = 0;
reg < input_regs; ++
reg) {
818 std::launder(
reinterpret_cast<VecOperandU32*
>(_vdst.data()));
819 for (
int reg = 0;
reg < output_regs; ++
reg) {
830 PackedReg<sFMT::size() * components, sFMT::size()> in_reg;
831 PackedReg<dFMT::size() * components, dFMT::size()> out_reg;
833 for (
int reg = 0;
reg < input_regs; ++
reg) {
837 for (
int reg = 0;
reg < input_regs; ++
reg) {
841 for (
int pass = 0; pass < components; ++pass) {
848 in.data = in_reg.
getElem(pass) << (32 - sFMT::size());
851 if (
instData.ABS & 1 &&
float(in) < 0.0f) {
859 float scale_val = src2[lane];
861 scale_val = std::fabs(scale_val);
864 scale_val = -scale_val;
867 in.scaleDiv(scale_val);
872 if (
instData.ABS & 8 &&
float(out) < 0.0f) {
879 out_reg.
setElem(pass, out.data >> (32 - dFMT::size()));
882 for (
int reg = 0;
reg < output_regs; ++
reg) {
888 for (
int reg = 0;
reg < output_regs; ++
reg) {
916 return sFMT::size() * 2;
918 return sFMT::size() * 2;
922 return dFMT::size() * 4;
924 fatal(
"op idx %i out of bounds\n", opIdx);
932 "v_cvt_scalef32_2xpk16_bf6_f32";
939 "v_cvt_scalef32_2xpk16_fp6_f32";
951template<
typename dFMT,
typename sFMT, const
char **MNEM>
967 static_assert(dFMT::size() == 8);
968 static_assert(sFMT::size() == 32 || sFMT::size() == 16);
996 if (
instData.ABS & 1 &&
float(in) < 0.0f) {
1006 seed_val = std::abs(seed_val);
1009 seed_val = -seed_val;
1012 float scale_val = src2[lane];
1014 scale_val = std::fabs(scale_val);
1017 scale_val = -scale_val;
1021 in.scaleDiv(scale_val);
1023 using sInfo =
decltype(in.getFmt());
1024 using dInfo =
decltype(cvt.getFmt());
1028 cvt.setFmt(cvt_info);
1030 if (
instData.ABS & 8 &&
float(cvt) < 0.0f) {
1038 vdst[lane] =
insertBits(vdst[lane], out_byte * 8 + 7, out_byte * 8,
1039 bits(cvt.data, 31, 32 - dFMT::size()));
1077 fatal(
"op idx %i out of bounds\n", opIdx);
1085 "v_cvt_scale_sr_bf8_f16";
1092 "v_cvt_scale_sr_bf8_f32";
1099 "v_cvt_scale_sr_fp8_bf16";
1106 "v_cvt_scale_sr_bf8_bf16";
1113 "v_cvt_scale_sr_fp8_f16";
1120 "v_cvt_scale_sr_fp8_f32";
1132template<
typename dFMT,
typename sFMT, const
char **MNEM>
1147 static_assert(dFMT::size() == 6);
1148 static_assert(sFMT::size() == 32 || sFMT::size() == 16);
1151 constexpr const int components = 32;
1155 Wavefront *wf = gpuDynInst->wavefront();
1165 for (
int reg = 0;
reg < input_regs; ++
reg) {
1179 std::launder(
reinterpret_cast<VecOperandU32*
>(_vdst.data()));
1180 for (
int reg = 0;
reg < output_regs; ++
reg) {
1191 PackedReg<sFMT::size() * components, sFMT::size()> in_reg;
1192 PackedReg<dFMT::size() * components, dFMT::size()> out_reg;
1194 for (
int reg = 0;
reg < input_regs; ++
reg) {
1198 for (
int pass = 0; pass < components; ++pass) {
1205 in.data = in_reg.
getElem(pass) << (32 - sFMT::size());
1208 if (
instData.ABS & 1 &&
float(in) < 0.0f) {
1218 seed_val = std::fabs(seed_val);
1221 seed_val = -seed_val;
1224 float scale_val = src2[lane];
1226 scale_val = std::fabs(scale_val);
1229 scale_val = -scale_val;
1233 in.scaleDiv(scale_val);
1235 using sInfo =
decltype(in.getFmt());
1236 using dInfo =
decltype(out.getFmt());
1240 out.setFmt(cvt_info);
1242 if (
instData.ABS & 8 &&
float(out) < 0.0f) {
1249 out_reg.
setElem(pass, out.data >> (32 - dFMT::size()));
1252 for (
int reg = 0;
reg < output_regs; ++
reg) {
1258 for (
int reg = 0;
reg < output_regs; ++
reg) {
1286 return sFMT::size() * 4;
1295 fatal(
"op idx %i out of bounds\n", opIdx);
1303 "v_cvt_scale_sr_pk_bf6_bf16";
1310 "v_cvt_scale_sr_pk_bf6_f16";
1317 "v_cvt_scale_sr_pk_bf6_f32";
1324 "v_cvt_scale_sr_pk_fp6_bf16";
1331 "v_cvt_scale_sr_pk_fp6_f16";
1338 "v_cvt_scale_sr_pk_fp6_f32";
1350template<
typename dFMT,
typename sFMT, const
char **MNEM>
1365 static_assert(dFMT::size() == 4);
1366 static_assert(sFMT::size() == 32 || sFMT::size() == 16);
1368 Wavefront *wf = gpuDynInst->wavefront();
1382 if constexpr (sFMT::size() == 32) {
1406 if constexpr (sFMT::size() == 32) {
1407 in[0].data = src0[0][lane];
1408 in[1].data = src0[1][lane];
1410 assert(sFMT::size() == 16);
1411 in[0].data =
bits(src0[0][lane], 15, 0) << 16;
1412 in[1].data =
bits(src0[0][lane], 31, 15) << 16;
1418 if (
float(in[0]) < 0.0f) {
1421 if (
float(in[1]) < 0.0f) {
1433 seed_val = std::fabs(seed_val);
1436 seed_val = -seed_val;
1440 float scale_val = src2[lane];
1442 scale_val = std::fabs(scale_val);
1445 scale_val = -scale_val;
1448 in[0].scaleDiv(scale_val);
1449 in[1].scaleDiv(scale_val);
1451 using sInfo =
decltype(in[0].getFmt());
1452 using dInfo =
decltype(out[0].getFmt());
1456 out[0].setFmt(cvt_info);
1460 out[1].setFmt(cvt_info);
1463 if (
float(out[0]) < 0.0f) {
1466 if (
float(out[1]) < 0.0f) {
1473 uint8_t packed_output = (
bits(out[1].
data, 31, 28) << 4)
1475 vdst[lane] =
insertBits(vdst[lane], 8 * out_opsel + 7,
1476 8 * out_opsel, packed_output);
1506 return sFMT::size() / 4;
1514 fatal(
"op idx %i out of bounds\n", opIdx);
1522 "v_cvt_scale_sr_pk_fp4_bf16";
1529 "v_cvt_scale_sr_pk_fp4_f16";
1536 "v_cvt_scale_sr_pk_fp4_f32";
const std::string _opcode
Inst_VOP3A(InFmt_VOP3A *, const std::string &opcode, bool sgpr_dst)
Base class for all V_CVT_SCALEF32_PK32* MI355X instructions which have F32 inputs.
int getNumOperands() override
Inst_VOP3__V_CVT_SCALEF32_2XPK16_F32(InFmt_VOP3A *iFmt)
int numDstRegOperands() override
~Inst_VOP3__V_CVT_SCALEF32_2XPK16_F32()
int getOperandSize(int opIdx) override
void execute(GPUDynInstPtr gpuDynInst) override
int numSrcRegOperands() override
Base class for all V_CVT_SCALEF32_SR_* instructions in MI355X which are NOT packed.
int getNumOperands() override
int numDstRegOperands() override
int getOperandSize(int opIdx) override
~Inst_VOP3__V_CVT_SCALEF32_SR()
void execute(GPUDynInstPtr gpuDynInst) override
Inst_VOP3__V_CVT_SCALEF32_SR(InFmt_VOP3A *iFmt)
int numSrcRegOperands() override
Base class for all V_CVT_SCALEF32_PK32* MI355X instructions (except with F32 inputs).
void execute(GPUDynInstPtr gpuDynInst) override
Inst_VOP3__V_CVT_SCALE_PK32(InFmt_VOP3A *iFmt)
int numSrcRegOperands() override
int numDstRegOperands() override
int getOperandSize(int opIdx) override
~Inst_VOP3__V_CVT_SCALE_PK32()
int getNumOperands() override
Base class for all V_CVT_SCALEF32_PK* instructions in MI355X.
int getNumOperands() override
~Inst_VOP3__V_CVT_SCALE_PK()
int getOperandSize(int opIdx) override
int numDstRegOperands() override
void execute(GPUDynInstPtr gpuDynInst) override
int numSrcRegOperands() override
Inst_VOP3__V_CVT_SCALE_PK(InFmt_VOP3A *iFmt)
Base class for all V_CVT_SCALEF32_SR_PK32* MI355X instructions.
int numDstRegOperands() override
int numSrcRegOperands() override
int getNumOperands() override
int getOperandSize(int opIdx) override
void execute(GPUDynInstPtr gpuDynInst) override
Inst_VOP3__V_CVT_SCALE_SR_PK32(InFmt_VOP3A *iFmt)
~Inst_VOP3__V_CVT_SCALE_SR_PK32()
Base class for all V_CVT_SCALEF32_SR_PK_FP4* MI355X instructions.
int getOperandSize(int opIdx) override
int numSrcRegOperands() override
Inst_VOP3__V_CVT_SCALE_SR_PK_FP4(InFmt_VOP3A *iFmt)
int getNumOperands() override
void execute(GPUDynInstPtr gpuDynInst) override
~Inst_VOP3__V_CVT_SCALE_SR_PK_FP4()
int numDstRegOperands() override
Base class for all V_CVT_SCALEF32* instructions in MI355X which are NOT packed.
void execute(GPUDynInstPtr gpuDynInst) override
~Inst_VOP3__V_CVT_SCALE()
int numDstRegOperands() override
int getOperandSize(int opIdx) override
int getNumOperands() override
dFMT omodModifier(dFMT val, unsigned omod)
Inst_VOP3__V_CVT_SCALE(InFmt_VOP3A *iFmt)
int numSrcRegOperands() override
uint32_t getElem(int elem)
void setDword(int dw, uint32_t value)
void setElem(int elem, uint32_t value)
uint32_t getDword(int dw)
void read() override
read from the vrf.
void readSrc()
certain vector operands can read from the vrf/srf or constants.
void write() override
write to the vrf.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
mxfp< fp6_e3m2_info > mxbf6
mxfp< fp8_e4m3_info > mxfloat8
mxfp< binary32 > mxfloat32
mxfp< fp16_e8m7_info > mxbfloat16
mxfp< fp16_e5m10_info > mxfloat16
dFMT convertMXFP(sFMT in, mxfpRoundingMode mode=roundTiesToEven, uint32_t seed=0)
mxfp< fp4_e2m1_info > mxfp4
mxfp< fp8_e5m2_info > mxbfloat8
mxfp< fp6_e2m3_info > mxfp6
classes that represnt vector/scalar operands in VEGA ISA.
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_SR_FP8_BF16 > Inst_VOP3__V_CVT_SCALEF32_SR_FP8_BF16
static const char * MNEM__V_CVT_SCALE_SR_PK_FP4_F16
static const char * MNEM__V_CVT_SCALEF32_F16_BF8
Inst_VOP3__V_CVT_SCALE_SR_PK_FP4< AMDGPU::mxfp4, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP4_F16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP4_F16
uint64_t packMXOperands64(T &lower_operand, T &upper_operand)
Pack two MXFP values into one qword.
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK_BF8_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK_BF8_BF16
static const char * MNEM__V_CVT_SCALEF32_PK_F16_FP4
static const char * MNEM__V_CVT_SCALEF32_PK32_BF16_FP6
VecOperand< VecElemF32, true > ConstVecOperandF32
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxbfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_SR_BF8_F16 > Inst_VOP3__V_CVT_SCALEF32_SR_BF8_F16
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_F16_FP8 > Inst_VOP3__V_CVT_SCALEF32_F16_FP8
VecOperand< VecElemU32, false > VecOperandU32
static const char * MNEM__V_CVT_SCALEF32_SR_FP8_F32
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxbf6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALE_SR_PK_BF6_F32 > Inst_VOP3__V_CVT_SCALE_SR_PK_BF6_F32
static const char * MNEM__V_CVT_SCALEF32_PK_BF8_BF16
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfp4, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK_FP4_F32 > Inst_VOP3__V_CVT_SCALEF32_PK_FP4_F32
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat32, AMDGPU::mxfp4, &MNEM__V_CVT_SCALEF32_PK_F32_FP4 > Inst_VOP3__V_CVT_SCALEF32_PK_F32_FP4
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxfp6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALE_SR_PK_FP6_F32 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP6_F32
static const char * MNEM__V_CVT_SCALEF32_SR_FP8_F16
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxbf6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALE_SR_PK_BF6_BF16 > Inst_VOP3__V_CVT_SCALE_SR_PK_BF6_BF16
std::pair< T, T > unpackMXOperands(uint32_t src, int opsel)
Unpack MXFP values from a register based on opsel value and type size.
static const char * MNEM__V_CVT_SCALEF32_PK_FP4_F16
static const char * MNEM__V_CVT_SCALEF32_PK_FP4_BF16
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat16, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_F16_BF8 > Inst_VOP3__V_CVT_SCALEF32_F16_BF8
static const char * MNEM__V_CVT_SCALE_SR_PK_BF6_F32
static const char * MNEM__V_CVT_SCALEF32_PK_FP8_F32
static const char * MNEM__V_CVT_SCALEF32_SR_BF8_F32
Inst_VOP3__V_CVT_SCALE_SR_PK_FP4< AMDGPU::mxfp4, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP4_BF16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP4_BF16
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat16, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_PK_F16_BF8 > Inst_VOP3__V_CVT_SCALEF32_PK_F16_BF8
VecOperand< VecElemI32, true > ConstVecOperandI32
VecOperand< VecElemU32, true > ConstVecOperandU32
static const char * MNEM__V_CVT_SCALEF32_PK_FP8_BF16
static const char * MNEM__V_CVT_SCALEF32_PK_BF8_F32
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxfp6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP6_BF16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP6_BF16
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK_BF8_F32 > Inst_VOP3__V_CVT_SCALEF32_PK_BF8_F32
static const char * MNEM__V_CVT_SCALE_SR_PK_FP4_BF16
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxbfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_SR_BF8_F32 > Inst_VOP3__V_CVT_SCALEF32_SR_BF8_F32
static const char * MNEM__V_CVT_SCALEF32_PK_BF16_FP4
static const char * MNEM__V_CVT_SCALEF32_F32_FP8
static const char * MNEM__V_CVT_SCALEF32_PK_BF16_BF8
static const char * MNEM__V_CVT_SCALE_SR_PK_BF6_F16
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat32, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_PK_F32_BF8 > Inst_VOP3__V_CVT_SCALEF32_PK_F32_BF8
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfp4, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK_FP4_F16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP4_F16
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbf6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK32_BF6_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF6_BF16
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbfloat16, AMDGPU::mxfp6, &MNEM__V_CVT_SCALEF32_PK32_BF16_FP6 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF16_FP6
static const char * MNEM__V_CVT_SCALE_SR_PK_FP4_F32
static const char * MNEM__V_CVT_SCALEF32_PK_F16_BF8
static const char * MNEM__V_CVT_SCALEF32_SR_FP8_BF16
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_SR_FP8_F32 > Inst_VOP3__V_CVT_SCALEF32_SR_FP8_F32
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfp6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK32_FP6_F16 > Inst_VOP3__V_CVT_SCALEF32_PK32_FP6_F16
Inst_VOP3__V_CVT_SCALE_SR_PK_FP4< AMDGPU::mxfp4, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALE_SR_PK_FP4_F32 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP4_F32
static const char * MNEM__V_CVT_SCALE_SR_PK_BF6_BF16
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat16, AMDGPU::mxfp6, &MNEM__V_CVT_SCALEF32_PK32_F16_FP6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F16_FP6
static const char * MNEM__V_CVT_SCALEF32_PK_F32_FP4
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_BF16_FP8 > Inst_VOP3__V_CVT_SCALEF32_PK_BF16_FP8
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfp6, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK32_FP6_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK32_FP6_BF16
static const char * MNEM__V_CVT_SCALEF32_PK32_FP6_F32
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_F16_FP8 > Inst_VOP3__V_CVT_SCALEF32_PK_F16_FP8
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxbfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_SR_BF8_BF16 > Inst_VOP3__V_CVT_SCALEF32_SR_BF8_BF16
static const char * MNEM__V_CVT_SCALEF32_PK_BF16_FP8
Inst_VOP3__V_CVT_SCALEF32_2XPK16_F32< AMDGPU::mxfp6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK32_FP6_F32 > Inst_VOP3__V_CVT_SCALEF32_2XPK16_FP6_F32
static const char * MNEM__V_CVT_SCALEF32_PK32_F32_BF6
static const char * MNEM__V_CVT_SCALEF32_PK_BF8_F16
static const char * MNEM__V_CVT_SCALE_SR_PK_FP6_F32
static const char * MNEM__V_CVT_SCALEF32_SR_BF8_BF16
static const char * MNEM__V_CVT_SCALE_SR_PK_FP6_F16
static const char * MNEM__V_CVT_SCALE_SR_PK_FP6_BF16
static const char * MNEM__V_CVT_SCALEF32_PK_FP4_F32
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfp4, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK_FP4_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP4_BF16
static const char * MNEM__V_CVT_SCALEF32_PK32_BF16_BF6
static const char * MNEM__V_CVT_SCALEF32_PK_F32_FP8
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat8, AMDGPU::mxbfloat16, &MNEM__V_CVT_SCALEF32_PK_FP8_BF16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP8_BF16
static const char * MNEM__V_CVT_SCALEF32_PK_FP8_F16
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbfloat16, AMDGPU::mxbf6, &MNEM__V_CVT_SCALEF32_PK32_BF16_BF6 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF16_BF6
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat32, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_F32_FP8 > Inst_VOP3__V_CVT_SCALEF32_PK_F32_FP8
static const char * MNEM__V_CVT_SCALEF32_PK32_BF6_F32
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat16, AMDGPU::mxbf6, &MNEM__V_CVT_SCALEF32_PK32_F16_BF6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F16_BF6
const int NumVecElemPerVecReg(64)
VecOperand< VecElemU64, false > VecOperandU64
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxbf6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALE_SR_PK_BF6_F16 > Inst_VOP3__V_CVT_SCALE_SR_PK_BF6_F16
static const char * MNEM__V_CVT_SCALEF32_F16_FP8
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat16, AMDGPU::mxfp4, &MNEM__V_CVT_SCALEF32_PK_BF16_FP4 > Inst_VOP3__V_CVT_SCALEF32_PK_BF16_FP4
Inst_VOP3__V_CVT_SCALEF32_SR< AMDGPU::mxfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_SR_FP8_F16 > Inst_VOP3__V_CVT_SCALEF32_SR_FP8_F16
static const char * MNEM__V_CVT_SCALEF32_PK32_F16_BF6
static const char * MNEM__V_CVT_SCALEF32_PK32_BF6_F16
static const char * MNEM__V_CVT_SCALEF32_SR_BF8_F16
Inst_VOP3__V_CVT_SCALEF32_2XPK16_F32< AMDGPU::mxbf6, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK32_BF6_F32 > Inst_VOP3__V_CVT_SCALEF32_2XPK16_BF6_F32
static const char * MNEM__V_CVT_SCALEF32_PK_F32_BF8
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxbf6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK32_BF6_F16 > Inst_VOP3__V_CVT_SCALEF32_PK32_BF6_F16
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat32, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_F32_FP8 > Inst_VOP3__V_CVT_SCALEF32_F32_FP8
static const char * MNEM__V_CVT_SCALEF32_F32_BF8
uint32_t packMXOperands32(T &upper_operand, T &lower_operand)
Pack two MXFP values into one dword.
static const char * MNEM__V_CVT_SCALEF32_PK32_FP6_BF16
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat8, AMDGPU::mxfloat32, &MNEM__V_CVT_SCALEF32_PK_FP8_F32 > Inst_VOP3__V_CVT_SCALEF32_PK_FP8_F32
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat32, AMDGPU::mxbf6, &MNEM__V_CVT_SCALEF32_PK32_F32_BF6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F32_BF6
static const char * MNEM__V_CVT_SCALEF32_PK32_BF6_BF16
static const char * MNEM__V_CVT_SCALEF32_PK32_FP6_F16
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK_FP8_F16 > Inst_VOP3__V_CVT_SCALEF32_PK_FP8_F16
static const char * MNEM__V_CVT_SCALEF32_PK32_F32_FP6
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxfloat16, AMDGPU::mxfp4, &MNEM__V_CVT_SCALEF32_PK_F16_FP4 > Inst_VOP3__V_CVT_SCALEF32_PK_F16_FP4
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat16, AMDGPU::mxfloat8, &MNEM__V_CVT_SCALEF32_PK_BF16_BF8 > Inst_VOP3__V_CVT_SCALEF32_PK_BF16_BF8
Inst_VOP3__V_CVT_SCALE_SR_PK32< AMDGPU::mxfp6, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALE_SR_PK_FP6_F16 > Inst_VOP3__V_CVT_SCALE_SR_PK_FP6_F16
Inst_VOP3__V_CVT_SCALE_PK< AMDGPU::mxbfloat8, AMDGPU::mxfloat16, &MNEM__V_CVT_SCALEF32_PK_BF8_F16 > Inst_VOP3__V_CVT_SCALEF32_PK_BF8_F16
Inst_VOP3__V_CVT_SCALE_PK32< AMDGPU::mxfloat32, AMDGPU::mxfp6, &MNEM__V_CVT_SCALEF32_PK32_F32_FP6 > Inst_VOP3__V_CVT_SCALEF32_PK32_F32_FP6
static const char * MNEM__V_CVT_SCALEF32_PK32_F16_FP6
Inst_VOP3__V_CVT_SCALE< AMDGPU::mxfloat32, AMDGPU::mxbfloat8, &MNEM__V_CVT_SCALEF32_F32_BF8 > Inst_VOP3__V_CVT_SCALEF32_F32_BF8
static const char * MNEM__V_CVT_SCALEF32_PK_F16_FP8
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< GPUDynInst > GPUDynInstPtr