51 static_assert(N < 32);
54 return static_cast<int32_t
>(value);
57 int32_t min = -(1 << (N - 1));
58 int32_t max = (1 << (N - 1)) - 1;
59 return std::clamp<int32_t>(value, min, max);
67 static_assert(N < 32);
70 return static_cast<int32_t
>(value);
74 uint32_t max = (1 << N) - 1;
75 return std::clamp<int32_t>(value, min, max);
82 return static_cast<int16_t
>(value);
85 return std::clamp(value,
86 static_cast<int32_t
>(std::numeric_limits<int16_t>::min()),
87 static_cast<int32_t
>(std::numeric_limits<int16_t>::max()));
94 return static_cast<uint16_t
>(value);
97 return std::clamp(value,
98 static_cast<uint32_t
>(std::numeric_limits<uint16_t>::min()),
99 static_cast<uint32_t
>(std::numeric_limits<uint16_t>::max()));
110 constexpr uint16_t one = 0x3c00;
111 constexpr uint16_t zero = 0x0;
112 ArmISA::FPSCR fpscr1, fpscr2;
126 return std::clamp(value, 0.0f, 1.0f);
136 [](int16_t S0, int16_t S1, int16_t S2,
bool clamp) -> int16_t
138 return clampI16(S0 * S1 + S2, clamp);
147 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
150 uint32_t D = S0 * S1;
151 uint16_t Dh = D & 0xFFFF;
160 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
170 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
180 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
182 unsigned shift_val =
bits(S0, 3, 0);
185 return S1 << shift_val;
193 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
195 unsigned shift_val =
bits(S0, 3, 0);
197 return S1 >> shift_val;
205 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
210 unsigned shift_val =
bits(S0, 3, 0);
212 return S1e >> shift_val;
220 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
222 return clampI16((S0 >= S1) ? S0 : S1, clamp);
230 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
232 return clampI16((S0 < S1) ? S0 : S1, clamp);
241 [](uint16_t S0, uint16_t S1, uint16_t S2,
bool clamp) -> uint16_t
243 return clampU16(S0 * S1 + S2, clamp);
251 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
261 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
271 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
273 return clampU16((S0 >= S1) ? S0 : S1, clamp);
281 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
283 return clampU16((S0 < S1) ? S0 : S1, clamp);
347 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
349 constexpr unsigned INBITS = 16;
351 constexpr unsigned elems = 32 / INBITS;
355 for (
int i = 0;
i < elems; ++
i) {
356 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
357 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
360 float S2 = *
reinterpret_cast<float*
>(&S2r);
366 for (
int i = 0;
i < elems; ++
i) {
372 Csum +=
clampF32(*
reinterpret_cast<float*
>(&conv), clamp);
376 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
401 a1.data = uint16_t(
bits(src0[lane], 15, 0));
402 a2.
data = uint16_t(
bits(src0[lane], 31, 16));
403 b1.data = uint16_t(
bits(src1[lane], 15, 0));
404 b2.data = uint16_t(
bits(src1[lane], 31, 16));
406 if (
instData.NEG_HI & 0x1) a2 = -a2;
411 vdst[lane] += float(
a1) * float(
b1);
412 vdst[lane] += float(a2) * float(
b2);
413 vdst[lane] += src2[lane];
425 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
427 constexpr unsigned INBITS = 16;
429 constexpr unsigned elems = 32 / INBITS;
433 for (
int i = 0;
i < elems; ++
i) {
434 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
435 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
438 int32_t S2 = *
reinterpret_cast<int32_t*
>(&S2r);
444 for (
int i = 0;
i < elems; ++
i) {
451 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
462 [](uint32_t S0r, uint32_t S1r, uint32_t S2,
bool clamp) -> uint32_t
464 constexpr unsigned INBITS = 16;
466 constexpr unsigned elems = 32 / INBITS;
470 for (
int i = 0;
i < elems; ++
i) {
471 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
472 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
479 for (
int i = 0;
i < elems; ++
i) {
480 C[
i] = S0[
i] * S1[
i];
496 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
498 constexpr unsigned INBITS = 8;
500 constexpr unsigned elems = 32 / INBITS;
504 for (
int i = 0;
i < elems; ++
i) {
505 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
506 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
509 int32_t S2 = *
reinterpret_cast<int32_t*
>(&S2r);
515 for (
int i = 0;
i < elems; ++
i) {
522 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
533 [](uint32_t S0r, uint32_t S1r, uint32_t S2,
bool clamp) -> uint32_t
535 constexpr unsigned INBITS = 8;
537 constexpr unsigned elems = 32 / INBITS;
541 for (
int i = 0;
i < elems; ++
i) {
542 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
543 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
550 for (
int i = 0;
i < elems; ++
i) {
551 C[
i] = S0[
i] * S1[
i];
567 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
569 constexpr unsigned INBITS = 4;
571 constexpr unsigned elems = 32 / INBITS;
575 for (
int i = 0;
i < elems; ++
i) {
576 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
577 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
580 int32_t S2 = *
reinterpret_cast<int32_t*
>(&S2r);
586 for (
int i = 0;
i < elems; ++
i) {
593 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
604 [](uint32_t S0r, uint32_t S1r, uint32_t S2,
bool clamp) -> uint32_t
606 constexpr unsigned INBITS = 4;
608 constexpr unsigned elems = 32 / INBITS;
612 for (
int i = 0;
i < elems; ++
i) {
613 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
614 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
621 for (
int i = 0;
i < elems; ++
i) {
622 C[
i] = S0[
i] * S1[
i];
647 vdst[lane] = src[lane];
666 vdst[lane] = src[lane];
712 uint32_t s0l = (opsel & 1) ?
bits(src0[lane], 63, 32)
713 :
bits(src0[lane], 31, 0);
714 uint32_t s1l = (opsel & 2) ?
bits(src1[lane], 63, 32)
715 :
bits(src1[lane], 31, 0);
716 uint32_t s2l = (opsel & 4) ?
bits(src2[lane], 63, 32)
717 :
bits(src2[lane], 31, 0);
719 float s0lf = *
reinterpret_cast<float*
>(&s0l);
720 float s1lf = *
reinterpret_cast<float*
>(&s1l);
721 float s2lf = *
reinterpret_cast<float*
>(&s2l);
723 if (neg & 1) s0lf = -s0lf;
724 if (neg & 1) s1lf = -s1lf;
725 if (neg & 1) s2lf = -s2lf;
727 float dword1 = std::fma(s0lf, s1lf, s2lf);
729 uint32_t s0h = (opsel_hi & 1) ?
bits(src0[lane], 63, 32)
730 :
bits(src0[lane], 31, 0);
731 uint32_t s1h = (opsel_hi & 2) ?
bits(src1[lane], 63, 32)
732 :
bits(src1[lane], 31, 0);
733 uint32_t s2h = (opsel_hi & 4) ?
bits(src2[lane], 63, 32)
734 :
bits(src2[lane], 31, 0);
736 float s0hf = *
reinterpret_cast<float*
>(&s0h);
737 float s1hf = *
reinterpret_cast<float*
>(&s1h);
738 float s2hf = *
reinterpret_cast<float*
>(&s2h);
740 if (neg_hi & 1) s0hf = -s0hf;
741 if (neg_hi & 1) s1hf = -s1hf;
742 if (neg_hi & 1) s2hf = -s2hf;
744 float dword2 = std::fma(s0hf, s1hf, s2hf);
746 uint32_t result1 = *
reinterpret_cast<uint32_t*
>(&dword1);
747 uint32_t result2 = *
reinterpret_cast<uint32_t*
>(&dword2);
749 vdst[lane] = (
static_cast<uint64_t
>(result2) << 32) | result1;
785 int opsel_hi =
extData.OPSEL_HI;
792 uint32_t lower_dword = (opsel & 1) ?
bits(src0[lane], 63, 32)
793 :
bits(src0[lane], 31, 0);
794 uint32_t upper_dword = (opsel & 2) ?
bits(src1[lane], 63, 32)
795 :
bits(src1[lane], 31, 0);
797 float ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
798 float udwordf = *
reinterpret_cast<float*
>(&upper_dword);
800 if (neg & 1) ldwordf = -ldwordf;
801 if (neg & 2) udwordf = -udwordf;
803 float dword1 = ldwordf * udwordf;
805 lower_dword = (opsel_hi & 1) ?
bits(src0[lane], 63, 32)
806 :
bits(src0[lane], 31, 0);
807 upper_dword = (opsel_hi & 2) ?
bits(src1[lane], 63, 32)
808 :
bits(src1[lane], 31, 0);
810 ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
811 udwordf = *
reinterpret_cast<float*
>(&upper_dword);
813 if (neg_hi & 1) ldwordf = -ldwordf;
814 if (neg_hi & 2) udwordf = -udwordf;
816 float dword2 = ldwordf * udwordf;
818 uint32_t result1 = *
reinterpret_cast<uint32_t*
>(&dword1);
819 uint32_t result2 = *
reinterpret_cast<uint32_t*
>(&dword2);
821 vdst[lane] = (
static_cast<uint64_t
>(result2) << 32) | result1;
860 int opsel_hi =
extData.OPSEL_HI;
867 uint32_t lower_dword = (opsel & 1) ?
bits(src0[lane], 63, 32)
868 :
bits(src0[lane], 31, 0);
869 uint32_t upper_dword = (opsel & 2) ?
bits(src1[lane], 63, 32)
870 :
bits(src1[lane], 31, 0);
872 float ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
873 float udwordf = *
reinterpret_cast<float*
>(&upper_dword);
875 if (neg & 1) ldwordf = -ldwordf;
876 if (neg & 2) udwordf = -udwordf;
878 float dword1 = ldwordf + udwordf;
880 lower_dword = (opsel_hi & 1) ?
bits(src0[lane], 63, 32)
881 :
bits(src0[lane], 31, 0);
882 upper_dword = (opsel_hi & 2) ?
bits(src1[lane], 63, 32)
883 :
bits(src1[lane], 31, 0);
885 ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
886 udwordf = *
reinterpret_cast<float*
>(&upper_dword);
888 if (neg_hi & 1) ldwordf = -ldwordf;
889 if (neg_hi & 2) udwordf = -udwordf;
891 float dword2 = ldwordf + udwordf;
893 uint32_t result1 = *
reinterpret_cast<uint32_t*
>(&dword1);
894 uint32_t result2 = *
reinterpret_cast<uint32_t*
>(&dword2);
896 vdst[lane] = (
static_cast<uint64_t
>(result2) << 32) | result1;
933 "Negative modifier undefined for %s",
_opcode);
938 uint64_t lower_dword = (opsel & 1) ?
bits(src0[lane], 63, 32)
939 :
bits(src0[lane], 31, 0);
940 uint64_t upper_dword = (opsel & 2) ?
bits(src1[lane], 63, 32)
941 :
bits(src1[lane], 31, 0);
943 vdst[lane] = upper_dword << 32 | lower_dword;
984 bits(src0[lane], opsel * 8 + 7, opsel * 8));
997 bits(src1[lane], opsel * 8 + 7, opsel * 8));
const std::string _opcode
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
Inst_VOP3P__V_MFMA_LOAD_SCALE(InFmt_VOP3P *)
void execute(GPUDynInstPtr) override
~Inst_VOP3P__V_MFMA_LOAD_SCALE()
void execute(GPUDynInstPtr gpuDynInst) override
~Inst_VOP3P__V_PK_ADD_F32()
Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *)
void execute(GPUDynInstPtr) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *)
~Inst_VOP3P__V_PK_FMA_F32()
void execute(GPUDynInstPtr) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr) override
Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *)
~Inst_VOP3P__V_PK_MOV_B32()
void execute(GPUDynInstPtr gpuDynInst) override
~Inst_VOP3P__V_PK_MUL_F32()
Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *)
void execute(GPUDynInstPtr) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
Inst_VOP3P(InFmt_VOP3P *, const std::string &opcode)
void read() override
read from the vrf.
void readSrc()
certain vector operands can read from the vrf/srf or constants.
void write() override
write to the vrf.
void setMfmaBScale(int idx, uint8_t value)
void setMfmaAScale(int idx, uint8_t value)
Floating-point library code, which will gradually replace vfp.hh.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
mxfp< fp16_e8m7_info > mxbfloat16
uint16_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr, FPCR fpcr)
uint16_t fplibMul(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
uint16_t fplibMulAdd(uint16_t addend, uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
uint16_t fplibMax(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
uint16_t fplibMin(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
uint16_t fplibAdd(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
classes that represnt vector/scalar operands in VEGA ISA.
int16_t clampI16(int32_t value, bool clamp)
bool isVectorReg(int opIdx)
int32_t dotClampI(int32_t value, bool clamp)
float clampF32(float value, bool clamp)
VecOperand< VecElemU32, false > VecOperandU32
VecOperand< VecElemU32, true > ConstVecOperandU32
uint16_t clampF16(uint16_t value, bool clamp)
const int NumVecElemPerVecReg(64)
VecOperand< VecElemU64, false > VecOperandU64
uint32_t dotClampU(uint32_t value, bool clamp)
VecOperand< VecElemU64, true > ConstVecOperandU64
uint16_t clampU16(uint32_t value, bool clamp)
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< GPUDynInst > GPUDynInstPtr