51 static_assert(N < 32);
54 return static_cast<int32_t
>(value);
57 int32_t min = -(1 << (N - 1));
58 int32_t max = (1 << (N - 1)) - 1;
59 return std::clamp<int32_t>(value, min, max);
67 static_assert(N < 32);
70 return static_cast<int32_t
>(value);
74 uint32_t max = (1 << N) - 1;
75 return std::clamp<int32_t>(value, min, max);
82 return static_cast<int16_t
>(value);
85 return std::clamp(value,
86 static_cast<int32_t
>(std::numeric_limits<int16_t>::min()),
87 static_cast<int32_t
>(std::numeric_limits<int16_t>::max()));
94 return static_cast<uint16_t
>(value);
97 return std::clamp(value,
98 static_cast<uint32_t
>(std::numeric_limits<uint16_t>::min()),
99 static_cast<uint32_t
>(std::numeric_limits<uint16_t>::max()));
110 constexpr uint16_t one = 0x3c00;
111 constexpr uint16_t zero = 0x0;
112 ArmISA::FPSCR fpscr1, fpscr2;
126 return std::clamp(value, 0.0f, 1.0f);
136 [](int16_t S0, int16_t S1, int16_t S2,
bool clamp) -> int16_t
138 return clampI16(S0 * S1 + S2, clamp);
147 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
150 uint32_t D = S0 * S1;
151 uint16_t Dh = D & 0xFFFF;
160 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
170 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
180 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
182 unsigned shift_val =
bits(S0, 3, 0);
185 return S1 << shift_val;
193 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
195 unsigned shift_val =
bits(S0, 3, 0);
197 return S1 >> shift_val;
205 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
210 unsigned shift_val =
bits(S0, 3, 0);
212 return S1e >> shift_val;
220 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
222 return clampI16((S0 >= S1) ? S0 : S1, clamp);
230 auto opImpl = [](int16_t S0, int16_t S1,
bool clamp) -> int16_t
232 return clampI16((S0 < S1) ? S0 : S1, clamp);
241 [](uint16_t S0, uint16_t S1, uint16_t S2,
bool clamp) -> uint16_t
243 return clampU16(S0 * S1 + S2, clamp);
251 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
261 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
271 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
273 return clampU16((S0 >= S1) ? S0 : S1, clamp);
281 auto opImpl = [](uint16_t S0, uint16_t S1,
bool clamp) -> uint16_t
283 return clampU16((S0 < S1) ? S0 : S1, clamp);
347 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
349 constexpr unsigned INBITS = 16;
351 constexpr unsigned elems = 32 / INBITS;
355 for (
int i = 0;
i < elems; ++
i) {
356 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
357 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
360 float S2 = *
reinterpret_cast<float*
>(&S2r);
366 for (
int i = 0;
i < elems; ++
i) {
372 Csum +=
clampF32(*
reinterpret_cast<float*
>(&conv), clamp);
376 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
387 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
389 constexpr unsigned INBITS = 16;
391 constexpr unsigned elems = 32 / INBITS;
395 for (
int i = 0;
i < elems; ++
i) {
396 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
397 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
400 int32_t S2 = *
reinterpret_cast<int32_t*
>(&S2r);
406 for (
int i = 0;
i < elems; ++
i) {
413 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
424 [](uint32_t S0r, uint32_t S1r, uint32_t S2,
bool clamp) -> uint32_t
426 constexpr unsigned INBITS = 16;
428 constexpr unsigned elems = 32 / INBITS;
432 for (
int i = 0;
i < elems; ++
i) {
433 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
434 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
441 for (
int i = 0;
i < elems; ++
i) {
442 C[
i] = S0[
i] * S1[
i];
458 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
460 constexpr unsigned INBITS = 8;
462 constexpr unsigned elems = 32 / INBITS;
466 for (
int i = 0;
i < elems; ++
i) {
467 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
468 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
471 int32_t S2 = *
reinterpret_cast<int32_t*
>(&S2r);
477 for (
int i = 0;
i < elems; ++
i) {
484 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
495 [](uint32_t S0r, uint32_t S1r, uint32_t S2,
bool clamp) -> uint32_t
497 constexpr unsigned INBITS = 8;
499 constexpr unsigned elems = 32 / INBITS;
503 for (
int i = 0;
i < elems; ++
i) {
504 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
505 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
512 for (
int i = 0;
i < elems; ++
i) {
513 C[
i] = S0[
i] * S1[
i];
529 [](uint32_t S0r, uint32_t S1r, uint32_t S2r,
bool clamp) -> uint32_t
531 constexpr unsigned INBITS = 4;
533 constexpr unsigned elems = 32 / INBITS;
537 for (
int i = 0;
i < elems; ++
i) {
538 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
539 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
542 int32_t S2 = *
reinterpret_cast<int32_t*
>(&S2r);
548 for (
int i = 0;
i < elems; ++
i) {
555 uint32_t rv = *
reinterpret_cast<uint32_t*
>(&Csum);
566 [](uint32_t S0r, uint32_t S1r, uint32_t S2,
bool clamp) -> uint32_t
568 constexpr unsigned INBITS = 4;
570 constexpr unsigned elems = 32 / INBITS;
574 for (
int i = 0;
i < elems; ++
i) {
575 S0[
i] =
bits(S0r,
i*INBITS+INBITS-1,
i*INBITS);
576 S1[
i] =
bits(S1r,
i*INBITS+INBITS-1,
i*INBITS);
583 for (
int i = 0;
i < elems; ++
i) {
584 C[
i] = S0[
i] * S1[
i];
609 vdst[lane] = src[lane];
628 vdst[lane] = src[lane];
674 uint32_t s0l = (opsel & 1) ?
bits(src0[lane], 63, 32)
675 :
bits(src0[lane], 31, 0);
676 uint32_t s1l = (opsel & 2) ?
bits(src1[lane], 63, 32)
677 :
bits(src1[lane], 31, 0);
678 uint32_t s2l = (opsel & 4) ?
bits(src2[lane], 63, 32)
679 :
bits(src2[lane], 31, 0);
681 float s0lf = *
reinterpret_cast<float*
>(&s0l);
682 float s1lf = *
reinterpret_cast<float*
>(&s1l);
683 float s2lf = *
reinterpret_cast<float*
>(&s2l);
685 if (neg & 1) s0lf = -s0lf;
686 if (neg & 1) s1lf = -s1lf;
687 if (neg & 1) s2lf = -s2lf;
689 float dword1 = std::fma(s0lf, s1lf, s2lf);
691 uint32_t s0h = (opsel_hi & 1) ?
bits(src0[lane], 63, 32)
692 :
bits(src0[lane], 31, 0);
693 uint32_t s1h = (opsel_hi & 2) ?
bits(src1[lane], 63, 32)
694 :
bits(src1[lane], 31, 0);
695 uint32_t s2h = (opsel_hi & 4) ?
bits(src2[lane], 63, 32)
696 :
bits(src2[lane], 31, 0);
698 float s0hf = *
reinterpret_cast<float*
>(&s0h);
699 float s1hf = *
reinterpret_cast<float*
>(&s1h);
700 float s2hf = *
reinterpret_cast<float*
>(&s2h);
702 if (neg_hi & 1) s0hf = -s0hf;
703 if (neg_hi & 1) s1hf = -s1hf;
704 if (neg_hi & 1) s2hf = -s2hf;
706 float dword2 = std::fma(s0hf, s1hf, s2hf);
708 uint32_t result1 = *
reinterpret_cast<uint32_t*
>(&dword1);
709 uint32_t result2 = *
reinterpret_cast<uint32_t*
>(&dword2);
711 vdst[lane] = (
static_cast<uint64_t
>(result2) << 32) | result1;
754 uint32_t lower_dword = (opsel & 1) ?
bits(src0[lane], 63, 32)
755 :
bits(src0[lane], 31, 0);
756 uint32_t upper_dword = (opsel & 2) ?
bits(src1[lane], 63, 32)
757 :
bits(src1[lane], 31, 0);
759 float ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
760 float udwordf = *
reinterpret_cast<float*
>(&upper_dword);
762 if (neg & 1) ldwordf = -ldwordf;
763 if (neg & 2) udwordf = -udwordf;
765 float dword1 = ldwordf * udwordf;
767 lower_dword = (opsel_hi & 1) ?
bits(src0[lane], 63, 32)
768 :
bits(src0[lane], 31, 0);
769 upper_dword = (opsel_hi & 2) ?
bits(src1[lane], 63, 32)
770 :
bits(src1[lane], 31, 0);
772 ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
773 udwordf = *
reinterpret_cast<float*
>(&upper_dword);
775 if (neg_hi & 1) ldwordf = -ldwordf;
776 if (neg_hi & 2) udwordf = -udwordf;
778 float dword2 = ldwordf * udwordf;
780 uint32_t result1 = *
reinterpret_cast<uint32_t*
>(&dword1);
781 uint32_t result2 = *
reinterpret_cast<uint32_t*
>(&dword2);
783 vdst[lane] = (
static_cast<uint64_t
>(result2) << 32) | result1;
829 uint32_t lower_dword = (opsel & 1) ?
bits(src0[lane], 63, 32)
830 :
bits(src0[lane], 31, 0);
831 uint32_t upper_dword = (opsel & 2) ?
bits(src1[lane], 63, 32)
832 :
bits(src1[lane], 31, 0);
834 float ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
835 float udwordf = *
reinterpret_cast<float*
>(&upper_dword);
837 if (neg & 1) ldwordf = -ldwordf;
838 if (neg & 2) udwordf = -udwordf;
840 float dword1 = ldwordf + udwordf;
842 lower_dword = (opsel_hi & 1) ?
bits(src0[lane], 63, 32)
843 :
bits(src0[lane], 31, 0);
844 upper_dword = (opsel_hi & 2) ?
bits(src1[lane], 63, 32)
845 :
bits(src1[lane], 31, 0);
847 ldwordf = *
reinterpret_cast<float*
>(&lower_dword);
848 udwordf = *
reinterpret_cast<float*
>(&upper_dword);
850 if (neg_hi & 1) ldwordf = -ldwordf;
851 if (neg_hi & 2) udwordf = -udwordf;
853 float dword2 = ldwordf + udwordf;
855 uint32_t result1 = *
reinterpret_cast<uint32_t*
>(&dword1);
856 uint32_t result2 = *
reinterpret_cast<uint32_t*
>(&dword2);
858 vdst[lane] = (
static_cast<uint64_t
>(result2) << 32) | result1;
895 "Negative modifier undefined for %s",
_opcode);
900 uint64_t lower_dword = (opsel & 1) ?
bits(src0[lane], 63, 32)
901 :
bits(src0[lane], 31, 0);
902 uint64_t upper_dword = (opsel & 2) ?
bits(src1[lane], 63, 32)
903 :
bits(src1[lane], 31, 0);
905 vdst[lane] = upper_dword << 32 | lower_dword;
const std::string _opcode
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
~Inst_VOP3P__V_PK_ADD_F32()
Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *)
void execute(GPUDynInstPtr) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *)
~Inst_VOP3P__V_PK_FMA_F32()
void execute(GPUDynInstPtr) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr) override
Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *)
~Inst_VOP3P__V_PK_MOV_B32()
void execute(GPUDynInstPtr gpuDynInst) override
~Inst_VOP3P__V_PK_MUL_F32()
Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *)
void execute(GPUDynInstPtr) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void execute(GPUDynInstPtr gpuDynInst) override
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
void readSrc()
certain vector operands can read from the vrf/srf or constants.
void write() override
write to the vrf.
Floating-point library code, which will gradually replace vfp.hh.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
uint16_t fplibMax(uint16_t op1, uint16_t op2, FPSCR &fpscr)
uint16_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr)
constexpr unsigned NumVecElemPerVecReg
uint16_t fplibAdd(uint16_t op1, uint16_t op2, FPSCR &fpscr)
uint16_t fplibMul(uint16_t op1, uint16_t op2, FPSCR &fpscr)
uint16_t fplibMulAdd(uint16_t addend, uint16_t op1, uint16_t op2, FPSCR &fpscr)
uint16_t fplibMin(uint16_t op1, uint16_t op2, FPSCR &fpscr)
int16_t clampI16(int32_t value, bool clamp)
int32_t dotClampI(int32_t value, bool clamp)
float clampF32(float value, bool clamp)
uint16_t clampF16(uint16_t value, bool clamp)
uint32_t dotClampU(uint32_t value, bool clamp)
uint16_t clampU16(uint32_t value, bool clamp)
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
std::shared_ptr< GPUDynInst > GPUDynInstPtr