32 #ifndef __ARCH_VEGA_INSTS_INST_UTIL_HH__
33 #define __ARCH_VEGA_INSTS_INST_UTIL_HH__
176 bool sign_bit = (
val & 0x80000000) != 0;
184 for (
int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++
i) {
185 tmp_val =
val & (0x80000000 >>
i);
212 bool sign_bit = (
val & 0x8000000000000000ULL) != 0;
220 for (
int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++
i) {
221 tmp_val =
val & (0x8000000000000000ULL >>
i);
248 if (std::is_floating_point_v<T>) {
249 return std::fmax(std::fmin(val_0, val_1),
250 std::fmin(std::fmax(val_0, val_1), val_2));
252 return std::max(std::min(val_0, val_1),
253 std::min(std::max(val_0, val_1), val_2));
257 template <
typename T>
261 T nearest_round = std::floor(
val + 0.5);
262 if ((
int)std::floor(
val) % 2 == 0
263 && std::modf(std::abs(
val), &int_part) == 0.5) {
264 nearest_round = nearest_round - 1;
267 return nearest_round;
274 __uint128_t u0 = (__uint128_t)val_0;
275 __uint128_t u1 = (__uint128_t)val_1;
276 __uint128_t u2 = (__uint128_t)val_2;
277 __uint128_t result = u0 * u1 + u2;
288 __int128_t u0 = (__int128_t)val_0;
289 __int128_t u1 = (__int128_t)val_1;
290 __int128_t u2 = (__int128_t)val_2;
291 __int128_t result = u0 * u1 + u2;
319 int rowOffset,
bool & outOfBounds)
323 int newLane = currLane;
327 int localRowOffset = rowOffset;
328 int localRowNum = rowNum;
331 int quadBase = (currLane & ~(3));
332 int quadPix = (currLane & 3);
333 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
334 newLane = (quadBase | quadPix);
336 panic(
"ERROR: instruction using reserved DPP_CTRL value\n");
340 if ((localRowOffset +
count >= 0) &&
342 localRowOffset +=
count;
343 newLane = ((rowNum *
ROW_SIZE) | localRowOffset);
350 if ((localRowOffset +
count >= 0) &&
352 localRowOffset +=
count;
353 newLane = ((rowNum *
ROW_SIZE) | localRowOffset);
361 newLane = ((rowNum *
ROW_SIZE) | localRowOffset);
372 int currVal = (currLane - 1);
382 localRowOffset = (15 - localRowOffset);
383 newLane = (rowNum | localRowOffset);
385 localRowNum = (currLane & -0x7);
386 localRowOffset = (currLane & 0x7);
387 localRowOffset = (7 - localRowNum);
388 newLane = (localRowNum | localRowOffset);
391 if (currLane >
count) {
395 newLane = (currLane & 0x30) - 1;
401 if (currLane >
count) {
405 newLane = (currLane & 0x20) - 1;
410 panic(
"Unimplemented DPP control operation: %d\n", dppCtrl);
427 int boundCtrl = dppInst.
BC;
431 int rowNum = 0, bankNum = 0, rowOffset = 0;
434 bool laneDisabled =
false;
436 bool outOfBounds =
false, zeroSrc =
false;
437 long long threadValid = 0;
458 threadValid = (0x1LL << lane);
468 if (((rowMask & (0x1 << rowNum)) == 0) ||
469 ((bankMask & (0x1 << bankNum)) == 0) ) {
490 outLane =
dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
501 }
else if (outOfBounds) {
502 if (boundCtrl == 1) {
507 }
else if (!gpuDynInst->wavefront()->execMask(lane)) {
508 if (boundCtrl == 1) {
515 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
516 assert(!laneDisabled);
517 src0[lane] = src0_copy[outLane];
518 }
else if (zeroSrc) {
523 laneDisabled =
false;
569 int low_bit = 0, high_bit = 0;
570 bool signExt_local = signExt;
586 retVal =
bits(currOperVal, high_bit, low_bit);
590 bits(origOperVal, high_bit),
591 "ERROR: SDWA byte update not propagated: retVal: %d, "
593 bits(origOperVal, high_bit));
595 signExt_local = (signExt &&
601 retVal = (uint32_t)sext<VegaISA::MSB_PER_BYTE>(retVal);
611 retVal =
bits(currOperVal, high_bit, low_bit);
615 bits(origOperVal, high_bit),
616 "ERROR: SDWA word update not propagated: retVal: %d, "
619 bits(origOperVal, high_bit));
621 signExt_local = (signExt &&
628 retVal = (uint32_t)sext<VegaISA::MSB_PER_WORD>(retVal);
632 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
664 origCurrOper[lane],
sel,
682 int low_bit = 0, high_bit = 0;
686 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
687 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
701 signExt = (signExt &&
704 for (
int byte = 0;
byte < 4; ++byte) {
715 origBits_thisByte =
bits(origDstVal, high_bit, low_bit);
716 currBits_thisByte =
bits(currDstVal, high_bit, low_bit);
717 newBits = ((
byte ==
sel) ? origBits_thisByte :
718 ((preserve) ? currBits_thisByte :
719 (((
byte >
sel) && signExt) ? 0xff : 0)));
720 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
726 signExt = (signExt &&
727 (
bits(currDstVal, high_bit, low_bit) & 0x8000));
740 origBits_thisWord =
bits(origDstVal, high_bit, low_bit);
741 currBits_thisWord =
bits(currDstVal, high_bit, low_bit);
742 newBits = ((
word == (
sel & 0x1)) ? origBits_thisWord :
743 ((preserve) ? currBits_thisWord :
744 (((
word > (
sel & 0x1)) && signExt) ? 0xffff : 0)));
745 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
749 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
785 origDstOper[lane], clamp,
786 sel, unusedBits_format);
801 const bool src_signExt,
const bool src_abs,
812 currSrc.negModifier();
816 currSrc.absModifier();
838 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
839 const bool src0_neg = sdwaInst.
SRC0_NEG;
840 const bool src0_abs = sdwaInst.
SRC0_ABS;
863 T & src1, T & origSrc1)
867 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
868 const bool src0_neg = sdwaInst.
SRC0_NEG;
869 const bool src0_abs = sdwaInst.
SRC0_ABS;
871 const bool src1_signExt = sdwaInst.
SRC1_SEXT;
872 const bool src1_neg = sdwaInst.
SRC1_NEG;
873 const bool src1_abs = sdwaInst.
SRC1_ABS;
896 const bool clamp = sdwaInst.
CLMP;
907 #endif // __ARCH_VEGA_INSTS_INST_UTIL_HH__