32 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
33 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__
176 bool sign_bit = (
val & 0x80000000) != 0;
184 for (
int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++
i) {
185 tmp_val =
val & (0x80000000 >>
i);
212 bool sign_bit = (
val & 0x8000000000000000ULL) != 0;
220 for (
int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++
i) {
221 tmp_val =
val & (0x8000000000000000ULL >>
i);
248 if (std::is_floating_point_v<T>) {
249 return std::fmax(std::fmin(val_0, val_1),
250 std::fmin(std::fmax(val_0, val_1), val_2));
252 return std::max(std::min(val_0, val_1),
253 std::min(std::max(val_0, val_1), val_2));
257 template <
typename T>
261 T nearest_round = std::floor(
val + 0.5);
262 if ((
int)std::floor(
val) % 2 == 0
263 && std::modf(std::abs(
val), &int_part) == 0.5) {
264 nearest_round = nearest_round - 1;
267 return nearest_round;
274 __uint128_t u0 = (__uint128_t)val_0;
275 __uint128_t u1 = (__uint128_t)val_1;
276 __uint128_t u2 = (__uint128_t)val_2;
277 __uint128_t result = u0 * u1 + u2;
288 __int128_t u0 = (__int128_t)val_0;
289 __int128_t u1 = (__int128_t)val_1;
290 __int128_t u2 = (__int128_t)val_2;
291 __int128_t result = u0 * u1 + u2;
319 int rowOffset,
bool & outOfBounds)
323 int newLane = currLane;
326 int localRowOffset = rowOffset;
327 int localRowNum = rowNum;
330 int quadBase = (currLane & ~(3));
331 int quadPix = (currLane & 3);
332 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
333 newLane = (quadBase | quadPix);
335 panic(
"ERROR: instruction using reserved DPP_CTRL value\n");
339 if ((localRowOffset +
count >= 0) &&
341 localRowOffset +=
count;
342 newLane = (rowNum | localRowOffset);
349 if ((localRowOffset +
count >= 0) &&
351 localRowOffset +=
count;
352 newLane = (rowNum | localRowOffset);
360 newLane = (rowNum | localRowOffset);
374 int currVal = (currLane +
count);
385 localRowOffset = (15 - localRowOffset);
386 newLane = (rowNum | localRowOffset);
388 localRowNum = (currLane & -0x7);
389 localRowOffset = (currLane & 0x7);
390 localRowOffset = (7 - localRowNum);
391 newLane = (localRowNum | localRowOffset);
394 if (currLane >
count) {
395 newLane = (currLane & ~
count) - 1;
399 if (currLane >
count) {
400 newLane = (currLane & ~
count) - 1;
403 panic(
"Unimplemented DPP control operation: %d\n", dppCtrl);
424 int rowNum = 0, bankNum = 0, rowOffset = 0;
427 bool laneDisabled =
false;
429 bool outOfBounds =
false, zeroSrc =
false;
430 long long threadValid = 0;
448 threadValid = (0x1LL << lane);
458 if (((rowMask & (0x1 << rowNum)) == 0) ||
459 ((bankMask & (0x1 << bankNum)) == 0) ) {
481 outLane =
dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
492 }
else if (outOfBounds) {
493 if (boundCtrl == 1) {
498 }
else if (!gpuDynInst->exec_mask[lane]) {
499 if (boundCtrl == 1) {
506 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
507 assert(!laneDisabled);
508 src0[outLane] = src0[lane];
509 }
else if (zeroSrc) {
514 laneDisabled =
false;
558 int low_bit = 0, high_bit = 0;
559 bool signExt_local = signExt;
575 retVal =
bits(currOperVal, high_bit, low_bit);
579 bits(origOperVal, high_bit),
580 "ERROR: SDWA byte update not propagated: retVal: %d, "
582 bits(origOperVal, high_bit));
584 signExt_local = (signExt &&
590 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
600 retVal =
bits(currOperVal, high_bit, low_bit);
604 bits(origOperVal, high_bit),
605 "ERROR: SDWA word update not propagated: retVal: %d, "
608 bits(origOperVal, high_bit));
610 signExt_local = (signExt &&
617 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
621 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
653 origCurrOper[lane],
sel,
671 int low_bit = 0, high_bit = 0;
675 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
676 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
690 signExt = (signExt &&
693 for (
int byte = 0;
byte < 4; ++byte) {
704 origBits_thisByte =
bits(origDstVal, high_bit, low_bit);
705 currBits_thisByte =
bits(currDstVal, high_bit, low_bit);
706 newBits = ((
byte ==
sel) ? origBits_thisByte :
707 ((preserve) ? currBits_thisByte :
708 (((
byte >
sel) && signExt) ? 0xff : 0)));
709 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
715 signExt = (signExt &&
716 (
bits(currDstVal, high_bit, low_bit) & 0x8000));
729 origBits_thisWord =
bits(origDstVal, high_bit, low_bit);
730 currBits_thisWord =
bits(currDstVal, high_bit, low_bit);
731 newBits = ((
word == (
sel & 0x1)) ? origBits_thisWord :
732 ((preserve) ? currBits_thisWord :
733 (((
word > (
sel & 0x1)) && signExt) ? 0xffff : 0)));
734 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
738 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
774 origDstOper[lane], clamp,
775 sel, unusedBits_format);
790 const bool src_signExt,
const bool src_abs,
801 currSrc.negModifier();
805 currSrc.absModifier();
827 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
828 const bool src0_neg = sdwaInst.
SRC0_NEG;
829 const bool src0_abs = sdwaInst.
SRC0_ABS;
852 T & src1, T & origSrc1)
856 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
857 const bool src0_neg = sdwaInst.
SRC0_NEG;
858 const bool src0_abs = sdwaInst.
SRC0_ABS;
860 const bool src1_signExt = sdwaInst.
SRC1_SEXT;
861 const bool src1_neg = sdwaInst.
SRC1_NEG;
862 const bool src1_abs = sdwaInst.
SRC1_ABS;
885 const bool clamp = sdwaInst.
CLAMP;
896 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__