36 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
37 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__
177 bool sign_bit = (
val & 0x80000000) != 0;
185 for (
int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++
i) {
186 tmp_val =
val & (0x80000000 >>
i);
213 bool sign_bit = (
val & 0x8000000000000000
ULL) != 0;
221 for (
int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++
i) {
222 tmp_val =
val & (0x8000000000000000
ULL >>
i);
249 if (std::is_floating_point<T>::value) {
250 return std::fmax(std::fmin(val_0, val_1),
251 std::fmin(std::fmax(val_0, val_1), val_2));
253 return std::max(std::min(val_0, val_1),
254 std::min(std::max(val_0, val_1), val_2));
258 template <
typename T>
262 T nearest_round = std::floor(
val + 0.5);
263 if ((
int)std::floor(
val) % 2 == 0
264 && std::modf(std::abs(
val), &int_part) == 0.5) {
265 nearest_round = nearest_round - 1;
268 return nearest_round;
275 __uint128_t u0 = (__uint128_t)val_0;
276 __uint128_t u1 = (__uint128_t)val_1;
277 __uint128_t u2 = (__uint128_t)val_2;
278 __uint128_t result = u0 * u1 + u2;
289 __int128_t u0 = (__int128_t)val_0;
290 __int128_t u1 = (__int128_t)val_1;
291 __int128_t u2 = (__int128_t)val_2;
292 __int128_t result = u0 * u1 + u2;
320 int rowOffset,
bool & outOfBounds)
324 int newLane = currLane;
327 int localRowOffset = rowOffset;
328 int localRowNum = rowNum;
331 int quadBase = (currLane & ~(3));
332 int quadPix = (currLane & 3);
333 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
334 newLane = (quadBase | quadPix);
336 panic(
"ERROR: instruction using reserved DPP_CTRL value\n");
340 if ((localRowOffset +
count >= 0) &&
342 localRowOffset +=
count;
343 newLane = (rowNum | localRowOffset);
350 if ((localRowOffset +
count >= 0) &&
352 localRowOffset +=
count;
353 newLane = (rowNum | localRowOffset);
361 newLane = (rowNum | localRowOffset);
375 int currVal = (currLane +
count);
386 localRowOffset = (15 - localRowOffset);
387 newLane = (rowNum | localRowOffset);
389 localRowNum = (currLane & -0x7);
390 localRowOffset = (currLane & 0x7);
391 localRowOffset = (7 - localRowNum);
392 newLane = (localRowNum | localRowOffset);
395 if (currLane >
count) {
396 newLane = (currLane & ~
count) - 1;
400 if (currLane >
count) {
401 newLane = (currLane & ~
count) - 1;
404 panic(
"Unimplemented DPP control operation: %d\n", dppCtrl);
425 int rowNum = 0, bankNum = 0, rowOffset = 0;
428 bool laneDisabled =
false;
430 bool outOfBounds =
false, zeroSrc =
false;
431 long long threadValid = 0;
449 threadValid = (0x1
LL << lane);
459 if (((rowMask & (0x1 << rowNum)) == 0) ||
460 ((bankMask & (0x1 << bankNum)) == 0) ) {
482 outLane =
dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
493 }
else if (outOfBounds) {
494 if (boundCtrl == 1) {
499 }
else if (!gpuDynInst->exec_mask[lane]) {
500 if (boundCtrl == 1) {
507 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
508 assert(!laneDisabled);
509 src0[outLane] = src0[lane];
510 }
else if (zeroSrc) {
515 laneDisabled =
false;
559 int low_bit = 0, high_bit = 0;
560 bool signExt_local = signExt;
576 retVal =
bits(currOperVal, high_bit, low_bit);
580 bits(origOperVal, high_bit),
581 "ERROR: SDWA byte update not propagated: retVal: %d, "
583 bits(origOperVal, high_bit));
585 signExt_local = (signExt &&
591 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
601 retVal =
bits(currOperVal, high_bit, low_bit);
605 bits(origOperVal, high_bit),
606 "ERROR: SDWA word update not propagated: retVal: %d, "
609 bits(origOperVal, high_bit));
611 signExt_local = (signExt &&
618 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
622 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
654 origCurrOper[lane],
sel,
672 int low_bit = 0, high_bit = 0;
676 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
677 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
691 signExt = (signExt &&
694 for (
int byte = 0;
byte < 4; ++byte) {
705 origBits_thisByte =
bits(origDstVal, high_bit, low_bit);
706 currBits_thisByte =
bits(currDstVal, high_bit, low_bit);
707 newBits = ((
byte ==
sel) ? origBits_thisByte :
708 ((preserve) ? currBits_thisByte :
709 (((
byte >
sel) && signExt) ? 0xff : 0)));
710 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
716 signExt = (signExt &&
717 (
bits(currDstVal, high_bit, low_bit) & 0x8000));
730 origBits_thisWord =
bits(origDstVal, high_bit, low_bit);
731 currBits_thisWord =
bits(currDstVal, high_bit, low_bit);
732 newBits = ((
word == (
sel & 0x1)) ? origBits_thisWord :
733 ((preserve) ? currBits_thisWord :
734 (((
word > (
sel & 0x1)) && signExt) ? 0xffff : 0)));
735 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
739 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
775 origDstOper[lane], clamp,
776 sel, unusedBits_format);
791 const bool src_signExt,
const bool src_abs,
802 currSrc.negModifier();
806 currSrc.absModifier();
828 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
829 const bool src0_neg = sdwaInst.
SRC0_NEG;
830 const bool src0_abs = sdwaInst.
SRC0_ABS;
853 T & src1, T & origSrc1)
857 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
858 const bool src0_neg = sdwaInst.
SRC0_NEG;
859 const bool src0_abs = sdwaInst.
SRC0_ABS;
861 const bool src1_signExt = sdwaInst.
SRC1_SEXT;
862 const bool src1_neg = sdwaInst.
SRC1_NEG;
863 const bool src1_abs = sdwaInst.
SRC1_ABS;
886 const bool clamp = sdwaInst.
CLAMP;
896 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__