34 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
35 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__
178 bool sign_bit = (
val & 0x80000000) != 0;
186 for (
int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++
i) {
187 tmp_val =
val & (0x80000000 >>
i);
214 bool sign_bit = (
val & 0x8000000000000000ULL) != 0;
222 for (
int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++
i) {
223 tmp_val =
val & (0x8000000000000000ULL >>
i);
250 if (std::is_floating_point<T>::value) {
251 return std::fmax(std::fmin(val_0, val_1),
252 std::fmin(std::fmax(val_0, val_1), val_2));
254 return std::max(std::min(val_0, val_1),
255 std::min(std::max(val_0, val_1), val_2));
259 template <
typename T>
263 T nearest_round = std::floor(
val + 0.5);
264 if ((
int)std::floor(
val) % 2 == 0
265 && std::modf(std::abs(
val), &int_part) == 0.5) {
266 nearest_round = nearest_round - 1;
269 return nearest_round;
276 __uint128_t u0 = (__uint128_t)val_0;
277 __uint128_t u1 = (__uint128_t)val_1;
278 __uint128_t u2 = (__uint128_t)val_2;
279 __uint128_t result = u0 * u1 + u2;
290 __int128_t u0 = (__int128_t)val_0;
291 __int128_t u1 = (__int128_t)val_1;
292 __int128_t u2 = (__int128_t)val_2;
293 __int128_t result = u0 * u1 + u2;
321 int rowOffset,
bool & outOfBounds)
325 int newLane = currLane;
328 int localRowOffset = rowOffset;
329 int localRowNum = rowNum;
332 int quadBase = (currLane & ~(3));
333 int quadPix = (currLane & 3);
334 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
335 newLane = (quadBase | quadPix);
337 panic(
"ERROR: instruction using reserved DPP_CTRL value\n");
341 if ((localRowOffset +
count >= 0) &&
343 localRowOffset +=
count;
344 newLane = (rowNum | localRowOffset);
351 if ((localRowOffset +
count >= 0) &&
353 localRowOffset +=
count;
354 newLane = (rowNum | localRowOffset);
362 newLane = (rowNum | localRowOffset);
376 int currVal = (currLane +
count);
387 localRowOffset = (15 - localRowOffset);
388 newLane = (rowNum | localRowOffset);
390 localRowNum = (currLane & -0x7);
391 localRowOffset = (currLane & 0x7);
392 localRowOffset = (7 - localRowNum);
393 newLane = (localRowNum | localRowOffset);
396 if (currLane >
count) {
397 newLane = (currLane & ~
count) - 1;
401 if (currLane >
count) {
402 newLane = (currLane & ~
count) - 1;
405 panic(
"Unimplemented DPP control operation: %d\n", dppCtrl);
426 int rowNum = 0, bankNum = 0, rowOffset = 0;
429 bool laneDisabled =
false;
431 bool outOfBounds =
false, zeroSrc =
false;
432 long long threadValid = 0;
450 threadValid = (0x1LL << lane);
460 if (((rowMask & (0x1 << rowNum)) == 0) ||
461 ((bankMask & (0x1 << bankNum)) == 0) ) {
483 outLane =
dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
494 }
else if (outOfBounds) {
495 if (boundCtrl == 1) {
500 }
else if (!gpuDynInst->exec_mask[lane]) {
501 if (boundCtrl == 1) {
508 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
509 assert(!laneDisabled);
510 src0[outLane] = src0[lane];
511 }
else if (zeroSrc) {
516 laneDisabled =
false;
560 int low_bit = 0, high_bit = 0;
561 bool signExt_local = signExt;
577 retVal =
bits(currOperVal, high_bit, low_bit);
581 bits(origOperVal, high_bit),
582 "ERROR: SDWA byte update not propagated: retVal: %d, "
584 bits(origOperVal, high_bit));
586 signExt_local = (signExt &&
592 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
602 retVal =
bits(currOperVal, high_bit, low_bit);
606 bits(origOperVal, high_bit),
607 "ERROR: SDWA word update not propagated: retVal: %d, "
610 bits(origOperVal, high_bit));
612 signExt_local = (signExt &&
619 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
623 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
655 origCurrOper[lane],
sel,
673 int low_bit = 0, high_bit = 0;
677 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
678 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
692 signExt = (signExt &&
695 for (
int byte = 0;
byte < 4; ++byte) {
706 origBits_thisByte =
bits(origDstVal, high_bit, low_bit);
707 currBits_thisByte =
bits(currDstVal, high_bit, low_bit);
708 newBits = ((
byte ==
sel) ? origBits_thisByte :
709 ((preserve) ? currBits_thisByte :
710 (((
byte >
sel) && signExt) ? 0xff : 0)));
711 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
717 signExt = (signExt &&
718 (
bits(currDstVal, high_bit, low_bit) & 0x8000));
731 origBits_thisWord =
bits(origDstVal, high_bit, low_bit);
732 currBits_thisWord =
bits(currDstVal, high_bit, low_bit);
733 newBits = ((
word == (
sel & 0x1)) ? origBits_thisWord :
734 ((preserve) ? currBits_thisWord :
735 (((
word > (
sel & 0x1)) && signExt) ? 0xffff : 0)));
736 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
740 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
776 origDstOper[lane], clamp,
777 sel, unusedBits_format);
792 const bool src_signExt,
const bool src_abs,
803 currSrc.negModifier();
807 currSrc.absModifier();
829 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
830 const bool src0_neg = sdwaInst.
SRC0_NEG;
831 const bool src0_abs = sdwaInst.
SRC0_ABS;
854 T & src1, T & origSrc1)
858 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
859 const bool src0_neg = sdwaInst.
SRC0_NEG;
860 const bool src0_abs = sdwaInst.
SRC0_ABS;
862 const bool src1_signExt = sdwaInst.
SRC1_SEXT;
863 const bool src1_neg = sdwaInst.
SRC1_NEG;
864 const bool src1_abs = sdwaInst.
SRC1_ABS;
887 const bool clamp = sdwaInst.
CLAMP;
898 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__