34 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
35 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__
175 bool sign_bit = (
val & 0x80000000) != 0;
183 for (
int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++
i) {
184 tmp_val =
val & (0x80000000 >>
i);
211 bool sign_bit = (
val & 0x8000000000000000
ULL) != 0;
219 for (
int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++
i) {
220 tmp_val =
val & (0x8000000000000000
ULL >>
i);
247 if (std::is_floating_point<T>::value) {
248 return std::fmax(std::fmin(val_0, val_1),
249 std::fmin(std::fmax(val_0, val_1), val_2));
251 return std::max(std::min(val_0, val_1),
252 std::min(std::max(val_0, val_1), val_2));
256 template <
typename T>
260 T nearest_round = std::floor(
val + 0.5);
261 if ((
int)std::floor(
val) % 2 == 0
262 && std::modf(std::abs(
val), &int_part) == 0.5) {
263 nearest_round = nearest_round - 1;
266 return nearest_round;
273 __uint128_t u0 = (__uint128_t)val_0;
274 __uint128_t u1 = (__uint128_t)val_1;
275 __uint128_t u2 = (__uint128_t)val_2;
276 __uint128_t result = u0 * u1 + u2;
287 __int128_t u0 = (__int128_t)val_0;
288 __int128_t u1 = (__int128_t)val_1;
289 __int128_t u2 = (__int128_t)val_2;
290 __int128_t result = u0 * u1 + u2;
318 int rowOffset,
bool & outOfBounds)
322 int newLane = currLane;
325 int localRowOffset = rowOffset;
326 int localRowNum = rowNum;
329 int quadBase = (currLane & ~(3));
330 int quadPix = (currLane & 3);
331 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
332 newLane = (quadBase | quadPix);
334 panic(
"ERROR: instruction using reserved DPP_CTRL value\n");
338 if ((localRowOffset +
count >= 0) &&
340 localRowOffset +=
count;
341 newLane = (rowNum | localRowOffset);
348 if ((localRowOffset +
count >= 0) &&
350 localRowOffset +=
count;
351 newLane = (rowNum | localRowOffset);
359 newLane = (rowNum | localRowOffset);
373 int currVal = (currLane +
count);
384 localRowOffset = (15 - localRowOffset);
385 newLane = (rowNum | localRowOffset);
387 localRowNum = (currLane & -0x7);
388 localRowOffset = (currLane & 0x7);
389 localRowOffset = (7 - localRowNum);
390 newLane = (localRowNum | localRowOffset);
393 if (currLane >
count) {
394 newLane = (currLane & ~
count) - 1;
398 if (currLane >
count) {
399 newLane = (currLane & ~
count) - 1;
402 panic(
"Unimplemented DPP control operation: %d\n", dppCtrl);
423 int rowNum = 0, bankNum = 0, rowOffset = 0;
426 bool laneDisabled =
false;
428 bool outOfBounds =
false, zeroSrc =
false;
429 long long threadValid = 0;
447 threadValid = (0x1
LL << lane);
457 if (((rowMask & (0x1 << rowNum)) == 0) ||
458 ((bankMask & (0x1 << bankNum)) == 0) ) {
480 outLane =
dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
491 }
else if (outOfBounds) {
492 if (boundCtrl == 1) {
497 }
else if (!gpuDynInst->exec_mask[lane]) {
498 if (boundCtrl == 1) {
505 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
506 assert(!laneDisabled);
507 src0[outLane] = src0[lane];
508 }
else if (zeroSrc) {
513 laneDisabled =
false;
557 int low_bit = 0, high_bit = 0;
558 bool signExt_local = signExt;
574 retVal =
bits(currOperVal, high_bit, low_bit);
578 bits(origOperVal, high_bit),
579 "ERROR: SDWA byte update not propagated: retVal: %d, "
581 bits(origOperVal, high_bit));
583 signExt_local = (signExt &&
589 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
599 retVal =
bits(currOperVal, high_bit, low_bit);
603 bits(origOperVal, high_bit),
604 "ERROR: SDWA word update not propagated: retVal: %d, "
607 bits(origOperVal, high_bit));
609 signExt_local = (signExt &&
616 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
620 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
652 origCurrOper[lane],
sel,
670 int low_bit = 0, high_bit = 0;
674 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
675 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
689 signExt = (signExt &&
692 for (
int byte = 0;
byte < 4; ++byte) {
703 origBits_thisByte =
bits(origDstVal, high_bit, low_bit);
704 currBits_thisByte =
bits(currDstVal, high_bit, low_bit);
705 newBits = ((
byte ==
sel) ? origBits_thisByte :
706 ((preserve) ? currBits_thisByte :
707 (((
byte >
sel) && signExt) ? 0xff : 0)));
708 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
714 signExt = (signExt &&
715 (
bits(currDstVal, high_bit, low_bit) & 0x8000));
728 origBits_thisWord =
bits(origDstVal, high_bit, low_bit);
729 currBits_thisWord =
bits(currDstVal, high_bit, low_bit);
730 newBits = ((
word == (
sel & 0x1)) ? origBits_thisWord :
731 ((preserve) ? currBits_thisWord :
732 (((
word > (
sel & 0x1)) && signExt) ? 0xffff : 0)));
733 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
737 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
773 origDstOper[lane], clamp,
774 sel, unusedBits_format);
789 const bool src_signExt,
const bool src_abs,
800 currSrc.negModifier();
804 currSrc.absModifier();
826 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
827 const bool src0_neg = sdwaInst.
SRC0_NEG;
828 const bool src0_abs = sdwaInst.
SRC0_ABS;
851 T & src1, T & origSrc1)
855 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
856 const bool src0_neg = sdwaInst.
SRC0_NEG;
857 const bool src0_abs = sdwaInst.
SRC0_ABS;
859 const bool src1_signExt = sdwaInst.
SRC1_SEXT;
860 const bool src1_neg = sdwaInst.
SRC1_NEG;
861 const bool src1_abs = sdwaInst.
SRC1_ABS;
884 const bool clamp = sdwaInst.
CLAMP;
894 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__