36 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__ 37 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__ 95 for (T
bits = val; mask != 0; mask <<= 4)
96 if ((
bits & mask) != 0)
110 for (T
bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
124 = std::numeric_limits<T>::digits -
popCount(val);
170 return std::numeric_limits<T>::digits - 1 -
findMsbSet(val);
177 bool sign_bit = (val & 0x80000000) != 0;
181 if (!val || val == -1) {
185 for (
int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++
i) {
186 tmp_val = val & (0x80000000 >>
i);
213 bool sign_bit = (val & 0x8000000000000000
ULL) != 0;
217 if (!val || val == -1) {
221 for (
int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++
i) {
222 tmp_val = val & (0x8000000000000000
ULL >>
i);
249 if (std::is_floating_point<T>::value) {
250 return std::fmax(std::fmin(val_0, val_1),
251 std::fmin(std::fmax(val_0, val_1), val_2));
253 return std::max(std::min(val_0, val_1),
254 std::min(std::max(val_0, val_1), val_2));
258 template <
typename T>
261 T nearest_round = std::round(val * 0.5) * 2.0;
262 return nearest_round;
269 __uint128_t u0 = (__uint128_t)val_0;
270 __uint128_t u1 = (__uint128_t)val_1;
271 __uint128_t u2 = (__uint128_t)val_2;
272 __uint128_t result = u0 * u1 + u2;
283 __int128_t u0 = (__int128_t)val_0;
284 __int128_t u1 = (__int128_t)val_1;
285 __int128_t u2 = (__int128_t)val_2;
286 __int128_t result = u0 * u1 + u2;
314 int rowOffset,
bool & outOfBounds)
318 int newLane = currLane;
321 int localRowOffset = rowOffset;
322 int localRowNum = rowNum;
325 int quadBase = (currLane & ~(3));
326 int quadPix = (currLane & 3);
327 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
328 newLane = (quadBase | quadPix);
330 panic(
"ERROR: instruction using reserved DPP_CTRL value\n");
334 if ((localRowOffset + count >= 0) &&
335 (localRowOffset + count <
ROW_SIZE)) {
336 localRowOffset +=
count;
337 newLane = (rowNum | localRowOffset);
344 if ((localRowOffset + count >= 0) &&
345 (localRowOffset + count <
ROW_SIZE)) {
346 localRowOffset +=
count;
347 newLane = (rowNum | localRowOffset);
355 newLane = (rowNum | localRowOffset);
369 int currVal = (currLane +
count);
380 localRowOffset = (15 - localRowOffset);
381 newLane = (rowNum | localRowOffset);
383 localRowNum = (currLane & -0x7);
384 localRowOffset = (currLane & 0x7);
385 localRowOffset = (7 - localRowNum);
386 newLane = (localRowNum | localRowOffset);
389 if (currLane > count) {
390 newLane = (currLane & ~count) - 1;
394 if (currLane > count) {
395 newLane = (currLane & ~count) - 1;
398 panic(
"Unimplemented DPP control operation: %d\n", dppCtrl);
419 int rowNum = 0, bankNum = 0, rowOffset = 0;
422 bool laneDisabled =
false;
424 bool outOfBounds =
false, zeroSrc =
false;
425 long long threadValid = 0;
443 threadValid = (0x1
LL << lane);
453 if (((rowMask & (0x1 << rowNum)) == 0) ||
454 ((bankMask & (0x1 << bankNum)) == 0) ) {
476 outLane =
dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
487 }
else if (outOfBounds) {
488 if (boundCtrl == 1) {
493 }
else if (!gpuDynInst->exec_mask[lane]) {
494 if (boundCtrl == 1) {
501 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
502 assert(!laneDisabled);
503 src0[outLane] = src0[lane];
504 }
else if (zeroSrc) {
510 laneDisabled =
false;
554 int first_bit = 0, last_bit = 0;
555 bool signExt_local = signExt;
571 retVal =
bits(currOperVal, first_bit, last_bit);
574 assert(
bits(retVal, Gcn3ISA::MSB_PER_BYTE) ==
576 Gcn3ISA::MSB_PER_BYTE));
578 signExt_local = (signExt &&
579 (
bits(retVal, 0, Gcn3ISA::MSB_PER_BYTE) & 0x80));
584 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
594 retVal =
bits(currOperVal, first_bit, last_bit);
597 assert(
bits(retVal, Gcn3ISA::MSB_PER_WORD) ==
599 Gcn3ISA::MSB_PER_WORD));
601 signExt_local = (signExt &&
602 (
bits(retVal, 0, Gcn3ISA::MSB_PER_WORD) &
608 retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
612 panic(
"Unimplemented SDWA select operation: %d\n", sel);
646 origCurrOper[lane], sel,
663 int first_bit = 0, last_bit = 0;
667 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
668 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
682 signExt = (signExt &&
685 for (
int byte = 0; byte < 4; ++byte) {
696 origBits_thisByte =
bits(origDstVal, first_bit, last_bit);
697 currBits_thisByte =
bits(currDstVal, first_bit, last_bit);
698 newBits = ((byte ==
sel) ? origBits_thisByte :
699 ((preserve) ? currBits_thisByte :
700 (((byte >
sel) && signExt) ? 0xff : 0)));
701 retVal =
insertBits(retVal, first_bit, last_bit, newBits);
707 signExt = (signExt &&
708 (
bits(currDstVal, first_bit, last_bit) & 0x8000));
721 origBits_thisWord =
bits(origDstVal, first_bit, last_bit);
722 currBits_thisWord =
bits(currDstVal, first_bit, last_bit);
723 newBits = ((
word == (sel & 0x1)) ? origBits_thisWord :
724 ((preserve) ? currBits_thisWord :
725 (((
word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
726 retVal =
insertBits(retVal, first_bit, last_bit, newBits);
730 panic(
"Unimplemented SDWA select operation: %d\n", sel);
767 origDstOper[lane], clamp,
768 sel, unusedBits_format);
783 bool src_abs,
bool src_neg)
793 currSrc.negModifier();
797 currSrc.absModifier();
816 T & src0, T & origSrc0)
845 T & src0, T & origSrc0, T & src1, T & origSrc1)
873 T & dst, T & origDst)
878 bool clamp = sdwaInst.
CLAMP;
888 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__
#define panic(...)
This implements a cprintf based panic() function.
int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp, SDWASelVals sel, SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
#define LL(N)
int64_t constant
T median(T val_0, T val_1, T val_2)
static const int NUM_BANKS
ScalarRegI32 countZeroBits(T val)
ScalarRegI32 findFirstZero(T val)
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
int popCount(uint64_t val)
Returns the number of set ones in the provided value.
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, SDWASelVals sel, bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
void sdwaInstDstImpl(T &dstOper, T &origDstOper, bool clamp, SDWASelVals sel, SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, SDWASelVals src_sel, bool src_signExt, bool src_abs, bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel, bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
T roundNearestEven(T val)
ScalarRegI32 findFirstOne(T val)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
ScalarRegI32 findFirstOneMsb(T val)
classes that represnt vector/scalar operands in GCN3 ISA.
T insertBits(T val, int first, int last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
ScalarRegI32 countZeroBitsMsb(T val)
#define ULL(N)
uint64_t constant
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
static const int ROW_SIZE
int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
T bits(T val, int first, int last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it...
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
const int NumVecElemPerVecReg(64)