32#ifndef __ARCH_VEGA_INSTS_INST_UTIL_HH__
33#define __ARCH_VEGA_INSTS_INST_UTIL_HH__
177 bool sign_bit = (
val & 0x80000000) != 0;
185 for (
int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++
i) {
186 tmp_val =
val & (0x80000000 >>
i);
213 bool sign_bit = (
val & 0x8000000000000000ULL) != 0;
221 for (
int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++
i) {
222 tmp_val =
val & (0x8000000000000000ULL >>
i);
249 if (std::is_floating_point_v<T>) {
250 return std::fmax(std::fmin(val_0, val_1),
251 std::fmin(std::fmax(val_0, val_1), val_2));
253 return std::max(std::min(val_0, val_1),
254 std::min(std::max(val_0, val_1), val_2));
258 template <
typename T>
262 T nearest_round = std::floor(
val + 0.5);
263 if ((
int)std::floor(
val) % 2 == 0
264 && std::modf(std::abs(
val), &int_part) == 0.5) {
265 nearest_round = nearest_round - 1;
268 return nearest_round;
275 __uint128_t u0 = (__uint128_t)val_0;
276 __uint128_t u1 = (__uint128_t)val_1;
277 __uint128_t u2 = (__uint128_t)val_2;
278 __uint128_t result = u0 * u1 + u2;
289 __int128_t u0 = (__int128_t)val_0;
290 __int128_t u1 = (__int128_t)val_1;
291 __int128_t u2 = (__int128_t)val_2;
292 __int128_t result = u0 * u1 + u2;
321 int rowOffset,
bool & outOfBounds)
325 int newLane = currLane;
329 int localRowOffset = rowOffset;
330 int localRowNum = rowNum;
333 int quadBase = (currLane & ~(3));
334 int quadPix = (currLane & 3);
335 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
336 newLane = (quadBase | quadPix);
338 panic(
"ERROR: instruction using reserved DPP_CTRL value\n");
342 if ((localRowOffset +
count >= 0) &&
344 localRowOffset +=
count;
345 newLane = ((rowNum *
ROW_SIZE) | localRowOffset);
352 if ((localRowOffset +
count >= 0) &&
354 localRowOffset +=
count;
355 newLane = ((rowNum *
ROW_SIZE) | localRowOffset);
363 newLane = ((rowNum *
ROW_SIZE) | localRowOffset);
374 int currVal = (currLane - 1);
384 localRowOffset = (15 - localRowOffset);
385 newLane = (rowNum | localRowOffset);
387 localRowNum = (currLane & -0x7);
388 localRowOffset = (currLane & 0x7);
389 localRowOffset = (7 - localRowNum);
390 newLane = (localRowNum | localRowOffset);
393 if (currLane >
count) {
397 newLane = (currLane & 0x30) - 1;
403 if (currLane >
count) {
407 newLane = (currLane & 0x20) - 1;
412 panic(
"Unimplemented DPP control operation: %d\n", dppCtrl);
429 int boundCtrl = dppInst.
BC;
433 int rowNum = 0, bankNum = 0, rowOffset = 0;
436 bool laneDisabled =
false;
438 bool outOfBounds =
false, zeroSrc =
false;
439 long long threadValid = 0;
460 threadValid = (0x1LL << lane);
470 if (((rowMask & (0x1 << rowNum)) == 0) ||
471 ((bankMask & (0x1 << bankNum)) == 0) ) {
492 outLane =
dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
503 }
else if (outOfBounds) {
504 if (boundCtrl == 1) {
509 }
else if (!gpuDynInst->wavefront()->execMask(lane)) {
510 if (boundCtrl == 1) {
517 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
518 assert(!laneDisabled);
519 src0[lane] = src0_copy[outLane];
520 }
else if (zeroSrc) {
525 laneDisabled =
false;
571 int low_bit = 0, high_bit = 0;
572 bool signExt_local = signExt;
588 retVal =
bits(currOperVal, high_bit, low_bit);
592 bits(origOperVal, high_bit),
593 "ERROR: SDWA byte update not propagated: retVal: %d, "
595 bits(origOperVal, high_bit));
597 signExt_local = (signExt &&
613 retVal =
bits(currOperVal, high_bit, low_bit);
617 bits(origOperVal, high_bit),
618 "ERROR: SDWA word update not propagated: retVal: %d, "
621 bits(origOperVal, high_bit));
623 signExt_local = (signExt &&
634 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
666 origCurrOper[lane],
sel,
684 int low_bit = 0, high_bit = 0;
688 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
689 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
703 signExt = (signExt &&
706 for (
int byte = 0;
byte < 4; ++byte) {
718 currBits_thisByte =
bits(currDstVal, high_bit, low_bit);
719 newBits = ((
byte ==
sel) ? origBits_thisByte :
720 ((preserve) ? currBits_thisByte :
721 (((
byte >
sel) && signExt) ? 0xff : 0)));
722 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
728 signExt = (signExt &&
729 (
bits(currDstVal, high_bit, low_bit) & 0x8000));
731 for (
int word = 0; word < 2; ++word) {
743 currBits_thisWord =
bits(currDstVal, high_bit, low_bit);
744 newBits = ((word == (
sel & 0x1)) ? origBits_thisWord :
745 ((preserve) ? currBits_thisWord :
746 (((word > (
sel & 0x1)) && signExt) ? 0xffff : 0)));
747 retVal =
insertBits(retVal, high_bit, low_bit, newBits);
751 panic(
"Unimplemented SDWA select operation: %d\n",
sel);
787 origDstOper[lane], clamp,
788 sel, unusedBits_format);
803 const bool src_signExt,
const bool src_abs,
814 currSrc.negModifier();
818 currSrc.absModifier();
840 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
841 const bool src0_neg = sdwaInst.
SRC0_NEG;
842 const bool src0_abs = sdwaInst.
SRC0_ABS;
865 T & src1, T & origSrc1)
869 const bool src0_signExt = sdwaInst.
SRC0_SEXT;
870 const bool src0_neg = sdwaInst.
SRC0_NEG;
871 const bool src0_abs = sdwaInst.
SRC0_ABS;
873 const bool src1_signExt = sdwaInst.
SRC1_SEXT;
874 const bool src1_neg = sdwaInst.
SRC1_NEG;
875 const bool src1_abs = sdwaInst.
SRC1_ABS;
898 const bool clamp = sdwaInst.
CLMP;
constexpr int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input That function will either use a builtin ...
#define panic(...)
This implements a cprintf based panic() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
constexpr unsigned NumVecElemPerVecReg
ScalarRegI32 countZeroBitsMsb(T val)
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
ScalarRegI32 findFirstZero(T val)
ScalarRegI32 findFirstOne(T val)
T median(T val_0, T val_1, T val_2)
ScalarRegI32 findFirstOneMsb(T val)
T roundNearestEven(T val)
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
ScalarRegI32 countZeroBits(T val)
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
static const int NUM_BANKS
std::shared_ptr< GPUDynInst > GPUDynInstPtr
static const int ROW_SIZE