35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/GPUTrace.hh"
38#include "debug/WavefrontStack.hh"
93 for (
int i = 0;
i < 3; ++
i) {
135 uint32_t firstWave = 0;
136 int orderedAppendTerm = 0;
138 uint32_t finalValue = 0;
141 Addr hidden_priv_base(0);
146 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
150 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
151 "Setting PrivateSegBuffer: s[%d] = %x\n",
157 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
161 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
162 "Setting PrivateSegBuffer: s[%d] = %x\n",
168 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
172 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
173 "Setting PrivateSegBuffer: s[%d] = %x\n",
179 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
184 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
185 "Setting PrivateSegBuffer: s[%d] = %x\n",
192 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
194 bits(host_disp_pkt_addr, 31, 0));
196 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
197 "Setting DispatchPtr: s[%d] = %x\n",
200 bits(host_disp_pkt_addr, 31, 0));
203 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
205 bits(host_disp_pkt_addr, 63, 32));
206 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
207 "Setting DispatchPtr: s[%d] = %x\n",
210 bits(host_disp_pkt_addr, 63, 32));
216 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
220 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
221 "Setting QueuePtr: s[%d] = %x\n",
227 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
230 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
231 "Setting QueuePtr: s[%d] = %x\n",
240 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
242 bits(kernarg_addr, 31, 0));
244 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
245 "Setting KernargSegPtr: s[%d] = %x\n",
248 bits(kernarg_addr, 31, 0));
251 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
253 bits(kernarg_addr, 63, 32));
254 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
255 "Setting KernargSegPtr: s[%d] = %x\n",
258 bits(kernarg_addr, 63, 32));
264 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
268 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
269 "Setting DispatchId: s[%d] = %x\n",
276 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
282 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
284 (TheGpuISA::ScalarRegU32)(task->
amdQueue
287 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
288 "Setting FlatScratch Addr: s[%d] = %x\n",
291 (TheGpuISA::ScalarRegU32)(task->
amdQueue
295 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
301 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
302 "Setting FlatScratch size: s[%d] = %x\n",
332 & 0x000000000000ffff) << 32);
339 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
343 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
344 "Setting private segment size: s[%d] = %x\n",
350 DPRINTF(GPUInitAbi,
"Preload %d user SGPRs starting at virtual"
359 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] Setting "
369 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
374 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
375 "Setting WG ID X: s[%d] = %x\n",
381 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
386 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
387 "Setting WG ID Y: s[%d] = %x\n",
393 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
398 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
399 "Setting WG ID Z: s[%d] = %x\n",
408 if (task->
gfxVersion() == GfxVersion::gfx942 ||
410 uint32_t scratchPerWI =
415 + (scratchPerWI * 64 *
wfId);
417 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
418 "Setting architected flat scratch = %x\n",
429 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
449 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
450 "Setting Private Seg Offset: s[%d] = %x\n",
457 firstWave = (
wfId == 0) ? 1 : 0;
458 numWfsInWg =
divCeil(wgSizeInWorkItems,
460 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
461 finalValue |= (orderedAppendTerm << 6);
462 finalValue |= numWfsInWg;
464 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
466 write(physSgprIdx, finalValue);
469 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
470 "Setting WG Info: s[%d] = %x\n",
475 fatal(
"SGPR enable bit %i not supported\n", en_bit);
491 bool packed_work_item_id =
false;
493 if (task->
gfxVersion() == GfxVersion::gfx90a ||
496 packed_work_item_id =
true;
502 if (packed_work_item_id) {
503 TheGpuISA::VecRegContainerU32 raw_vgpr;
504 TheGpuISA::VecElemU32 *packed_vgpr
505 = raw_vgpr.as<TheGpuISA::VecElemU32>();
507 uint32_t physVgprIdx =
computeUnit->registerManager
508 ->mapVgpr(
this, regInitIdx);
509 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
510 packed_vgpr[lane] =
workItemId[0][lane] & 0x3ff;
513 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
514 packed_vgpr[lane] |= ((
workItemId[1][lane] & 0x3ff) << 10);
518 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
519 packed_vgpr[lane] |= ((
workItemId[2][lane] & 0x3ff) << 20);
532 uint32_t physVgprIdx = 0;
533 TheGpuISA::VecRegContainerU32 raw_vgpr;
539 ->mapVgpr(
this, regInitIdx);
540 TheGpuISA::VecElemU32 *vgpr_x
541 = raw_vgpr.as<TheGpuISA::VecElemU32>();
543 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
555 ->mapVgpr(
this, regInitIdx);
556 TheGpuISA::VecElemU32 *vgpr_y
557 = raw_vgpr.as<TheGpuISA::VecElemU32>();
559 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
571 mapVgpr(
this, regInitIdx);
572 TheGpuISA::VecElemU32 *vgpr_z
573 = raw_vgpr.as<TheGpuISA::VecElemU32>();
575 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
626 "CU%d has been idle for %d ticks at tick %d",
651 if (ii->isGlobalMem() ||
652 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
662 if (ii->isLocalMem() ||
663 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
692 if (ii->isWaitcnt()) {
694 assert(ii->isScalar());
707 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
708 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
709 (ii->isKernArgSeg() && ii->isLoad()))) {
723 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
724 || (ii->isKernArgSeg() && ii->isLoad()))) {
814 if (ii->isReturn() || ii->isBranch() ||
815 ii->isEndOfKernel()) {
834 "Negative requests in pipe for WF%d for slot%d"
835 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
836 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
837 " Outstanding Reqs=%d\n",
845 if (!ii->isScalar()) {
848 }
else if (ii->isStore()) {
850 }
else if (ii->isAtomic() || ii->isMemSync()) {
854 panic(
"Invalid memory operation!\n");
860 }
else if (ii->isStore()) {
862 }
else if (ii->isAtomic() || ii->isMemSync()) {
866 panic(
"Invalid memory operation!\n");
876 "Scalar instructions can not access Shared memory!!!");
879 }
else if (ii->isStore()) {
881 }
else if (ii->isAtomic() || ii->isMemSync()) {
885 panic(
"Invalid memory operation!\n");
902 if (ii->isALU() || ii->isSpecialOp() ||
903 ii->isBranch() || ii->isNop() ||
904 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
905 ii->isReturn() || ii->isEndOfKernel()) {
906 if (!ii->isScalar()) {
912 }
else if (ii->isBarrier()) {
914 }
else if (ii->isFlat()) {
915 assert(!ii->isScalar());
925 }
else if (ii->isGlobalMem()) {
927 }
else if (ii->isLocalMem()) {
929 }
else if (ii->isPrivateSeg()) {
931 "Scalar instructions can not access Private memory!!!");
934 panic(
"reserveResources -> Couldn't process op!\n");
940 assert(execUnitIds.size());
974 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
976 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
977 DPRINTF(GPUTrace,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
979 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
989 if (!ii->isScalar()) {
995 computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
996 computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
998 stats.numInstrExecuted++;
1016 for (
const auto& srcVecOp : ii->srcVecRegOperands()) {
1017 for (
const auto& virtIdx : srcVecOp.virtIndices()) {
1020 stats.vecRawDistance.sample(
stats.numInstrExecuted.value() -
1028 for (
const auto& dstVecOp : ii->dstVecRegOperands()) {
1029 for (
const auto& virtIdx : dstVecOp.virtIndices()) {
1043 if (
pc() == old_pc) {
1048 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1053 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1057 const int num_active_lanes =
execMask().count();
1058 computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
1059 computeUnit->stats.numVecOpsExecuted += num_active_lanes;
1062 computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
1065 += num_active_lanes;
1069 if (ii->isF16() && ii->isALU()) {
1070 if (ii->isF32() || ii->isF64()) {
1071 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1072 "either F32 or F64.");
1074 computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
1076 computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
1078 += num_active_lanes;
1080 else if (ii->isMAC()) {
1081 computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
1083 += num_active_lanes;
1085 else if (ii->isMAD()) {
1086 computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
1088 += num_active_lanes;
1090 else if (ii->isMFMA()) {
1092 += num_active_lanes;
1095 if (ii->isF32() && ii->isALU()) {
1096 if (ii->isF16() || ii->isF64()) {
1097 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1098 "either F16 or F64.");
1100 computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
1102 computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
1104 += num_active_lanes;
1106 else if (ii->isMAC()) {
1107 computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
1109 += num_active_lanes;
1111 else if (ii->isMAD()) {
1112 computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
1114 += num_active_lanes;
1116 else if (ii->isMFMA()) {
1118 += num_active_lanes;
1121 if (ii->isF64() && ii->isALU()) {
1122 if (ii->isF16() || ii->isF32()) {
1123 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1124 "either F16 or F32.");
1126 computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
1128 computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
1130 += num_active_lanes;
1132 else if (ii->isMAC()) {
1133 computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
1135 += num_active_lanes;
1137 else if (ii->isMAD()) {
1138 computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
1140 += num_active_lanes;
1142 else if (ii->isMFMA()) {
1144 += num_active_lanes;
1148 computeUnit->stats.activeLanesPerGMemInstrDist.sample(
1151 computeUnit->stats.activeLanesPerLMemInstrDist.sample(
1166 bool flat_as_gm =
false;
1167 bool flat_as_lm =
false;
1169 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1170 (ii->executedAs() == enums::SC_PRIVATE);
1171 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1176 if (ii->isALU() || ii->isSpecialOp() ||
1177 ii->isBranch() || ii->isNop() ||
1178 (ii->isKernArgSeg() && ii->isLoad()) ||
1179 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1181 if (!ii->isScalar()) {
1189 }
else if (ii->isBarrier()) {
1193 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1194 if (!ii->isScalar()) {
1210 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1211 if (!ii->isScalar()) {
1226 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1227 (ii->isGlobalMem() || flat_as_gm)) {
1228 if (!ii->isScalar()) {
1244 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1252 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1260 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1261 (ii->isLocalMem() || flat_as_lm)) {
1269 panic(
"Bad instruction type!\n");
1384 assert(vm_wait_cnt >= 0);
1385 assert(exp_wait_cnt >= 0);
1386 assert(lgkm_wait_cnt >= 0);
1389 assert(vm_wait_cnt <= 0xf);
1390 assert(exp_wait_cnt <= 0x7);
1391 assert(lgkm_wait_cnt <= 0x1f);
1409 if (vm_wait_cnt != 0xf)
1412 if (exp_wait_cnt != 0x7)
1415 if (lgkm_wait_cnt != 0x1f)
1472 if (!
computeUnit->shader->getProgressInterval()) {
1476 assert(!
vmemIssued.count(gpu_dyn_inst->seqNum()));
1484 if (!
computeUnit->shader->getProgressInterval()) {
1488 assert(!
lgkmIssued.count(gpu_dyn_inst->seqNum()));
1496 if (!
computeUnit->shader->getProgressInterval()) {
1500 assert(!
expIssued.count(gpu_dyn_inst->seqNum()));
1501 expIssued.insert(gpu_dyn_inst->seqNum());
1508 if (!
computeUnit->shader->getProgressInterval()) {
1512 cntInsts.insert({gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()});
1518 if (!
computeUnit->shader->getProgressInterval()) {
1523 "%d not in VMEM issued!\n", gpu_dyn_inst->seqNum());
1531 if (!
computeUnit->shader->getProgressInterval()) {
1536 "%d not in LGKM issued!\n", gpu_dyn_inst->seqNum());
1544 if (!
computeUnit->shader->getProgressInterval()) {
1549 "%d not in EXP issued!\n", gpu_dyn_inst->seqNum());
1550 expIssued.erase(gpu_dyn_inst->seqNum());
1557 if (!
computeUnit->shader->getProgressInterval()) {
1597 int vgprIdx =
computeUnit->registerManager->mapVgpr(
this,
i);
1623 assert(bar_id < computeUnit->numBarrierSlots());
1665 std::cout <<
"wave[" <<
wfDynId <<
"] status: "
1670 std::cout << elem <<
' ';
1675 std::cout << elem <<
' ';
1681 std::cout << elem <<
' ';
1685 <<
" wait insts:\n";
1688 std::cout <<
"\t" <<
cntInsts[elem] <<
"\n";
1691 std::cout <<
"\t" <<
cntInsts[elem] <<
"\n";
1694 std::cout <<
"\t" <<
cntInsts[elem] <<
"\n";
1735 "number of instructions executed by this WF slot"),
1739 "RF denied adding instruction"),
1743 "RF reads to complete"),
1745 "number of cycles wave stalled due to LDS-VRF arbitration"),
1748 "instructions are blocked due to WAW or WAR dependencies"),
1751 "instructions are blocked due to RAW dependencies"),
1753 "Count of RAW distance in dynamic instructions for this WF"),
Cycles is a wrapper class for representing cycle counts, i.e.
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
const GfxVersion & gfxVersion() const
void preloadLength(unsigned val)
Addr hostDispPktAddr() const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int privMemPerItem() const
unsigned accumOffset() const
static const int InvalidID
bool isOldestInstWaitcnt()
InstSeqNum lastInstSeqNum
void reserveGmResource(GPUDynInstPtr ii)
std::vector< Addr > lastAddr
std::set< InstSeqNum > expIssued
void setStatus(status_e newStatus)
void untrackInst(InstSeqNum seqNum)
std::array< uint8_t, VegaISA::NumVecElemPerVecReg > mfmaAScale
void validateRequestCounters()
uint8_t getMfmaAScale(int idx)
void trackInst(GPUDynInstPtr gpu_dyn_inst)
void trackVMemInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstPrivMem()
bool isOldestInstScalarMem()
Wavefront(const Params &p)
uint8_t getMfmaBScale(int idx)
bool isOldestInstBarrier()
void resizeRegFiles(int num_vregs, int num_sregs)
int scalarOutstandingReqsWrGm
std::array< uint8_t, VegaISA::NumVecElemPerVecReg > mfmaBScale
std::set< InstSeqNum > lgkmIssued
std::vector< uint32_t > oldVgpr
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
std::vector< uint32_t > workItemFlatId
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
std::vector< int > vecReads
std::deque< GPUDynInstPtr > instructionBuffer
bool isLmInstruction(GPUDynInstPtr ii)
GPUDynInstPtr nextInstr()
std::vector< uint32_t > workItemId[3]
std::vector< uint64_t > oldDgpr
bool isOldestInstScalarALU()
void untrackExpInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstFlatMem()
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
std::string lastInstDisasm
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
std::unordered_map< int, uint64_t > rawDist
void untrackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
std::vector< int > reserveResources()
void decLGKMInstsIssued()
void incLGKMInstsIssued()
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
void untrackVMemInst(GPUDynInstPtr gpu_dyn_inst)
void trackExpInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstVectorALU()
std::unordered_map< InstSeqNum, std::string > cntInsts
std::set< InstSeqNum > vmemIssued
void setMfmaBScale(int idx, uint8_t value)
void trackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
int scalarOutstandingReqsRdGm
void incVMemInstsIssued()
std::string statusToString(status_e status)
void reserveLmResource(GPUDynInstPtr ii)
std::string lastInstRdyStatus
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
void setMfmaAScale(int idx, uint8_t value)
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
void start(uint64_t _wfDynId, uint64_t _base_ptr)
TheGpuISA::GPUISA _gpuISA
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T divCeil(const T &a, const U &b)
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
SimObject(const Params &p)
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
const int NumVecElemPerVecReg(64)
const FlagsType none
Nothing extra to print.
Copyright (c) 2024 Arm Limited All rights reserved.
static void init_pc(py::module_ &m_native)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
statistics::Scalar numTimesBlockedDueRAWDependencies
statistics::Scalar schResourceStalls
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
statistics::Distribution readsPerWrite
statistics::Scalar schCycles
statistics::Scalar numTimesBlockedDueWAXDependencies
statistics::Scalar schRfAccessStalls
statistics::Scalar schOpdNrdyStalls
statistics::Scalar numInstrExecuted
statistics::Scalar schStalls
statistics::Scalar schLdsArbStalls
uint32_t scratch_workitem_byte_size
uint32_t compute_tmpring_size_wavesize
uint64_t scratch_backing_memory_location
uint32_t scratch_resource_descriptor[4]