35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/GPUTrace.hh"
38#include "debug/WavefrontStack.hh"
93 for (
int i = 0;
i < 3; ++
i) {
135 uint32_t firstWave = 0;
136 int orderedAppendTerm = 0;
138 uint32_t finalValue = 0;
141 Addr hidden_priv_base(0);
146 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
150 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
151 "Setting PrivateSegBuffer: s[%d] = %x\n",
157 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
161 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
162 "Setting PrivateSegBuffer: s[%d] = %x\n",
168 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
172 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
173 "Setting PrivateSegBuffer: s[%d] = %x\n",
179 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
184 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
185 "Setting PrivateSegBuffer: s[%d] = %x\n",
192 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
194 bits(host_disp_pkt_addr, 31, 0));
196 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
197 "Setting DispatchPtr: s[%d] = %x\n",
200 bits(host_disp_pkt_addr, 31, 0));
203 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
205 bits(host_disp_pkt_addr, 63, 32));
206 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
207 "Setting DispatchPtr: s[%d] = %x\n",
210 bits(host_disp_pkt_addr, 63, 32));
216 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
220 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
221 "Setting QueuePtr: s[%d] = %x\n",
227 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
230 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
231 "Setting QueuePtr: s[%d] = %x\n",
240 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
242 bits(kernarg_addr, 31, 0));
244 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
245 "Setting KernargSegPtr: s[%d] = %x\n",
248 bits(kernarg_addr, 31, 0));
251 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
253 bits(kernarg_addr, 63, 32));
254 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
255 "Setting KernargSegPtr: s[%d] = %x\n",
258 bits(kernarg_addr, 63, 32));
264 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
268 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
269 "Setting DispatchId: s[%d] = %x\n",
276 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
282 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
284 (TheGpuISA::ScalarRegU32)(task->
amdQueue
287 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
288 "Setting FlatScratch Addr: s[%d] = %x\n",
291 (TheGpuISA::ScalarRegU32)(task->
amdQueue
295 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
301 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
302 "Setting FlatScratch size: s[%d] = %x\n",
332 & 0x000000000000ffff) << 32);
339 =
computeUnit->registerManager->mapSgpr(
this, regInitIdx);
343 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
344 "Setting private segment size: s[%d] = %x\n",
350 DPRINTF(GPUInitAbi,
"Preload %d user SGPRs starting at virtual"
359 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] Setting "
369 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
374 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
375 "Setting WG ID X: s[%d] = %x\n",
381 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
386 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
387 "Setting WG ID Y: s[%d] = %x\n",
393 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
398 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
399 "Setting WG ID Z: s[%d] = %x\n",
408 if (task->
gfxVersion() == GfxVersion::gfx942) {
409 uint32_t scratchPerWI =
414 + (scratchPerWI * 64 *
wfId);
416 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
417 "Setting architected flat scratch = %x\n",
428 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
448 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
449 "Setting Private Seg Offset: s[%d] = %x\n",
456 firstWave = (
wfId == 0) ? 1 : 0;
457 numWfsInWg =
divCeil(wgSizeInWorkItems,
459 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
460 finalValue |= (orderedAppendTerm << 6);
461 finalValue |= numWfsInWg;
463 computeUnit->registerManager->mapSgpr(
this, regInitIdx);
465 write(physSgprIdx, finalValue);
468 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
469 "Setting WG Info: s[%d] = %x\n",
474 fatal(
"SGPR enable bit %i not supported\n", en_bit);
490 bool packed_work_item_id =
false;
492 if (task->
gfxVersion() == GfxVersion::gfx90a ||
494 packed_work_item_id =
true;
500 if (packed_work_item_id) {
501 TheGpuISA::VecRegContainerU32 raw_vgpr;
502 TheGpuISA::VecElemU32 *packed_vgpr
503 = raw_vgpr.as<TheGpuISA::VecElemU32>();
505 uint32_t physVgprIdx =
computeUnit->registerManager
506 ->mapVgpr(
this, regInitIdx);
507 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
508 packed_vgpr[lane] =
workItemId[0][lane] & 0x3ff;
511 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
512 packed_vgpr[lane] |= ((
workItemId[1][lane] & 0x3ff) << 10);
516 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
517 packed_vgpr[lane] |= ((
workItemId[2][lane] & 0x3ff) << 20);
530 uint32_t physVgprIdx = 0;
531 TheGpuISA::VecRegContainerU32 raw_vgpr;
537 ->mapVgpr(
this, regInitIdx);
538 TheGpuISA::VecElemU32 *vgpr_x
539 = raw_vgpr.as<TheGpuISA::VecElemU32>();
541 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
553 ->mapVgpr(
this, regInitIdx);
554 TheGpuISA::VecElemU32 *vgpr_y
555 = raw_vgpr.as<TheGpuISA::VecElemU32>();
557 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
569 mapVgpr(
this, regInitIdx);
570 TheGpuISA::VecElemU32 *vgpr_z
571 = raw_vgpr.as<TheGpuISA::VecElemU32>();
573 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
624 "CU%d has been idle for %d ticks at tick %d",
649 if (ii->isGlobalMem() ||
650 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
660 if (ii->isLocalMem() ||
661 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
690 if (ii->isWaitcnt()) {
692 assert(ii->isScalar());
705 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
706 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
707 (ii->isKernArgSeg() && ii->isLoad()))) {
721 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
722 || (ii->isKernArgSeg() && ii->isLoad()))) {
812 if (ii->isReturn() || ii->isBranch() ||
813 ii->isEndOfKernel()) {
832 "Negative requests in pipe for WF%d for slot%d"
833 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
834 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
835 " Outstanding Reqs=%d\n",
843 if (!ii->isScalar()) {
846 }
else if (ii->isStore()) {
848 }
else if (ii->isAtomic() || ii->isMemSync()) {
852 panic(
"Invalid memory operation!\n");
858 }
else if (ii->isStore()) {
860 }
else if (ii->isAtomic() || ii->isMemSync()) {
864 panic(
"Invalid memory operation!\n");
874 "Scalar instructions can not access Shared memory!!!");
877 }
else if (ii->isStore()) {
879 }
else if (ii->isAtomic() || ii->isMemSync()) {
883 panic(
"Invalid memory operation!\n");
900 if (ii->isALU() || ii->isSpecialOp() ||
901 ii->isBranch() || ii->isNop() ||
902 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
903 ii->isReturn() || ii->isEndOfKernel()) {
904 if (!ii->isScalar()) {
910 }
else if (ii->isBarrier()) {
912 }
else if (ii->isFlat()) {
913 assert(!ii->isScalar());
923 }
else if (ii->isGlobalMem()) {
925 }
else if (ii->isLocalMem()) {
927 }
else if (ii->isPrivateSeg()) {
929 "Scalar instructions can not access Private memory!!!");
932 panic(
"reserveResources -> Couldn't process op!\n");
938 assert(execUnitIds.size());
972 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
974 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
975 DPRINTF(GPUTrace,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
977 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
987 if (!ii->isScalar()) {
993 computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
994 computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
996 stats.numInstrExecuted++;
1014 for (
const auto& srcVecOp : ii->srcVecRegOperands()) {
1015 for (
const auto& virtIdx : srcVecOp.virtIndices()) {
1018 stats.vecRawDistance.sample(
stats.numInstrExecuted.value() -
1026 for (
const auto& dstVecOp : ii->dstVecRegOperands()) {
1027 for (
const auto& virtIdx : dstVecOp.virtIndices()) {
1041 if (
pc() == old_pc) {
1046 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1051 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1055 const int num_active_lanes =
execMask().count();
1056 computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
1057 computeUnit->stats.numVecOpsExecuted += num_active_lanes;
1060 computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
1063 += num_active_lanes;
1067 if (ii->isF16() && ii->isALU()) {
1068 if (ii->isF32() || ii->isF64()) {
1069 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1070 "either F32 or F64.");
1072 computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
1074 computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
1076 += num_active_lanes;
1078 else if (ii->isMAC()) {
1079 computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
1081 += num_active_lanes;
1083 else if (ii->isMAD()) {
1084 computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
1086 += num_active_lanes;
1088 else if (ii->isMFMA()) {
1090 += num_active_lanes;
1093 if (ii->isF32() && ii->isALU()) {
1094 if (ii->isF16() || ii->isF64()) {
1095 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1096 "either F16 or F64.");
1098 computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
1100 computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
1102 += num_active_lanes;
1104 else if (ii->isMAC()) {
1105 computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
1107 += num_active_lanes;
1109 else if (ii->isMAD()) {
1110 computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
1112 += num_active_lanes;
1114 else if (ii->isMFMA()) {
1116 += num_active_lanes;
1119 if (ii->isF64() && ii->isALU()) {
1120 if (ii->isF16() || ii->isF32()) {
1121 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1122 "either F16 or F32.");
1124 computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
1126 computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
1128 += num_active_lanes;
1130 else if (ii->isMAC()) {
1131 computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
1133 += num_active_lanes;
1135 else if (ii->isMAD()) {
1136 computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
1138 += num_active_lanes;
1140 else if (ii->isMFMA()) {
1142 += num_active_lanes;
1146 computeUnit->stats.activeLanesPerGMemInstrDist.sample(
1149 computeUnit->stats.activeLanesPerLMemInstrDist.sample(
1164 bool flat_as_gm =
false;
1165 bool flat_as_lm =
false;
1167 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1168 (ii->executedAs() == enums::SC_PRIVATE);
1169 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1174 if (ii->isALU() || ii->isSpecialOp() ||
1175 ii->isBranch() || ii->isNop() ||
1176 (ii->isKernArgSeg() && ii->isLoad()) ||
1177 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1179 if (!ii->isScalar()) {
1187 }
else if (ii->isBarrier()) {
1191 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1192 if (!ii->isScalar()) {
1208 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1209 if (!ii->isScalar()) {
1224 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1225 (ii->isGlobalMem() || flat_as_gm)) {
1226 if (!ii->isScalar()) {
1242 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1250 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1258 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1259 (ii->isLocalMem() || flat_as_lm)) {
1267 panic(
"Bad instruction type!\n");
1382 assert(vm_wait_cnt >= 0);
1383 assert(exp_wait_cnt >= 0);
1384 assert(lgkm_wait_cnt >= 0);
1387 assert(vm_wait_cnt <= 0xf);
1388 assert(exp_wait_cnt <= 0x7);
1389 assert(lgkm_wait_cnt <= 0x1f);
1407 if (vm_wait_cnt != 0xf)
1410 if (exp_wait_cnt != 0x7)
1413 if (lgkm_wait_cnt != 0x1f)
1470 if (!
computeUnit->shader->getProgressInterval()) {
1474 assert(!
vmemIssued.count(gpu_dyn_inst->seqNum()));
1482 if (!
computeUnit->shader->getProgressInterval()) {
1486 assert(!
lgkmIssued.count(gpu_dyn_inst->seqNum()));
1494 if (!
computeUnit->shader->getProgressInterval()) {
1498 assert(!
expIssued.count(gpu_dyn_inst->seqNum()));
1499 expIssued.insert(gpu_dyn_inst->seqNum());
1506 if (!
computeUnit->shader->getProgressInterval()) {
1510 cntInsts.insert({gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()});
1516 if (!
computeUnit->shader->getProgressInterval()) {
1521 "%d not in VMEM issued!\n", gpu_dyn_inst->seqNum());
1529 if (!
computeUnit->shader->getProgressInterval()) {
1534 "%d not in LGKM issued!\n", gpu_dyn_inst->seqNum());
1542 if (!
computeUnit->shader->getProgressInterval()) {
1547 "%d not in EXP issued!\n", gpu_dyn_inst->seqNum());
1548 expIssued.erase(gpu_dyn_inst->seqNum());
1555 if (!
computeUnit->shader->getProgressInterval()) {
1595 int vgprIdx =
computeUnit->registerManager->mapVgpr(
this,
i);
1621 assert(bar_id < computeUnit->numBarrierSlots());
1663 std::cout <<
"wave[" <<
wfDynId <<
"] status: "
1668 std::cout << elem <<
' ';
1673 std::cout << elem <<
' ';
1679 std::cout << elem <<
' ';
1683 <<
" wait insts:\n";
1686 std::cout <<
"\t" <<
cntInsts[elem] <<
"\n";
1689 std::cout <<
"\t" <<
cntInsts[elem] <<
"\n";
1692 std::cout <<
"\t" <<
cntInsts[elem] <<
"\n";
1699 "number of instructions executed by this WF slot"),
1703 "RF denied adding instruction"),
1707 "RF reads to complete"),
1709 "number of cycles wave stalled due to LDS-VRF arbitration"),
1712 "instructions are blocked due to WAW or WAR dependencies"),
1715 "instructions are blocked due to RAW dependencies"),
1717 "Count of RAW distance in dynamic instructions for this WF"),
Cycles is a wrapper class for representing cycle counts, i.e.
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
const GfxVersion & gfxVersion() const
void preloadLength(unsigned val)
Addr hostDispPktAddr() const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int privMemPerItem() const
unsigned accumOffset() const
static const int InvalidID
bool isOldestInstWaitcnt()
InstSeqNum lastInstSeqNum
void reserveGmResource(GPUDynInstPtr ii)
std::vector< Addr > lastAddr
std::set< InstSeqNum > expIssued
void setStatus(status_e newStatus)
void untrackInst(InstSeqNum seqNum)
void validateRequestCounters()
void trackInst(GPUDynInstPtr gpu_dyn_inst)
void trackVMemInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstPrivMem()
bool isOldestInstScalarMem()
Wavefront(const Params &p)
bool isOldestInstBarrier()
void resizeRegFiles(int num_vregs, int num_sregs)
int scalarOutstandingReqsWrGm
std::set< InstSeqNum > lgkmIssued
std::vector< uint32_t > oldVgpr
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
std::vector< uint32_t > workItemFlatId
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
std::vector< int > vecReads
std::deque< GPUDynInstPtr > instructionBuffer
bool isLmInstruction(GPUDynInstPtr ii)
GPUDynInstPtr nextInstr()
std::vector< uint32_t > workItemId[3]
std::vector< uint64_t > oldDgpr
bool isOldestInstScalarALU()
void untrackExpInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstFlatMem()
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
std::string lastInstDisasm
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
std::unordered_map< int, uint64_t > rawDist
void untrackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
std::vector< int > reserveResources()
void decLGKMInstsIssued()
void incLGKMInstsIssued()
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
void untrackVMemInst(GPUDynInstPtr gpu_dyn_inst)
void trackExpInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstVectorALU()
std::unordered_map< InstSeqNum, std::string > cntInsts
std::set< InstSeqNum > vmemIssued
void trackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
int scalarOutstandingReqsRdGm
void incVMemInstsIssued()
std::string statusToString(status_e status)
void reserveLmResource(GPUDynInstPtr ii)
std::string lastInstRdyStatus
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
void start(uint64_t _wfDynId, uint64_t _base_ptr)
TheGpuISA::GPUISA _gpuISA
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T divCeil(const T &a, const U &b)
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
SimObject(const Params &p)
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
const FlagsType none
Nothing extra to print.
Copyright (c) 2024 Arm Limited All rights reserved.
static void init_pc(py::module_ &m_native)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
statistics::Scalar numTimesBlockedDueRAWDependencies
statistics::Scalar schResourceStalls
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
statistics::Distribution readsPerWrite
statistics::Scalar schCycles
statistics::Scalar numTimesBlockedDueWAXDependencies
statistics::Scalar schRfAccessStalls
statistics::Scalar schOpdNrdyStalls
statistics::Scalar numInstrExecuted
statistics::Scalar schStalls
statistics::Scalar schLdsArbStalls
uint32_t scratch_workitem_byte_size
uint32_t compute_tmpring_size_wavesize
uint64_t scratch_backing_memory_location
uint32_t scratch_resource_descriptor[4]