35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/WavefrontStack.hh"
50 :
SimObject(
p), wfSlotId(
p.wf_slot_id), simdId(
p.simdId),
51 maxIbSize(
p.max_ib_size), _gpuISA(*this),
52 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
53 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
54 sleepCnt(0), barId(
WFBarrier::InvalidID), stats(this)
92 for (
int i = 0;
i < 3; ++
i) {
131 uint32_t firstWave = 0;
132 int orderedAppendTerm = 0;
134 uint32_t finalValue = 0;
137 Addr hidden_priv_base(0);
146 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
147 "Setting PrivateSegBuffer: s[%d] = %x\n",
157 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
158 "Setting PrivateSegBuffer: s[%d] = %x\n",
168 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
169 "Setting PrivateSegBuffer: s[%d] = %x\n",
180 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
181 "Setting PrivateSegBuffer: s[%d] = %x\n",
190 bits(host_disp_pkt_addr, 31, 0));
192 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
193 "Setting DispatchPtr: s[%d] = %x\n",
196 bits(host_disp_pkt_addr, 31, 0));
201 bits(host_disp_pkt_addr, 63, 32));
202 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
203 "Setting DispatchPtr: s[%d] = %x\n",
206 bits(host_disp_pkt_addr, 63, 32));
216 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
217 "Setting QueuePtr: s[%d] = %x\n",
226 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
227 "Setting QueuePtr: s[%d] = %x\n",
238 bits(kernarg_addr, 31, 0));
240 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
241 "Setting KernargSegPtr: s[%d] = %x\n",
244 bits(kernarg_addr, 31, 0));
249 bits(kernarg_addr, 63, 32));
250 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
251 "Setting KernargSegPtr: s[%d] = %x\n",
254 bits(kernarg_addr, 63, 32));
264 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
265 "Setting DispatchId: s[%d] = %x\n",
280 (TheGpuISA::ScalarRegU32)(task->
amdQueue
283 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
284 "Setting FlatScratch Addr: s[%d] = %x\n",
287 (TheGpuISA::ScalarRegU32)(task->
amdQueue
297 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
298 "Setting FlatScratch size: s[%d] = %x\n",
328 & 0x000000000000ffff) << 32);
339 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
340 "Setting private segment size: s[%d] = %x\n",
352 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
353 "Setting WG ID X: s[%d] = %x\n",
364 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
365 "Setting WG ID Y: s[%d] = %x\n",
376 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
377 "Setting WG ID Z: s[%d] = %x\n",
386 if (task->
gfxVersion() == GfxVersion::gfx942) {
390 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
391 "Setting architected flat scratch = %x\n",
422 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
423 "Setting Private Seg Offset: s[%d] = %x\n",
430 firstWave = (
wfId == 0) ? 1 : 0;
431 numWfsInWg =
divCeil(wgSizeInWorkItems,
433 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
434 finalValue |= (orderedAppendTerm << 6);
435 finalValue |= numWfsInWg;
439 write(physSgprIdx, finalValue);
442 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
443 "Setting WG Info: s[%d] = %x\n",
448 fatal(
"SGPR enable bit %i not supported\n", en_bit);
464 bool packed_work_item_id =
false;
466 if (task->
gfxVersion() == GfxVersion::gfx90a ||
468 packed_work_item_id =
true;
474 if (packed_work_item_id) {
475 TheGpuISA::VecRegContainerU32 raw_vgpr;
476 TheGpuISA::VecElemU32 *packed_vgpr
477 = raw_vgpr.as<TheGpuISA::VecElemU32>();
481 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
482 packed_vgpr[lane] =
workItemId[0][lane] & 0x3ff;
485 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
486 packed_vgpr[lane] |= ((
workItemId[1][lane] & 0x3ff) << 10);
490 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
491 packed_vgpr[lane] |= ((
workItemId[2][lane] & 0x3ff) << 20);
504 uint32_t physVgprIdx = 0;
505 TheGpuISA::VecRegContainerU32 raw_vgpr;
512 TheGpuISA::VecElemU32 *vgpr_x
513 = raw_vgpr.as<TheGpuISA::VecElemU32>();
515 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
528 TheGpuISA::VecElemU32 *vgpr_y
529 = raw_vgpr.as<TheGpuISA::VecElemU32>();
531 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
543 mapVgpr(
this, regInitIdx);
544 TheGpuISA::VecElemU32 *vgpr_z
545 = raw_vgpr.as<TheGpuISA::VecElemU32>();
547 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
598 "CU%d has been idle for %d ticks at tick %d",
623 if (ii->isGlobalMem() ||
624 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
634 if (ii->isLocalMem() ||
635 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
664 if (ii->isWaitcnt()) {
666 assert(ii->isScalar());
679 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
680 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
681 (ii->isKernArgSeg() && ii->isLoad()))) {
695 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
696 || (ii->isKernArgSeg() && ii->isLoad()))) {
786 if (ii->isReturn() || ii->isBranch() ||
787 ii->isEndOfKernel()) {
806 "Negative requests in pipe for WF%d for slot%d"
807 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
808 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
809 " Outstanding Reqs=%d\n",
817 if (!ii->isScalar()) {
820 }
else if (ii->isStore()) {
822 }
else if (ii->isAtomic() || ii->isMemSync()) {
826 panic(
"Invalid memory operation!\n");
832 }
else if (ii->isStore()) {
834 }
else if (ii->isAtomic() || ii->isMemSync()) {
838 panic(
"Invalid memory operation!\n");
848 "Scalar instructions can not access Shared memory!!!");
851 }
else if (ii->isStore()) {
853 }
else if (ii->isAtomic() || ii->isMemSync()) {
857 panic(
"Invalid memory operation!\n");
874 if (ii->isALU() || ii->isSpecialOp() ||
875 ii->isBranch() || ii->isNop() ||
876 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
877 ii->isReturn() || ii->isEndOfKernel()) {
878 if (!ii->isScalar()) {
884 }
else if (ii->isBarrier()) {
886 }
else if (ii->isFlat()) {
887 assert(!ii->isScalar());
897 }
else if (ii->isGlobalMem()) {
899 }
else if (ii->isLocalMem()) {
901 }
else if (ii->isPrivateSeg()) {
903 "Scalar instructions can not access Private memory!!!");
906 panic(
"reserveResources -> Couldn't process op!\n");
912 assert(execUnitIds.size());
946 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
948 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
958 if (!ii->isScalar()) {
985 for (
const auto& srcVecOp : ii->srcVecRegOperands()) {
986 for (
const auto& virtIdx : srcVecOp.virtIndices()) {
997 for (
const auto& dstVecOp : ii->dstVecRegOperands()) {
998 for (
const auto& virtIdx : dstVecOp.virtIndices()) {
1012 if (
pc() == old_pc) {
1017 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1022 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1026 const int num_active_lanes =
execMask().count();
1034 += num_active_lanes;
1038 if (ii->isF16() && ii->isALU()) {
1039 if (ii->isF32() || ii->isF64()) {
1040 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1041 "either F32 or F64.");
1047 += num_active_lanes;
1049 else if (ii->isMAC()) {
1052 += num_active_lanes;
1054 else if (ii->isMAD()) {
1057 += num_active_lanes;
1059 else if (ii->isMFMA()) {
1061 += num_active_lanes;
1064 if (ii->isF32() && ii->isALU()) {
1065 if (ii->isF16() || ii->isF64()) {
1066 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1067 "either F16 or F64.");
1073 += num_active_lanes;
1075 else if (ii->isMAC()) {
1078 += num_active_lanes;
1080 else if (ii->isMAD()) {
1083 += num_active_lanes;
1085 else if (ii->isMFMA()) {
1087 += num_active_lanes;
1090 if (ii->isF64() && ii->isALU()) {
1091 if (ii->isF16() || ii->isF32()) {
1092 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1093 "either F16 or F32.");
1099 += num_active_lanes;
1101 else if (ii->isMAC()) {
1104 += num_active_lanes;
1106 else if (ii->isMAD()) {
1109 += num_active_lanes;
1111 else if (ii->isMFMA()) {
1113 += num_active_lanes;
1135 bool flat_as_gm =
false;
1136 bool flat_as_lm =
false;
1138 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1139 (ii->executedAs() == enums::SC_PRIVATE);
1140 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1145 if (ii->isALU() || ii->isSpecialOp() ||
1146 ii->isBranch() || ii->isNop() ||
1147 (ii->isKernArgSeg() && ii->isLoad()) ||
1148 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1150 if (!ii->isScalar()) {
1158 }
else if (ii->isBarrier()) {
1162 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1163 if (!ii->isScalar()) {
1179 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1180 if (!ii->isScalar()) {
1195 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1196 (ii->isGlobalMem() || flat_as_gm)) {
1197 if (!ii->isScalar()) {
1213 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1221 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1229 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1230 (ii->isLocalMem() || flat_as_lm)) {
1238 panic(
"Bad instruction type!\n");
1353 assert(vm_wait_cnt >= 0);
1354 assert(exp_wait_cnt >= 0);
1355 assert(lgkm_wait_cnt >= 0);
1358 assert(vm_wait_cnt <= 0xf);
1359 assert(exp_wait_cnt <= 0x7);
1360 assert(lgkm_wait_cnt <= 0x1f);
1378 if (vm_wait_cnt != 0xf)
1381 if (exp_wait_cnt != 0x7)
1384 if (lgkm_wait_cnt != 0x1f)
1493 assert(bar_id < computeUnit->numBarrierSlots());
1516 : statistics::
Group(parent),
1518 "number of instructions executed by this WF slot"),
1519 ADD_STAT(schCycles,
"number of cycles spent in schedule stage"),
1520 ADD_STAT(schStalls,
"number of cycles WF is stalled in SCH stage"),
1521 ADD_STAT(schRfAccessStalls,
"number of cycles wave selected in SCH but "
1522 "RF denied adding instruction"),
1523 ADD_STAT(schResourceStalls,
"number of cycles stalled in sch by resource"
1525 ADD_STAT(schOpdNrdyStalls,
"number of cycles stalled in sch waiting for "
1526 "RF reads to complete"),
1528 "number of cycles wave stalled due to LDS-VRF arbitration"),
1530 ADD_STAT(numTimesBlockedDueWAXDependencies,
"number of times the wf's "
1531 "instructions are blocked due to WAW or WAR dependencies"),
1533 ADD_STAT(numTimesBlockedDueRAWDependencies,
"number of times the wf's "
1534 "instructions are blocked due to RAW dependencies"),
1536 "Count of RAW distance in dynamic instructions for this WF"),
1537 ADD_STAT(readsPerWrite,
"Count of Vector reads per write for this WF")
Tick cyclesToTicks(Cycles c) const
int mapWaveToScalarAlu(Wavefront *w) const
std::vector< WaitClass > scalarALUs
Cycles srf_scm_bus_latency
std::vector< uint64_t > instExecPerSimd
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
Cycles vrf_gm_bus_latency
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
std::vector< RegisterFileCache * > rfc
WaitClass vrfToLocalMemPipeBus
Cycles vrf_lm_bus_latency
TokenManager * getTokenManager()
WaitClass srfToScalarMemPipeBus
std::vector< uint64_t > lastExecCycle
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
WaitClass vectorSharedMemUnit
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
void deleteFromPipeMap(Wavefront *w)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
FetchUnit & fetchUnit(int simdId)
void flushBuf(int wfSlotId)
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
const GfxVersion & gfxVersion() const
Addr hostDispPktAddr() const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int privMemPerItem() const
unsigned accumOffset() const
int mapVgpr(Wavefront *w, int vgprIndex)
std::vector< PoolManager * > vrfPoolMgrs
int mapSgpr(Wavefront *w, int sgprIndex)
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
void incVectorInstDstOperand(int num_operands)
void incVectorInstSrcOperand(int num_operands)
Abstract superclass for simulation objects.
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
static const int InvalidID
bool isOldestInstWaitcnt()
void reserveGmResource(GPUDynInstPtr ii)
std::vector< Addr > lastAddr
void setStatus(status_e newStatus)
void validateRequestCounters()
bool isOldestInstPrivMem()
bool isOldestInstScalarMem()
Wavefront(const Params &p)
bool isOldestInstBarrier()
void resizeRegFiles(int num_vregs, int num_sregs)
int scalarOutstandingReqsWrGm
std::vector< uint32_t > oldVgpr
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
std::vector< uint32_t > workItemFlatId
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
std::vector< int > vecReads
std::deque< GPUDynInstPtr > instructionBuffer
bool isLmInstruction(GPUDynInstPtr ii)
GPUDynInstPtr nextInstr()
std::vector< uint32_t > workItemId[3]
std::vector< uint64_t > oldDgpr
bool isOldestInstScalarALU()
bool isOldestInstFlatMem()
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
std::unordered_map< int, uint64_t > rawDist
std::vector< int > reserveResources()
void decLGKMInstsIssued()
void incLGKMInstsIssued()
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
bool isOldestInstVectorALU()
int scalarOutstandingReqsRdGm
void incVMemInstsIssued()
void reserveLmResource(GPUDynInstPtr ii)
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
void start(uint64_t _wfDynId, uint64_t _base_ptr)
TheGpuISA::GPUISA _gpuISA
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Counter value() const
Return the current value of this stat as its base type.
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T divCeil(const T &a, const U &b)
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
const FlagsType none
Nothing extra to print.
Copyright (c) 2024 Arm Limited All rights reserved.
static void init_pc(py::module_ &m_native)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numVecOpsExecuted
statistics::Distribution activeLanesPerLMemInstrDist
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedMAC64
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Vector instCyclesVMemPerSimd
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedFMA64
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numInstrExecuted
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Scalar numVecOpsExecutedFMA32
statistics::Scalar numVecOpsExecutedMAD16
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar numVecOpsExecutedMAD32
statistics::Scalar numVecOpsExecutedMFMAI8
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar totalCycles
statistics::Distribution execRateDist
statistics::Scalar numVecOpsExecutedMFMA
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
statistics::Distribution readsPerWrite
statistics::Scalar numInstrExecuted
uint32_t scratch_workitem_byte_size
uint32_t compute_tmpring_size_wavesize
uint64_t scratch_backing_memory_location
uint32_t scratch_resource_descriptor[4]