35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/WavefrontStack.hh"
50 :
SimObject(
p), wfSlotId(
p.wf_slot_id), simdId(
p.simdId),
51 maxIbSize(
p.max_ib_size), _gpuISA(*this),
52 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
53 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
54 sleepCnt(0), barId(
WFBarrier::InvalidID), stats(this)
92 for (
int i = 0;
i < 3; ++
i) {
131 uint32_t firstWave = 0;
132 int orderedAppendTerm = 0;
134 uint32_t finalValue = 0;
137 Addr hidden_priv_base(0);
146 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
147 "Setting PrivateSegBuffer: s[%d] = %x\n",
157 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
158 "Setting PrivateSegBuffer: s[%d] = %x\n",
168 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
169 "Setting PrivateSegBuffer: s[%d] = %x\n",
180 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
181 "Setting PrivateSegBuffer: s[%d] = %x\n",
190 bits(host_disp_pkt_addr, 31, 0));
192 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
193 "Setting DispatchPtr: s[%d] = %x\n",
196 bits(host_disp_pkt_addr, 31, 0));
201 bits(host_disp_pkt_addr, 63, 32));
202 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
203 "Setting DispatchPtr: s[%d] = %x\n",
206 bits(host_disp_pkt_addr, 63, 32));
216 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
217 "Setting QueuePtr: s[%d] = %x\n",
226 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
227 "Setting QueuePtr: s[%d] = %x\n",
238 bits(kernarg_addr, 31, 0));
240 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
241 "Setting KernargSegPtr: s[%d] = %x\n",
244 bits(kernarg_addr, 31, 0));
249 bits(kernarg_addr, 63, 32));
250 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
251 "Setting KernargSegPtr: s[%d] = %x\n",
254 bits(kernarg_addr, 63, 32));
264 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
265 "Setting DispatchId: s[%d] = %x\n",
280 (TheGpuISA::ScalarRegU32)(task->
amdQueue
283 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
284 "Setting FlatScratch Addr: s[%d] = %x\n",
287 (TheGpuISA::ScalarRegU32)(task->
amdQueue
297 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
298 "Setting FlatScratch size: s[%d] = %x\n",
328 & 0x000000000000ffff) << 32);
339 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
340 "Setting private segment size: s[%d] = %x\n",
352 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
353 "Setting WG ID X: s[%d] = %x\n",
364 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
365 "Setting WG ID Y: s[%d] = %x\n",
376 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
377 "Setting WG ID Z: s[%d] = %x\n",
386 if (task->
gfxVersion() == GfxVersion::gfx942) {
390 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
391 "Setting architected flat scratch = %x\n",
422 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
423 "Setting Private Seg Offset: s[%d] = %x\n",
430 firstWave = (
wfId == 0) ? 1 : 0;
431 numWfsInWg =
divCeil(wgSizeInWorkItems,
433 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
434 finalValue |= (orderedAppendTerm << 6);
435 finalValue |= numWfsInWg;
439 write(physSgprIdx, finalValue);
442 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
443 "Setting WG Info: s[%d] = %x\n",
448 fatal(
"SGPR enable bit %i not supported\n", en_bit);
464 bool packed_work_item_id =
false;
466 if (task->
gfxVersion() == GfxVersion::gfx90a ||
468 packed_work_item_id =
true;
474 if (packed_work_item_id) {
475 TheGpuISA::VecRegContainerU32 raw_vgpr;
476 TheGpuISA::VecElemU32 *packed_vgpr
477 = raw_vgpr.as<TheGpuISA::VecElemU32>();
481 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
482 packed_vgpr[lane] =
workItemId[0][lane] & 0x3ff;
485 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
486 packed_vgpr[lane] |= ((
workItemId[1][lane] & 0x3ff) << 10);
490 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
491 packed_vgpr[lane] |= ((
workItemId[2][lane] & 0x3ff) << 20);
504 uint32_t physVgprIdx = 0;
505 TheGpuISA::VecRegContainerU32 raw_vgpr;
512 TheGpuISA::VecElemU32 *vgpr_x
513 = raw_vgpr.as<TheGpuISA::VecElemU32>();
515 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
528 TheGpuISA::VecElemU32 *vgpr_y
529 = raw_vgpr.as<TheGpuISA::VecElemU32>();
531 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
543 mapVgpr(
this, regInitIdx);
544 TheGpuISA::VecElemU32 *vgpr_z
545 = raw_vgpr.as<TheGpuISA::VecElemU32>();
547 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
598 "CU%d has been idle for %d ticks at tick %d",
623 if (ii->isGlobalMem() ||
624 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
634 if (ii->isLocalMem() ||
635 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
664 if (ii->isWaitcnt()) {
666 assert(ii->isScalar());
679 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
680 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
681 (ii->isKernArgSeg() && ii->isLoad()))) {
695 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
696 || (ii->isKernArgSeg() && ii->isLoad()))) {
786 if (ii->isReturn() || ii->isBranch() ||
787 ii->isEndOfKernel()) {
806 "Negative requests in pipe for WF%d for slot%d"
807 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
808 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
809 " Outstanding Reqs=%d\n",
817 if (!ii->isScalar()) {
820 }
else if (ii->isStore()) {
822 }
else if (ii->isAtomic() || ii->isMemSync()) {
826 panic(
"Invalid memory operation!\n");
832 }
else if (ii->isStore()) {
834 }
else if (ii->isAtomic() || ii->isMemSync()) {
838 panic(
"Invalid memory operation!\n");
848 "Scalar instructions can not access Shared memory!!!");
851 }
else if (ii->isStore()) {
853 }
else if (ii->isAtomic() || ii->isMemSync()) {
857 panic(
"Invalid memory operation!\n");
874 if (ii->isALU() || ii->isSpecialOp() ||
875 ii->isBranch() || ii->isNop() ||
876 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
877 ii->isReturn() || ii->isEndOfKernel()) {
878 if (!ii->isScalar()) {
884 }
else if (ii->isBarrier()) {
886 }
else if (ii->isFlat()) {
887 assert(!ii->isScalar());
897 }
else if (ii->isGlobalMem()) {
899 }
else if (ii->isLocalMem()) {
901 }
else if (ii->isPrivateSeg()) {
903 "Scalar instructions can not access Private memory!!!");
906 panic(
"reserveResources -> Couldn't process op!\n");
912 assert(execUnitIds.size());
946 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
948 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
958 if (!ii->isScalar()) {
985 for (
const auto& srcVecOp : ii->srcVecRegOperands()) {
986 for (
const auto& virtIdx : srcVecOp.virtIndices()) {
997 for (
const auto& dstVecOp : ii->dstVecRegOperands()) {
998 for (
const auto& virtIdx : dstVecOp.virtIndices()) {
1012 if (
pc() == old_pc) {
1017 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1022 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1026 const int num_active_lanes =
execMask().count();
1034 += num_active_lanes;
1038 if (ii->isF16() && ii->isALU()) {
1039 if (ii->isF32() || ii->isF64()) {
1040 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1041 "either F32 or F64.");
1047 += num_active_lanes;
1049 else if (ii->isMAC()) {
1052 += num_active_lanes;
1054 else if (ii->isMAD()) {
1057 += num_active_lanes;
1059 else if (ii->isMFMA()) {
1061 += num_active_lanes;
1064 if (ii->isF32() && ii->isALU()) {
1065 if (ii->isF16() || ii->isF64()) {
1066 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1067 "either F16 or F64.");
1073 += num_active_lanes;
1075 else if (ii->isMAC()) {
1078 += num_active_lanes;
1080 else if (ii->isMAD()) {
1083 += num_active_lanes;
1085 else if (ii->isMFMA()) {
1087 += num_active_lanes;
1090 if (ii->isF64() && ii->isALU()) {
1091 if (ii->isF16() || ii->isF32()) {
1092 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1093 "either F16 or F32.");
1099 += num_active_lanes;
1101 else if (ii->isMAC()) {
1104 += num_active_lanes;
1106 else if (ii->isMAD()) {
1109 += num_active_lanes;
1111 else if (ii->isMFMA()) {
1113 += num_active_lanes;
1135 bool flat_as_gm =
false;
1136 bool flat_as_lm =
false;
1138 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1139 (ii->executedAs() == enums::SC_PRIVATE);
1140 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1145 if (ii->isALU() || ii->isSpecialOp() ||
1146 ii->isBranch() || ii->isNop() ||
1147 (ii->isKernArgSeg() && ii->isLoad()) ||
1148 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1150 if (!ii->isScalar()) {
1158 }
else if (ii->isBarrier()) {
1162 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1163 if (!ii->isScalar()) {
1179 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1180 if (!ii->isScalar()) {
1195 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1196 (ii->isGlobalMem() || flat_as_gm)) {
1197 if (!ii->isScalar()) {
1213 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1221 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1229 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1230 (ii->isLocalMem() || flat_as_lm)) {
1238 panic(
"Bad instruction type!\n");
1353 assert(vm_wait_cnt >= 0);
1354 assert(exp_wait_cnt >= 0);
1355 assert(lgkm_wait_cnt >= 0);
1358 assert(vm_wait_cnt <= 0xf);
1359 assert(exp_wait_cnt <= 0x7);
1360 assert(lgkm_wait_cnt <= 0x1f);
1378 if (vm_wait_cnt != 0xf)
1381 if (exp_wait_cnt != 0x7)
1384 if (lgkm_wait_cnt != 0x1f)
1493 assert(bar_id < computeUnit->numBarrierSlots());
1516 : statistics::
Group(parent),
1518 "number of instructions executed by this WF slot"),
1519 ADD_STAT(schCycles,
"number of cycles spent in schedule stage"),
1520 ADD_STAT(schStalls,
"number of cycles WF is stalled in SCH stage"),
1521 ADD_STAT(schRfAccessStalls,
"number of cycles wave selected in SCH but "
1522 "RF denied adding instruction"),
1523 ADD_STAT(schResourceStalls,
"number of cycles stalled in sch by resource"
1525 ADD_STAT(schOpdNrdyStalls,
"number of cycles stalled in sch waiting for "
1526 "RF reads to complete"),
1528 "number of cycles wave stalled due to LDS-VRF arbitration"),
1530 ADD_STAT(numTimesBlockedDueWAXDependencies,
"number of times the wf's "
1531 "instructions are blocked due to WAW or WAR dependencies"),
1533 ADD_STAT(numTimesBlockedDueRAWDependencies,
"number of times the wf's "
1534 "instructions are blocked due to RAW dependencies"),
1536 "Count of RAW distance in dynamic instructions for this WF"),
1537 ADD_STAT(readsPerWrite,
"Count of Vector reads per write for this WF")
Tick cyclesToTicks(Cycles c) const
int mapWaveToScalarAlu(Wavefront *w) const
std::vector< WaitClass > scalarALUs
Cycles srf_scm_bus_latency
std::vector< uint64_t > instExecPerSimd
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
Cycles vrf_gm_bus_latency
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
std::vector< RegisterFileCache * > rfc
WaitClass vrfToLocalMemPipeBus
Cycles vrf_lm_bus_latency
TokenManager * getTokenManager()
WaitClass srfToScalarMemPipeBus
std::vector< uint64_t > lastExecCycle
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
WaitClass vectorSharedMemUnit
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
void deleteFromPipeMap(Wavefront *w)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
FetchUnit & fetchUnit(int simdId)
void flushBuf(int wfSlotId)
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
const GfxVersion & gfxVersion() const
Addr hostDispPktAddr() const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int privMemPerItem() const
unsigned accumOffset() const
int mapVgpr(Wavefront *w, int vgprIndex)
std::vector< PoolManager * > vrfPoolMgrs
int mapSgpr(Wavefront *w, int sgprIndex)
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
void incVectorInstDstOperand(int num_operands)
void incVectorInstSrcOperand(int num_operands)
Abstract superclass for simulation objects.
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
static const int InvalidID
bool isOldestInstWaitcnt()
void reserveGmResource(GPUDynInstPtr ii)
std::vector< Addr > lastAddr
void setStatus(status_e newStatus)
void validateRequestCounters()
bool isOldestInstPrivMem()
bool isOldestInstScalarMem()
Wavefront(const Params &p)
bool isOldestInstBarrier()
void resizeRegFiles(int num_vregs, int num_sregs)
int scalarOutstandingReqsWrGm
std::vector< uint32_t > oldVgpr
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
std::vector< uint32_t > workItemFlatId
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
std::vector< int > vecReads
std::deque< GPUDynInstPtr > instructionBuffer
bool isLmInstruction(GPUDynInstPtr ii)
GPUDynInstPtr nextInstr()
std::vector< uint32_t > workItemId[3]
std::vector< uint64_t > oldDgpr
bool isOldestInstScalarALU()
bool isOldestInstFlatMem()
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
std::unordered_map< int, uint64_t > rawDist
std::vector< int > reserveResources()
void decLGKMInstsIssued()
void incLGKMInstsIssued()
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
bool isOldestInstVectorALU()
int scalarOutstandingReqsRdGm
void incVMemInstsIssued()
void reserveLmResource(GPUDynInstPtr ii)
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
void start(uint64_t _wfDynId, uint64_t _base_ptr)
TheGpuISA::GPUISA _gpuISA
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Counter value() const
Return the current value of this stat as its base type.
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T divCeil(const T &a, const U &b)
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
const FlagsType none
Nothing extra to print.
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
static void init_pc(py::module_ &m_native)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numVecOpsExecuted
statistics::Distribution activeLanesPerLMemInstrDist
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedMAC64
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Vector instCyclesVMemPerSimd
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedFMA64
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numInstrExecuted
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Scalar numVecOpsExecutedFMA32
statistics::Scalar numVecOpsExecutedMAD16
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar numVecOpsExecutedMAD32
statistics::Scalar numVecOpsExecutedMFMAI8
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar totalCycles
statistics::Distribution execRateDist
statistics::Scalar numVecOpsExecutedMFMA
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
statistics::Distribution readsPerWrite
statistics::Scalar numInstrExecuted
uint32_t scratch_workitem_byte_size
uint32_t compute_tmpring_size_wavesize
uint64_t scratch_backing_memory_location
uint32_t scratch_resource_descriptor[4]