35 #include "debug/GPUExec.hh"
36 #include "debug/GPUInitAbi.hh"
37 #include "debug/WavefrontStack.hh"
49 :
SimObject(
p), wfSlotId(
p.wf_slot_id), simdId(
p.simdId),
50 maxIbSize(
p.max_ib_size), _gpuISA(*this),
51 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
52 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
53 sleepCnt(0), barId(
WFBarrier::InvalidID), stats(this)
91 for (
int i = 0;
i < 3; ++
i) {
127 uint32_t wiCount = 0;
128 uint32_t firstWave = 0;
129 int orderedAppendTerm = 0;
131 uint32_t finalValue = 0;
134 Addr hidden_priv_base(0);
143 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
144 "Setting PrivateSegBuffer: s[%d] = %x\n",
154 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
155 "Setting PrivateSegBuffer: s[%d] = %x\n",
165 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
166 "Setting PrivateSegBuffer: s[%d] = %x\n",
177 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
178 "Setting PrivateSegBuffer: s[%d] = %x\n",
187 bits(host_disp_pkt_addr, 31, 0));
189 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
190 "Setting DispatchPtr: s[%d] = %x\n",
193 bits(host_disp_pkt_addr, 31, 0));
198 bits(host_disp_pkt_addr, 63, 32));
199 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
200 "Setting DispatchPtr: s[%d] = %x\n",
203 bits(host_disp_pkt_addr, 63, 32));
213 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
214 "Setting QueuePtr: s[%d] = %x\n",
223 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
224 "Setting QueuePtr: s[%d] = %x\n",
235 bits(kernarg_addr, 31, 0));
237 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
238 "Setting KernargSegPtr: s[%d] = %x\n",
241 bits(kernarg_addr, 31, 0));
246 bits(kernarg_addr, 63, 32));
247 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
248 "Setting KernargSegPtr: s[%d] = %x\n",
251 bits(kernarg_addr, 63, 32));
261 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
262 "Setting DispatchId: s[%d] = %x\n",
274 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
275 "Setting FlatScratch Addr: s[%d] = %x\n",
288 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
289 "Setting FlatScratch size: s[%d] = %x\n",
319 & 0x000000000000ffff) << 32);
330 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
331 "Setting private segment size: s[%d] = %x\n",
345 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
346 "Setting num WG X: s[%d] = %x\n",
359 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
360 "Setting num WG Y: s[%d] = %x\n",
373 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
374 "Setting num WG Z: s[%d] = %x\n",
385 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
386 "Setting WG ID X: s[%d] = %x\n",
397 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
398 "Setting WG ID Y: s[%d] = %x\n",
409 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
410 "Setting WG ID Z: s[%d] = %x\n",
435 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
436 "Setting Private Seg Offset: s[%d] = %x\n",
443 firstWave = (
wfId == 0) ? 1 : 0;
444 numWfsInWg =
divCeil(wgSizeInWorkItems,
446 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
447 finalValue |= (orderedAppendTerm << 6);
448 finalValue |= numWfsInWg;
452 write(physSgprIdx, finalValue);
455 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
456 "Setting WG Info: s[%d] = %x\n",
461 fatal(
"SGPR enable bit %i not supported\n", en_bit);
473 uint32_t physVgprIdx = 0;
484 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
500 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
512 mapVgpr(
this, regInitIdx);
516 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
567 "CU%d has been idle for %d ticks at tick %d",
592 if (ii->isGlobalMem() ||
593 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
603 if (ii->isLocalMem() ||
604 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
633 if (ii->isWaitcnt()) {
635 assert(ii->isScalar());
648 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
649 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
650 (ii->isKernArgSeg() && ii->isLoad()))) {
664 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
665 || (ii->isKernArgSeg() && ii->isLoad()))) {
755 if (ii->isReturn() || ii->isBranch() ||
756 ii->isEndOfKernel()) {
775 "Negative requests in pipe for WF%d for slot%d"
776 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
777 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
778 " Outstanding Reqs=%d\n",
786 if (!ii->isScalar()) {
789 }
else if (ii->isStore()) {
791 }
else if (ii->isAtomic() || ii->isMemSync()) {
795 panic(
"Invalid memory operation!\n");
801 }
else if (ii->isStore()) {
803 }
else if (ii->isAtomic() || ii->isMemSync()) {
807 panic(
"Invalid memory operation!\n");
817 "Scalar instructions can not access Shared memory!!!");
820 }
else if (ii->isStore()) {
822 }
else if (ii->isAtomic() || ii->isMemSync()) {
826 panic(
"Invalid memory operation!\n");
843 if (ii->isALU() || ii->isSpecialOp() ||
844 ii->isBranch() || ii->isNop() ||
845 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
846 ii->isReturn() || ii->isEndOfKernel()) {
847 if (!ii->isScalar()) {
853 }
else if (ii->isBarrier()) {
855 }
else if (ii->isFlat()) {
856 assert(!ii->isScalar());
866 }
else if (ii->isGlobalMem()) {
868 }
else if (ii->isLocalMem()) {
870 }
else if (ii->isPrivateSeg()) {
872 "Scalar instructions can not access Private memory!!!");
875 panic(
"reserveResources -> Couldn't process op!\n");
881 assert(execUnitIds.size());
915 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
917 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
927 if (!ii->isScalar()) {
953 for (
const auto& srcVecOp : ii->srcVecRegOperands()) {
954 for (
const auto& virtIdx : srcVecOp.virtIndices()) {
965 for (
const auto& dstVecOp : ii->dstVecRegOperands()) {
966 for (
const auto& virtIdx : dstVecOp.virtIndices()) {
980 if (
pc() == old_pc) {
985 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
990 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
994 const int num_active_lanes =
execMask().count();
998 if (ii->isF16() && ii->isALU()) {
999 if (ii->isF32() || ii->isF64()) {
1000 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1001 "either F32 or F64.");
1007 += num_active_lanes;
1009 else if (ii->isMAC()) {
1012 += num_active_lanes;
1014 else if (ii->isMAD()) {
1017 += num_active_lanes;
1020 if (ii->isF32() && ii->isALU()) {
1021 if (ii->isF16() || ii->isF64()) {
1022 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1023 "either F16 or F64.");
1029 += num_active_lanes;
1031 else if (ii->isMAC()) {
1034 += num_active_lanes;
1036 else if (ii->isMAD()) {
1039 += num_active_lanes;
1042 if (ii->isF64() && ii->isALU()) {
1043 if (ii->isF16() || ii->isF32()) {
1044 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1045 "either F16 or F32.");
1051 += num_active_lanes;
1053 else if (ii->isMAC()) {
1056 += num_active_lanes;
1058 else if (ii->isMAD()) {
1061 += num_active_lanes;
1083 bool flat_as_gm =
false;
1084 bool flat_as_lm =
false;
1086 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1087 (ii->executedAs() == enums::SC_PRIVATE);
1088 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1093 if (ii->isALU() || ii->isSpecialOp() ||
1094 ii->isBranch() || ii->isNop() ||
1095 (ii->isKernArgSeg() && ii->isLoad()) ||
1096 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1098 if (!ii->isScalar()) {
1106 }
else if (ii->isBarrier()) {
1110 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1111 if (!ii->isScalar()) {
1127 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1128 if (!ii->isScalar()) {
1143 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1144 (ii->isGlobalMem() || flat_as_gm)) {
1145 if (!ii->isScalar()) {
1161 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1169 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1177 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1178 (ii->isLocalMem() || flat_as_lm)) {
1186 panic(
"Bad instruction type!\n");
1301 assert(vm_wait_cnt >= 0);
1302 assert(exp_wait_cnt >= 0);
1303 assert(lgkm_wait_cnt >= 0);
1306 assert(vm_wait_cnt <= 0xf);
1307 assert(exp_wait_cnt <= 0x7);
1308 assert(lgkm_wait_cnt <= 0x1f);
1326 if (vm_wait_cnt != 0xf)
1329 if (exp_wait_cnt != 0x7)
1332 if (lgkm_wait_cnt != 0x1f)
1441 assert(bar_id < computeUnit->numBarrierSlots());
1464 : statistics::
Group(parent),
1466 "number of instructions executed by this WF slot"),
1467 ADD_STAT(schCycles,
"number of cycles spent in schedule stage"),
1468 ADD_STAT(schStalls,
"number of cycles WF is stalled in SCH stage"),
1469 ADD_STAT(schRfAccessStalls,
"number of cycles wave selected in SCH but "
1470 "RF denied adding instruction"),
1471 ADD_STAT(schResourceStalls,
"number of cycles stalled in sch by resource"
1473 ADD_STAT(schOpdNrdyStalls,
"number of cycles stalled in sch waiting for "
1474 "RF reads to complete"),
1476 "number of cycles wave stalled due to LDS-VRF arbitration"),
1478 ADD_STAT(numTimesBlockedDueWAXDependencies,
"number of times the wf's "
1479 "instructions are blocked due to WAW or WAR dependencies"),
1481 ADD_STAT(numTimesBlockedDueRAWDependencies,
"number of times the wf's "
1482 "instructions are blocked due to RAW dependencies"),
1484 "Count of RAW distance in dynamic instructions for this WF"),
1485 ADD_STAT(readsPerWrite,
"Count of Vector reads per write for this WF")
Tick cyclesToTicks(Cycles c) const
int mapWaveToScalarAlu(Wavefront *w) const
std::vector< WaitClass > scalarALUs
Cycles srf_scm_bus_latency
std::vector< uint64_t > instExecPerSimd
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
Cycles vrf_gm_bus_latency
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass vrfToLocalMemPipeBus
Cycles vrf_lm_bus_latency
WaitClass srfToScalarMemPipeBus
std::vector< uint64_t > lastExecCycle
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
WaitClass vectorSharedMemUnit
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
TokenManager * getTokenManager()
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
void deleteFromPipeMap(Wavefront *w)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
FetchUnit & fetchUnit(int simdId)
void flushBuf(int wfSlotId)
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
Addr hostDispPktAddr() const
int wgSize(int dim) const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int privMemPerItem() const
int gridSize(int dim) const
int mapVgpr(Wavefront *w, int vgprIndex)
std::vector< PoolManager * > vrfPoolMgrs
int mapSgpr(Wavefront *w, int sgprIndex)
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
void incVectorInstDstOperand(int num_operands)
void incVectorInstSrcOperand(int num_operands)
Abstract superclass for simulation objects.
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
static const int InvalidID
bool isOldestInstWaitcnt()
void reserveGmResource(GPUDynInstPtr ii)
std::vector< Addr > lastAddr
void setStatus(status_e newStatus)
void validateRequestCounters()
bool isOldestInstPrivMem()
bool isOldestInstScalarMem()
Wavefront(const Params &p)
bool isOldestInstBarrier()
void resizeRegFiles(int num_vregs, int num_sregs)
int scalarOutstandingReqsWrGm
std::vector< uint32_t > oldVgpr
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
std::vector< uint32_t > workItemFlatId
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
std::vector< int > vecReads
std::deque< GPUDynInstPtr > instructionBuffer
bool isLmInstruction(GPUDynInstPtr ii)
GPUDynInstPtr nextInstr()
std::vector< uint32_t > workItemId[3]
std::vector< uint64_t > oldDgpr
bool isOldestInstScalarALU()
bool isOldestInstFlatMem()
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
std::unordered_map< int, uint64_t > rawDist
std::vector< int > reserveResources()
void decLGKMInstsIssued()
void incLGKMInstsIssued()
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
bool isOldestInstVectorALU()
int scalarOutstandingReqsRdGm
void incVMemInstsIssued()
void reserveLmResource(GPUDynInstPtr ii)
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
void start(uint64_t _wfDynId, uint64_t _base_ptr)
TheGpuISA::GPUISA _gpuISA
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Counter value() const
Return the current value of this stat as its base type.
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T divCeil(const T &a, const U &b)
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
VecRegContainer< sizeof(VecElemU32) *NumVecElemPerVecReg > VecRegContainerU32
const FlagsType none
Nothing extra to print.
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numVecOpsExecuted
statistics::Distribution activeLanesPerLMemInstrDist
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedMAC64
statistics::Vector instCyclesVMemPerSimd
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedFMA64
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numInstrExecuted
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedFMA32
statistics::Scalar numVecOpsExecutedMAD16
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar numVecOpsExecutedMAD32
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar totalCycles
statistics::Distribution execRateDist
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
statistics::Distribution readsPerWrite
statistics::Scalar numInstrExecuted
uint32_t scratch_workitem_byte_size
uint32_t compute_tmpring_size_wavesize
uint64_t scratch_backing_memory_location
uint32_t scratch_resource_descriptor[4]