Go to the documentation of this file.
36 #include "debug/GPUExec.hh"
37 #include "debug/GPUInitAbi.hh"
38 #include "debug/WavefrontStack.hh"
47 WavefrontParams::create()
53 :
SimObject(
p), wfSlotId(
p->wf_slot_id), simdId(
p->simdId),
54 maxIbSize(
p->max_ib_size), _gpuISA(*this),
55 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
56 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
95 for (
int i = 0;
i < 3; ++
i) {
112 .
name(
name() +
".timesBlockedDueWAXDependencies")
113 .
desc(
"number of times the wf's instructions are blocked due to WAW "
114 "or WAR dependencies")
119 .
name(
name() +
".timesBlockedDueRAWDependencies")
120 .
desc(
"number of times the wf's instructions are blocked due to RAW "
125 .
name(
name() +
".num_instr_executed")
126 .
desc(
"number of instructions executed by this WF slot")
131 .
desc(
"number of cycles spent in schedule stage")
136 .
desc(
"number of cycles WF is stalled in SCH stage")
140 .
name(
name() +
".sch_rf_access_stalls")
141 .
desc(
"number of cycles wave selected in SCH but RF denied adding "
146 .
name(
name() +
".sch_resource_stalls")
147 .
desc(
"number of cycles stalled in sch by resource not available")
151 .
name(
name() +
".sch_opd_nrdy_stalls")
152 .
desc(
"number of cycles stalled in sch waiting for RF reads to "
157 .
name(
name() +
".sch_lds_arb_stalls")
158 .
desc(
"number of cycles wave stalled due to LDS-VRF arbitration")
164 .
desc(
"Count of RAW distance in dynamic instructions for this WF")
169 .
name(
name() +
".vec_reads_per_write")
170 .
desc(
"Count of Vector reads per write for this WF")
200 uint32_t wiCount = 0;
201 uint32_t firstWave = 0;
202 int orderedAppendTerm = 0;
204 uint32_t finalValue = 0;
207 Addr hidden_priv_base(0);
216 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
217 "Setting PrivateSegBuffer: s[%d] = %x\n",
227 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
228 "Setting PrivateSegBuffer: s[%d] = %x\n",
238 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
239 "Setting PrivateSegBuffer: s[%d] = %x\n",
250 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
251 "Setting PrivateSegBuffer: s[%d] = %x\n",
260 ((uint32_t*)&host_disp_pkt_addr)[0]);
262 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
263 "Setting DispatchPtr: s[%d] = %x\n",
266 ((uint32_t*)&host_disp_pkt_addr)[0]);
271 ((uint32_t*)&host_disp_pkt_addr)[1]);
272 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
273 "Setting DispatchPtr: s[%d] = %x\n",
276 ((uint32_t*)&host_disp_pkt_addr)[1]);
286 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
287 "Setting QueuePtr: s[%d] = %x\n",
296 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
297 "Setting QueuePtr: s[%d] = %x\n",
308 ((uint32_t*)&kernarg_addr)[0]);
310 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
311 "Setting KernargSegPtr: s[%d] = %x\n",
314 ((uint32_t*)kernarg_addr)[0]);
319 ((uint32_t*)&kernarg_addr)[1]);
320 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
321 "Setting KernargSegPtr: s[%d] = %x\n",
324 ((uint32_t*)kernarg_addr)[1]);
335 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
336 "Setting FlatScratch Addr: s[%d] = %x\n",
349 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
350 "Setting FlatScratch size: s[%d] = %x\n",
380 & 0x000000000000ffff) << 32);
394 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
395 "Setting num WG X: s[%d] = %x\n",
408 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
409 "Setting num WG Y: s[%d] = %x\n",
422 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
423 "Setting num WG Z: s[%d] = %x\n",
434 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
435 "Setting WG ID X: s[%d] = %x\n",
446 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
447 "Setting WG ID Y: s[%d] = %x\n",
458 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
459 "Setting WG ID Z: s[%d] = %x\n",
484 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
485 "Setting Private Seg Offset: s[%d] = %x\n",
492 firstWave = (
wfId == 0) ? 1 : 0;
493 numWfsInWg =
divCeil(wgSizeInWorkItems,
495 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
496 finalValue |= (orderedAppendTerm << 6);
497 finalValue |= numWfsInWg;
501 write(physSgprIdx, finalValue);
504 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
505 "Setting WG Info: s[%d] = %x\n",
510 fatal(
"SGPR enable bit %i not supported\n", en_bit);
522 uint32_t physVgprIdx = 0;
533 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
549 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
561 mapVgpr(
this, regInitIdx);
565 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
616 "CU%d has been idle for %d ticks at tick %d",
641 if (ii->isGlobalMem() ||
642 (ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) {
652 if (ii->isLocalMem() ||
653 (ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) {
668 if (ii->isWaitcnt()) {
670 assert(ii->isScalar());
683 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
684 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
685 (ii->isKernArgSeg() && ii->isLoad()))) {
699 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
700 || (ii->isKernArgSeg() && ii->isLoad()))) {
790 if (ii->isReturn() || ii->isBranch() ||
791 ii->isEndOfKernel()) {
810 "Negative requests in pipe for WF%d for slot%d"
811 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
812 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
813 " Outstanding Reqs=%d\n",
821 if (!ii->isScalar()) {
824 }
else if (ii->isStore()) {
826 }
else if (ii->isAtomic() || ii->isMemSync()) {
830 panic(
"Invalid memory operation!\n");
836 }
else if (ii->isStore()) {
838 }
else if (ii->isAtomic() || ii->isMemSync()) {
842 panic(
"Invalid memory operation!\n");
852 "Scalar instructions can not access Shared memory!!!");
855 }
else if (ii->isStore()) {
857 }
else if (ii->isAtomic() || ii->isMemSync()) {
861 panic(
"Invalid memory operation!\n");
878 if (ii->isALU() || ii->isSpecialOp() ||
879 ii->isBranch() || ii->isNop() ||
880 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
881 ii->isReturn() || ii->isEndOfKernel()) {
882 if (!ii->isScalar()) {
888 }
else if (ii->isBarrier()) {
890 }
else if (ii->isFlat()) {
891 assert(!ii->isScalar());
901 }
else if (ii->isGlobalMem()) {
903 }
else if (ii->isLocalMem()) {
905 }
else if (ii->isPrivateSeg()) {
907 "Scalar instructions can not access Private memory!!!");
910 panic(
"reserveResources -> Couldn't process op!\n");
916 assert(execUnitIds.size());
950 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
952 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
962 if (!ii->isScalar()) {
986 for (
int i = 0;
i < ii->getNumOperands();
i++) {
987 if (ii->isVectorRegister(
i)) {
988 int vgpr = ii->getRegisterIndex(
i, ii);
989 int nReg = ii->getOperandSize(
i) <= 4 ? 1 :
990 ii->getOperandSize(
i) / 4;
991 for (
int n = 0;
n < nReg;
n++) {
992 if (ii->isSrcOperand(
i)) {
1000 }
else if (ii->isDstOperand(
i)) {
1016 if (
pc() == old_pc) {
1021 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1026 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1030 const int num_active_lanes =
execMask().count();
1034 if (ii->isF16() && ii->isALU()) {
1035 if (ii->isF32() || ii->isF64()) {
1036 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1037 "either F32 or F64.");
1044 else if (ii->isMAC()) {
1048 else if (ii->isMAD()) {
1053 if (ii->isF32() && ii->isALU()) {
1054 if (ii->isF16() || ii->isF64()) {
1055 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1056 "either F16 or F64.");
1063 else if (ii->isMAC()) {
1067 else if (ii->isMAD()) {
1072 if (ii->isF64() && ii->isALU()) {
1073 if (ii->isF16() || ii->isF32()) {
1074 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1075 "either F16 or F32.");
1082 else if (ii->isMAC()) {
1086 else if (ii->isMAD()) {
1108 bool flat_as_gm =
false;
1109 bool flat_as_lm =
false;
1111 flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) ||
1112 (ii->executedAs() == Enums::SC_PRIVATE);
1113 flat_as_lm = (ii->executedAs() == Enums::SC_GROUP);
1118 if (ii->isALU() || ii->isSpecialOp() ||
1119 ii->isBranch() || ii->isNop() ||
1120 (ii->isKernArgSeg() && ii->isLoad()) ||
1121 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1123 if (!ii->isScalar()) {
1131 }
else if (ii->isBarrier()) {
1135 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1136 if (!ii->isScalar()) {
1152 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1153 if (!ii->isScalar()) {
1168 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1169 (ii->isGlobalMem() || flat_as_gm)) {
1170 if (!ii->isScalar()) {
1186 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1194 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1202 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1203 (ii->isLocalMem() || flat_as_lm)) {
1211 panic(
"Bad instruction type!\n");
1300 assert(vm_wait_cnt >= 0);
1301 assert(exp_wait_cnt >= 0);
1302 assert(lgkm_wait_cnt >= 0);
1305 assert(vm_wait_cnt <= 0xf);
1306 assert(exp_wait_cnt <= 0x7);
1307 assert(lgkm_wait_cnt <= 0x1f);
1325 if (vm_wait_cnt != 0xf)
1328 if (exp_wait_cnt != 0x7)
1331 if (lgkm_wait_cnt != 0x1f)
1440 assert(bar_id < computeUnit->numBarrierSlots());
Stats::Distribution controlFlowDivergenceDist
std::vector< WaitClass > vectorALUs
bool isOldestInstScalarALU()
uint32_t scratch_resource_descriptor[4]
#define fatal(...)
This implements a cprintf based fatal() function.
WaitClass vectorSharedMemUnit
bool isOldestInstFlatMem()
virtual void regStats()
Callback to set stat parameters.
Stats::Vector vectorInstDstOperand
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
void deleteFromPipeMap(Wavefront *w)
Stats::Scalar numInstrExecuted
WaitClass vrfToGlobalMemPipeBus
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
uint32_t scratch_workitem_byte_size
Addr hostDispPktAddr() const
Stats::Scalar numVecOpsExecutedF16
void resizeRegFiles(int num_vregs, int num_sregs)
std::vector< int > vecReads
bool vgprBitEnabled(int bit) const
int mapWaveToGlobalMem(Wavefront *w) const
::VecRegT< VecElemU32, NumVecElemPerVecReg, false > VecRegU32
Cycles srf_scm_bus_latency
Stats::Vector instCyclesLdsPerSimd
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
Cycles vrf_gm_bus_latency
std::vector< uint32_t > oldVgpr
int mapWaveToScalarMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
Stats::Scalar schOpdNrdyStalls
Stats::Scalar totalCycles
Stats::Scalar numVecOpsExecutedMAD16
void start(uint64_t _wfDynId, uint64_t _base_ptr)
int wgSize(int dim) const
std::vector< Addr > lastAddr
void decVMemInstsIssued()
bool isOldestInstPrivMem()
int mapSgpr(Wavefront *w, int sgprIndex)
Stats::Scalar numVecOpsExecutedFMA32
bool isOldestInstScalarMem()
Stats::Distribution activeLanesPerLMemInstrDist
WaitClass vectorGlobalMemUnit
RegisterManager * registerManager
void reserveGmResource(GPUDynInstPtr ii)
Cycles vrf_lm_bus_latency
Stats::Scalar numVecOpsExecutedF64
@ S_BARRIER
WF is stalled at a barrier.
std::vector< uint32_t > workItemId[3]
bool sgprBitEnabled(int bit) const
Stats::Scalar numTimesBlockedDueWAXDependencies
const FlagsType none
Nothing extra to print.
Stats::Distribution vecRawDistance
int scalarOutstandingReqsWrGm
Stats::Scalar numVecOpsExecutedMAD64
std::vector< WaitClass > scalarALUs
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
void setStatus(status_e newStatus)
Stats::Scalar numTimesBlockedDueRAWDependencies
Stats::Vector instCyclesScMemPerSimd
void flushBuf(int wfSlotId)
Stats::Scalar numVecOpsExecutedFMA64
ComputeUnit * computeUnit
T divCeil(const T &a, const U &b)
std::vector< ScalarRegisterFile * > srf
Tick cyclesToTicks(Cycles c) const
Stats::Scalar numVecOpsExecutedF32
Stats::Scalar numInstrExecuted
int scalarOutstandingReqsRdGm
Stats::Vector vectorInstSrcOperand
Counter value() const
Return the current value of this stat as its base type.
bool isOldestInstVectorALU()
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool isOldestInstWaitcnt()
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
std::vector< PoolManager * > vrfPoolMgrs
std::vector< VectorRegisterFile * > vrf
std::vector< uint64_t > instExecPerSimd
@ S_WAITCNT
wavefront has unsatisfied wait counts
void regStats()
Callback to set stat parameters.
void freeRegisterFile()
Freeing VRF space.
Stats::VectorDistribution instInterleave
int gridSize(int dim) const
Stats::Scalar numVecOpsExecutedMAC32
FetchUnit & fetchUnit(int simdId)
Stats::Distribution execRateDist
void validateRequestCounters()
WaitClass vrfToLocalMemPipeBus
bool isLmInstruction(GPUDynInstPtr ii)
Stats::Scalar numVecOpsExecuted
std::vector< uint64_t > lastExecCycle
WaitClass srfToScalarMemPipeBus
Stats::Scalar numVecOpsExecutedMAD32
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
void computeActualWgSz(HSAQueueEntry *task)
GPUDynInstPtr nextInstr()
std::unordered_map< int, uint64_t > rawDist
void decLGKMInstsIssued()
TokenManager * getTokenManager()
virtual const std::string name() const
Stats::Scalar numVecOpsExecutedMAC64
Stats::Distribution readsPerWrite
Stats::Scalar schResourceStalls
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
bool isGmInstruction(GPUDynInstPtr ii)
void reserveLmResource(GPUDynInstPtr ii)
uint64_t scratch_backing_memory_location
Wavefront(const Params *p)
bool isOldestInstBarrier()
Stats::Scalar schLdsArbStalls
void updateInstStats(GPUDynInstPtr gpuDynInst)
Stats::Scalar numVecOpsExecutedTwoOpFP
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Stats::Scalar numVecOpsExecutedMAC16
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Cycles is a wrapper class for representing cycle counts, i.e.
uint32_t compute_tmpring_size_wavesize
Stats::Scalar schRfAccessStalls
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
std::deque< GPUDynInstPtr > instructionBuffer
int mapVgpr(Wavefront *w, int vgprIndex)
void incVMemInstsIssued()
TheGpuISA::GPUISA _gpuISA
VecRegU32::Container VecRegContainerU32
std::vector< uint32_t > workItemFlatId
std::vector< uint64_t > oldDgpr
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
std::unordered_set< uint64_t > pipeMap
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Stats::Vector instCyclesVMemPerSimd
int mapWaveToScalarAlu(Wavefront *w) const
static const int InvalidID
Stats::Distribution activeLanesPerGMemInstrDist
Stats::Scalar numVecOpsExecutedFMA16
std::vector< int > reserveResources()
#define panic(...)
This implements a cprintf based panic() function.
void incLGKMInstsIssued()
Tick curTick()
The current simulated tick.
Abstract superclass for simulation objects.
Generated on Wed Sep 30 2020 14:02:12 for gem5 by doxygen 1.8.17