Go to the documentation of this file.
35 #include "debug/GPUExec.hh"
36 #include "debug/GPUInitAbi.hh"
37 #include "debug/WavefrontStack.hh"
49 :
SimObject(
p), wfSlotId(
p.wf_slot_id), simdId(
p.simdId),
50 maxIbSize(
p.max_ib_size), _gpuISA(*this),
51 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
52 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
53 sleepCnt(0), barId(
WFBarrier::InvalidID), stats(this)
91 for (
int i = 0;
i < 3; ++
i) {
129 uint32_t wiCount = 0;
130 uint32_t firstWave = 0;
131 int orderedAppendTerm = 0;
133 uint32_t finalValue = 0;
136 Addr hidden_priv_base(0);
145 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
146 "Setting PrivateSegBuffer: s[%d] = %x\n",
156 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
157 "Setting PrivateSegBuffer: s[%d] = %x\n",
167 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
168 "Setting PrivateSegBuffer: s[%d] = %x\n",
179 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
180 "Setting PrivateSegBuffer: s[%d] = %x\n",
189 bits(host_disp_pkt_addr, 31, 0));
191 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
192 "Setting DispatchPtr: s[%d] = %x\n",
195 bits(host_disp_pkt_addr, 31, 0));
200 bits(host_disp_pkt_addr, 63, 32));
201 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
202 "Setting DispatchPtr: s[%d] = %x\n",
205 bits(host_disp_pkt_addr, 63, 32));
215 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
216 "Setting QueuePtr: s[%d] = %x\n",
225 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
226 "Setting QueuePtr: s[%d] = %x\n",
237 bits(kernarg_addr, 31, 0));
239 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
240 "Setting KernargSegPtr: s[%d] = %x\n",
243 bits(kernarg_addr, 31, 0));
248 bits(kernarg_addr, 63, 32));
249 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
250 "Setting KernargSegPtr: s[%d] = %x\n",
253 bits(kernarg_addr, 63, 32));
263 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
264 "Setting DispatchId: s[%d] = %x\n",
282 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
283 "Setting FlatScratch Addr: s[%d] = %x\n",
296 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
297 "Setting FlatScratch size: s[%d] = %x\n",
327 & 0x000000000000ffff) << 32);
338 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
339 "Setting private segment size: s[%d] = %x\n",
353 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
354 "Setting num WG X: s[%d] = %x\n",
367 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
368 "Setting num WG Y: s[%d] = %x\n",
381 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
382 "Setting num WG Z: s[%d] = %x\n",
393 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
394 "Setting WG ID X: s[%d] = %x\n",
405 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
406 "Setting WG ID Y: s[%d] = %x\n",
417 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
418 "Setting WG ID Z: s[%d] = %x\n",
443 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
444 "Setting Private Seg Offset: s[%d] = %x\n",
451 firstWave = (
wfId == 0) ? 1 : 0;
452 numWfsInWg =
divCeil(wgSizeInWorkItems,
454 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
455 finalValue |= (orderedAppendTerm << 6);
456 finalValue |= numWfsInWg;
460 write(physSgprIdx, finalValue);
463 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
464 "Setting WG Info: s[%d] = %x\n",
469 fatal(
"SGPR enable bit %i not supported\n", en_bit);
481 uint32_t physVgprIdx = 0;
492 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
508 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
520 mapVgpr(
this, regInitIdx);
524 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
575 "CU%d has been idle for %d ticks at tick %d",
600 if (ii->isGlobalMem() ||
601 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
611 if (ii->isLocalMem() ||
612 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
641 if (ii->isWaitcnt()) {
643 assert(ii->isScalar());
656 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
657 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
658 (ii->isKernArgSeg() && ii->isLoad()))) {
672 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
673 || (ii->isKernArgSeg() && ii->isLoad()))) {
763 if (ii->isReturn() || ii->isBranch() ||
764 ii->isEndOfKernel()) {
783 "Negative requests in pipe for WF%d for slot%d"
784 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
785 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
786 " Outstanding Reqs=%d\n",
794 if (!ii->isScalar()) {
797 }
else if (ii->isStore()) {
799 }
else if (ii->isAtomic() || ii->isMemSync()) {
803 panic(
"Invalid memory operation!\n");
809 }
else if (ii->isStore()) {
811 }
else if (ii->isAtomic() || ii->isMemSync()) {
815 panic(
"Invalid memory operation!\n");
825 "Scalar instructions can not access Shared memory!!!");
828 }
else if (ii->isStore()) {
830 }
else if (ii->isAtomic() || ii->isMemSync()) {
834 panic(
"Invalid memory operation!\n");
851 if (ii->isALU() || ii->isSpecialOp() ||
852 ii->isBranch() || ii->isNop() ||
853 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
854 ii->isReturn() || ii->isEndOfKernel()) {
855 if (!ii->isScalar()) {
861 }
else if (ii->isBarrier()) {
863 }
else if (ii->isFlat()) {
864 assert(!ii->isScalar());
874 }
else if (ii->isGlobalMem()) {
876 }
else if (ii->isLocalMem()) {
878 }
else if (ii->isPrivateSeg()) {
880 "Scalar instructions can not access Private memory!!!");
883 panic(
"reserveResources -> Couldn't process op!\n");
889 assert(execUnitIds.size());
923 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
925 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
935 if (!ii->isScalar()) {
961 for (
const auto& srcVecOp : ii->srcVecRegOperands()) {
962 for (
const auto& virtIdx : srcVecOp.virtIndices()) {
973 for (
const auto& dstVecOp : ii->dstVecRegOperands()) {
974 for (
const auto& virtIdx : dstVecOp.virtIndices()) {
988 if (
pc() == old_pc) {
993 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
998 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1002 const int num_active_lanes =
execMask().count();
1006 if (ii->isF16() && ii->isALU()) {
1007 if (ii->isF32() || ii->isF64()) {
1008 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1009 "either F32 or F64.");
1015 += num_active_lanes;
1017 else if (ii->isMAC()) {
1020 += num_active_lanes;
1022 else if (ii->isMAD()) {
1025 += num_active_lanes;
1028 if (ii->isF32() && ii->isALU()) {
1029 if (ii->isF16() || ii->isF64()) {
1030 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1031 "either F16 or F64.");
1037 += num_active_lanes;
1039 else if (ii->isMAC()) {
1042 += num_active_lanes;
1044 else if (ii->isMAD()) {
1047 += num_active_lanes;
1050 if (ii->isF64() && ii->isALU()) {
1051 if (ii->isF16() || ii->isF32()) {
1052 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1053 "either F16 or F32.");
1059 += num_active_lanes;
1061 else if (ii->isMAC()) {
1064 += num_active_lanes;
1066 else if (ii->isMAD()) {
1069 += num_active_lanes;
1091 bool flat_as_gm =
false;
1092 bool flat_as_lm =
false;
1094 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1095 (ii->executedAs() == enums::SC_PRIVATE);
1096 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1101 if (ii->isALU() || ii->isSpecialOp() ||
1102 ii->isBranch() || ii->isNop() ||
1103 (ii->isKernArgSeg() && ii->isLoad()) ||
1104 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1106 if (!ii->isScalar()) {
1114 }
else if (ii->isBarrier()) {
1118 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1119 if (!ii->isScalar()) {
1135 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1136 if (!ii->isScalar()) {
1151 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1152 (ii->isGlobalMem() || flat_as_gm)) {
1153 if (!ii->isScalar()) {
1169 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1177 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1185 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1186 (ii->isLocalMem() || flat_as_lm)) {
1194 panic(
"Bad instruction type!\n");
1309 assert(vm_wait_cnt >= 0);
1310 assert(exp_wait_cnt >= 0);
1311 assert(lgkm_wait_cnt >= 0);
1314 assert(vm_wait_cnt <= 0xf);
1315 assert(exp_wait_cnt <= 0x7);
1316 assert(lgkm_wait_cnt <= 0x1f);
1334 if (vm_wait_cnt != 0xf)
1337 if (exp_wait_cnt != 0x7)
1340 if (lgkm_wait_cnt != 0x1f)
1449 assert(bar_id < computeUnit->numBarrierSlots());
1472 : statistics::
Group(parent),
1474 "number of instructions executed by this WF slot"),
1475 ADD_STAT(schCycles,
"number of cycles spent in schedule stage"),
1476 ADD_STAT(schStalls,
"number of cycles WF is stalled in SCH stage"),
1477 ADD_STAT(schRfAccessStalls,
"number of cycles wave selected in SCH but "
1478 "RF denied adding instruction"),
1479 ADD_STAT(schResourceStalls,
"number of cycles stalled in sch by resource"
1481 ADD_STAT(schOpdNrdyStalls,
"number of cycles stalled in sch waiting for "
1482 "RF reads to complete"),
1484 "number of cycles wave stalled due to LDS-VRF arbitration"),
1486 ADD_STAT(numTimesBlockedDueWAXDependencies,
"number of times the wf's "
1487 "instructions are blocked due to WAW or WAR dependencies"),
1489 ADD_STAT(numTimesBlockedDueRAWDependencies,
"number of times the wf's "
1490 "instructions are blocked due to RAW dependencies"),
1492 "Count of RAW distance in dynamic instructions for this WF"),
1493 ADD_STAT(readsPerWrite,
"Count of Vector reads per write for this WF")
std::vector< uint32_t > workItemFlatId
Tick curTick()
The universal simulation clock.
#define fatal(...)
This implements a cprintf based fatal() function.
std::vector< int > vecReads
static void init_pc(py::module_ &m_native)
std::vector< Addr > lastAddr
bool isOldestInstFlatMem()
void computeActualWgSz(HSAQueueEntry *task)
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedTwoOpFP
std::vector< uint64_t > lastExecCycle
void flushBuf(int wfSlotId)
void setSleepTime(int sleep_time)
int privMemPerItem() const
bool isOldestInstBarrier()
std::vector< ScalarRegisterFile * > srf
void incVectorInstDstOperand(int num_operands)
std::vector< uint32_t > oldVgpr
std::vector< uint64_t > oldDgpr
statistics::Scalar numVecOpsExecutedF64
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
gem5::ComputeUnit::ComputeUnitStats stats
bool isOldestInstVectorALU()
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
WaitClass srfToScalarMemPipeBus
statistics::Vector instCyclesScMemPerSimd
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
WavefrontStats(statistics::Group *parent)
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
statistics::Scalar numVecOpsExecutedFMA32
WaitClass vrfToGlobalMemPipeBus
std::vector< PoolManager * > vrfPoolMgrs
statistics::Distribution activeLanesPerLMemInstrDist
TheGpuISA::GPUISA _gpuISA
statistics::Scalar numVecOpsExecutedMAD64
const FlagsType none
Nothing extra to print.
std::unordered_map< int, uint64_t > rawDist
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numVecOpsExecutedMAD32
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
std::vector< VectorRegisterFile * > vrf
statistics::Vector instCyclesVMemPerSimd
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
std::vector< uint32_t > workItemId[3]
VecRegContainer< sizeof(VecElemU32) *NumVecElemPerVecReg > VecRegContainerU32
Cycles is a wrapper class for representing cycle counts, i.e.
bool isGmInstruction(GPUDynInstPtr ii)
void setStatus(status_e newStatus)
int mapSgpr(Wavefront *w, int sgprIndex)
void start(uint64_t _wfDynId, uint64_t _base_ptr)
void freeRegisterFile()
Freeing VRF space.
void validateRequestCounters()
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
statistics::Scalar numVecOpsExecuted
static const int InvalidID
Tick cyclesToTicks(Cycles c) const
WaitClass vectorSharedMemUnit
statistics::Distribution activeLanesPerGMemInstrDist
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
void resizeRegFiles(int num_vregs, int num_sregs)
void incLGKMInstsIssued()
bool vgprBitEnabled(int bit) const
statistics::Scalar numVecOpsExecutedMAC64
int scalarOutstandingReqsRdGm
RegisterManager * registerManager
statistics::Scalar numInstrExecuted
@ S_BARRIER
WF is stalled at a barrier.
Addr hostDispPktAddr() const
int scalarOutstandingReqsWrGm
Cycles vrf_lm_bus_latency
Wavefront(const Params &p)
int wgSize(int dim) const
uint32_t scratch_workitem_byte_size
uint32_t scratch_resource_descriptor[4]
bool isOldestInstWaitcnt()
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
int mapWaveToScalarMem(Wavefront *w) const
int mapWaveToGlobalMem(Wavefront *w) const
void deleteFromPipeMap(Wavefront *w)
Abstract superclass for simulation objects.
void reserveLmResource(GPUDynInstPtr ii)
int gridSize(int dim) const
std::vector< WaitClass > scalarALUs
GPUDynInstPtr nextInstr()
std::vector< uint64_t > instExecPerSimd
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
ComputeUnit * computeUnit
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
TokenManager * getTokenManager()
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar numInstrExecuted
FetchUnit & fetchUnit(int simdId)
int mapVgpr(Wavefront *w, int vgprIndex)
int mapWaveToScalarAlu(Wavefront *w) const
std::shared_ptr< GPUDynInst > GPUDynInstPtr
void decVMemInstsIssued()
std::unordered_set< uint64_t > pipeMap
static constexpr T divCeil(const T &a, const U &b)
void updateInstStats(GPUDynInstPtr gpuDynInst)
statistics::Vector instCyclesLdsPerSimd
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
void decLGKMInstsIssued()
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
statistics::Scalar numVecOpsExecutedMAD16
std::vector< int > reserveResources()
statistics::Scalar numVecOpsExecutedMAC16
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Cycles vrf_gm_bus_latency
statistics::Distribution controlFlowDivergenceDist
int mapWaveToLocalMem(Wavefront *w) const
statistics::Scalar numVecOpsExecutedFMA64
@ S_WAITCNT
wavefront has unsatisfied wait counts
bool sgprBitEnabled(int bit) const
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
WaitClass vrfToLocalMemPipeBus
statistics::Distribution execRateDist
statistics::Scalar numVecOpsExecutedF32
std::vector< WaitClass > vectorALUs
bool isOldestInstScalarALU()
statistics::Distribution readsPerWrite
std::deque< GPUDynInstPtr > instructionBuffer
bool isLmInstruction(GPUDynInstPtr ii)
gem5::Wavefront::WavefrontStats stats
bool isOldestInstPrivMem()
void incVMemInstsIssued()
uint64_t scratch_backing_memory_location
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
uint32_t compute_tmpring_size_wavesize
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
WaitClass vectorGlobalMemUnit
statistics::Scalar totalCycles
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
statistics::Distribution vecRawDistance
void incVectorInstSrcOperand(int num_operands)
void reserveGmResource(GPUDynInstPtr ii)
Cycles srf_scm_bus_latency
bool isOldestInstScalarMem()
statistics::Scalar numVecOpsExecutedFMA16
#define panic(...)
This implements a cprintf based panic() function.
Counter value() const
Return the current value of this stat as its base type.
Generated on Sun Jul 30 2023 01:56:57 for gem5 by doxygen 1.8.17