35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/WavefrontStack.hh"
49 :
SimObject(
p), wfSlotId(
p.wf_slot_id), simdId(
p.simdId),
50 maxIbSize(
p.max_ib_size), _gpuISA(*this),
51 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
52 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
53 sleepCnt(0), barId(
WFBarrier::InvalidID), stats(this)
91 for (
int i = 0;
i < 3; ++
i) {
129 uint32_t wiCount = 0;
130 uint32_t firstWave = 0;
131 int orderedAppendTerm = 0;
133 uint32_t finalValue = 0;
136 Addr hidden_priv_base(0);
145 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
146 "Setting PrivateSegBuffer: s[%d] = %x\n",
156 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
157 "Setting PrivateSegBuffer: s[%d] = %x\n",
167 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
168 "Setting PrivateSegBuffer: s[%d] = %x\n",
179 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
180 "Setting PrivateSegBuffer: s[%d] = %x\n",
189 bits(host_disp_pkt_addr, 31, 0));
191 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
192 "Setting DispatchPtr: s[%d] = %x\n",
195 bits(host_disp_pkt_addr, 31, 0));
200 bits(host_disp_pkt_addr, 63, 32));
201 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
202 "Setting DispatchPtr: s[%d] = %x\n",
205 bits(host_disp_pkt_addr, 63, 32));
215 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
216 "Setting QueuePtr: s[%d] = %x\n",
225 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
226 "Setting QueuePtr: s[%d] = %x\n",
237 bits(kernarg_addr, 31, 0));
239 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
240 "Setting KernargSegPtr: s[%d] = %x\n",
243 bits(kernarg_addr, 31, 0));
248 bits(kernarg_addr, 63, 32));
249 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
250 "Setting KernargSegPtr: s[%d] = %x\n",
253 bits(kernarg_addr, 63, 32));
263 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
264 "Setting DispatchId: s[%d] = %x\n",
279 (TheGpuISA::ScalarRegU32)(task->
amdQueue
282 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
283 "Setting FlatScratch Addr: s[%d] = %x\n",
286 (TheGpuISA::ScalarRegU32)(task->
amdQueue
296 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
297 "Setting FlatScratch size: s[%d] = %x\n",
327 & 0x000000000000ffff) << 32);
338 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
339 "Setting private segment size: s[%d] = %x\n",
353 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
354 "Setting num WG X: s[%d] = %x\n",
367 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
368 "Setting num WG Y: s[%d] = %x\n",
381 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
382 "Setting num WG Z: s[%d] = %x\n",
393 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
394 "Setting WG ID X: s[%d] = %x\n",
405 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
406 "Setting WG ID Y: s[%d] = %x\n",
417 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
418 "Setting WG ID Z: s[%d] = %x\n",
443 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
444 "Setting Private Seg Offset: s[%d] = %x\n",
451 firstWave = (
wfId == 0) ? 1 : 0;
452 numWfsInWg =
divCeil(wgSizeInWorkItems,
454 finalValue = firstWave << ((
sizeof(uint32_t) * 8) - 1);
455 finalValue |= (orderedAppendTerm << 6);
456 finalValue |= numWfsInWg;
460 write(physSgprIdx, finalValue);
463 DPRINTF(GPUInitAbi,
"CU%d: WF[%d][%d]: wave[%d] "
464 "Setting WG Info: s[%d] = %x\n",
469 fatal(
"SGPR enable bit %i not supported\n", en_bit);
481 uint32_t physVgprIdx = 0;
482 TheGpuISA::VecRegContainerU32 raw_vgpr;
489 TheGpuISA::VecElemU32 *vgpr_x
490 = raw_vgpr.as<TheGpuISA::VecElemU32>();
492 for (
int lane = 0; lane <
workItemId[0].size(); ++lane) {
505 TheGpuISA::VecElemU32 *vgpr_y
506 = raw_vgpr.as<TheGpuISA::VecElemU32>();
508 for (
int lane = 0; lane <
workItemId[1].size(); ++lane) {
520 mapVgpr(
this, regInitIdx);
521 TheGpuISA::VecElemU32 *vgpr_z
522 = raw_vgpr.as<TheGpuISA::VecElemU32>();
524 for (
int lane = 0; lane <
workItemId[2].size(); ++lane) {
575 "CU%d has been idle for %d ticks at tick %d",
600 if (ii->isGlobalMem() ||
601 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
611 if (ii->isLocalMem() ||
612 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
641 if (ii->isWaitcnt()) {
643 assert(ii->isScalar());
656 if (
status !=
S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
657 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
658 (ii->isKernArgSeg() && ii->isLoad()))) {
672 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
673 || (ii->isKernArgSeg() && ii->isLoad()))) {
763 if (ii->isReturn() || ii->isBranch() ||
764 ii->isEndOfKernel()) {
783 "Negative requests in pipe for WF%d for slot%d"
784 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
785 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
786 " Outstanding Reqs=%d\n",
794 if (!ii->isScalar()) {
797 }
else if (ii->isStore()) {
799 }
else if (ii->isAtomic() || ii->isMemSync()) {
803 panic(
"Invalid memory operation!\n");
809 }
else if (ii->isStore()) {
811 }
else if (ii->isAtomic() || ii->isMemSync()) {
815 panic(
"Invalid memory operation!\n");
825 "Scalar instructions can not access Shared memory!!!");
828 }
else if (ii->isStore()) {
830 }
else if (ii->isAtomic() || ii->isMemSync()) {
834 panic(
"Invalid memory operation!\n");
851 if (ii->isALU() || ii->isSpecialOp() ||
852 ii->isBranch() || ii->isNop() ||
853 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
854 ii->isReturn() || ii->isEndOfKernel()) {
855 if (!ii->isScalar()) {
861 }
else if (ii->isBarrier()) {
863 }
else if (ii->isFlat()) {
864 assert(!ii->isScalar());
874 }
else if (ii->isGlobalMem()) {
876 }
else if (ii->isLocalMem()) {
878 }
else if (ii->isPrivateSeg()) {
880 "Scalar instructions can not access Private memory!!!");
883 panic(
"reserveResources -> Couldn't process op!\n");
889 assert(execUnitIds.size());
923 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
925 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
935 if (!ii->isScalar()) {
961 for (
const auto& srcVecOp : ii->srcVecRegOperands()) {
962 for (
const auto& virtIdx : srcVecOp.virtIndices()) {
973 for (
const auto& dstVecOp : ii->dstVecRegOperands()) {
974 for (
const auto& virtIdx : dstVecOp.virtIndices()) {
988 if (
pc() == old_pc) {
993 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave%d %s taken branch\n",
998 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1002 const int num_active_lanes =
execMask().count();
1006 if (ii->isF16() && ii->isALU()) {
1007 if (ii->isF32() || ii->isF64()) {
1008 fatal(
"Instruction is tagged as both (1) F16, and (2)"
1009 "either F32 or F64.");
1015 += num_active_lanes;
1017 else if (ii->isMAC()) {
1020 += num_active_lanes;
1022 else if (ii->isMAD()) {
1025 += num_active_lanes;
1028 if (ii->isF32() && ii->isALU()) {
1029 if (ii->isF16() || ii->isF64()) {
1030 fatal(
"Instruction is tagged as both (1) F32, and (2)"
1031 "either F16 or F64.");
1037 += num_active_lanes;
1039 else if (ii->isMAC()) {
1042 += num_active_lanes;
1044 else if (ii->isMAD()) {
1047 += num_active_lanes;
1050 if (ii->isF64() && ii->isALU()) {
1051 if (ii->isF16() || ii->isF32()) {
1052 fatal(
"Instruction is tagged as both (1) F64, and (2)"
1053 "either F16 or F32.");
1059 += num_active_lanes;
1061 else if (ii->isMAC()) {
1064 += num_active_lanes;
1066 else if (ii->isMAD()) {
1069 += num_active_lanes;
1091 bool flat_as_gm =
false;
1092 bool flat_as_lm =
false;
1094 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1095 (ii->executedAs() == enums::SC_PRIVATE);
1096 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1101 if (ii->isALU() || ii->isSpecialOp() ||
1102 ii->isBranch() || ii->isNop() ||
1103 (ii->isKernArgSeg() && ii->isLoad()) ||
1104 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1106 if (!ii->isScalar()) {
1114 }
else if (ii->isBarrier()) {
1118 }
else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1119 if (!ii->isScalar()) {
1135 }
else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1136 if (!ii->isScalar()) {
1151 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1152 (ii->isGlobalMem() || flat_as_gm)) {
1153 if (!ii->isScalar()) {
1169 }
else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1177 }
else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1185 }
else if ((ii->isAtomic() || ii->isMemSync()) &&
1186 (ii->isLocalMem() || flat_as_lm)) {
1194 panic(
"Bad instruction type!\n");
1309 assert(vm_wait_cnt >= 0);
1310 assert(exp_wait_cnt >= 0);
1311 assert(lgkm_wait_cnt >= 0);
1314 assert(vm_wait_cnt <= 0xf);
1315 assert(exp_wait_cnt <= 0x7);
1316 assert(lgkm_wait_cnt <= 0x1f);
1334 if (vm_wait_cnt != 0xf)
1337 if (exp_wait_cnt != 0x7)
1340 if (lgkm_wait_cnt != 0x1f)
1449 assert(bar_id < computeUnit->numBarrierSlots());
1472 : statistics::
Group(parent),
1474 "number of instructions executed by this WF slot"),
1475 ADD_STAT(schCycles,
"number of cycles spent in schedule stage"),
1476 ADD_STAT(schStalls,
"number of cycles WF is stalled in SCH stage"),
1477 ADD_STAT(schRfAccessStalls,
"number of cycles wave selected in SCH but "
1478 "RF denied adding instruction"),
1479 ADD_STAT(schResourceStalls,
"number of cycles stalled in sch by resource"
1481 ADD_STAT(schOpdNrdyStalls,
"number of cycles stalled in sch waiting for "
1482 "RF reads to complete"),
1484 "number of cycles wave stalled due to LDS-VRF arbitration"),
1486 ADD_STAT(numTimesBlockedDueWAXDependencies,
"number of times the wf's "
1487 "instructions are blocked due to WAW or WAR dependencies"),
1489 ADD_STAT(numTimesBlockedDueRAWDependencies,
"number of times the wf's "
1490 "instructions are blocked due to RAW dependencies"),
1492 "Count of RAW distance in dynamic instructions for this WF"),
1493 ADD_STAT(readsPerWrite,
"Count of Vector reads per write for this WF")
Tick cyclesToTicks(Cycles c) const
int mapWaveToScalarAlu(Wavefront *w) const
std::vector< WaitClass > scalarALUs
Cycles srf_scm_bus_latency
std::vector< uint64_t > instExecPerSimd
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
Cycles vrf_gm_bus_latency
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass vrfToLocalMemPipeBus
Cycles vrf_lm_bus_latency
TokenManager * getTokenManager()
WaitClass srfToScalarMemPipeBus
std::vector< uint64_t > lastExecCycle
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
WaitClass vectorSharedMemUnit
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
void deleteFromPipeMap(Wavefront *w)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
FetchUnit & fetchUnit(int simdId)
void flushBuf(int wfSlotId)
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
Addr hostDispPktAddr() const
int wgSize(int dim) const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int privMemPerItem() const
int gridSize(int dim) const
int mapVgpr(Wavefront *w, int vgprIndex)
std::vector< PoolManager * > vrfPoolMgrs
int mapSgpr(Wavefront *w, int sgprIndex)
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
void incVectorInstDstOperand(int num_operands)
void incVectorInstSrcOperand(int num_operands)
Abstract superclass for simulation objects.
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
static const int InvalidID
bool isOldestInstWaitcnt()
void reserveGmResource(GPUDynInstPtr ii)
std::vector< Addr > lastAddr
void setStatus(status_e newStatus)
void validateRequestCounters()
bool isOldestInstPrivMem()
bool isOldestInstScalarMem()
Wavefront(const Params &p)
bool isOldestInstBarrier()
void resizeRegFiles(int num_vregs, int num_sregs)
int scalarOutstandingReqsWrGm
std::vector< uint32_t > oldVgpr
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
std::vector< uint32_t > workItemFlatId
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
std::vector< int > vecReads
std::deque< GPUDynInstPtr > instructionBuffer
bool isLmInstruction(GPUDynInstPtr ii)
GPUDynInstPtr nextInstr()
std::vector< uint32_t > workItemId[3]
std::vector< uint64_t > oldDgpr
bool isOldestInstScalarALU()
bool isOldestInstFlatMem()
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
std::unordered_map< int, uint64_t > rawDist
std::vector< int > reserveResources()
void decLGKMInstsIssued()
void incLGKMInstsIssued()
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
bool isOldestInstVectorALU()
int scalarOutstandingReqsRdGm
void incVMemInstsIssued()
void reserveLmResource(GPUDynInstPtr ii)
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
void start(uint64_t _wfDynId, uint64_t _base_ptr)
TheGpuISA::GPUISA _gpuISA
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Counter value() const
Return the current value of this stat as its base type.
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T divCeil(const T &a, const U &b)
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
const FlagsType none
Nothing extra to print.
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
static void init_pc(py::module_ &m_native)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numVecOpsExecuted
statistics::Distribution activeLanesPerLMemInstrDist
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedMAC64
statistics::Vector instCyclesVMemPerSimd
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedFMA64
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numInstrExecuted
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedFMA32
statistics::Scalar numVecOpsExecutedMAD16
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar numVecOpsExecutedMAD32
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar totalCycles
statistics::Distribution execRateDist
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
statistics::Distribution readsPerWrite
statistics::Scalar numInstrExecuted
uint32_t scratch_workitem_byte_size
uint32_t compute_tmpring_size_wavesize
uint64_t scratch_backing_memory_location
uint32_t scratch_resource_descriptor[4]