36 #include "debug/GPUExec.hh" 37 #include "debug/WavefrontStack.hh" 44 WavefrontParams::create()
50 :
SimObject(p), callArgMem(nullptr), _gpuISA()
87 for (
int i = 0;
i < 3; ++
i) {
99 .
name(
name() +
".src_reg_operand_dist")
100 .
desc(
"number of executed instructions with N source register operands")
105 .
name(
name() +
".dst_reg_operand_dist")
106 .
desc(
"number of executed instructions with N destination register " 112 .
name(
name() +
".timesBlockedDueWAXDependencies")
113 .
desc(
"number of times the wf's instructions are blocked due to WAW " 114 "or WAR dependencies")
119 .
name(
name() +
".timesBlockedDueRAWDependencies")
120 .
desc(
"number of times the wf's instructions are blocked due to RAW " 126 .
name(
name() +
".timesBlockedDueVrfPortAvail")
127 .
desc(
"number of times instructions are blocked due to VRF port " 165 if (ii->isGlobalMem() || ii->isFlat())
174 if (ii->isLocalMem()) {
188 ii->isReturn() || ii->isBranch() ||
189 ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
269 if (ii->isReturn() || ii->isBranch()) {
289 if (mode == 1 && size > 4) {
325 bool ready_inst M5_VAR_USED =
false;
326 bool glbMemBusRdy =
false;
327 bool glbMemIssueRdy =
false;
333 glbMemIssueRdy =
true;
336 bool locMemBusRdy =
false;
337 bool locMemIssueRdy =
false;
343 locMemIssueRdy =
true;
351 if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
352 ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
353 ii->isMemFence() || ii->isFlat())) {
354 panic(
"next instruction: %s is of unknown type\n", ii->disassemble());
357 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
360 if (type ==
I_ALU && ii->isBarrier()) {
373 }
else if (type ==
I_ALU && ii->isNop()) {
381 }
else if (type ==
I_ALU && ii->isReturn()) {
394 }
else if (type ==
I_ALU && (ii->isBranch() ||
396 (ii->isKernArgSeg() && ii->isLoad()) ||
412 }
else if (type ==
I_GLOBAL && ii->isGlobalMem()) {
414 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
421 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
427 if (!glbMemIssueRdy) {
451 }
else if (type ==
I_SHARED && ii->isLocalMem()) {
453 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
459 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
469 if (!locMemIssueRdy) {
488 }
else if (type ==
I_FLAT && ii->isFlat()) {
499 if (!glbMemIssueRdy) {
504 if (!locMemIssueRdy) {
547 if (ii->isALU() || ii->isSpecialOp() ||
552 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
559 }
else if (ii->isBarrier()) {
562 }
else if (ii->isLoad() && ii->isFlat()) {
566 if ( Enums::SC_SHARED == ii->executedAs() ) {
577 }
else if (ii->isStore() && ii->isFlat()) {
581 if (Enums::SC_SHARED == ii->executedAs()) {
592 }
else if (ii->isLoad() && ii->isGlobalMem()) {
599 }
else if (ii->isStore() && ii->isGlobalMem()) {
606 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
614 }
else if (ii->isLoad() && ii->isLocalMem()) {
621 }
else if (ii->isStore() && ii->isLocalMem()) {
628 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
653 const uint32_t old_pc =
pc();
654 DPRINTF(GPUExec,
"CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 656 ii->disassemble(), old_pc);
670 if (
pc() == old_pc) {
671 uint32_t new_pc =
_gpuISA.advancePC(old_pc, ii);
674 if (new_pc ==
rpc()) {
685 const int num_active_lanes =
execMask().count();
697 if (ii->isALU() || ii->isSpecialOp() ||
702 (ii->isKernArgSeg() && ii->isLoad()) ||
711 }
else if (ii->isBarrier()) {
714 }
else if (ii->isLoad() && ii->isFlat()) {
717 if (Enums::SC_SHARED == ii->executedAs()) {
728 }
else if (ii->isStore() && ii->isFlat()) {
730 if (Enums::SC_SHARED == ii->executedAs()) {
741 }
else if (ii->isLoad() && ii->isGlobalMem()) {
746 }
else if (ii->isStore() && ii->isGlobalMem()) {
751 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
756 }
else if (ii->isLoad() && ii->isLocalMem()) {
761 }
else if (ii->isStore() && ii->isLocalMem()) {
766 }
else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
784 assert(mask.count());
793 DPRINTF(WavefrontStack,
"[%2d, %2d, %2d, %2d] %s %3i => ",
796 std::string::allocator_type>().c_str(),
pc());
800 DPRINTF(WavefrontStack,
"%3i %s\n",
pc(),
802 std::string::allocator_type>().c_str());
857 uint8_t *iter = (uint8_t *)out;
858 for (
int i = 0;
i <
barCnt.size();
i++) {
861 *(
int *)iter =
wfId; iter +=
sizeof(
wfId);
866 *(uint32_t *)iter =
wgId; iter +=
sizeof(
wgId);
868 *(uint64_t *)iter =
initMask.to_ullong(); iter +=
sizeof(
initMask.to_ullong());
874 std::numeric_limits<uint32_t>::max(),
875 std::numeric_limits<uint64_t>::max()};
889 uint32_t vgprIdx =
remap(
i,
sizeof(uint32_t), 1);
890 for (
int lane = 0; lane < wf_size; lane++) {
892 read<uint32_t>(vgprIdx,lane);
893 *(uint32_t *)iter = regVal; iter +=
sizeof(regVal);
898 uint32_t vgprIdx =
remap(
i,
sizeof(uint64_t), 1);
899 for (
int lane = 0; lane < wf_size; lane++) {
901 read<uint64_t>(vgprIdx,lane);
902 *(uint64_t *)iter = regVal; iter +=
sizeof(regVal);
907 for (
int lane = 0; lane < wf_size; lane++) {
909 *(uint64_t *)iter = regVal; iter +=
sizeof(regVal);
917 *(
char *) iter = val; iter +=
sizeof(
val);
924 uint8_t *iter = (uint8_t *)in;
925 for (
int i = 0;
i <
barCnt.size();
i++) {
928 wfId = *(
int *)iter; iter +=
sizeof(
wfId);
933 wgId = *(uint32_t *)iter; iter +=
sizeof(
wgId);
942 if (newEntry.
pc != std::numeric_limits<uint32_t>::max()) {
950 uint32_t vgprIdx =
remap(
i,
sizeof(uint32_t), 1);
951 for (
int lane = 0; lane < wf_size; lane++) {
952 uint32_t regVal = *(uint32_t *)iter; iter +=
sizeof(regVal);
958 uint32_t vgprIdx =
remap(
i,
sizeof(uint64_t), 1);
959 for (
int lane = 0; lane < wf_size; lane++) {
960 uint64_t regVal = *(uint64_t *)iter; iter +=
sizeof(regVal);
966 for (
int lane = 0; lane < wf_size; lane++) {
967 uint64_t regVal = *(uint64_t *)iter; iter +=
sizeof(regVal);
974 char val = *(
char *) iter; iter +=
sizeof(
val);
983 for (
int d = 0;
d < 3; ++
d) {
#define panic(...)
This implements a cprintf based panic() function.
std::vector< uint32_t > oldVgpr
std::vector< uint8_t >::size_type size() const
get the size of this chunk
Stats::Scalar numTimesBlockedDueRAWDependencies
void setContext(const void *in)
Sets the hardware context fromt a stream of bytes This method is designed for HSAIL execution...
void write(int regIdx, int threadId, T value)
Stats::Scalar numTimesBlockedDueVrfPortAvail
std::deque< std::unique_ptr< ReconvergenceStackEntry > > reconvergenceStack
Stack containing Control Flow Graph nodes (i.e., kernel instructions) to be visited by the wavefront...
Stats::Distribution controlFlowDivergenceDist
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a particular wavefront This should be updated ever...
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
void init(uint32_t _size)
T read(int regIdx, int threadId)
class ConditionRegisterState * condRegState
bool isOldestInstFlatMem()
bool isOldestInstPrivMem()
Stats::Scalar numTimesBlockedDueWAXDependencies
Stats::Scalar numInstrExecuted
std::vector< WaitClass > vrfToLocalMemPipeBus
bool instructionBufferHasBranch()
std::vector< WaitClass > aluPipe
GlobalMemPipeline globalMemoryPipe
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Distribution srcRegOpDist
std::vector< uint32_t > workItemId[3]
uint32_t pc
PC of current instruction.
std::deque< GPUDynInstPtr > instructionBuffer
Stats::Distribution execRateDist
void regStats()
Callback to set stat parameters.
std::vector< uint32_t > workItemFlatId
Wavefront(const Params *p)
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
ComputeUnit * computeUnit
Stats::Distribution dstRegOpDist
bool isLmInstruction(GPUDynInstPtr ii)
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
uint32_t outstandingReqsWrLm
void getContext(const void *out)
Returns the hardware context as a stream of bytes This method is designed for HSAIL execution...
uint32_t outstandingReqsRdGm
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
T read(const uint32_t index)
a read operation
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
uint32_t outstandingReqsRdLm
bool isOldestInstBarrier()
uint32_t outstandingReqsWrGm
bool isGmInstruction(GPUDynInstPtr ii)
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
std::vector< Addr > lastAddr
virtual const std::string name() const
TheGpuISA::GPUISA _gpuISA
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Stats::Distribution activeLanesPerGMemInstrDist
A reconvergence stack entry conveys the necessary state to implement control flow divergence...
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Tick ticks(int numCycles) const
std::vector< uint64_t > oldDgpr
VectorMask execMask() const
void popFromReconvergenceStack()
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
void write(const uint32_t index, const T value)
a write operation
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
std::vector< uint64_t > lastExecCycle
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
virtual void regStats()
Callback to set stat parameters.
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0)
Stats::Scalar totalCycles
Counter value() const
Return the current value of this stat as its base type.
Abstract superclass for simulation objects.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
const std::string to_string(sc_enc enc)
uint32_t rpc
PC of the immediate post-dominator instruction, i.e., the value of pc for the first instruction that ...
bool waitingAtBarrier(int lane)
VectorMask execMask
Execution mask.