34#include <unordered_set>
37#include "debug/GPUSched.hh"
38#include "debug/GPUVRF.hh"
81 "Scheduler should have same number of entries as CU's readyList");
82 for (
int j = 0; j <
computeUnit.numExeUnits(); ++j) {
96 for (
int j = 0; j <
computeUnit.numExeUnits(); ++j) {
122 for (
int j = firstMemUnit; j <= lastMemUnit; j++) {
126 if (!readyListSize) {
127 stats.rdyListEmpty[j]++;
130 stats.rdyListNotEmpty[j]++;
135 assert(gpu_dyn_inst);
140 stats.addToSchListStalls[j]++;
142 if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
148 if (gpu_dyn_inst->isFlat()) {
153 if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
161 for (
int j = 0; j <
computeUnit.numExeUnits(); ++j) {
163 if (j >= firstMemUnit && j <= lastMemUnit) {
169 if (!readyListSize) {
170 stats.rdyListEmpty[j]++;
173 stats.rdyListNotEmpty[j]++;
178 assert(gpu_dyn_inst);
183 stats.addToSchListStalls[j]++;
221 toExecute.dispatchTransition(gpu_dyn_inst, unitId,
s);
233 assert(gpu_dyn_inst);
234 Wavefront *wf = gpu_dyn_inst->wavefront();
235 bool accessVrfWr =
true;
236 if (!gpu_dyn_inst->isScalar()) {
238 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
241 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
242 bool accessRf = accessVrfWr && accessSrfWr;
244 if (!gpu_dyn_inst->isScalar()) {
269 for (
int j = 0; j <
computeUnit.numExeUnits(); ++j) {
278 assert(gpu_dyn_inst);
279 Wavefront *wf = gpu_dyn_inst->wavefront();
303 assert(gpu_dyn_inst);
304 Wavefront *wf = gpu_dyn_inst->wavefront();
305 bool accessVrf =
true;
306 if (!gpu_dyn_inst->isScalar()) {
308 ->canScheduleReadOperands(wf, gpu_dyn_inst);
311 ->canScheduleReadOperands(wf, gpu_dyn_inst);
315 bool accessRf = accessVrf && accessSrf;
319 DPRINTF(GPUSched,
"schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
321 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
325 schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst,
RFBUSY));
335 if (!gpu_dyn_inst->isScalar()) {
337 ->scheduleReadOperands(wf, gpu_dyn_inst);
341 DPRINTF(GPUSched,
"schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
343 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
360 DPRINTF(GPUSched,
"schList[%d]: Could not add: "
361 "SIMD[%d] WV[%d]: %d: %s\n",
363 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
375 assert(gpu_dyn_inst);
376 auto schIter =
schList.at(exeType).begin();
377 while (schIter !=
schList.at(exeType).end()
378 && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
381 schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst,
RFREADY));
425 assert(gpu_dyn_inst);
426 Wavefront *wf = gpu_dyn_inst->wavefront();
437 if (gpu_dyn_inst->isNop()) {
444 }
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
448 }
else if (gpu_dyn_inst->isEndOfKernel()) {
454 }
else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
455 || gpu_dyn_inst->isALU()) {
460 }
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
464 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
475 if (!
computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
479 if (!
computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
486 }
else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
507 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
526 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
537 if (!
computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
541 if (!
computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
554 panic(
"%s: unknown instr checked for readiness",
555 gpu_dyn_inst->disassemble());
568 for (
int j = 0; j <
computeUnit.numExeUnits(); j++) {
572 auto schIter =
schList.at(j).begin();
573 auto selected_iter =
schList.at(j).end();
576 for (
auto iter =
schList.at(j).begin();
577 iter !=
schList.at(j).end(); iter++) {
579 if (selected_iter ==
schList.at(j).end()) {
580 selected_iter = iter;
582 (selected_iter->first->seqNum() > iter->first->seqNum()) {
583 selected_iter = iter;
587 while (schIter !=
schList.at(j).end()) {
589 if (schIter->second ==
RFREADY) {
591 if (schIter == selected_iter) {
593 if (!
mp->isMemSync() && !
mp->isScalar() &&
599 if (
mp->isMemRef()) {
600 mp->exec_mask =
mp->wavefront()->execMask();
604 DPRINTF(GPUSched,
"dispatchList[%d]: fillDispatchList: "
605 "EMPTY->EXREADY\n", j);
606 schIter->first =
nullptr;
611 schIter->first->wavefront()->stats.schStalls++;
614 schIter->first->wavefront()->stats.schResourceStalls++;
628 if (selected_iter ==
schList.at(j).end()) {
629 stats.schListToDispListStalls[j]++;
631 stats.schListToDispList[j]++;
633 schList.at(j).erase(selected_iter);
652 if (gpu_dyn_inst &&
toExecute.dispatchStatus(gm_exe_unit)
653 ==
EXREADY && gpu_dyn_inst->isFlat()) {
654 Wavefront *wf = gpu_dyn_inst->wavefront();
663 stats.ldsBusArbStalls++;
665 ->wavefront()->stats.schLdsArbStalls++;
671 DPRINTF(GPUSched,
"dispatchList[%d]: arbVrfLds: "
683 for (
int j = 0; j <
computeUnit.numExeUnits(); ++j) {
686 assert(gpu_dyn_inst);
687 Wavefront *wf = gpu_dyn_inst->wavefront();
694 if (!gpu_dyn_inst->isScalar()) {
696 ->operandReadComplete(wf, gpu_dyn_inst);
699 ->operandReadComplete(wf, gpu_dyn_inst);
700 bool operandsReady = vrfRdy && srfRdy;
702 DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands ready for: "
703 "%d: %s\n", j, wf->
wfDynId, gpu_dyn_inst->seqNum(),
704 gpu_dyn_inst->disassemble());
705 DPRINTF(GPUSched,
"schList[%d]: WV[%d] RFBUSY->RFREADY\n",
709 DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands not ready "
710 "for: %d: %s\n", j, wf->
wfDynId,
711 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
737 exeUnitReservations.resize(
computeUnit.numExeUnits(),
false);
739 for (
int j = 0; j <
computeUnit.numExeUnits(); ++j) {
743 Wavefront *wf = gpu_dyn_inst->wavefront();
750 if (!gpu_dyn_inst->isScalar()) {
752 ->dispatchInstruction(gpu_dyn_inst);
756 std::stringstream
ss;
757 for (
auto id : execUnitIds) {
760 DPRINTF(GPUSched,
"dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
761 " Reserving ExeRes[ %s]\n",
763 gpu_dyn_inst->disassemble(),
ss.str());
765 for (
auto execUnitId : execUnitIds) {
766 panic_if(exeUnitReservations.at(execUnitId),
767 "Execution unit %d is reserved!!!\n"
768 "SIMD[%d] WV[%d]: %d: %s",
770 gpu_dyn_inst->seqNum(),
771 gpu_dyn_inst->disassemble());
772 exeUnitReservations.at(execUnitId) =
true;
779 if (execUnitIds.size() > 1) {
780 [[maybe_unused]]
int lm_exec_unit = wf->
localMem;
781 assert(
toExecute.dispatchStatus(lm_exec_unit)
784 }
else if (
s ==
SKIP) {
789 [[maybe_unused]]
int gm_exec_unit = wf->
globalMem;
791 .readyInst(gm_exec_unit)->wfDynId);
792 assert(
toExecute.dispatchStatus(gm_exec_unit)
809 "execution resource"),
811 "list per execution resource"),
813 "schList per execution resource when ready list is not empty"),
815 "dispatchList per execution resource"),
817 " dispatchList per execution resource"),
Cycles is a wrapper class for representing cycle counts, i.e.
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_OPD_NRDY_CONDITIONS
gem5::ScheduleStage::ScheduleStageStats stats
void checkRfOperandReadComplete()
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_ACCESS_NRDY_CONDITIONS
const std::string & name() const
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
void scheduleRfDestOperands()
ScoreboardCheckToSchedule & fromScoreboardCheck
void arbitrateVrfToLdsBus()
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_FLAT_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_COALESCER_NRDY
@ SCH_FLAT_MEM_COALESCER_NRDY
@ SCH_SCALAR_MEM_BUS_BUSY_NRDY
@ SCH_VECTOR_MEM_REQS_NRDY
@ SCH_SCALAR_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_FIFO_NRDY
@ SCH_FLAT_MEM_ISSUE_NRDY
@ SCH_LOCAL_MEM_BUS_BUSY_NRDY
@ SCH_SCALAR_MEM_FIFO_NRDY
@ SCH_VECTOR_MEM_ISSUE_NRDY
Communication interface between Schedule and Execute stages.
Communication interface between ScoreboardCheck and Schedule stages.
bool isOldestInstWaitcnt()
void setStatus(status_e newStatus)
void trackVMemInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstBarrier()
std::deque< GPUDynInstPtr > instructionBuffer
std::vector< int > reserveResources()
void incLGKMInstsIssued()
void trackExpInst(GPUDynInstPtr gpu_dyn_inst)
void trackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
void incVMemInstsIssued()
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< GPUDynInst > GPUDynInstPtr
std::string csprintf(const char *format, const Args &...args)
statistics::Vector rdyListNotEmpty
statistics::Vector addToSchListStalls
statistics::Vector schListToDispList
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Vector rfAccessStalls
statistics::Vector schListToDispListStalls
statistics::Vector opdNrdyStalls
statistics::Vector rdyListEmpty
statistics::Vector dispNrdyStalls
statistics::Scalar ldsBusArbStalls
statistics::Scalar schCycles
statistics::Scalar schRfAccessStalls
statistics::Scalar schOpdNrdyStalls
statistics::Scalar schStalls