34#include <unordered_set>
37#include "debug/GPUSched.hh"
38#include "debug/GPUVRF.hh"
52 : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
53 toExecute(to_execute),
54 _name(cu.
name() +
".ScheduleStage"),
55 vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
56 scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
57 locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
81 "Scheduler should have same number of entries as CU's readyList");
122 for (
int j = firstMemUnit; j <= lastMemUnit; j++) {
126 if (!readyListSize) {
135 assert(gpu_dyn_inst);
142 if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
146 if (gpu_dyn_inst->isFlat()) {
150 if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
159 if (j >= firstMemUnit && j <= lastMemUnit) {
165 if (!readyListSize) {
174 assert(gpu_dyn_inst);
229 assert(gpu_dyn_inst);
230 Wavefront *wf = gpu_dyn_inst->wavefront();
231 bool accessVrfWr =
true;
232 if (!gpu_dyn_inst->isScalar()) {
234 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
237 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
238 bool accessRf = accessVrfWr && accessSrfWr;
240 if (!gpu_dyn_inst->isScalar()) {
274 assert(gpu_dyn_inst);
275 Wavefront *wf = gpu_dyn_inst->wavefront();
299 assert(gpu_dyn_inst);
300 Wavefront *wf = gpu_dyn_inst->wavefront();
301 bool accessVrf =
true;
302 if (!gpu_dyn_inst->isScalar()) {
304 ->canScheduleReadOperands(wf, gpu_dyn_inst);
307 ->canScheduleReadOperands(wf, gpu_dyn_inst);
311 bool accessRf = accessVrf && accessSrf;
313 DPRINTF(GPUSched,
"schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
315 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
319 schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst,
RFBUSY));
329 if (!gpu_dyn_inst->isScalar()) {
331 ->scheduleReadOperands(wf, gpu_dyn_inst);
335 DPRINTF(GPUSched,
"schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
337 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
354 DPRINTF(GPUSched,
"schList[%d]: Could not add: "
355 "SIMD[%d] WV[%d]: %d: %s\n",
357 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
369 assert(gpu_dyn_inst);
370 auto schIter =
schList.at(exeType).begin();
371 while (schIter !=
schList.at(exeType).end()
372 && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
375 schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst,
RFREADY));
419 assert(gpu_dyn_inst);
420 Wavefront *wf = gpu_dyn_inst->wavefront();
431 if (gpu_dyn_inst->isNop()) {
438 }
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
442 }
else if (gpu_dyn_inst->isEndOfKernel()) {
448 }
else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
449 || gpu_dyn_inst->isALU()) {
454 }
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
458 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
480 }
else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
501 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
520 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
548 panic(
"%s: unknown instr checked for readiness",
549 gpu_dyn_inst->disassemble());
566 auto schIter =
schList.at(j).begin();
567 bool dispatched =
false;
568 while (schIter !=
schList.at(j).end()) {
570 if (schIter->second ==
RFREADY) {
573 if (!dispatched && dispRdy) {
582 if (!
mp->isMemSync() && !
mp->isScalar() &&
588 if (
mp->isMemRef()) {
589 mp->exec_mask =
mp->wavefront()->execMask();
593 DPRINTF(GPUSched,
"dispatchList[%d]: fillDispatchList: "
594 "EMPTY->EXREADY\n", j);
595 schIter->first =
nullptr;
596 schIter =
schList.at(j).erase(schIter);
601 schIter->first->wavefront()->stats.schStalls++;
604 schIter->first->wavefront()->stats.schResourceStalls++;
640 ==
EXREADY && gpu_dyn_inst->isFlat()) {
641 Wavefront *wf = gpu_dyn_inst->wavefront();
652 ->wavefront()->stats.schLdsArbStalls++;
658 DPRINTF(GPUSched,
"dispatchList[%d]: arbVrfLds: "
673 assert(gpu_dyn_inst);
674 Wavefront *wf = gpu_dyn_inst->wavefront();
681 if (!gpu_dyn_inst->isScalar()) {
683 ->operandReadComplete(wf, gpu_dyn_inst);
686 ->operandReadComplete(wf, gpu_dyn_inst);
687 bool operandsReady = vrfRdy && srfRdy;
689 DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands ready for: "
690 "%d: %s\n", j, wf->
wfDynId, gpu_dyn_inst->seqNum(),
691 gpu_dyn_inst->disassemble());
692 DPRINTF(GPUSched,
"schList[%d]: WV[%d] RFBUSY->RFREADY\n",
696 DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands not ready "
697 "for: %d: %s\n", j, wf->
wfDynId,
698 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
730 Wavefront *wf = gpu_dyn_inst->wavefront();
737 if (!gpu_dyn_inst->isScalar()) {
739 ->dispatchInstruction(gpu_dyn_inst);
743 std::stringstream
ss;
744 for (
auto id : execUnitIds) {
747 DPRINTF(GPUSched,
"dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
748 " Reserving ExeRes[ %s]\n",
750 gpu_dyn_inst->disassemble(),
ss.str());
752 for (
auto execUnitId : execUnitIds) {
753 panic_if(exeUnitReservations.at(execUnitId),
754 "Execution unit %d is reserved!!!\n"
755 "SIMD[%d] WV[%d]: %d: %s",
757 gpu_dyn_inst->seqNum(),
758 gpu_dyn_inst->disassemble());
759 exeUnitReservations.at(execUnitId) =
true;
766 if (execUnitIds.size() > 1) {
767 [[maybe_unused]]
int lm_exec_unit = wf->
localMem;
771 }
else if (
s ==
SKIP) {
776 [[maybe_unused]]
int gm_exec_unit = wf->
globalMem;
794 : statistics::
Group(parent,
"ScheduleStage"),
795 ADD_STAT(rdyListEmpty ,
"number of cycles no wave on ready list per "
796 "execution resource"),
797 ADD_STAT(rdyListNotEmpty,
"number of cycles one or more wave on ready "
798 "list per execution resource"),
799 ADD_STAT(addToSchListStalls,
"number of cycles a wave is not added to "
800 "schList per execution resource when ready list is not empty"),
801 ADD_STAT(schListToDispList,
"number of cycles a wave is added to "
802 "dispatchList per execution resource"),
803 ADD_STAT(schListToDispListStalls,
"number of cycles no wave is added to"
804 " dispatchList per execution resource"),
805 ADD_STAT(rfAccessStalls,
"number of stalls due to RF access denied"),
806 ADD_STAT(ldsBusArbStalls,
"number of stalls due to VRF->LDS bus "
808 ADD_STAT(opdNrdyStalls,
"number of stalls in SCH due to operands not "
810 ADD_STAT(dispNrdyStalls,
"number of stalls in SCH due to resource not "
std::vector< WaitClass > scalarALUs
int numVectorGlobalMemUnits
WaitClass vectorGlobalMemUnit
LocalMemPipeline localMemoryPipe
WaitClass vrfToLocalMemPipeBus
int numVectorSharedMemUnits
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
GlobalMemPipeline globalMemoryPipe
void insertInPipeMap(Wavefront *w)
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
WaitClass vectorSharedMemUnit
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
Cycles is a wrapper class for representing cycle counts, i.e.
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void acqCoalescerToken(GPUDynInstPtr mp)
bool coalescerReady(GPUDynInstPtr mp) const
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_OPD_NRDY_CONDITIONS
gem5::ScheduleStage::ScheduleStageStats stats
void checkRfOperandReadComplete()
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_ACCESS_NRDY_CONDITIONS
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
void scheduleRfDestOperands()
ScoreboardCheckToSchedule & fromScoreboardCheck
void arbitrateVrfToLdsBus()
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_FLAT_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_COALESCER_NRDY
@ SCH_FLAT_MEM_COALESCER_NRDY
@ SCH_SCALAR_MEM_BUS_BUSY_NRDY
@ SCH_VECTOR_MEM_REQS_NRDY
@ SCH_SCALAR_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_FIFO_NRDY
@ SCH_FLAT_MEM_ISSUE_NRDY
@ SCH_LOCAL_MEM_BUS_BUSY_NRDY
@ SCH_SCALAR_MEM_FIFO_NRDY
@ SCH_VECTOR_MEM_ISSUE_NRDY
Communication interface between Schedule and Execute stages.
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
void reset() override
Reset the pipe stage interface.
GPUDynInstPtr & readyInst(int func_unit_id)
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Communication interface between ScoreboardCheck and Schedule stages.
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
bool rdy(Cycles cycles=Cycles(0)) const
bool isOldestInstWaitcnt()
void setStatus(status_e newStatus)
bool isOldestInstBarrier()
std::deque< GPUDynInstPtr > instructionBuffer
std::vector< int > reserveResources()
void incLGKMInstsIssued()
void incVMemInstsIssued()
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & init(size_type size)
Set this vector to have the given size.
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
std::shared_ptr< GPUDynInst > GPUDynInstPtr
std::string csprintf(const char *format, const Args &...args)
statistics::Vector rdyListNotEmpty
statistics::Vector addToSchListStalls
statistics::Vector schListToDispList
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Vector rfAccessStalls
statistics::Vector schListToDispListStalls
statistics::Vector opdNrdyStalls
statistics::Vector rdyListEmpty
statistics::Vector dispNrdyStalls
statistics::Scalar ldsBusArbStalls
statistics::Scalar schCycles
statistics::Scalar schRfAccessStalls
statistics::Scalar schOpdNrdyStalls
statistics::Scalar schStalls
const std::string & name()