34#include <unordered_set>
37#include "debug/GPUSched.hh"
38#include "debug/GPUVRF.hh"
51 : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
52 toExecute(to_execute),
53 _name(cu.
name() +
".ScheduleStage"),
54 vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
55 scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
56 locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
80 "Scheduler should have same number of entries as CU's readyList");
121 for (
int j = firstMemUnit;
j <= lastMemUnit;
j++) {
125 if (!readyListSize) {
134 assert(gpu_dyn_inst);
141 if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
145 if (gpu_dyn_inst->isFlat()) {
149 if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
158 if (
j >= firstMemUnit &&
j <= lastMemUnit) {
164 if (!readyListSize) {
173 assert(gpu_dyn_inst);
228 assert(gpu_dyn_inst);
229 Wavefront *wf = gpu_dyn_inst->wavefront();
230 bool accessVrfWr =
true;
231 if (!gpu_dyn_inst->isScalar()) {
233 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
236 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
237 bool accessRf = accessVrfWr && accessSrfWr;
239 if (!gpu_dyn_inst->isScalar()) {
273 assert(gpu_dyn_inst);
274 Wavefront *wf = gpu_dyn_inst->wavefront();
298 assert(gpu_dyn_inst);
299 Wavefront *wf = gpu_dyn_inst->wavefront();
300 bool accessVrf =
true;
301 if (!gpu_dyn_inst->isScalar()) {
303 ->canScheduleReadOperands(wf, gpu_dyn_inst);
306 ->canScheduleReadOperands(wf, gpu_dyn_inst);
310 bool accessRf = accessVrf && accessSrf;
312 DPRINTF(GPUSched,
"schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
314 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
318 schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst,
RFBUSY));
328 if (!gpu_dyn_inst->isScalar()) {
330 ->scheduleReadOperands(wf, gpu_dyn_inst);
334 DPRINTF(GPUSched,
"schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
336 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
353 DPRINTF(GPUSched,
"schList[%d]: Could not add: "
354 "SIMD[%d] WV[%d]: %d: %s\n",
356 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
368 assert(gpu_dyn_inst);
369 auto schIter =
schList.at(exeType).begin();
370 while (schIter !=
schList.at(exeType).end()
371 && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
374 schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst,
RFREADY));
418 assert(gpu_dyn_inst);
419 Wavefront *wf = gpu_dyn_inst->wavefront();
430 if (gpu_dyn_inst->isNop()) {
437 }
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
441 }
else if (gpu_dyn_inst->isEndOfKernel()) {
447 }
else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
448 || gpu_dyn_inst->isALU()) {
453 }
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
457 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
479 }
else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
500 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
519 }
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
547 panic(
"%s: unknown instr checked for readiness",
548 gpu_dyn_inst->disassemble());
565 auto schIter =
schList.at(
j).begin();
566 bool dispatched =
false;
567 while (schIter !=
schList.at(
j).end()) {
569 if (schIter->second ==
RFREADY) {
572 if (!dispatched && dispRdy) {
581 if (!
mp->isMemSync() && !
mp->isScalar() &&
582 (
mp->isGlobalMem() ||
mp->isFlat())) {
587 if (
mp->isMemRef()) {
588 mp->exec_mask =
mp->wavefront()->execMask();
592 DPRINTF(GPUSched,
"dispatchList[%d]: fillDispatchList: "
593 "EMPTY->EXREADY\n",
j);
594 schIter->first =
nullptr;
595 schIter =
schList.at(
j).erase(schIter);
600 schIter->first->wavefront()->stats.schStalls++;
603 schIter->first->wavefront()->stats.schResourceStalls++;
641 ==
EXREADY && gpu_dyn_inst->isFlat()) {
642 Wavefront *wf = gpu_dyn_inst->wavefront();
653 ->wavefront()->stats.schLdsArbStalls++;
659 DPRINTF(GPUSched,
"dispatchList[%d]: arbVrfLds: "
674 assert(gpu_dyn_inst);
675 Wavefront *wf = gpu_dyn_inst->wavefront();
682 if (!gpu_dyn_inst->isScalar()) {
684 ->operandReadComplete(wf, gpu_dyn_inst);
687 ->operandReadComplete(wf, gpu_dyn_inst);
688 bool operandsReady = vrfRdy && srfRdy;
690 DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands ready for: "
691 "%d: %s\n",
j, wf->
wfDynId, gpu_dyn_inst->seqNum(),
692 gpu_dyn_inst->disassemble());
693 DPRINTF(GPUSched,
"schList[%d]: WV[%d] RFBUSY->RFREADY\n",
697 DPRINTF(GPUSched,
"schList[%d]: WV[%d] operands not ready "
699 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
731 Wavefront *wf = gpu_dyn_inst->wavefront();
738 if (!gpu_dyn_inst->isScalar()) {
740 ->dispatchInstruction(gpu_dyn_inst);
744 std::stringstream
ss;
745 for (
auto id : execUnitIds) {
748 DPRINTF(GPUSched,
"dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
749 " Reserving ExeRes[ %s]\n",
751 gpu_dyn_inst->disassemble(),
ss.str());
753 for (
auto execUnitId : execUnitIds) {
754 panic_if(exeUnitReservations.at(execUnitId),
755 "Execution unit %d is reserved!!!\n"
756 "SIMD[%d] WV[%d]: %d: %s",
758 gpu_dyn_inst->seqNum(),
759 gpu_dyn_inst->disassemble());
760 exeUnitReservations.at(execUnitId) =
true;
767 if (execUnitIds.size() > 1) {
768 [[maybe_unused]]
int lm_exec_unit = wf->
localMem;
772 }
else if (
s ==
SKIP) {
777 [[maybe_unused]]
int gm_exec_unit = wf->
globalMem;
795 : statistics::
Group(parent,
"ScheduleStage"),
796 ADD_STAT(rdyListEmpty ,
"number of cycles no wave on ready list per "
797 "execution resource"),
798 ADD_STAT(rdyListNotEmpty,
"number of cycles one or more wave on ready "
799 "list per execution resource"),
800 ADD_STAT(addToSchListStalls,
"number of cycles a wave is not added to "
801 "schList per execution resource when ready list is not empty"),
802 ADD_STAT(schListToDispList,
"number of cycles a wave is added to "
803 "dispatchList per execution resource"),
804 ADD_STAT(schListToDispListStalls,
"number of cycles no wave is added to"
805 " dispatchList per execution resource"),
806 ADD_STAT(rfAccessStalls,
"number of stalls due to RF access denied"),
807 ADD_STAT(ldsBusArbStalls,
"number of stalls due to VRF->LDS bus "
809 ADD_STAT(opdNrdyStalls,
"number of stalls in SCH due to operands not "
811 ADD_STAT(dispNrdyStalls,
"number of stalls in SCH due to resource not "
std::vector< WaitClass > scalarALUs
int numVectorGlobalMemUnits
WaitClass vectorGlobalMemUnit
LocalMemPipeline localMemoryPipe
WaitClass vrfToLocalMemPipeBus
int numVectorSharedMemUnits
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
GlobalMemPipeline globalMemoryPipe
void insertInPipeMap(Wavefront *w)
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
WaitClass vectorSharedMemUnit
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
Cycles is a wrapper class for representing cycle counts, i.e.
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void acqCoalescerToken(GPUDynInstPtr mp)
bool coalescerReady(GPUDynInstPtr mp) const
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_OPD_NRDY_CONDITIONS
gem5::ScheduleStage::ScheduleStageStats stats
void checkRfOperandReadComplete()
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_ACCESS_NRDY_CONDITIONS
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
void scheduleRfDestOperands()
ScoreboardCheckToSchedule & fromScoreboardCheck
void arbitrateVrfToLdsBus()
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_FLAT_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_COALESCER_NRDY
@ SCH_FLAT_MEM_COALESCER_NRDY
@ SCH_SCALAR_MEM_BUS_BUSY_NRDY
@ SCH_VECTOR_MEM_REQS_NRDY
@ SCH_SCALAR_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_FIFO_NRDY
@ SCH_FLAT_MEM_ISSUE_NRDY
@ SCH_LOCAL_MEM_BUS_BUSY_NRDY
@ SCH_SCALAR_MEM_FIFO_NRDY
@ SCH_VECTOR_MEM_ISSUE_NRDY
Communication interface between Schedule and Execute stages.
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
void reset() override
Reset the pipe stage interface.
GPUDynInstPtr & readyInst(int func_unit_id)
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Communication interface between ScoreboardCheck and Schedule stages.
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
bool rdy(Cycles cycles=Cycles(0)) const
bool isOldestInstWaitcnt()
void setStatus(status_e newStatus)
bool isOldestInstBarrier()
std::deque< GPUDynInstPtr > instructionBuffer
std::vector< int > reserveResources()
void incLGKMInstsIssued()
void incVMemInstsIssued()
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & init(size_type size)
Set this vector to have the given size.
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
std::string csprintf(const char *format, const Args &...args)
statistics::Vector rdyListNotEmpty
statistics::Vector addToSchListStalls
statistics::Vector schListToDispList
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Vector rfAccessStalls
statistics::Vector schListToDispListStalls
statistics::Vector opdNrdyStalls
statistics::Vector rdyListEmpty
statistics::Vector dispNrdyStalls
statistics::Scalar ldsBusArbStalls
statistics::Scalar schCycles
statistics::Scalar schRfAccessStalls
statistics::Scalar schOpdNrdyStalls
statistics::Scalar schStalls
const std::string & name()