Go to the documentation of this file.
   36 #include <unordered_set> 
   39 #include "debug/GPUSched.hh" 
   40 #include "debug/GPUVRF.hh" 
   53     : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
 
   54       toExecute(to_execute),
 
   55       _name(cu.
name() + 
".ScheduleStage"),
 
   56       vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
 
   57       scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
 
   58       locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
 
   82              "Scheduler should have same number of entries as CU's readyList");
 
  123     for (
int j = firstMemUnit; 
j <= lastMemUnit; 
j++) {
 
  127         if (!readyListSize) {
 
  136         assert(gpu_dyn_inst);
 
  143             if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
 
  147                 if (gpu_dyn_inst->isFlat()) {
 
  151             if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
 
  160         if (
j >= firstMemUnit && 
j <= lastMemUnit) {
 
  166         if (!readyListSize) {
 
  175         assert(gpu_dyn_inst);
 
  230     assert(gpu_dyn_inst);
 
  231     Wavefront *wf = gpu_dyn_inst->wavefront();
 
  232     bool accessVrfWr = 
true;
 
  233     if (!gpu_dyn_inst->isScalar()) {
 
  235             ->canScheduleWriteOperands(wf, gpu_dyn_inst);
 
  238         ->canScheduleWriteOperands(wf, gpu_dyn_inst);
 
  239     bool accessRf = accessVrfWr && accessSrfWr;
 
  241         if (!gpu_dyn_inst->isScalar()) {
 
  275         assert(gpu_dyn_inst);
 
  276         Wavefront *wf = gpu_dyn_inst->wavefront();
 
  300     assert(gpu_dyn_inst);
 
  301     Wavefront *wf = gpu_dyn_inst->wavefront();
 
  302     bool accessVrf = 
true;
 
  303     if (!gpu_dyn_inst->isScalar()) {
 
  305             ->canScheduleReadOperands(wf, gpu_dyn_inst);
 
  308         ->canScheduleReadOperands(wf, gpu_dyn_inst);
 
  312     bool accessRf = accessVrf && accessSrf;
 
  314         DPRINTF(GPUSched, 
"schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
 
  316                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  320         schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, 
RFBUSY));
 
  330         if (!gpu_dyn_inst->isScalar()) {
 
  332                 ->scheduleReadOperands(wf, gpu_dyn_inst);
 
  336         DPRINTF(GPUSched, 
"schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
 
  338                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  355         DPRINTF(GPUSched, 
"schList[%d]: Could not add: " 
  356                 "SIMD[%d] WV[%d]: %d: %s\n",
 
  358                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  370     assert(gpu_dyn_inst);
 
  371     auto schIter = 
schList.at(exeType).begin();
 
  372     while (schIter != 
schList.at(exeType).end()
 
  373            && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
 
  376     schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, 
RFREADY));
 
  420     assert(gpu_dyn_inst);
 
  421     Wavefront *wf = gpu_dyn_inst->wavefront();
 
  432     if (gpu_dyn_inst->isNop()) {
 
  439         } 
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
 
  443     } 
else if (gpu_dyn_inst->isEndOfKernel()) {
 
  449     } 
else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
 
  450                || gpu_dyn_inst->isALU()) {
 
  455         } 
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
 
  459     } 
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
 
  481     } 
else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
 
  502     } 
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
 
  521     } 
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
 
  549         panic(
"%s: unknown instr checked for readiness",
 
  550               gpu_dyn_inst->disassemble());
 
  567         auto schIter = 
schList.at(
j).begin();
 
  568         bool dispatched = 
false;
 
  569         while (schIter != 
schList.at(
j).end()) {
 
  571             if (schIter->second == 
RFREADY) {
 
  574                 if (!dispatched && dispRdy) {
 
  583                     if (!
mp->isMemSync() && !
mp->isScalar() &&
 
  584                         (
mp->isGlobalMem() || 
mp->isFlat())) {
 
  589                     if (
mp->isMemRef()) {
 
  590                         mp->exec_mask = 
mp->wavefront()->execMask();
 
  594                     DPRINTF(GPUSched, 
"dispatchList[%d]: fillDispatchList: " 
  595                             "EMPTY->EXREADY\n", 
j);
 
  596                     schIter->first = 
nullptr;
 
  597                     schIter = 
schList.at(
j).erase(schIter);
 
  602                     schIter->first->wavefront()->stats.schStalls++;
 
  605                         schIter->first->wavefront()->stats.schResourceStalls++;
 
  643             == 
EXREADY && gpu_dyn_inst->isFlat()) {
 
  644             Wavefront *wf = gpu_dyn_inst->wavefront();
 
  655                     ->wavefront()->stats.schLdsArbStalls++;
 
  661             DPRINTF(GPUSched, 
"dispatchList[%d]: arbVrfLds: " 
  676             assert(gpu_dyn_inst);
 
  677             Wavefront *wf = gpu_dyn_inst->wavefront();
 
  684             if (!gpu_dyn_inst->isScalar()) {
 
  686                     ->operandReadComplete(wf, gpu_dyn_inst);
 
  689                 ->operandReadComplete(wf, gpu_dyn_inst);
 
  690             bool operandsReady = vrfRdy && srfRdy;
 
  692                 DPRINTF(GPUSched, 
"schList[%d]: WV[%d] operands ready for: " 
  693                         "%d: %s\n", 
j, wf->
wfDynId, gpu_dyn_inst->seqNum(),
 
  694                         gpu_dyn_inst->disassemble());
 
  695                 DPRINTF(GPUSched, 
"schList[%d]: WV[%d] RFBUSY->RFREADY\n",
 
  699                 DPRINTF(GPUSched, 
"schList[%d]: WV[%d] operands not ready " 
  701                         gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  733             Wavefront *wf = gpu_dyn_inst->wavefront();
 
  740                 if (!gpu_dyn_inst->isScalar()) {
 
  742                         ->dispatchInstruction(gpu_dyn_inst);
 
  746                 std::stringstream 
ss;
 
  747                 for (
auto id : execUnitIds) {
 
  750                 DPRINTF(GPUSched, 
"dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s" 
  751                         "    Reserving ExeRes[ %s]\n",
 
  753                         gpu_dyn_inst->disassemble(), 
ss.str());
 
  755                 for (
auto execUnitId : execUnitIds) {
 
  756                     panic_if(exeUnitReservations.at(execUnitId),
 
  757                              "Execution unit %d is reserved!!!\n" 
  758                              "SIMD[%d] WV[%d]: %d: %s",
 
  760                              gpu_dyn_inst->seqNum(),
 
  761                              gpu_dyn_inst->disassemble());
 
  762                     exeUnitReservations.at(execUnitId) = 
true;
 
  769                 if (execUnitIds.size() > 1) {
 
  770                     GEM5_VAR_USED 
int lm_exec_unit = wf->
localMem;
 
  774             } 
else if (
s == 
SKIP) {
 
  779                 GEM5_VAR_USED 
int gm_exec_unit = wf->
globalMem;
 
  797     : statistics::
Group(parent, 
"ScheduleStage"),
 
  798       ADD_STAT(rdyListEmpty ,
"number of cycles no wave on ready list per " 
  799                "execution resource"),
 
  800       ADD_STAT(rdyListNotEmpty, 
"number of cycles one or more wave on ready " 
  801                "list per execution resource"),
 
  802       ADD_STAT(addToSchListStalls, 
"number of cycles a wave is not added to " 
  803                "schList per execution resource when ready list is not empty"),
 
  804       ADD_STAT(schListToDispList, 
"number of cycles a wave is added to " 
  805                "dispatchList per execution resource"),
 
  806       ADD_STAT(schListToDispListStalls, 
"number of cycles no wave is added to" 
  807                " dispatchList per execution resource"),
 
  808       ADD_STAT(rfAccessStalls, 
"number of stalls due to RF access denied"),
 
  809       ADD_STAT(ldsBusArbStalls, 
"number of stalls due to VRF->LDS bus " 
  811       ADD_STAT(opdNrdyStalls, 
"number of stalls in SCH due to operands not " 
  813       ADD_STAT(dispNrdyStalls, 
"number of stalls in SCH due to resource not " 
  
std::unordered_set< uint64_t > wavesInSch
ScoreboardCheckToSchedule & fromScoreboardCheck
void scheduleRfDestOperands()
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
statistics::Vector addToSchListStalls
statistics::Scalar ldsBusArbStalls
LocalMemPipeline localMemoryPipe
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
statistics::Scalar schStalls
bool isOldestInstBarrier()
std::vector< ScalarRegisterFile * > srf
Communication interface between Schedule and Execute stages.
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
statistics::Scalar schOpdNrdyStalls
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
statistics::Vector rfAccessStalls
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
WaitClass srfToScalarMemPipeBus
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
std::string csprintf(const char *format, const Args &...args)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_FLAT_MEM_ISSUE_NRDY
statistics::Scalar schCycles
WaitClass vrfToGlobalMemPipeBus
int numVectorSharedMemUnits
std::vector< VectorRegisterFile * > vrf
@ SCH_LOCAL_MEM_FIFO_NRDY
statistics::Vector rdyListEmpty
ScheduleToExecute & toExecute
Cycles is a wrapper class for representing cycle counts, i.e.
void setStatus(status_e newStatus)
statistics::Scalar schRfAccessStalls
GPUDynInstPtr & readyInst(int func_unit_id)
@ SCH_RF_OPD_NRDY_CONDITIONS
statistics::Vector schListToDispList
WaitClass vectorSharedMemUnit
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
void incLGKMInstsIssued()
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_VECTOR_MEM_ISSUE_NRDY
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
GlobalMemPipeline globalMemoryPipe
@ S_BARRIER
WF is stalled at a barrier.
void reset() override
Reset the pipe stage interface.
void arbitrateVrfToLdsBus()
void deleteFromSch(Wavefront *w)
void checkRfOperandReadComplete()
bool isOldestInstWaitcnt()
@ SCH_FLAT_MEM_BUS_BUSY_NRDY
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Vector schListToDispListStalls
statistics::Vector rdyListNotEmpty
statistics::Vector opdNrdyStalls
std::vector< WaitClass > scalarALUs
void acqCoalescerToken(GPUDynInstPtr mp)
const std::string & name()
@ SCH_VECTOR_MEM_REQS_NRDY
bool outstandingReqsCheck(GPUDynInstPtr mp) const
std::shared_ptr< GPUDynInst > GPUDynInstPtr
@ SCH_LOCAL_MEM_BUS_BUSY_NRDY
@ SCH_SCALAR_MEM_ISSUE_NRDY
@ SCH_LOCAL_MEM_ISSUE_NRDY
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
int numVectorGlobalMemUnits
bool coalescerReady(GPUDynInstPtr mp) const
@ SCH_SCALAR_MEM_FIFO_NRDY
std::vector< int > reserveResources()
Communication interface between ScoreboardCheck and Schedule stages.
gem5::ScheduleStage::ScheduleStageStats stats
ComputeUnit & computeUnit
void insertInPipeMap(Wavefront *w)
statistics::Vector dispNrdyStalls
@ S_WAITCNT
wavefront has unsatisfied wait counts
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
WaitClass vrfToLocalMemPipeBus
std::vector< WaitClass > vectorALUs
std::deque< GPUDynInstPtr > instructionBuffer
gem5::Wavefront::WavefrontStats stats
@ SCH_VECTOR_MEM_BUS_BUSY_NRDY
@ SCH_VECTOR_MEM_COALESCER_NRDY
@ SCH_RF_ACCESS_NRDY_CONDITIONS
void incVMemInstsIssued()
@ SCH_SCALAR_MEM_BUS_BUSY_NRDY
ScalarMemPipeline scalarMemoryPipe
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
WaitClass vectorGlobalMemUnit
Derived & init(size_type size)
Set this vector to have the given size.
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
bool rdy(Cycles cycles=Cycles(0)) const
@ SCH_FLAT_MEM_COALESCER_NRDY
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
#define panic(...)
This implements a cprintf based panic() function.
Generated on Tue Sep 21 2021 12:25:25 for gem5 by  doxygen 1.8.17