34 #include <unordered_set> 
   37 #include "debug/GPUSched.hh" 
   38 #include "debug/GPUVRF.hh" 
   51     : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
 
   52       toExecute(to_execute),
 
   53       _name(cu.
name() + 
".ScheduleStage"),
 
   54       vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
 
   55       scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
 
   56       locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
 
   80              "Scheduler should have same number of entries as CU's readyList");
 
  121     for (
int j = firstMemUnit; 
j <= lastMemUnit; 
j++) {
 
  125         if (!readyListSize) {
 
  134         assert(gpu_dyn_inst);
 
  141             if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
 
  145                 if (gpu_dyn_inst->isFlat()) {
 
  149             if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
 
  158         if (
j >= firstMemUnit && 
j <= lastMemUnit) {
 
  164         if (!readyListSize) {
 
  173         assert(gpu_dyn_inst);
 
  228     assert(gpu_dyn_inst);
 
  229     Wavefront *wf = gpu_dyn_inst->wavefront();
 
  230     bool accessVrfWr = 
true;
 
  231     if (!gpu_dyn_inst->isScalar()) {
 
  233             ->canScheduleWriteOperands(wf, gpu_dyn_inst);
 
  236         ->canScheduleWriteOperands(wf, gpu_dyn_inst);
 
  237     bool accessRf = accessVrfWr && accessSrfWr;
 
  239         if (!gpu_dyn_inst->isScalar()) {
 
  273         assert(gpu_dyn_inst);
 
  274         Wavefront *wf = gpu_dyn_inst->wavefront();
 
  298     assert(gpu_dyn_inst);
 
  299     Wavefront *wf = gpu_dyn_inst->wavefront();
 
  300     bool accessVrf = 
true;
 
  301     if (!gpu_dyn_inst->isScalar()) {
 
  303             ->canScheduleReadOperands(wf, gpu_dyn_inst);
 
  306         ->canScheduleReadOperands(wf, gpu_dyn_inst);
 
  310     bool accessRf = accessVrf && accessSrf;
 
  312         DPRINTF(GPUSched, 
"schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
 
  314                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  318         schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, 
RFBUSY));
 
  328         if (!gpu_dyn_inst->isScalar()) {
 
  330                 ->scheduleReadOperands(wf, gpu_dyn_inst);
 
  334         DPRINTF(GPUSched, 
"schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
 
  336                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  353         DPRINTF(GPUSched, 
"schList[%d]: Could not add: " 
  354                 "SIMD[%d] WV[%d]: %d: %s\n",
 
  356                 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  368     assert(gpu_dyn_inst);
 
  369     auto schIter = 
schList.at(exeType).begin();
 
  370     while (schIter != 
schList.at(exeType).end()
 
  371            && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
 
  374     schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, 
RFREADY));
 
  418     assert(gpu_dyn_inst);
 
  419     Wavefront *wf = gpu_dyn_inst->wavefront();
 
  430     if (gpu_dyn_inst->isNop()) {
 
  437         } 
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
 
  441     } 
else if (gpu_dyn_inst->isEndOfKernel()) {
 
  447     } 
else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
 
  448                || gpu_dyn_inst->isALU()) {
 
  453         } 
else if (!gpu_dyn_inst->isScalar() && !
vectorAluRdy) {
 
  457     } 
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
 
  479     } 
else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
 
  500     } 
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
 
  519     } 
else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
 
  547         panic(
"%s: unknown instr checked for readiness",
 
  548               gpu_dyn_inst->disassemble());
 
  565         auto schIter = 
schList.at(
j).begin();
 
  566         bool dispatched = 
false;
 
  567         while (schIter != 
schList.at(
j).end()) {
 
  569             if (schIter->second == 
RFREADY) {
 
  572                 if (!dispatched && dispRdy) {
 
  581                     if (!
mp->isMemSync() && !
mp->isScalar() &&
 
  582                         (
mp->isGlobalMem() || 
mp->isFlat())) {
 
  587                     if (
mp->isMemRef()) {
 
  588                         mp->exec_mask = 
mp->wavefront()->execMask();
 
  592                     DPRINTF(GPUSched, 
"dispatchList[%d]: fillDispatchList: " 
  593                             "EMPTY->EXREADY\n", 
j);
 
  594                     schIter->first = 
nullptr;
 
  595                     schIter = 
schList.at(
j).erase(schIter);
 
  600                     schIter->first->wavefront()->stats.schStalls++;
 
  603                         schIter->first->wavefront()->stats.schResourceStalls++;
 
  641             == 
EXREADY && gpu_dyn_inst->isFlat()) {
 
  642             Wavefront *wf = gpu_dyn_inst->wavefront();
 
  653                     ->wavefront()->stats.schLdsArbStalls++;
 
  659             DPRINTF(GPUSched, 
"dispatchList[%d]: arbVrfLds: " 
  674             assert(gpu_dyn_inst);
 
  675             Wavefront *wf = gpu_dyn_inst->wavefront();
 
  682             if (!gpu_dyn_inst->isScalar()) {
 
  684                     ->operandReadComplete(wf, gpu_dyn_inst);
 
  687                 ->operandReadComplete(wf, gpu_dyn_inst);
 
  688             bool operandsReady = vrfRdy && srfRdy;
 
  690                 DPRINTF(GPUSched, 
"schList[%d]: WV[%d] operands ready for: " 
  691                         "%d: %s\n", 
j, wf->
wfDynId, gpu_dyn_inst->seqNum(),
 
  692                         gpu_dyn_inst->disassemble());
 
  693                 DPRINTF(GPUSched, 
"schList[%d]: WV[%d] RFBUSY->RFREADY\n",
 
  697                 DPRINTF(GPUSched, 
"schList[%d]: WV[%d] operands not ready " 
  699                         gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
 
  731             Wavefront *wf = gpu_dyn_inst->wavefront();
 
  738                 if (!gpu_dyn_inst->isScalar()) {
 
  740                         ->dispatchInstruction(gpu_dyn_inst);
 
  744                 std::stringstream 
ss;
 
  745                 for (
auto id : execUnitIds) {
 
  748                 DPRINTF(GPUSched, 
"dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s" 
  749                         "    Reserving ExeRes[ %s]\n",
 
  751                         gpu_dyn_inst->disassemble(), 
ss.str());
 
  753                 for (
auto execUnitId : execUnitIds) {
 
  754                     panic_if(exeUnitReservations.at(execUnitId),
 
  755                              "Execution unit %d is reserved!!!\n" 
  756                              "SIMD[%d] WV[%d]: %d: %s",
 
  758                              gpu_dyn_inst->seqNum(),
 
  759                              gpu_dyn_inst->disassemble());
 
  760                     exeUnitReservations.at(execUnitId) = 
true;
 
  767                 if (execUnitIds.size() > 1) {
 
  768                     [[maybe_unused]] 
int lm_exec_unit = wf->
localMem;
 
  772             } 
else if (
s == 
SKIP) {
 
  777                 [[maybe_unused]] 
int gm_exec_unit = wf->
globalMem;
 
  795     : statistics::
Group(parent, 
"ScheduleStage"),
 
  796       ADD_STAT(rdyListEmpty ,
"number of cycles no wave on ready list per " 
  797                "execution resource"),
 
  798       ADD_STAT(rdyListNotEmpty, 
"number of cycles one or more wave on ready " 
  799                "list per execution resource"),
 
  800       ADD_STAT(addToSchListStalls, 
"number of cycles a wave is not added to " 
  801                "schList per execution resource when ready list is not empty"),
 
  802       ADD_STAT(schListToDispList, 
"number of cycles a wave is added to " 
  803                "dispatchList per execution resource"),
 
  804       ADD_STAT(schListToDispListStalls, 
"number of cycles no wave is added to" 
  805                " dispatchList per execution resource"),
 
  806       ADD_STAT(rfAccessStalls, 
"number of stalls due to RF access denied"),
 
  807       ADD_STAT(ldsBusArbStalls, 
"number of stalls due to VRF->LDS bus " 
  809       ADD_STAT(opdNrdyStalls, 
"number of stalls in SCH due to operands not " 
  811       ADD_STAT(dispNrdyStalls, 
"number of stalls in SCH due to resource not " 
std::vector< WaitClass > scalarALUs
int numVectorGlobalMemUnits
WaitClass vectorGlobalMemUnit
LocalMemPipeline localMemoryPipe
WaitClass vrfToLocalMemPipeBus
int numVectorSharedMemUnits
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
GlobalMemPipeline globalMemoryPipe
void insertInPipeMap(Wavefront *w)
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
WaitClass vectorSharedMemUnit
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
Cycles is a wrapper class for representing cycle counts, i.e.
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void acqCoalescerToken(GPUDynInstPtr mp)
bool coalescerReady(GPUDynInstPtr mp) const
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_OPD_NRDY_CONDITIONS
gem5::ScheduleStage::ScheduleStageStats stats
void checkRfOperandReadComplete()
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_RF_ACCESS_NRDY_CONDITIONS
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
void scheduleRfDestOperands()
ScoreboardCheckToSchedule & fromScoreboardCheck
void arbitrateVrfToLdsBus()
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
@ SCH_FLAT_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_COALESCER_NRDY
@ SCH_FLAT_MEM_COALESCER_NRDY
@ SCH_SCALAR_MEM_BUS_BUSY_NRDY
@ SCH_VECTOR_MEM_REQS_NRDY
@ SCH_SCALAR_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_FIFO_NRDY
@ SCH_FLAT_MEM_ISSUE_NRDY
@ SCH_LOCAL_MEM_BUS_BUSY_NRDY
@ SCH_SCALAR_MEM_FIFO_NRDY
@ SCH_VECTOR_MEM_ISSUE_NRDY
Communication interface between Schedule and Execute stages.
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
void reset() override
Reset the pipe stage interface.
GPUDynInstPtr & readyInst(int func_unit_id)
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Communication interface between ScoreboardCheck and Schedule stages.
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
bool rdy(Cycles cycles=Cycles(0)) const
bool isOldestInstWaitcnt()
void setStatus(status_e newStatus)
bool isOldestInstBarrier()
std::deque< GPUDynInstPtr > instructionBuffer
std::vector< int > reserveResources()
void incLGKMInstsIssued()
void incVMemInstsIssued()
@ S_BARRIER
WF is stalled at a barrier.
@ S_WAITCNT
wavefront has unsatisfied wait counts
gem5::Wavefront::WavefrontStats stats
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & init(size_type size)
Set this vector to have the given size.
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
std::string csprintf(const char *format, const Args &...args)
statistics::Vector rdyListNotEmpty
statistics::Vector addToSchListStalls
statistics::Vector schListToDispList
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Vector rfAccessStalls
statistics::Vector schListToDispListStalls
statistics::Vector opdNrdyStalls
statistics::Vector rdyListEmpty
statistics::Vector dispNrdyStalls
statistics::Scalar ldsBusArbStalls
statistics::Scalar schCycles
statistics::Scalar schRfAccessStalls
statistics::Scalar schOpdNrdyStalls
statistics::Scalar schStalls
const std::string & name()