gem5  v22.1.0.0
schedule_stage.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <unordered_set>
35 
36 #include "base/compiler.hh"
37 #include "debug/GPUSched.hh"
38 #include "debug/GPUVRF.hh"
43 #include "gpu-compute/wavefront.hh"
44 
45 namespace gem5
46 {
47 
48 ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
49  ScoreboardCheckToSchedule &from_scoreboard_check,
50  ScheduleToExecute &to_execute)
51  : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
52  toExecute(to_execute),
53  _name(cu.name() + ".ScheduleStage"),
54  vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
55  scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
56  locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
57 {
58  for (int j = 0; j < cu.numExeUnits(); ++j) {
59  scheduler.emplace_back(p);
60  }
61  wavesInSch.clear();
62  schList.resize(cu.numExeUnits());
63  for (auto &dq : schList) {
64  dq.clear();
65  }
66 }
67 
69 {
70  scheduler.clear();
71  wavesInSch.clear();
72  schList.clear();
73 }
74 
75 void
77 {
78 
80  "Scheduler should have same number of entries as CU's readyList");
81  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
83  }
84 
87 }
88 
89 void
91 {
92  toExecute.reset();
93 
94  // Update readyList
95  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
102  for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
103  wIt != fromScoreboardCheck.readyWFs(j).end();) {
104  if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
105  *wIt = nullptr;
106  wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
107  } else {
108  wIt++;
109  }
110  }
111  }
112 
113  // Attempt to add another wave for each EXE type to schList queues
114  // VMEM resources are iterated first, effectively giving priority
115  // to VMEM over VALU for scheduling read of operands to the RFs.
116  // Scalar Memory are iterated after VMEM
117 
118  // Iterate VMEM and SMEM
119  int firstMemUnit = computeUnit.firstMemUnit();
120  int lastMemUnit = computeUnit.lastMemUnit();
121  for (int j = firstMemUnit; j <= lastMemUnit; j++) {
122  int readyListSize = fromScoreboardCheck.readyWFs(j).size();
123  // If no wave is ready to be scheduled on the execution resource
124  // then skip scheduling for this execution resource
125  if (!readyListSize) {
126  stats.rdyListEmpty[j]++;
127  continue;
128  }
130 
131  // Pick a wave and attempt to add it to schList
132  Wavefront *wf = scheduler[j].chooseWave();
133  GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
134  assert(gpu_dyn_inst);
135  if (!addToSchList(j, gpu_dyn_inst)) {
136  // For waves not added to schList, increment count of cycles
137  // this wave spends in SCH stage.
138  wf->stats.schCycles++;
140  } else {
141  if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
142  wf->incLGKMInstsIssued();
143  } else {
144  wf->incVMemInstsIssued();
145  if (gpu_dyn_inst->isFlat()) {
146  wf->incLGKMInstsIssued();
147  }
148  }
149  if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
150  wf->incExpInstsIssued();
151  }
152  }
153  }
154 
155  // Iterate everything else
156  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
157  // skip the VMEM resources
158  if (j >= firstMemUnit && j <= lastMemUnit) {
159  continue;
160  }
161  int readyListSize = fromScoreboardCheck.readyWFs(j).size();
162  // If no wave is ready to be scheduled on the execution resource
163  // then skip scheduling for this execution resource
164  if (!readyListSize) {
165  stats.rdyListEmpty[j]++;
166  continue;
167  }
169 
170  // Pick a wave and attempt to add it to schList
171  Wavefront *wf = scheduler[j].chooseWave();
172  GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
173  assert(gpu_dyn_inst);
174  if (!addToSchList(j, gpu_dyn_inst)) {
175  // For waves not added to schList, increment count of cycles
176  // this wave spends in SCH stage.
177  wf->stats.schCycles++;
179  }
180  }
181 
182  // At this point, the schList queue per EXE type may contain
183  // multiple waves, in order of age (oldest to youngest).
184  // Wave may be in RFBUSY, indicating they are waiting for registers
185  // to be read, or in RFREADY, indicating they are candidates for
186  // the dispatchList and execution
187 
188  // Iterate schList queues and check if any of the waves have finished
189  // reading their operands, moving those waves to RFREADY status
191 
192  // Fill the dispatch list with the oldest wave of each EXE type that
193  // is ready to execute
194  // Wave is picked if status in schList is RFREADY and it passes resource
195  // ready checks similar to those currently in SCB
197 
198  // Resource arbitration on waves in dispatchList
199  // Losing waves are re-inserted to the schList at a location determined
200  // by wave age
201 
202  // Arbitrate access to the VRF->LDS bus
204 
205  // Schedule write operations to the register files
207 
208  // Lastly, reserve resources for waves that are ready to execute.
210 }
211 
212 void
214  const GPUDynInstPtr &gpu_dyn_inst)
215 {
216  toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
217 }
218 
219 void
221 {
222  toExecute.dispatchTransition(unitId, s);
223 }
224 
225 bool
226 ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
227 {
228  assert(gpu_dyn_inst);
229  Wavefront *wf = gpu_dyn_inst->wavefront();
230  bool accessVrfWr = true;
231  if (!gpu_dyn_inst->isScalar()) {
232  accessVrfWr = computeUnit.vrf[wf->simdId]
233  ->canScheduleWriteOperands(wf, gpu_dyn_inst);
234  }
235  bool accessSrfWr = computeUnit.srf[wf->simdId]
236  ->canScheduleWriteOperands(wf, gpu_dyn_inst);
237  bool accessRf = accessVrfWr && accessSrfWr;
238  if (accessRf) {
239  if (!gpu_dyn_inst->isScalar()) {
240  computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
241  gpu_dyn_inst);
242  }
243  computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
244  return true;
245  } else {
247  if (!accessSrfWr) {
249  }
250  if (!accessVrfWr) {
252  }
253 
254  // Increment stall counts for WF
255  wf->stats.schStalls++;
256  wf->stats.schRfAccessStalls++;
257  }
258  return false;
259 }
260 
261 void
263 {
264  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
265  if (toExecute.dispatchStatus(j) == EMPTY ||
267  continue;
268  }
269 
270  // get the wave on dispatch list and attempt to allocate write
271  // resources in the RFs
272  const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
273  assert(gpu_dyn_inst);
274  Wavefront *wf = gpu_dyn_inst->wavefront();
275  if (!schedRfWrites(j, gpu_dyn_inst)) {
276  reinsertToSchList(j, gpu_dyn_inst);
278  // if this is a flat inst, also transition the LM pipe to empty
279  // Note: since FLAT/LM arbitration occurs before scheduling
280  // destination operands to the RFs, it is possible that a LM
281  // instruction lost arbitration, but would have been able to
282  // pass the RF destination operand check here, and execute
283  // instead of the FLAT.
284  if (wf->instructionBuffer.front()->isFlat()) {
285  assert(toExecute.dispatchStatus(wf->localMem)
286  == SKIP);
288  }
289  }
290  }
291 }
292 
293 bool
294 ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
295 {
296  // Attempt to add the wave to the schList if the VRF can support the
297  // wave's next instruction
298  assert(gpu_dyn_inst);
299  Wavefront *wf = gpu_dyn_inst->wavefront();
300  bool accessVrf = true;
301  if (!gpu_dyn_inst->isScalar()) {
302  accessVrf = computeUnit.vrf[wf->simdId]
303  ->canScheduleReadOperands(wf, gpu_dyn_inst);
304  }
305  bool accessSrf = computeUnit.srf[wf->simdId]
306  ->canScheduleReadOperands(wf, gpu_dyn_inst);
307  // If RFs can support instruction, add to schList in RFBUSY state,
308  // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
309  // to the VRF
310  bool accessRf = accessVrf && accessSrf;
311  if (accessRf) {
312  DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
313  exeType, wf->simdId, wf->wfDynId,
314  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
315 
317  wavesInSch.emplace(wf->wfDynId);
318  schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
319  if (wf->isOldestInstBarrier() && wf->hasBarrier()) {
321  }
322  if (wf->isOldestInstWaitcnt()) {
324  }
325  if (wf->isOldestInstSleep()) {
327  }
328  if (!gpu_dyn_inst->isScalar()) {
329  computeUnit.vrf[wf->simdId]
330  ->scheduleReadOperands(wf, gpu_dyn_inst);
331  }
332  computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
333 
334  DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
335  exeType, wf->simdId, wf->wfDynId,
336  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
337  return true;
338  } else {
339  // Number of stall cycles due to RF access denied
341  // Count number of denials due to each reason
342  // Multiple items may contribute to the denied request
343  if (!accessVrf) {
345  }
346  if (!accessSrf) {
348  }
349 
350  // Increment stall counts for WF
351  wf->stats.schStalls++;
352  wf->stats.schRfAccessStalls++;
353  DPRINTF(GPUSched, "schList[%d]: Could not add: "
354  "SIMD[%d] WV[%d]: %d: %s\n",
355  exeType, wf->simdId, wf->wfDynId,
356  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
357  }
358  return false;
359 }
360 
361 void
363  const GPUDynInstPtr &gpu_dyn_inst)
364 {
365  // Insert wave w into schList for specified exeType.
366  // Wave is inserted in age order, with oldest wave being at the
367  // front of the schList
368  assert(gpu_dyn_inst);
369  auto schIter = schList.at(exeType).begin();
370  while (schIter != schList.at(exeType).end()
371  && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
372  schIter++;
373  }
374  schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
375 }
376 
377 void
379 {
380  // Check for resource availability in the next cycle
381  scalarMemBusRdy = false;
382  scalarMemIssueRdy = false;
383  // check if there is a SRF->Global Memory bus available and
385  scalarMemBusRdy = true;
386  }
387  // check if we can issue a scalar memory instruction
389  scalarMemIssueRdy = true;
390  }
391 
392  glbMemBusRdy = false;
393  glbMemIssueRdy = false;
394  // check if there is a VRF->Global Memory bus available
396  glbMemBusRdy = true;
397  }
398  // check if we can issue a Global memory instruction
400  glbMemIssueRdy = true;
401  }
402 
403  locMemBusRdy = false;
404  locMemIssueRdy = false;
405  // check if there is a VRF->LDS bus available
407  locMemBusRdy = true;
408  }
409  // check if we can issue a LDS instruction
411  locMemIssueRdy = true;
412  }
413 }
414 
415 bool
417 {
418  assert(gpu_dyn_inst);
419  Wavefront *wf = gpu_dyn_inst->wavefront();
420  vectorAluRdy = false;
421  scalarAluRdy = false;
422  // check for available vector/scalar ALUs in the next cycle
423  if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
424  vectorAluRdy = true;
425  }
426  if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
427  scalarAluRdy = true;
428  }
429 
430  if (gpu_dyn_inst->isNop()) {
431  // S_NOP requires SALU. V_NOP requires VALU.
432  // TODO: Scalar NOP does not require SALU in hardware,
433  // and is executed out of IB directly.
434  if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
436  return false;
437  } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
439  return false;
440  }
441  } else if (gpu_dyn_inst->isEndOfKernel()) {
442  // EndPgm instruction
443  if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
445  return false;
446  }
447  } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
448  || gpu_dyn_inst->isALU()) {
449  // Barrier, Branch, or ALU instruction
450  if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
452  return false;
453  } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
455  return false;
456  }
457  } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
458  // Vector Global Memory instruction
459  bool rdy = true;
460  if (!glbMemIssueRdy) {
461  rdy = false;
463  }
464  if (!glbMemBusRdy) {
465  rdy = false;
467  }
468  if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
469  rdy = false;
471  }
473  rdy = false;
475  }
476  if (!rdy) {
477  return false;
478  }
479  } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
480  // Scalar Global Memory instruction
481  bool rdy = true;
482  if (!scalarMemIssueRdy) {
483  rdy = false;
485  }
486  if (!scalarMemBusRdy) {
487  rdy = false;
489  }
492  + wf->scalarWrGmReqsInPipe))
493  {
494  rdy = false;
496  }
497  if (!rdy) {
498  return false;
499  }
500  } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
501  // Vector Local Memory instruction
502  bool rdy = true;
503  if (!locMemIssueRdy) {
504  rdy = false;
506  }
507  if (!locMemBusRdy) {
508  rdy = false;
510  }
512  isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
513  rdy = false;
515  }
516  if (!rdy) {
517  return false;
518  }
519  } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
520  // Vector Flat memory instruction
521  bool rdy = true;
522  if (!glbMemIssueRdy || !locMemIssueRdy) {
523  rdy = false;
525  }
526  if (!glbMemBusRdy || !locMemBusRdy) {
527  rdy = false;
529  }
530  if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
531  rdy = false;
533  }
535  rdy = false;
537  }
539  isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
540  rdy = false;
542  }
543  if (!rdy) {
544  return false;
545  }
546  } else {
547  panic("%s: unknown instr checked for readiness",
548  gpu_dyn_inst->disassemble());
549  return false;
550  }
552  return true;
553 }
554 
555 void
557 {
558  // update execution resource status
560  // iterate execution resources
561  for (int j = 0; j < computeUnit.numExeUnits(); j++) {
562  assert(toExecute.dispatchStatus(j) == EMPTY);
563 
564  // iterate waves in schList to pick one for dispatch
565  auto schIter = schList.at(j).begin();
566  bool dispatched = false;
567  while (schIter != schList.at(j).end()) {
568  // only attempt to dispatch if status is RFREADY
569  if (schIter->second == RFREADY) {
570  // Check if this wave is ready for dispatch
571  bool dispRdy = dispatchReady(schIter->first);
572  if (!dispatched && dispRdy) {
573  // No other wave has been dispatched for this exe
574  // resource, and this wave is ready. Place this wave
575  // on dispatchList and make it ready for execution
576  // next cycle.
577 
578  // Acquire a coalescer token if it is a global mem
579  // operation.
580  GPUDynInstPtr mp = schIter->first;
581  if (!mp->isMemSync() && !mp->isScalar() &&
582  (mp->isGlobalMem() || mp->isFlat())) {
584  }
585 
586  // Set instruction's exec_mask if it's a mem operation
587  if (mp->isMemRef()) {
588  mp->exec_mask = mp->wavefront()->execMask();
589  }
590 
591  doDispatchListTransition(j, EXREADY, schIter->first);
592  DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
593  "EMPTY->EXREADY\n", j);
594  schIter->first = nullptr;
595  schIter = schList.at(j).erase(schIter);
596  dispatched = true;
597  } else {
598  // Either another wave has been dispatched, or this wave
599  // was not ready, so it is stalled this cycle
600  schIter->first->wavefront()->stats.schStalls++;
601  if (!dispRdy) {
602  // not ready for dispatch, increment stall stat
603  schIter->first->wavefront()->stats.schResourceStalls++;
604  }
605  // Examine next wave for this resource
606  schIter++;
607  }
608  } else {
609  // Wave not in RFREADY, try next wave
610  schIter++;
611  }
612  }
613 
614  // Increment stall count if no wave sent to dispatchList for
615  // current execution resource
616  if (!dispatched) {
618  } else {
620  }
621  }
622 }
623 
624 void
626 {
627  // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
628  // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
629  // and a VRF->LDS bus. In GFx9, this is not the case.
630 
631  // iterate the GM pipelines
632  for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
633  // get the GM pipe index in the dispatchList
634  int gm_exe_unit = computeUnit.firstMemUnit() + i;
635  // get the wave in the dispatchList
636  GPUDynInstPtr &gpu_dyn_inst
637  = toExecute.readyInst(gm_exe_unit);
638  // If the WF is valid, ready to execute, and the instruction
639  // is a flat access, arbitrate with the WF's assigned LM pipe
640  if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
641  == EXREADY && gpu_dyn_inst->isFlat()) {
642  Wavefront *wf = gpu_dyn_inst->wavefront();
643  // If the associated LM pipe also has a wave selected, block
644  // that wave and let the Flat instruction issue. The WF in the
645  // LM pipe is added back to the schList for consideration next
646  // cycle.
649  .readyInst(wf->localMem));
650  // Increment stall stats for LDS-VRF arbitration
653  ->wavefront()->stats.schLdsArbStalls++;
654  }
655  // With arbitration of LM pipe complete, transition the
656  // LM pipe to SKIP state in the dispatchList to inform EX stage
657  // that a Flat instruction is executing next cycle
658  doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
659  DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
660  "EXREADY->SKIP\n", wf->localMem);
661  }
662  }
663 }
664 
665 void
667 {
668  // Iterate the schList queues and check if operand reads
669  // have completed in the RFs. If so, mark the wave as ready for
670  // selection for dispatchList
671  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
672  for (auto &p : schList.at(j)) {
673  const GPUDynInstPtr &gpu_dyn_inst = p.first;
674  assert(gpu_dyn_inst);
675  Wavefront *wf = gpu_dyn_inst->wavefront();
676 
677  // Increment the number of cycles the wave spends in the
678  // SCH stage, since this loop visits every wave in SCH.
679  wf->stats.schCycles++;
680 
681  bool vrfRdy = true;
682  if (!gpu_dyn_inst->isScalar()) {
683  vrfRdy = computeUnit.vrf[wf->simdId]
684  ->operandReadComplete(wf, gpu_dyn_inst);
685  }
686  bool srfRdy = computeUnit.srf[wf->simdId]
687  ->operandReadComplete(wf, gpu_dyn_inst);
688  bool operandsReady = vrfRdy && srfRdy;
689  if (operandsReady) {
690  DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
691  "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
692  gpu_dyn_inst->disassemble());
693  DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
694  j, wf->wfDynId);
695  p.second = RFREADY;
696  } else {
697  DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
698  "for: %d: %s\n", j, wf->wfDynId,
699  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
700 
701  // operands not ready yet, increment SCH stage stats
702  // aggregate to all wavefronts on the CU
703  p.second = RFBUSY;
704 
705  // Increment stall stats
706  wf->stats.schStalls++;
707  wf->stats.schOpdNrdyStalls++;
708 
710  if (!vrfRdy) {
712  }
713  if (!srfRdy) {
715  }
716  }
717  }
718  }
719 }
720 
721 void
723 {
724  std::vector<bool> exeUnitReservations;
725  exeUnitReservations.resize(computeUnit.numExeUnits(), false);
726 
727  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
728  GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
729  if (gpu_dyn_inst) {
731  Wavefront *wf = gpu_dyn_inst->wavefront();
732  if (s == EMPTY) {
733  continue;
734  } else if (s == EXREADY) {
735  // Wave is ready for execution
736  std::vector<int> execUnitIds = wf->reserveResources();
737 
738  if (!gpu_dyn_inst->isScalar()) {
739  computeUnit.vrf[wf->simdId]
740  ->dispatchInstruction(gpu_dyn_inst);
741  }
742  computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
743 
744  std::stringstream ss;
745  for (auto id : execUnitIds) {
746  ss << id << " ";
747  }
748  DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
749  " Reserving ExeRes[ %s]\n",
750  j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
751  gpu_dyn_inst->disassemble(), ss.str());
752  // mark the resources as reserved for this cycle
753  for (auto execUnitId : execUnitIds) {
754  panic_if(exeUnitReservations.at(execUnitId),
755  "Execution unit %d is reserved!!!\n"
756  "SIMD[%d] WV[%d]: %d: %s",
757  execUnitId, wf->simdId, wf->wfDynId,
758  gpu_dyn_inst->seqNum(),
759  gpu_dyn_inst->disassemble());
760  exeUnitReservations.at(execUnitId) = true;
761  }
762 
763  // If wavefront::reserveResources reserved multiple resources,
764  // then we're executing a flat memory instruction. This means
765  // that we've reserved a global and local memory unit. Thus,
766  // we need to mark the latter execution unit as not available.
767  if (execUnitIds.size() > 1) {
768  [[maybe_unused]] int lm_exec_unit = wf->localMem;
769  assert(toExecute.dispatchStatus(lm_exec_unit)
770  == SKIP);
771  }
772  } else if (s == SKIP) {
773  // Shared Memory pipe reserved for FLAT instruction.
774  // Verify the GM pipe for this wave is ready to execute
775  // and the wave in the GM pipe is the same as the wave
776  // in the LM pipe
777  [[maybe_unused]] int gm_exec_unit = wf->globalMem;
778  assert(wf->wfDynId == toExecute
779  .readyInst(gm_exec_unit)->wfDynId);
780  assert(toExecute.dispatchStatus(gm_exec_unit)
781  == EXREADY);
782  }
783  }
784  }
785 }
786 
787 void
789 {
790  wavesInSch.erase(w->wfDynId);
791 }
792 
794  statistics::Group *parent, int num_exec_units)
795  : statistics::Group(parent, "ScheduleStage"),
796  ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
797  "execution resource"),
798  ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
799  "list per execution resource"),
800  ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
801  "schList per execution resource when ready list is not empty"),
802  ADD_STAT(schListToDispList, "number of cycles a wave is added to "
803  "dispatchList per execution resource"),
804  ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
805  " dispatchList per execution resource"),
806  ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
807  ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
808  "conflicts"),
809  ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
810  "ready"),
811  ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
812  "ready")
813 {
814  rdyListNotEmpty.init(num_exec_units);
815  rdyListEmpty.init(num_exec_units);
816  addToSchListStalls.init(num_exec_units);
817  schListToDispList.init(num_exec_units);
818  schListToDispListStalls.init(num_exec_units);
822 
826 
830  csprintf("VectorMemIssue"));
832  csprintf("VectorMemBusBusy"));
834  csprintf("VectorMemCoalescer"));
837  csprintf("ScalarMemIssue"));
839  csprintf("ScalarMemBusBusy"));
841  csprintf("ScalarMemFIFO"));
843  csprintf("LocalMemIssue"));
845  csprintf("LocalMemBusBusy"));
847  csprintf("LocalMemFIFO"));
849  csprintf("FlatMemIssue"));
851  csprintf("FlatMemBusBusy"));
853  csprintf("FlatMemCoalescer"));
855  csprintf("FlatMemFIFO"));
857 
863 }
864 
865 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
std::vector< WaitClass > scalarALUs
WaitClass scalarMemUnit
WaitClass vectorGlobalMemUnit
LocalMemPipeline localMemoryPipe
WaitClass vrfToLocalMemPipeBus
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
int numExeUnits() const
GlobalMemPipeline globalMemoryPipe
void insertInPipeMap(Wavefront *w)
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
std::vector< WaitClass > vectorALUs
WaitClass vectorSharedMemUnit
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:79
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void acqCoalescerToken(GPUDynInstPtr mp)
bool coalescerReady(GPUDynInstPtr mp) const
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
gem5::ScheduleStage::ScheduleStageStats stats
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
ScoreboardCheckToSchedule & fromScoreboardCheck
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Communication interface between Schedule and Execute stages.
Definition: comm.hh:99
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
Definition: comm.cc:150
void reset() override
Reset the pipe stage interface.
Definition: comm.cc:115
GPUDynInstPtr & readyInst(int func_unit_id)
Definition: comm.cc:127
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Definition: comm.cc:133
Communication interface between ScoreboardCheck and Schedule stages.
Definition: comm.hh:63
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
Definition: comm.cc:73
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
Definition: comm.cc:79
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
Definition: comm.cc:89
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
bool isOldestInstWaitcnt()
Definition: wavefront.cc:626
bool hasBarrier() const
Definition: wavefront.cc:1452
void setStatus(status_e newStatus)
Definition: wavefront.cc:542
const int simdId
Definition: wavefront.hh:99
int scalarWrGmReqsInPipe
Definition: wavefront.hh:189
bool isOldestInstBarrier()
Definition: wavefront.cc:673
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:109
bool isOldestInstSleep()
Definition: wavefront.cc:612
void incExpInstsIssued()
Definition: wavefront.cc:1357
std::vector< int > reserveResources()
Definition: wavefront.cc:832
void incLGKMInstsIssued()
Definition: wavefront.cc:1363
int scalarRdGmReqsInPipe
Definition: wavefront.hh:188
void incVMemInstsIssued()
Definition: wavefront.cc:1351
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:92
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition: wavefront.hh:88
gem5::Wavefront::WavefrontStats stats
uint64_t wfDynId
Definition: wavefront.hh:226
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:402
Statistics container.
Definition: group.hh:94
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1040
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 11 > mp
Definition: misc_types.hh:827
Bitfield< 24 > j
Definition: misc_types.hh:57
Bitfield< 2 > dq
Bitfield< 1 > s
Definition: pagetable.hh:64
Bitfield< 6 > w
Definition: pagetable.hh:59
Bitfield< 54 > p
Definition: pagetable.hh:70
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
DISPATCH_STATUS
Definition: exec_stage.hh:60
@ EXREADY
Definition: exec_stage.hh:62
@ EMPTY
Definition: exec_stage.hh:61
@ SKIP
Definition: exec_stage.hh:63
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Scalar schCycles
Definition: wavefront.hh:343
statistics::Scalar schRfAccessStalls
Definition: wavefront.hh:353
statistics::Scalar schOpdNrdyStalls
Definition: wavefront.hh:357
statistics::Scalar schStalls
Definition: wavefront.hh:346
const std::string & name()
Definition: trace.cc:49
std::stringstream ss
Definition: trace.test.cc:45

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1