gem5  v20.1.0.0
schedule_stage.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
35 
36 #include <unordered_set>
37 
38 #include "debug/GPUSched.hh"
39 #include "debug/GPUVRF.hh"
44 #include "gpu-compute/wavefront.hh"
45 
46 ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
47  ScoreboardCheckToSchedule &from_scoreboard_check,
48  ScheduleToExecute &to_execute)
49  : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
50  toExecute(to_execute),
51  _name(cu.name() + ".ScheduleStage"),
52  vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
53  scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
54  locMemBusRdy(false), locMemIssueRdy(false)
55 {
56  for (int j = 0; j < cu.numExeUnits(); ++j) {
57  scheduler.emplace_back(p);
58  }
59  wavesInSch.clear();
60  schList.resize(cu.numExeUnits());
61  for (auto &dq : schList) {
62  dq.clear();
63  }
64 }
65 
67 {
68  scheduler.clear();
69  wavesInSch.clear();
70  schList.clear();
71 }
72 
73 void
75 {
76 
78  "Scheduler should have same number of entries as CU's readyList");
79  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
81  }
82 
85 }
86 
87 void
89 {
90  toExecute.reset();
91 
92  // Update readyList
93  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
100  for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
101  wIt != fromScoreboardCheck.readyWFs(j).end();) {
102  if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
103  *wIt = nullptr;
104  wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
105  } else {
106  wIt++;
107  }
108  }
109  }
110 
111  // Attempt to add another wave for each EXE type to schList queues
112  // VMEM resources are iterated first, effectively giving priority
113  // to VMEM over VALU for scheduling read of operands to the RFs.
114  // Scalar Memory are iterated after VMEM
115 
116  // Iterate VMEM and SMEM
117  int firstMemUnit = computeUnit.firstMemUnit();
118  int lastMemUnit = computeUnit.lastMemUnit();
119  for (int j = firstMemUnit; j <= lastMemUnit; j++) {
120  int readyListSize = fromScoreboardCheck.readyWFs(j).size();
121  // If no wave is ready to be scheduled on the execution resource
122  // then skip scheduling for this execution resource
123  if (!readyListSize) {
124  rdyListEmpty[j]++;
125  continue;
126  }
127  rdyListNotEmpty[j]++;
128 
129  // Pick a wave and attempt to add it to schList
130  Wavefront *wf = scheduler[j].chooseWave();
131  GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
132  assert(gpu_dyn_inst);
133  if (!addToSchList(j, gpu_dyn_inst)) {
134  // For waves not added to schList, increment count of cycles
135  // this wave spends in SCH stage.
136  wf->schCycles++;
138  } else {
139  if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
140  wf->incLGKMInstsIssued();
141  } else {
142  wf->incVMemInstsIssued();
143  if (gpu_dyn_inst->isFlat()) {
144  wf->incLGKMInstsIssued();
145  }
146  }
147  }
148  }
149 
150  // Iterate everything else
151  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
152  // skip the VMEM resources
153  if (j >= firstMemUnit && j <= lastMemUnit) {
154  continue;
155  }
156  int readyListSize = fromScoreboardCheck.readyWFs(j).size();
157  // If no wave is ready to be scheduled on the execution resource
158  // then skip scheduling for this execution resource
159  if (!readyListSize) {
160  rdyListEmpty[j]++;
161  continue;
162  }
163  rdyListNotEmpty[j]++;
164 
165  // Pick a wave and attempt to add it to schList
166  Wavefront *wf = scheduler[j].chooseWave();
167  GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
168  assert(gpu_dyn_inst);
169  if (!addToSchList(j, gpu_dyn_inst)) {
170  // For waves not added to schList, increment count of cycles
171  // this wave spends in SCH stage.
172  wf->schCycles++;
174  }
175  }
176 
177  // At this point, the schList queue per EXE type may contain
178  // multiple waves, in order of age (oldest to youngest).
179  // Wave may be in RFBUSY, indicating they are waiting for registers
180  // to be read, or in RFREADY, indicating they are candidates for
181  // the dispatchList and execution
182 
183  // Iterate schList queues and check if any of the waves have finished
184  // reading their operands, moving those waves to RFREADY status
186 
187  // Fill the dispatch list with the oldest wave of each EXE type that
188  // is ready to execute
189  // Wave is picked if status in schList is RFREADY and it passes resource
190  // ready checks similar to those currently in SCB
192 
193  // Resource arbitration on waves in dispatchList
194  // Losing waves are re-inserted to the schList at a location determined
195  // by wave age
196 
197  // Arbitrate access to the VRF->LDS bus
199 
200  // Schedule write operations to the register files
202 
203  // Lastly, reserve resources for waves that are ready to execute.
205 }
206 
207 void
209  const GPUDynInstPtr &gpu_dyn_inst)
210 {
211  toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
212 }
213 
214 void
216 {
217  toExecute.dispatchTransition(unitId, s);
218 }
219 
220 bool
221 ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
222 {
223  assert(gpu_dyn_inst);
224  Wavefront *wf = gpu_dyn_inst->wavefront();
225  bool accessVrfWr = true;
226  if (!gpu_dyn_inst->isScalar()) {
227  accessVrfWr = computeUnit.vrf[wf->simdId]
228  ->canScheduleWriteOperands(wf, gpu_dyn_inst);
229  }
230  bool accessSrfWr = computeUnit.srf[wf->simdId]
231  ->canScheduleWriteOperands(wf, gpu_dyn_inst);
232  bool accessRf = accessVrfWr && accessSrfWr;
233  if (accessRf) {
234  if (!gpu_dyn_inst->isScalar()) {
235  computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
236  gpu_dyn_inst);
237  }
238  computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
239  return true;
240  } else {
242  if (!accessSrfWr) {
244  }
245  if (!accessVrfWr) {
247  }
248 
249  // Increment stall counts for WF
250  wf->schStalls++;
251  wf->schRfAccessStalls++;
252  }
253  return false;
254 }
255 
256 void
258 {
259  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
260  if (toExecute.dispatchStatus(j) == EMPTY ||
262  continue;
263  }
264 
265  // get the wave on dispatch list and attempt to allocate write
266  // resources in the RFs
267  const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
268  assert(gpu_dyn_inst);
269  Wavefront *wf = gpu_dyn_inst->wavefront();
270  if (!schedRfWrites(j, gpu_dyn_inst)) {
271  reinsertToSchList(j, gpu_dyn_inst);
273  // if this is a flat inst, also transition the LM pipe to empty
274  // Note: since FLAT/LM arbitration occurs before scheduling
275  // destination operands to the RFs, it is possible that a LM
276  // instruction lost arbitration, but would have been able to
277  // pass the RF destination operand check here, and execute
278  // instead of the FLAT.
279  if (wf->instructionBuffer.front()->isFlat()) {
280  assert(toExecute.dispatchStatus(wf->localMem)
281  == SKIP);
283  }
284  }
285  }
286 }
287 
288 bool
289 ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
290 {
291  // Attempt to add the wave to the schList if the VRF can support the
292  // wave's next instruction
293  assert(gpu_dyn_inst);
294  Wavefront *wf = gpu_dyn_inst->wavefront();
295  bool accessVrf = true;
296  if (!gpu_dyn_inst->isScalar()) {
297  accessVrf = computeUnit.vrf[wf->simdId]
298  ->canScheduleReadOperands(wf, gpu_dyn_inst);
299  }
300  bool accessSrf = computeUnit.srf[wf->simdId]
301  ->canScheduleReadOperands(wf, gpu_dyn_inst);
302  // If RFs can support instruction, add to schList in RFBUSY state,
303  // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
304  // to the VRF
305  bool accessRf = accessVrf && accessSrf;
306  if (accessRf) {
307  DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
308  exeType, wf->simdId, wf->wfDynId,
309  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
310 
312  wavesInSch.emplace(wf->wfDynId);
313  schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
314  if (wf->isOldestInstWaitcnt()) {
316  }
317  if (!gpu_dyn_inst->isScalar()) {
318  computeUnit.vrf[wf->simdId]
319  ->scheduleReadOperands(wf, gpu_dyn_inst);
320  }
321  computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
322 
323  DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
324  exeType, wf->simdId, wf->wfDynId,
325  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
326  return true;
327  } else {
328  // Number of stall cycles due to RF access denied
330  // Count number of denials due to each reason
331  // Multiple items may contribute to the denied request
332  if (!accessVrf) {
334  }
335  if (!accessSrf) {
337  }
338 
339  // Increment stall counts for WF
340  wf->schStalls++;
341  wf->schRfAccessStalls++;
342  DPRINTF(GPUSched, "schList[%d]: Could not add: "
343  "SIMD[%d] WV[%d]: %d: %s\n",
344  exeType, wf->simdId, wf->wfDynId,
345  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
346  }
347  return false;
348 }
349 
350 void
352  const GPUDynInstPtr &gpu_dyn_inst)
353 {
354  // Insert wave w into schList for specified exeType.
355  // Wave is inserted in age order, with oldest wave being at the
356  // front of the schList
357  assert(gpu_dyn_inst);
358  auto schIter = schList.at(exeType).begin();
359  while (schIter != schList.at(exeType).end()
360  && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
361  schIter++;
362  }
363  schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
364 }
365 
366 void
368 {
369  // Check for resource availability in the next cycle
370  scalarMemBusRdy = false;
371  scalarMemIssueRdy = false;
372  // check if there is a SRF->Global Memory bus available and
374  scalarMemBusRdy = true;
375  }
376  // check if we can issue a scalar memory instruction
378  scalarMemIssueRdy = true;
379  }
380 
381  glbMemBusRdy = false;
382  glbMemIssueRdy = false;
383  // check if there is a VRF->Global Memory bus available
385  glbMemBusRdy = true;
386  }
387  // check if we can issue a Global memory instruction
389  glbMemIssueRdy = true;
390  }
391 
392  locMemBusRdy = false;
393  locMemIssueRdy = false;
394  // check if there is a VRF->LDS bus available
396  locMemBusRdy = true;
397  }
398  // check if we can issue a LDS instruction
400  locMemIssueRdy = true;
401  }
402 }
403 
404 bool
406 {
407  assert(gpu_dyn_inst);
408  Wavefront *wf = gpu_dyn_inst->wavefront();
409  vectorAluRdy = false;
410  scalarAluRdy = false;
411  // check for available vector/scalar ALUs in the next cycle
412  if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
413  vectorAluRdy = true;
414  }
415  if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
416  scalarAluRdy = true;
417  }
418 
419  if (gpu_dyn_inst->isNop()) {
420  // S_NOP requires SALU. V_NOP requires VALU.
421  // TODO: Scalar NOP does not require SALU in hardware,
422  // and is executed out of IB directly.
423  if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
425  return false;
426  } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
428  return false;
429  }
430  } else if (gpu_dyn_inst->isEndOfKernel()) {
431  // EndPgm instruction
432  if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
434  return false;
435  }
436  } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
437  || gpu_dyn_inst->isALU()) {
438  // Barrier, Branch, or ALU instruction
439  if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
441  return false;
442  } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
444  return false;
445  }
446  } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
447  // Vector Global Memory instruction
448  bool rdy = true;
449  if (!glbMemIssueRdy) {
450  rdy = false;
452  }
453  if (!glbMemBusRdy) {
454  rdy = false;
456  }
457  if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
458  rdy = false;
460  }
462  rdy = false;
464  }
465  if (!rdy) {
466  return false;
467  }
468  } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
469  // Scalar Global Memory instruction
470  bool rdy = true;
471  if (!scalarMemIssueRdy) {
472  rdy = false;
474  }
475  if (!scalarMemBusRdy) {
476  rdy = false;
478  }
481  + wf->scalarWrGmReqsInPipe))
482  {
483  rdy = false;
485  }
486  if (!rdy) {
487  return false;
488  }
489  } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
490  // Vector Local Memory instruction
491  bool rdy = true;
492  if (!locMemIssueRdy) {
493  rdy = false;
495  }
496  if (!locMemBusRdy) {
497  rdy = false;
499  }
501  isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
502  rdy = false;
504  }
505  if (!rdy) {
506  return false;
507  }
508  } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
509  // Vector Flat memory instruction
510  bool rdy = true;
511  if (!glbMemIssueRdy || !locMemIssueRdy) {
512  rdy = false;
514  }
515  if (!glbMemBusRdy || !locMemBusRdy) {
516  rdy = false;
518  }
519  if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
520  rdy = false;
522  }
524  rdy = false;
526  }
528  isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
529  rdy = false;
531  }
532  if (!rdy) {
533  return false;
534  }
535  } else {
536  panic("%s: unknown instr checked for readiness",
537  gpu_dyn_inst->disassemble());
538  return false;
539  }
541  return true;
542 }
543 
544 void
546 {
547  // update execution resource status
549  // iterate execution resources
550  for (int j = 0; j < computeUnit.numExeUnits(); j++) {
551  assert(toExecute.dispatchStatus(j) == EMPTY);
552 
553  // iterate waves in schList to pick one for dispatch
554  auto schIter = schList.at(j).begin();
555  bool dispatched = false;
556  while (schIter != schList.at(j).end()) {
557  // only attempt to dispatch if status is RFREADY
558  if (schIter->second == RFREADY) {
559  // Check if this wave is ready for dispatch
560  bool dispRdy = dispatchReady(schIter->first);
561  if (!dispatched && dispRdy) {
562  // No other wave has been dispatched for this exe
563  // resource, and this wave is ready. Place this wave
564  // on dispatchList and make it ready for execution
565  // next cycle.
566 
567  // Acquire a coalescer token if it is a global mem
568  // operation.
569  GPUDynInstPtr mp = schIter->first;
570  if (!mp->isMemSync() && !mp->isScalar() &&
571  (mp->isGlobalMem() || mp->isFlat())) {
573  }
574 
575  doDispatchListTransition(j, EXREADY, schIter->first);
576  DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
577  "EMPTY->EXREADY\n", j);
578  schIter->first = nullptr;
579  schIter = schList.at(j).erase(schIter);
580  dispatched = true;
581  } else {
582  // Either another wave has been dispatched, or this wave
583  // was not ready, so it is stalled this cycle
584  schIter->first->wavefront()->schStalls++;
585  if (!dispRdy) {
586  // not ready for dispatch, increment stall stat
587  schIter->first->wavefront()->schResourceStalls++;
588  }
589  // Examine next wave for this resource
590  schIter++;
591  }
592  } else {
593  // Wave not in RFREADY, try next wave
594  schIter++;
595  }
596  }
597 
598  // Increment stall count if no wave sent to dispatchList for
599  // current execution resource
600  if (!dispatched) {
602  } else {
603  schListToDispList[j]++;
604  }
605  }
606 }
607 
608 void
610 {
611  // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
612  // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
613  // and a VRF->LDS bus. In GFx9, this is not the case.
614 
615  // iterate the GM pipelines
616  for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
617  // get the GM pipe index in the dispatchList
618  int gm_exe_unit = computeUnit.firstMemUnit() + i;
619  // get the wave in the dispatchList
620  GPUDynInstPtr &gpu_dyn_inst
621  = toExecute.readyInst(gm_exe_unit);
622  // If the WF is valid, ready to execute, and the instruction
623  // is a flat access, arbitrate with the WF's assigned LM pipe
624  if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
625  == EXREADY && gpu_dyn_inst->isFlat()) {
626  Wavefront *wf = gpu_dyn_inst->wavefront();
627  // If the associated LM pipe also has a wave selected, block
628  // that wave and let the Flat instruction issue. The WF in the
629  // LM pipe is added back to the schList for consideration next
630  // cycle.
633  .readyInst(wf->localMem));
634  // Increment stall stats for LDS-VRF arbitration
635  ldsBusArbStalls++;
637  ->wavefront()->schLdsArbStalls++;
638  }
639  // With arbitration of LM pipe complete, transition the
640  // LM pipe to SKIP state in the dispatchList to inform EX stage
641  // that a Flat instruction is executing next cycle
642  doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
643  DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
644  "EXREADY->SKIP\n", wf->localMem);
645  }
646  }
647 }
648 
649 void
651 {
652  // Iterate the schList queues and check if operand reads
653  // have completed in the RFs. If so, mark the wave as ready for
654  // selection for dispatchList
655  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
656  for (auto &p : schList.at(j)) {
657  const GPUDynInstPtr &gpu_dyn_inst = p.first;
658  assert(gpu_dyn_inst);
659  Wavefront *wf = gpu_dyn_inst->wavefront();
660 
661  // Increment the number of cycles the wave spends in the
662  // SCH stage, since this loop visits every wave in SCH.
663  wf->schCycles++;
664 
665  bool vrfRdy = true;
666  if (!gpu_dyn_inst->isScalar()) {
667  vrfRdy = computeUnit.vrf[wf->simdId]
668  ->operandReadComplete(wf, gpu_dyn_inst);
669  }
670  bool srfRdy = computeUnit.srf[wf->simdId]
671  ->operandReadComplete(wf, gpu_dyn_inst);
672  bool operandsReady = vrfRdy && srfRdy;
673  if (operandsReady) {
674  DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
675  "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
676  gpu_dyn_inst->disassemble());
677  DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
678  j, wf->wfDynId);
679  p.second = RFREADY;
680  } else {
681  DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
682  "for: %d: %s\n", j, wf->wfDynId,
683  gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
684 
685  // operands not ready yet, increment SCH stage stats
686  // aggregate to all wavefronts on the CU
687  p.second = RFBUSY;
688 
689  // Increment stall stats
690  wf->schStalls++;
691  wf->schOpdNrdyStalls++;
692 
694  if (!vrfRdy) {
696  }
697  if (!srfRdy) {
699  }
700  }
701  }
702  }
703 }
704 
705 void
707 {
708  std::vector<bool> exeUnitReservations;
709  exeUnitReservations.resize(computeUnit.numExeUnits(), false);
710 
711  for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
712  GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
713  if (gpu_dyn_inst) {
715  Wavefront *wf = gpu_dyn_inst->wavefront();
716  if (s == EMPTY) {
717  continue;
718  } else if (s == EXREADY) {
719  // Wave is ready for execution
720  std::vector<int> execUnitIds = wf->reserveResources();
721 
722  if (!gpu_dyn_inst->isScalar()) {
723  computeUnit.vrf[wf->simdId]
724  ->dispatchInstruction(gpu_dyn_inst);
725  }
726  computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
727 
728  std::stringstream ss;
729  for (auto id : execUnitIds) {
730  ss << id << " ";
731  }
732  DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
733  " Reserving ExeRes[ %s]\n",
734  j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
735  gpu_dyn_inst->disassemble(), ss.str());
736  // mark the resources as reserved for this cycle
737  for (auto execUnitId : execUnitIds) {
738  panic_if(exeUnitReservations.at(execUnitId),
739  "Execution unit %d is reserved!!!\n"
740  "SIMD[%d] WV[%d]: %d: %s",
741  execUnitId, wf->simdId, wf->wfDynId,
742  gpu_dyn_inst->seqNum(),
743  gpu_dyn_inst->disassemble());
744  exeUnitReservations.at(execUnitId) = true;
745  }
746 
747  // If wavefront::reserveResources reserved multiple resources,
748  // then we're executing a flat memory instruction. This means
749  // that we've reserved a global and local memory unit. Thus,
750  // we need to mark the latter execution unit as not available.
751  if (execUnitIds.size() > 1) {
752  int lm_exec_unit M5_VAR_USED = wf->localMem;
753  assert(toExecute.dispatchStatus(lm_exec_unit)
754  == SKIP);
755  }
756  } else if (s == SKIP) {
757  // Shared Memory pipe reserved for FLAT instruction.
758  // Verify the GM pipe for this wave is ready to execute
759  // and the wave in the GM pipe is the same as the wave
760  // in the LM pipe
761  int gm_exec_unit M5_VAR_USED = wf->globalMem;
762  assert(wf->wfDynId == toExecute
763  .readyInst(gm_exec_unit)->wfDynId);
764  assert(toExecute.dispatchStatus(gm_exec_unit)
765  == EXREADY);
766  }
767  }
768  }
769 }
770 
771 void
773 {
774  wavesInSch.erase(w->wfDynId);
775 }
776 
777 void
779 {
782  .name(name() + ".rdy_list_not_empty")
783  .desc("number of cycles one or more wave on ready list per "
784  "execution resource")
785  ;
786 
789  .name(name() + ".rdy_list_empty")
790  .desc("number of cycles no wave on ready list per "
791  "execution resource")
792  ;
793 
796  .name(name() + ".sch_list_add_stalls")
797  .desc("number of cycles a wave is not added to schList per "
798  "execution resource when ready list is not empty")
799  ;
800 
803  .name(name() + ".sch_list_to_disp_list")
804  .desc("number of cycles a wave is added to dispatchList per "
805  "execution resource")
806  ;
807 
810  .name(name() + ".sch_list_to_disp_list_stalls")
811  .desc("number of cycles no wave is added to dispatchList per "
812  "execution resource")
813  ;
814 
815  // Operand Readiness Stall Cycles
818  .name(name() + ".opd_nrdy_stalls")
819  .desc("number of stalls in SCH due to operands not ready")
820  ;
824 
825  // dispatchReady Stall Cycles
828  .name(name() + ".disp_nrdy_stalls")
829  .desc("number of stalls in SCH due to resource not ready")
830  ;
834  csprintf("VectorMemIssue"));
836  csprintf("VectorMemBusBusy"));
838  csprintf("VectorMemCoalescer"));
841  csprintf("ScalarMemIssue"));
843  csprintf("ScalarMemBusBusy"));
845  csprintf("ScalarMemFIFO"));
847  csprintf("LocalMemIssue"));
849  csprintf("LocalMemBusBusy"));
851  csprintf("LocalMemFIFO"));
853  csprintf("FlatMemIssue"));
855  csprintf("FlatMemBusBusy"));
857  csprintf("FlatMemCoalescer"));
859  csprintf("FlatMemFIFO"));
861 
862  // RF Access Stall Cycles
865  .name(name() + ".rf_access_stalls")
866  .desc("number of stalls due to RF access denied")
867  ;
873 
874  // Stall cycles due to wave losing LDS bus arbitration
876  .name(name() + ".lds_bus_arb_stalls")
877  .desc("number of stalls due to VRF->LDS bus conflicts")
878  ;
879 }
ScheduleStage::exec
void exec()
Definition: schedule_stage.cc:88
ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition: compute_unit.hh:242
ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition: compute_unit.hh:230
Wavefront::schCycles
Stats::Scalar schCycles
Definition: wavefront.hh:227
Wavefront::schStalls
Stats::Scalar schStalls
Definition: wavefront.hh:230
ScheduleStage::~ScheduleStage
~ScheduleStage()
Definition: schedule_stage.cc:66
GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition: global_memory_pipeline.cc:81
ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition: compute_unit.hh:220
EMPTY
@ EMPTY
Definition: exec_stage.hh:59
ScheduleStage::SCH_VECTOR_MEM_COALESCER_NRDY
@ SCH_VECTOR_MEM_COALESCER_NRDY
Definition: schedule_stage.hh:76
ScheduleStage::regStats
void regStats()
Definition: schedule_stage.cc:778
ScheduleStage::SCH_SRF_WR_ACCESS_NRDY
@ SCH_SRF_WR_ACCESS_NRDY
Definition: schedule_stage.hh:103
ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:282
ScheduleToExecute::reset
void reset() override
Reset the pipe stage interface.
Definition: comm.cc:116
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
Wavefront::globalMem
int globalMem
Definition: wavefront.hh:119
ScheduleStage::SCH_VECTOR_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_ISSUE_NRDY
Definition: schedule_stage.hh:74
ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition: compute_unit.cc:243
ScheduleStage::ldsBusArbStalls
Stats::Scalar ldsBusArbStalls
Definition: schedule_stage.hh:160
Wavefront::schOpdNrdyStalls
Stats::Scalar schOpdNrdyStalls
Definition: wavefront.hh:240
ScheduleToExecute::readyInst
GPUDynInstPtr & readyInst(int func_unit_id)
Definition: comm.cc:128
ScheduleStage::ScheduleStage
ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
Definition: schedule_stage.cc:46
compute_unit.hh
gpu_static_inst.hh
ScheduleStage::SCH_SRF_RD_ACCESS_NRDY
@ SCH_SRF_RD_ACCESS_NRDY
Definition: schedule_stage.hh:102
ScheduleStage::SCH_VRF_OPD_NRDY
@ SCH_VRF_OPD_NRDY
Definition: schedule_stage.hh:94
Wavefront::scalarRdGmReqsInPipe
int scalarRdGmReqsInPipe
Definition: wavefront.hh:180
ScheduleStage::glbMemBusRdy
bool glbMemBusRdy
Definition: schedule_stage.hh:185
ComputeUnit::numExeUnits
int numExeUnits() const
Definition: compute_unit.cc:228
ScheduleToExecute::dispatchStatus
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
Definition: comm.cc:151
ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:222
ScheduleStage::init
void init()
Definition: schedule_stage.cc:74
std::vector< bool >
ScheduleStage::SCH_FLAT_MEM_COALESCER_NRDY
@ SCH_FLAT_MEM_COALESCER_NRDY
Definition: schedule_stage.hh:87
Wavefront::scalarWrGmReqsInPipe
int scalarWrGmReqsInPipe
Definition: wavefront.hh:181
ScheduleStage::deleteFromSch
void deleteFromSch(Wavefront *w)
Definition: schedule_stage.cc:772
ScheduleStage::locMemBusRdy
bool locMemBusRdy
Definition: schedule_stage.hh:187
ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition: compute_unit.cc:482
ScheduleStage::scheduler
std::vector< Scheduler > scheduler
Definition: schedule_stage.hh:127
ScoreboardCheckToSchedule::updateReadyList
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
Definition: comm.cc:90
ScheduleStage::reserveResources
void reserveResources()
Definition: schedule_stage.cc:706
ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition: compute_unit.hh:216
wavefront.hh
ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition: compute_unit.hh:246
ScheduleStage::SCH_RF_ACCESS_NRDY_CONDITIONS
@ SCH_RF_ACCESS_NRDY_CONDITIONS
Definition: schedule_stage.hh:105
WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:90
ScheduleStage::SCH_RF_OPD_NRDY_CONDITIONS
@ SCH_RF_OPD_NRDY_CONDITIONS
Definition: schedule_stage.hh:97
SKIP
@ SKIP
Definition: exec_stage.hh:61
ScheduleStage::schList
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
Definition: schedule_stage.hh:223
Wavefront::setStatus
void setStatus(status_e newStatus)
Definition: wavefront.cc:591
GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:93
Wavefront::wrLmReqsInPipe
int wrLmReqsInPipe
Definition: wavefront.hh:178
ComputeUnit
Definition: compute_unit.hh:198
ScheduleStage::SCH_SCALAR_MEM_ISSUE_NRDY
@ SCH_SCALAR_MEM_ISSUE_NRDY
Definition: schedule_stage.hh:79
ScheduleStage::locMemIssueRdy
bool locMemIssueRdy
Definition: schedule_stage.hh:188
ScheduleStage::reinsertToSchList
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Definition: schedule_stage.cc:351
ScheduleStage::SCH_LOCAL_MEM_FIFO_NRDY
@ SCH_LOCAL_MEM_FIFO_NRDY
Definition: schedule_stage.hh:84
ArmISA::j
Bitfield< 24 > j
Definition: miscregs_types.hh:54
ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition: compute_unit.hh:294
vector_register_file.hh
ScheduleToExecute::dispatchTransition
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Definition: comm.cc:134
ScheduleStage::SCH_VECTOR_MEM_BUS_BUSY_NRDY
@ SCH_VECTOR_MEM_BUS_BUSY_NRDY
Definition: schedule_stage.hh:75
ScheduleStage::SCH_SCALAR_MEM_BUS_BUSY_NRDY
@ SCH_SCALAR_MEM_BUS_BUSY_NRDY
Definition: schedule_stage.hh:80
ArmISA::ss
Bitfield< 21 > ss
Definition: miscregs_types.hh:56
ScheduleStage::SCH_RF_ACCESS_NRDY
@ SCH_RF_ACCESS_NRDY
Definition: schedule_stage.hh:104
ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition: compute_unit.hh:224
ScheduleToExecute
Communication interface between Schedule and Execute stages.
Definition: comm.hh:99
ScheduleStage::rfAccessStalls
Stats::Vector rfAccessStalls
Definition: schedule_stage.hh:155
MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:278
Wavefront::localMem
int localMem
Definition: wavefront.hh:120
Wavefront::isOldestInstWaitcnt
bool isOldestInstWaitcnt()
Definition: wavefront.cc:661
ScalarMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: scalar_memory_pipeline.hh:82
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:234
ScheduleStage::SCH_CEDE_SIMD_NRDY
@ SCH_CEDE_SIMD_NRDY
Definition: schedule_stage.hh:78
ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:292
ScheduleStage::SCH_LOCAL_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_BUS_BUSY_NRDY
Definition: schedule_stage.hh:83
Wavefront::S_WAITCNT
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition: wavefront.hh:81
ScheduleStage::rdyListNotEmpty
Stats::Vector rdyListNotEmpty
Definition: schedule_stage.hh:134
ScheduleStage::glbMemIssueRdy
bool glbMemIssueRdy
Definition: schedule_stage.hh:186
scalar_register_file.hh
ScheduleStage::SCH_VECTOR_MEM_REQS_NRDY
@ SCH_VECTOR_MEM_REQS_NRDY
Definition: schedule_stage.hh:77
ScheduleStage::rdyListEmpty
Stats::Vector rdyListEmpty
Definition: schedule_stage.hh:133
ScheduleStage::SCH_FLAT_MEM_ISSUE_NRDY
@ SCH_FLAT_MEM_ISSUE_NRDY
Definition: schedule_stage.hh:85
ScheduleStage::scalarMemIssueRdy
bool scalarMemIssueRdy
Definition: schedule_stage.hh:184
ScheduleStage::SCH_FLAT_MEM_BUS_BUSY_NRDY
@ SCH_FLAT_MEM_BUS_BUSY_NRDY
Definition: schedule_stage.hh:86
ScheduleStage::SCH_RF_OPD_NRDY
@ SCH_RF_OPD_NRDY
Definition: schedule_stage.hh:96
ScheduleStage::SCH_FLAT_MEM_REQS_NRDY
@ SCH_FLAT_MEM_REQS_NRDY
Definition: schedule_stage.hh:88
ScheduleStage::SCH_NRDY_CONDITIONS
@ SCH_NRDY_CONDITIONS
Definition: schedule_stage.hh:91
ScheduleStage::computeUnit
ComputeUnit & computeUnit
Definition: schedule_stage.hh:121
ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition: compute_unit.hh:228
ScheduleStage::name
const std::string & name() const
Definition: schedule_stage.hh:70
ScheduleStage::scheduleRfDestOperands
void scheduleRfDestOperands()
Definition: schedule_stage.cc:257
ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition: compute_unit.hh:238
ScheduleStage::toExecute
ScheduleToExecute & toExecute
Definition: schedule_stage.hh:123
ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition: compute_unit.cc:236
ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition: compute_unit.hh:283
Wavefront::simdId
const int simdId
Definition: wavefront.hh:92
ScheduleStage::SCH_FLAT_MEM_FIFO_NRDY
@ SCH_FLAT_MEM_FIFO_NRDY
Definition: schedule_stage.hh:89
ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition: compute_unit.hh:236
ScheduleStage::fromScoreboardCheck
ScoreboardCheckToSchedule & fromScoreboardCheck
Definition: schedule_stage.hh:122
ScheduleStage::vectorAluRdy
bool vectorAluRdy
Definition: schedule_stage.hh:181
Stats::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:274
Wavefront::rdLmReqsInPipe
int rdLmReqsInPipe
Definition: wavefront.hh:176
EXREADY
@ EXREADY
Definition: exec_stage.hh:60
name
const std::string & name()
Definition: trace.cc:50
Stats::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1177
ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:281
schedule_stage.hh
ScheduleStage::scalarMemBusRdy
bool scalarMemBusRdy
Definition: schedule_stage.hh:183
ScheduleStage::scalarAluRdy
bool scalarAluRdy
Definition: schedule_stage.hh:182
ScheduleStage::SCH_VRF_WR_ACCESS_NRDY
@ SCH_VRF_WR_ACCESS_NRDY
Definition: schedule_stage.hh:101
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:197
ScheduleStage::addToSchListStalls
Stats::Vector addToSchListStalls
Definition: schedule_stage.hh:139
ScheduleStage::SCH_SCALAR_MEM_FIFO_NRDY
@ SCH_SCALAR_MEM_FIFO_NRDY
Definition: schedule_stage.hh:81
ScoreboardCheckToSchedule
Communication interface between ScoreboardCheck and Schedule stages.
Definition: comm.hh:63
ScheduleStage::RFREADY
@ RFREADY
Definition: schedule_stage.hh:117
ScheduleStage::dispatchReady
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
Definition: schedule_stage.cc:405
ScheduleStage::SCH_SCALAR_ALU_NRDY
@ SCH_SCALAR_ALU_NRDY
Definition: schedule_stage.hh:72
ScheduleStage::dispNrdyStalls
Stats::Vector dispNrdyStalls
Definition: schedule_stage.hh:169
ScheduleStage::RFBUSY
@ RFBUSY
Definition: schedule_stage.hh:116
Wavefront
Definition: wavefront.hh:57
ScheduleStage::checkMemResources
void checkMemResources()
Definition: schedule_stage.cc:367
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Cycles
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:83
ScoreboardCheckToSchedule::readyWFs
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
Definition: comm.cc:80
ScheduleStage::addToSchList
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Definition: schedule_stage.cc:289
Wavefront::schRfAccessStalls
Stats::Scalar schRfAccessStalls
Definition: wavefront.hh:236
MipsISA::dq
Bitfield< 2 > dq
Definition: dt_constants.hh:128
Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:102
Stats::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:374
ScoreboardCheckToSchedule::numReadyLists
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
Definition: comm.cc:74
ScheduleStage::SCH_LOCAL_MEM_ISSUE_NRDY
@ SCH_LOCAL_MEM_ISSUE_NRDY
Definition: schedule_stage.hh:82
ScheduleStage::wavesInSch
std::unordered_set< uint64_t > wavesInSch
Definition: schedule_stage.hh:212
Wavefront::incVMemInstsIssued
void incVMemInstsIssued()
Definition: wavefront.cc:1350
Wavefront::scalarAlu
int scalarAlu
Definition: wavefront.hh:114
ArmISA::mp
Bitfield< 11 > mp
Definition: miscregs_types.hh:762
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
ScheduleStage::schListToDispList
Stats::Vector schListToDispList
Definition: schedule_stage.hh:144
ArmISA::s
Bitfield< 4 > s
Definition: miscregs_types.hh:556
ScheduleStage::SCH_VRF_RD_ACCESS_NRDY
@ SCH_VRF_RD_ACCESS_NRDY
Definition: schedule_stage.hh:100
ScheduleStage::opdNrdyStalls
Stats::Vector opdNrdyStalls
Definition: schedule_stage.hh:164
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:219
ScheduleStage::checkRfOperandReadComplete
void checkRfOperandReadComplete()
Definition: schedule_stage.cc:650
ScheduleStage::schListToDispListStalls
Stats::Vector schListToDispListStalls
Definition: schedule_stage.hh:148
DISPATCH_STATUS
DISPATCH_STATUS
Definition: exec_stage.hh:57
ScheduleStage::fillDispatchList
void fillDispatchList()
Definition: schedule_stage.cc:545
GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:62
ScheduleStage::schedRfWrites
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Definition: schedule_stage.cc:221
Stats::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:307
csprintf
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:158
ScheduleStage::arbitrateVrfToLdsBus
void arbitrateVrfToLdsBus()
Definition: schedule_stage.cc:609
ScheduleStage::doDispatchListTransition
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
Definition: schedule_stage.cc:208
ScheduleStage::SCH_VECTOR_ALU_NRDY
@ SCH_VECTOR_ALU_NRDY
Definition: schedule_stage.hh:73
ScheduleStage::SCH_RDY
@ SCH_RDY
Definition: schedule_stage.hh:90
Wavefront::wfDynId
uint64_t wfDynId
Definition: wavefront.hh:218
Wavefront::reserveResources
std::vector< int > reserveResources()
Definition: wavefront.cc:867
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:171
Wavefront::incLGKMInstsIssued
void incLGKMInstsIssued()
Definition: wavefront.cc:1362
ScheduleStage::SCH_SRF_OPD_NRDY
@ SCH_SRF_OPD_NRDY
Definition: schedule_stage.hh:95

Generated on Wed Sep 30 2020 14:02:12 for gem5 by doxygen 1.8.17