gem5 v23.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
schedule_stage.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <unordered_set>
35
36#include "base/compiler.hh"
37#include "debug/GPUSched.hh"
38#include "debug/GPUVRF.hh"
44
45namespace gem5
46{
47
48ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
49 ScoreboardCheckToSchedule &from_scoreboard_check,
50 ScheduleToExecute &to_execute)
51 : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
52 toExecute(to_execute),
53 _name(cu.name() + ".ScheduleStage"),
54 vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
55 scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
56 locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
57{
58 for (int j = 0; j < cu.numExeUnits(); ++j) {
59 scheduler.emplace_back(p);
60 }
61 wavesInSch.clear();
62 schList.resize(cu.numExeUnits());
63 for (auto &dq : schList) {
64 dq.clear();
65 }
66}
67
69{
70 scheduler.clear();
71 wavesInSch.clear();
72 schList.clear();
73}
74
75void
77{
78
80 "Scheduler should have same number of entries as CU's readyList");
81 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
83 }
84
87}
88
89void
91{
93
94 // Update readyList
95 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
102 for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
103 wIt != fromScoreboardCheck.readyWFs(j).end();) {
104 if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
105 *wIt = nullptr;
106 wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
107 } else {
108 wIt++;
109 }
110 }
111 }
112
113 // Attempt to add another wave for each EXE type to schList queues
114 // VMEM resources are iterated first, effectively giving priority
115 // to VMEM over VALU for scheduling read of operands to the RFs.
116 // Scalar Memory are iterated after VMEM
117
118 // Iterate VMEM and SMEM
119 int firstMemUnit = computeUnit.firstMemUnit();
120 int lastMemUnit = computeUnit.lastMemUnit();
121 for (int j = firstMemUnit; j <= lastMemUnit; j++) {
122 int readyListSize = fromScoreboardCheck.readyWFs(j).size();
123 // If no wave is ready to be scheduled on the execution resource
124 // then skip scheduling for this execution resource
125 if (!readyListSize) {
127 continue;
128 }
130
131 // Pick a wave and attempt to add it to schList
132 Wavefront *wf = scheduler[j].chooseWave();
133 GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
134 assert(gpu_dyn_inst);
135 if (!addToSchList(j, gpu_dyn_inst)) {
136 // For waves not added to schList, increment count of cycles
137 // this wave spends in SCH stage.
138 wf->stats.schCycles++;
140 } else {
141 if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
142 wf->incLGKMInstsIssued();
143 } else {
144 wf->incVMemInstsIssued();
145 if (gpu_dyn_inst->isFlat()) {
146 wf->incLGKMInstsIssued();
147 }
148 }
149 if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
150 wf->incExpInstsIssued();
151 }
152 }
153 }
154
155 // Iterate everything else
156 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
157 // skip the VMEM resources
158 if (j >= firstMemUnit && j <= lastMemUnit) {
159 continue;
160 }
161 int readyListSize = fromScoreboardCheck.readyWFs(j).size();
162 // If no wave is ready to be scheduled on the execution resource
163 // then skip scheduling for this execution resource
164 if (!readyListSize) {
166 continue;
167 }
169
170 // Pick a wave and attempt to add it to schList
171 Wavefront *wf = scheduler[j].chooseWave();
172 GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
173 assert(gpu_dyn_inst);
174 if (!addToSchList(j, gpu_dyn_inst)) {
175 // For waves not added to schList, increment count of cycles
176 // this wave spends in SCH stage.
177 wf->stats.schCycles++;
179 }
180 }
181
182 // At this point, the schList queue per EXE type may contain
183 // multiple waves, in order of age (oldest to youngest).
184 // Wave may be in RFBUSY, indicating they are waiting for registers
185 // to be read, or in RFREADY, indicating they are candidates for
186 // the dispatchList and execution
187
188 // Iterate schList queues and check if any of the waves have finished
189 // reading their operands, moving those waves to RFREADY status
191
192 // Fill the dispatch list with the oldest wave of each EXE type that
193 // is ready to execute
194 // Wave is picked if status in schList is RFREADY and it passes resource
195 // ready checks similar to those currently in SCB
197
198 // Resource arbitration on waves in dispatchList
199 // Losing waves are re-inserted to the schList at a location determined
200 // by wave age
201
202 // Arbitrate access to the VRF->LDS bus
204
205 // Schedule write operations to the register files
207
208 // Lastly, reserve resources for waves that are ready to execute.
210}
211
212void
214 const GPUDynInstPtr &gpu_dyn_inst)
215{
216 toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
217}
218
219void
221{
223}
224
225bool
226ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
227{
228 assert(gpu_dyn_inst);
229 Wavefront *wf = gpu_dyn_inst->wavefront();
230 bool accessVrfWr = true;
231 if (!gpu_dyn_inst->isScalar()) {
232 accessVrfWr = computeUnit.vrf[wf->simdId]
233 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
234 }
235 bool accessSrfWr = computeUnit.srf[wf->simdId]
236 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
237 bool accessRf = accessVrfWr && accessSrfWr;
238 if (accessRf) {
239 if (!gpu_dyn_inst->isScalar()) {
240 computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
241 gpu_dyn_inst);
242 }
243 computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
244 return true;
245 } else {
247 if (!accessSrfWr) {
249 }
250 if (!accessVrfWr) {
252 }
253
254 // Increment stall counts for WF
255 wf->stats.schStalls++;
257 }
258 return false;
259}
260
261void
263{
264 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
267 continue;
268 }
269
270 // get the wave on dispatch list and attempt to allocate write
271 // resources in the RFs
272 const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
273 assert(gpu_dyn_inst);
274 Wavefront *wf = gpu_dyn_inst->wavefront();
275 if (!schedRfWrites(j, gpu_dyn_inst)) {
276 reinsertToSchList(j, gpu_dyn_inst);
278 // if this is a flat inst, also transition the LM pipe to empty
279 // Note: since FLAT/LM arbitration occurs before scheduling
280 // destination operands to the RFs, it is possible that a LM
281 // instruction lost arbitration, but would have been able to
282 // pass the RF destination operand check here, and execute
283 // instead of the FLAT.
284 if (wf->instructionBuffer.front()->isFlat()) {
286 == SKIP);
288 }
289 }
290 }
291}
292
293bool
294ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
295{
296 // Attempt to add the wave to the schList if the VRF can support the
297 // wave's next instruction
298 assert(gpu_dyn_inst);
299 Wavefront *wf = gpu_dyn_inst->wavefront();
300 bool accessVrf = true;
301 if (!gpu_dyn_inst->isScalar()) {
302 accessVrf = computeUnit.vrf[wf->simdId]
303 ->canScheduleReadOperands(wf, gpu_dyn_inst);
304 }
305 bool accessSrf = computeUnit.srf[wf->simdId]
306 ->canScheduleReadOperands(wf, gpu_dyn_inst);
307 // If RFs can support instruction, add to schList in RFBUSY state,
308 // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
309 // to the VRF
310 bool accessRf = accessVrf && accessSrf;
311 if (accessRf) {
312 DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
313 exeType, wf->simdId, wf->wfDynId,
314 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
315
317 wavesInSch.emplace(wf->wfDynId);
318 schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
319 if (wf->isOldestInstBarrier() && wf->hasBarrier()) {
321 }
322 if (wf->isOldestInstWaitcnt()) {
324 }
325 if (wf->isOldestInstSleep()) {
327 }
328 if (!gpu_dyn_inst->isScalar()) {
330 ->scheduleReadOperands(wf, gpu_dyn_inst);
331 }
332 computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
333
334 DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
335 exeType, wf->simdId, wf->wfDynId,
336 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
337 return true;
338 } else {
339 // Number of stall cycles due to RF access denied
341 // Count number of denials due to each reason
342 // Multiple items may contribute to the denied request
343 if (!accessVrf) {
345 }
346 if (!accessSrf) {
348 }
349
350 // Increment stall counts for WF
351 wf->stats.schStalls++;
353 DPRINTF(GPUSched, "schList[%d]: Could not add: "
354 "SIMD[%d] WV[%d]: %d: %s\n",
355 exeType, wf->simdId, wf->wfDynId,
356 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
357 }
358 return false;
359}
360
361void
363 const GPUDynInstPtr &gpu_dyn_inst)
364{
365 // Insert wave w into schList for specified exeType.
366 // Wave is inserted in age order, with oldest wave being at the
367 // front of the schList
368 assert(gpu_dyn_inst);
369 auto schIter = schList.at(exeType).begin();
370 while (schIter != schList.at(exeType).end()
371 && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
372 schIter++;
373 }
374 schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
375}
376
377void
379{
380 // Check for resource availability in the next cycle
381 scalarMemBusRdy = false;
382 scalarMemIssueRdy = false;
383 // check if there is a SRF->Global Memory bus available and
385 scalarMemBusRdy = true;
386 }
387 // check if we can issue a scalar memory instruction
389 scalarMemIssueRdy = true;
390 }
391
392 glbMemBusRdy = false;
393 glbMemIssueRdy = false;
394 // check if there is a VRF->Global Memory bus available
396 glbMemBusRdy = true;
397 }
398 // check if we can issue a Global memory instruction
400 glbMemIssueRdy = true;
401 }
402
403 locMemBusRdy = false;
404 locMemIssueRdy = false;
405 // check if there is a VRF->LDS bus available
407 locMemBusRdy = true;
408 }
409 // check if we can issue a LDS instruction
411 locMemIssueRdy = true;
412 }
413}
414
415bool
417{
418 assert(gpu_dyn_inst);
419 Wavefront *wf = gpu_dyn_inst->wavefront();
420 vectorAluRdy = false;
421 scalarAluRdy = false;
422 // check for available vector/scalar ALUs in the next cycle
423 if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
424 vectorAluRdy = true;
425 }
426 if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
427 scalarAluRdy = true;
428 }
429
430 if (gpu_dyn_inst->isNop()) {
431 // S_NOP requires SALU. V_NOP requires VALU.
432 // TODO: Scalar NOP does not require SALU in hardware,
433 // and is executed out of IB directly.
434 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
436 return false;
437 } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
439 return false;
440 }
441 } else if (gpu_dyn_inst->isEndOfKernel()) {
442 // EndPgm instruction
443 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
445 return false;
446 }
447 } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
448 || gpu_dyn_inst->isALU()) {
449 // Barrier, Branch, or ALU instruction
450 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
452 return false;
453 } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
455 return false;
456 }
457 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
458 // Vector Global Memory instruction
459 bool rdy = true;
460 if (!glbMemIssueRdy) {
461 rdy = false;
463 }
464 if (!glbMemBusRdy) {
465 rdy = false;
467 }
468 if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
469 rdy = false;
471 }
473 rdy = false;
475 }
476 if (!rdy) {
477 return false;
478 }
479 } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
480 // Scalar Global Memory instruction
481 bool rdy = true;
482 if (!scalarMemIssueRdy) {
483 rdy = false;
485 }
486 if (!scalarMemBusRdy) {
487 rdy = false;
489 }
493 {
494 rdy = false;
496 }
497 if (!rdy) {
498 return false;
499 }
500 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
501 // Vector Local Memory instruction
502 bool rdy = true;
503 if (!locMemIssueRdy) {
504 rdy = false;
506 }
507 if (!locMemBusRdy) {
508 rdy = false;
510 }
512 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
513 rdy = false;
515 }
516 if (!rdy) {
517 return false;
518 }
519 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
520 // Vector Flat memory instruction
521 bool rdy = true;
523 rdy = false;
525 }
526 if (!glbMemBusRdy || !locMemBusRdy) {
527 rdy = false;
529 }
530 if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
531 rdy = false;
533 }
535 rdy = false;
537 }
539 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
540 rdy = false;
542 }
543 if (!rdy) {
544 return false;
545 }
546 } else {
547 panic("%s: unknown instr checked for readiness",
548 gpu_dyn_inst->disassemble());
549 return false;
550 }
552 return true;
553}
554
555void
557{
558 // update execution resource status
560 // iterate execution resources
561 for (int j = 0; j < computeUnit.numExeUnits(); j++) {
562 assert(toExecute.dispatchStatus(j) == EMPTY);
563
564 // iterate waves in schList to pick one for dispatch
565 auto schIter = schList.at(j).begin();
566 bool dispatched = false;
567 while (schIter != schList.at(j).end()) {
568 // only attempt to dispatch if status is RFREADY
569 if (schIter->second == RFREADY) {
570 // Check if this wave is ready for dispatch
571 bool dispRdy = dispatchReady(schIter->first);
572 if (!dispatched && dispRdy) {
573 // No other wave has been dispatched for this exe
574 // resource, and this wave is ready. Place this wave
575 // on dispatchList and make it ready for execution
576 // next cycle.
577
578 // Acquire a coalescer token if it is a global mem
579 // operation.
580 GPUDynInstPtr mp = schIter->first;
581 if (!mp->isMemSync() && !mp->isScalar() &&
582 (mp->isGlobalMem() || mp->isFlat())) {
584 }
585
586 // Set instruction's exec_mask if it's a mem operation
587 if (mp->isMemRef()) {
588 mp->exec_mask = mp->wavefront()->execMask();
589 }
590
591 doDispatchListTransition(j, EXREADY, schIter->first);
592 DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
593 "EMPTY->EXREADY\n", j);
594 schIter->first = nullptr;
595 schIter = schList.at(j).erase(schIter);
596 dispatched = true;
597 } else {
598 // Either another wave has been dispatched, or this wave
599 // was not ready, so it is stalled this cycle
600 schIter->first->wavefront()->stats.schStalls++;
601 if (!dispRdy) {
602 // not ready for dispatch, increment stall stat
603 schIter->first->wavefront()->stats.schResourceStalls++;
604 }
605 // Examine next wave for this resource
606 schIter++;
607 }
608 } else {
609 // Wave not in RFREADY, try next wave
610 schIter++;
611 }
612 }
613
614 // Increment stall count if no wave sent to dispatchList for
615 // current execution resource
616 if (!dispatched) {
618 } else {
620 }
621 }
622}
623
624void
626{
627 // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
628 // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
629 // and a VRF->LDS bus. In GFx9, this is not the case.
630
631 // iterate the GM pipelines
632 for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
633 // get the GM pipe index in the dispatchList
634 int gm_exe_unit = computeUnit.firstMemUnit() + i;
635 // get the wave in the dispatchList
636 GPUDynInstPtr &gpu_dyn_inst
637 = toExecute.readyInst(gm_exe_unit);
638 // If the WF is valid, ready to execute, and the instruction
639 // is a flat access, arbitrate with the WF's assigned LM pipe
640 if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
641 == EXREADY && gpu_dyn_inst->isFlat()) {
642 Wavefront *wf = gpu_dyn_inst->wavefront();
643 // If the associated LM pipe also has a wave selected, block
644 // that wave and let the Flat instruction issue. The WF in the
645 // LM pipe is added back to the schList for consideration next
646 // cycle.
649 .readyInst(wf->localMem));
650 // Increment stall stats for LDS-VRF arbitration
653 ->wavefront()->stats.schLdsArbStalls++;
654 }
655 // With arbitration of LM pipe complete, transition the
656 // LM pipe to SKIP state in the dispatchList to inform EX stage
657 // that a Flat instruction is executing next cycle
658 doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
659 DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
660 "EXREADY->SKIP\n", wf->localMem);
661 }
662 }
663}
664
665void
667{
668 // Iterate the schList queues and check if operand reads
669 // have completed in the RFs. If so, mark the wave as ready for
670 // selection for dispatchList
671 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
672 for (auto &p : schList.at(j)) {
673 const GPUDynInstPtr &gpu_dyn_inst = p.first;
674 assert(gpu_dyn_inst);
675 Wavefront *wf = gpu_dyn_inst->wavefront();
676
677 // Increment the number of cycles the wave spends in the
678 // SCH stage, since this loop visits every wave in SCH.
679 wf->stats.schCycles++;
680
681 bool vrfRdy = true;
682 if (!gpu_dyn_inst->isScalar()) {
683 vrfRdy = computeUnit.vrf[wf->simdId]
684 ->operandReadComplete(wf, gpu_dyn_inst);
685 }
686 bool srfRdy = computeUnit.srf[wf->simdId]
687 ->operandReadComplete(wf, gpu_dyn_inst);
688 bool operandsReady = vrfRdy && srfRdy;
689 if (operandsReady) {
690 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
691 "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
692 gpu_dyn_inst->disassemble());
693 DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
694 j, wf->wfDynId);
695 p.second = RFREADY;
696 } else {
697 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
698 "for: %d: %s\n", j, wf->wfDynId,
699 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
700
701 // operands not ready yet, increment SCH stage stats
702 // aggregate to all wavefronts on the CU
703 p.second = RFBUSY;
704
705 // Increment stall stats
706 wf->stats.schStalls++;
708
710 if (!vrfRdy) {
712 }
713 if (!srfRdy) {
715 }
716 }
717 }
718 }
719}
720
721void
723{
724 std::vector<bool> exeUnitReservations;
725 exeUnitReservations.resize(computeUnit.numExeUnits(), false);
726
727 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
728 GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
729 if (gpu_dyn_inst) {
731 Wavefront *wf = gpu_dyn_inst->wavefront();
732 if (s == EMPTY) {
733 continue;
734 } else if (s == EXREADY) {
735 // Wave is ready for execution
736 std::vector<int> execUnitIds = wf->reserveResources();
737
738 if (!gpu_dyn_inst->isScalar()) {
740 ->dispatchInstruction(gpu_dyn_inst);
741 }
742 computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
743
744 std::stringstream ss;
745 for (auto id : execUnitIds) {
746 ss << id << " ";
747 }
748 DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
749 " Reserving ExeRes[ %s]\n",
750 j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
751 gpu_dyn_inst->disassemble(), ss.str());
752 // mark the resources as reserved for this cycle
753 for (auto execUnitId : execUnitIds) {
754 panic_if(exeUnitReservations.at(execUnitId),
755 "Execution unit %d is reserved!!!\n"
756 "SIMD[%d] WV[%d]: %d: %s",
757 execUnitId, wf->simdId, wf->wfDynId,
758 gpu_dyn_inst->seqNum(),
759 gpu_dyn_inst->disassemble());
760 exeUnitReservations.at(execUnitId) = true;
761 }
762
763 // If wavefront::reserveResources reserved multiple resources,
764 // then we're executing a flat memory instruction. This means
765 // that we've reserved a global and local memory unit. Thus,
766 // we need to mark the latter execution unit as not available.
767 if (execUnitIds.size() > 1) {
768 [[maybe_unused]] int lm_exec_unit = wf->localMem;
769 assert(toExecute.dispatchStatus(lm_exec_unit)
770 == SKIP);
771 }
772 } else if (s == SKIP) {
773 // Shared Memory pipe reserved for FLAT instruction.
774 // Verify the GM pipe for this wave is ready to execute
775 // and the wave in the GM pipe is the same as the wave
776 // in the LM pipe
777 [[maybe_unused]] int gm_exec_unit = wf->globalMem;
778 assert(wf->wfDynId == toExecute
779 .readyInst(gm_exec_unit)->wfDynId);
780 assert(toExecute.dispatchStatus(gm_exec_unit)
781 == EXREADY);
782 }
783 }
784 }
785}
786
787void
789{
790 wavesInSch.erase(w->wfDynId);
791}
792
794 statistics::Group *parent, int num_exec_units)
795 : statistics::Group(parent, "ScheduleStage"),
796 ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
797 "execution resource"),
798 ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
799 "list per execution resource"),
800 ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
801 "schList per execution resource when ready list is not empty"),
802 ADD_STAT(schListToDispList, "number of cycles a wave is added to "
803 "dispatchList per execution resource"),
804 ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
805 " dispatchList per execution resource"),
806 ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
807 ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
808 "conflicts"),
809 ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
810 "ready"),
811 ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
812 "ready")
813{
814 rdyListNotEmpty.init(num_exec_units);
815 rdyListEmpty.init(num_exec_units);
816 addToSchListStalls.init(num_exec_units);
817 schListToDispList.init(num_exec_units);
818 schListToDispListStalls.init(num_exec_units);
822
826
830 csprintf("VectorMemIssue"));
832 csprintf("VectorMemBusBusy"));
834 csprintf("VectorMemCoalescer"));
837 csprintf("ScalarMemIssue"));
839 csprintf("ScalarMemBusBusy"));
841 csprintf("ScalarMemFIFO"));
843 csprintf("LocalMemIssue"));
845 csprintf("LocalMemBusBusy"));
847 csprintf("LocalMemFIFO"));
849 csprintf("FlatMemIssue"));
851 csprintf("FlatMemBusBusy"));
853 csprintf("FlatMemCoalescer"));
855 csprintf("FlatMemFIFO"));
857
863}
864
865} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
std::vector< WaitClass > scalarALUs
WaitClass scalarMemUnit
WaitClass vectorGlobalMemUnit
LocalMemPipeline localMemoryPipe
WaitClass vrfToLocalMemPipeBus
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
int numExeUnits() const
GlobalMemPipeline globalMemoryPipe
void insertInPipeMap(Wavefront *w)
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
std::vector< WaitClass > vectorALUs
WaitClass vectorSharedMemUnit
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void acqCoalescerToken(GPUDynInstPtr mp)
bool coalescerReady(GPUDynInstPtr mp) const
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
gem5::ScheduleStage::ScheduleStageStats stats
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
ScoreboardCheckToSchedule & fromScoreboardCheck
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Communication interface between Schedule and Execute stages.
Definition comm.hh:99
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
Definition comm.cc:150
void reset() override
Reset the pipe stage interface.
Definition comm.cc:115
GPUDynInstPtr & readyInst(int func_unit_id)
Definition comm.cc:127
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Definition comm.cc:133
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
Definition comm.cc:73
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
Definition comm.cc:79
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
Definition comm.cc:89
bool rdy(Cycles cycles=Cycles(0)) const
Definition misc.hh:93
bool isOldestInstWaitcnt()
Definition wavefront.cc:634
bool hasBarrier() const
void setStatus(status_e newStatus)
Definition wavefront.cc:550
const int simdId
Definition wavefront.hh:99
bool isOldestInstBarrier()
Definition wavefront.cc:681
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:109
bool isOldestInstSleep()
Definition wavefront.cc:620
void incExpInstsIssued()
std::vector< int > reserveResources()
Definition wavefront.cc:840
void incLGKMInstsIssued()
void incVMemInstsIssued()
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:88
gem5::Wavefront::WavefrontStats stats
uint64_t wfDynId
Definition wavefront.hh:226
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 4 > s
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 11 > mp
Bitfield< 24 > j
Definition misc_types.hh:57
Bitfield< 0 > p
Bitfield< 2 > dq
Bitfield< 0 > w
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
DISPATCH_STATUS
Definition exec_stage.hh:60
@ EXREADY
Definition exec_stage.hh:62
@ EMPTY
Definition exec_stage.hh:61
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Scalar schCycles
Definition wavefront.hh:343
statistics::Scalar schRfAccessStalls
Definition wavefront.hh:353
statistics::Scalar schOpdNrdyStalls
Definition wavefront.hh:357
statistics::Scalar schStalls
Definition wavefront.hh:346
const std::string & name()
Definition trace.cc:48

Generated on Mon Jul 10 2023 14:24:31 for gem5 by doxygen 1.9.7