gem5 v24.0.0.0
Loading...
Searching...
No Matches
schedule_stage.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <unordered_set>
35
36#include "base/compiler.hh"
37#include "debug/GPUSched.hh"
38#include "debug/GPUVRF.hh"
45
46namespace gem5
47{
48
49ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
50 ScoreboardCheckToSchedule &from_scoreboard_check,
51 ScheduleToExecute &to_execute)
52 : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
53 toExecute(to_execute),
54 _name(cu.name() + ".ScheduleStage"),
55 vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
56 scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
57 locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
58{
59 for (int j = 0; j < cu.numExeUnits(); ++j) {
60 scheduler.emplace_back(p);
61 }
62 wavesInSch.clear();
63 schList.resize(cu.numExeUnits());
64 for (auto &dq : schList) {
65 dq.clear();
66 }
67}
68
70{
71 scheduler.clear();
72 wavesInSch.clear();
73 schList.clear();
74}
75
76void
78{
79
81 "Scheduler should have same number of entries as CU's readyList");
82 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
83 scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
84 }
85
88}
89
90void
92{
94
95 // Update readyList
96 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
103 for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
104 wIt != fromScoreboardCheck.readyWFs(j).end();) {
105 if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
106 *wIt = nullptr;
107 wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
108 } else {
109 wIt++;
110 }
111 }
112 }
113
114 // Attempt to add another wave for each EXE type to schList queues
115 // VMEM resources are iterated first, effectively giving priority
116 // to VMEM over VALU for scheduling read of operands to the RFs.
117 // Scalar Memory are iterated after VMEM
118
119 // Iterate VMEM and SMEM
120 int firstMemUnit = computeUnit.firstMemUnit();
121 int lastMemUnit = computeUnit.lastMemUnit();
122 for (int j = firstMemUnit; j <= lastMemUnit; j++) {
123 int readyListSize = fromScoreboardCheck.readyWFs(j).size();
124 // If no wave is ready to be scheduled on the execution resource
125 // then skip scheduling for this execution resource
126 if (!readyListSize) {
127 stats.rdyListEmpty[j]++;
128 continue;
129 }
131
132 // Pick a wave and attempt to add it to schList
133 Wavefront *wf = scheduler[j].chooseWave();
134 GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
135 assert(gpu_dyn_inst);
136 if (!addToSchList(j, gpu_dyn_inst)) {
137 // For waves not added to schList, increment count of cycles
138 // this wave spends in SCH stage.
139 wf->stats.schCycles++;
141 } else {
142 if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
143 wf->incLGKMInstsIssued();
144 } else {
145 wf->incVMemInstsIssued();
146 if (gpu_dyn_inst->isFlat()) {
147 wf->incLGKMInstsIssued();
148 }
149 }
150 if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
151 wf->incExpInstsIssued();
152 }
153 }
154 }
155
156 // Iterate everything else
157 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
158 // skip the VMEM resources
159 if (j >= firstMemUnit && j <= lastMemUnit) {
160 continue;
161 }
162 int readyListSize = fromScoreboardCheck.readyWFs(j).size();
163 // If no wave is ready to be scheduled on the execution resource
164 // then skip scheduling for this execution resource
165 if (!readyListSize) {
166 stats.rdyListEmpty[j]++;
167 continue;
168 }
170
171 // Pick a wave and attempt to add it to schList
172 Wavefront *wf = scheduler[j].chooseWave();
173 GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
174 assert(gpu_dyn_inst);
175 if (!addToSchList(j, gpu_dyn_inst)) {
176 // For waves not added to schList, increment count of cycles
177 // this wave spends in SCH stage.
178 wf->stats.schCycles++;
180 }
181 }
182
183 // At this point, the schList queue per EXE type may contain
184 // multiple waves, in order of age (oldest to youngest).
185 // Wave may be in RFBUSY, indicating they are waiting for registers
186 // to be read, or in RFREADY, indicating they are candidates for
187 // the dispatchList and execution
188
189 // Iterate schList queues and check if any of the waves have finished
190 // reading their operands, moving those waves to RFREADY status
192
193 // Fill the dispatch list with the oldest wave of each EXE type that
194 // is ready to execute
195 // Wave is picked if status in schList is RFREADY and it passes resource
196 // ready checks similar to those currently in SCB
198
199 // Resource arbitration on waves in dispatchList
200 // Losing waves are re-inserted to the schList at a location determined
201 // by wave age
202
203 // Arbitrate access to the VRF->LDS bus
205
206 // Schedule write operations to the register files
208
209 // Lastly, reserve resources for waves that are ready to execute.
211}
212
213void
215 const GPUDynInstPtr &gpu_dyn_inst)
216{
217 toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
218}
219
220void
225
226bool
227ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
228{
229 assert(gpu_dyn_inst);
230 Wavefront *wf = gpu_dyn_inst->wavefront();
231 bool accessVrfWr = true;
232 if (!gpu_dyn_inst->isScalar()) {
233 accessVrfWr = computeUnit.vrf[wf->simdId]
234 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
235 }
236 bool accessSrfWr = computeUnit.srf[wf->simdId]
237 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
238 bool accessRf = accessVrfWr && accessSrfWr;
239 if (accessRf) {
240 if (!gpu_dyn_inst->isScalar()) {
241 computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
242 gpu_dyn_inst);
243 }
244 computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
245 return true;
246 } else {
248 if (!accessSrfWr) {
250 }
251 if (!accessVrfWr) {
253 }
254
255 // Increment stall counts for WF
256 wf->stats.schStalls++;
258 }
259 return false;
260}
261
262void
264{
265 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
266 if (toExecute.dispatchStatus(j) == EMPTY ||
268 continue;
269 }
270
271 // get the wave on dispatch list and attempt to allocate write
272 // resources in the RFs
273 const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
274 assert(gpu_dyn_inst);
275 Wavefront *wf = gpu_dyn_inst->wavefront();
276 if (!schedRfWrites(j, gpu_dyn_inst)) {
277 reinsertToSchList(j, gpu_dyn_inst);
279 // if this is a flat inst, also transition the LM pipe to empty
280 // Note: since FLAT/LM arbitration occurs before scheduling
281 // destination operands to the RFs, it is possible that a LM
282 // instruction lost arbitration, but would have been able to
283 // pass the RF destination operand check here, and execute
284 // instead of the FLAT.
285 if (wf->instructionBuffer.front()->isFlat()) {
287 == SKIP);
289 }
290 }
291 }
292}
293
294bool
295ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
296{
297 // Attempt to add the wave to the schList if the VRF can support the
298 // wave's next instruction
299 assert(gpu_dyn_inst);
300 Wavefront *wf = gpu_dyn_inst->wavefront();
301 bool accessVrf = true;
302 if (!gpu_dyn_inst->isScalar()) {
303 accessVrf = computeUnit.vrf[wf->simdId]
304 ->canScheduleReadOperands(wf, gpu_dyn_inst);
305 }
306 bool accessSrf = computeUnit.srf[wf->simdId]
307 ->canScheduleReadOperands(wf, gpu_dyn_inst);
308 // If RFs can support instruction, add to schList in RFBUSY state,
309 // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
310 // to the VRF
311 bool accessRf = accessVrf && accessSrf;
312 if (accessRf) {
313 DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
314 exeType, wf->simdId, wf->wfDynId,
315 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
316
318 wavesInSch.emplace(wf->wfDynId);
319 schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
320 if (wf->isOldestInstBarrier() && wf->hasBarrier()) {
322 }
323 if (wf->isOldestInstWaitcnt()) {
325 }
326 if (wf->isOldestInstSleep()) {
328 }
329 if (!gpu_dyn_inst->isScalar()) {
331 ->scheduleReadOperands(wf, gpu_dyn_inst);
332 }
333 computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
334
335 DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
336 exeType, wf->simdId, wf->wfDynId,
337 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
338 return true;
339 } else {
340 // Number of stall cycles due to RF access denied
342 // Count number of denials due to each reason
343 // Multiple items may contribute to the denied request
344 if (!accessVrf) {
346 }
347 if (!accessSrf) {
349 }
350
351 // Increment stall counts for WF
352 wf->stats.schStalls++;
354 DPRINTF(GPUSched, "schList[%d]: Could not add: "
355 "SIMD[%d] WV[%d]: %d: %s\n",
356 exeType, wf->simdId, wf->wfDynId,
357 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
358 }
359 return false;
360}
361
362void
364 const GPUDynInstPtr &gpu_dyn_inst)
365{
366 // Insert wave w into schList for specified exeType.
367 // Wave is inserted in age order, with oldest wave being at the
368 // front of the schList
369 assert(gpu_dyn_inst);
370 auto schIter = schList.at(exeType).begin();
371 while (schIter != schList.at(exeType).end()
372 && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
373 schIter++;
374 }
375 schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
376}
377
378void
380{
381 // Check for resource availability in the next cycle
382 scalarMemBusRdy = false;
383 scalarMemIssueRdy = false;
384 // check if there is a SRF->Global Memory bus available and
386 scalarMemBusRdy = true;
387 }
388 // check if we can issue a scalar memory instruction
390 scalarMemIssueRdy = true;
391 }
392
393 glbMemBusRdy = false;
394 glbMemIssueRdy = false;
395 // check if there is a VRF->Global Memory bus available
397 glbMemBusRdy = true;
398 }
399 // check if we can issue a Global memory instruction
401 glbMemIssueRdy = true;
402 }
403
404 locMemBusRdy = false;
405 locMemIssueRdy = false;
406 // check if there is a VRF->LDS bus available
408 locMemBusRdy = true;
409 }
410 // check if we can issue a LDS instruction
412 locMemIssueRdy = true;
413 }
414}
415
416bool
418{
419 assert(gpu_dyn_inst);
420 Wavefront *wf = gpu_dyn_inst->wavefront();
421 vectorAluRdy = false;
422 scalarAluRdy = false;
423 // check for available vector/scalar ALUs in the next cycle
424 if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
425 vectorAluRdy = true;
426 }
427 if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
428 scalarAluRdy = true;
429 }
430
431 if (gpu_dyn_inst->isNop()) {
432 // S_NOP requires SALU. V_NOP requires VALU.
433 // TODO: Scalar NOP does not require SALU in hardware,
434 // and is executed out of IB directly.
435 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
437 return false;
438 } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
440 return false;
441 }
442 } else if (gpu_dyn_inst->isEndOfKernel()) {
443 // EndPgm instruction
444 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
446 return false;
447 }
448 } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
449 || gpu_dyn_inst->isALU()) {
450 // Barrier, Branch, or ALU instruction
451 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
453 return false;
454 } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
456 return false;
457 }
458 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
459 // Vector Global Memory instruction
460 bool rdy = true;
461 if (!glbMemIssueRdy) {
462 rdy = false;
464 }
465 if (!glbMemBusRdy) {
466 rdy = false;
468 }
469 if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
470 rdy = false;
472 }
474 rdy = false;
476 }
477 if (!rdy) {
478 return false;
479 }
480 } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
481 // Scalar Global Memory instruction
482 bool rdy = true;
483 if (!scalarMemIssueRdy) {
484 rdy = false;
486 }
487 if (!scalarMemBusRdy) {
488 rdy = false;
490 }
494 {
495 rdy = false;
497 }
498 if (!rdy) {
499 return false;
500 }
501 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
502 // Vector Local Memory instruction
503 bool rdy = true;
504 if (!locMemIssueRdy) {
505 rdy = false;
507 }
508 if (!locMemBusRdy) {
509 rdy = false;
511 }
513 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
514 rdy = false;
516 }
517 if (!rdy) {
518 return false;
519 }
520 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
521 // Vector Flat memory instruction
522 bool rdy = true;
524 rdy = false;
526 }
527 if (!glbMemBusRdy || !locMemBusRdy) {
528 rdy = false;
530 }
531 if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
532 rdy = false;
534 }
536 rdy = false;
538 }
540 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
541 rdy = false;
543 }
544 if (!rdy) {
545 return false;
546 }
547 } else {
548 panic("%s: unknown instr checked for readiness",
549 gpu_dyn_inst->disassemble());
550 return false;
551 }
553 return true;
554}
555
556void
558{
559 // update execution resource status
561 // iterate execution resources
562 for (int j = 0; j < computeUnit.numExeUnits(); j++) {
563 assert(toExecute.dispatchStatus(j) == EMPTY);
564
565 // iterate waves in schList to pick one for dispatch
566 auto schIter = schList.at(j).begin();
567 bool dispatched = false;
568 while (schIter != schList.at(j).end()) {
569 // only attempt to dispatch if status is RFREADY
570 if (schIter->second == RFREADY) {
571 // Check if this wave is ready for dispatch
572 bool dispRdy = dispatchReady(schIter->first);
573 if (!dispatched && dispRdy) {
574 // No other wave has been dispatched for this exe
575 // resource, and this wave is ready. Place this wave
576 // on dispatchList and make it ready for execution
577 // next cycle.
578
579 // Acquire a coalescer token if it is a global mem
580 // operation.
581 GPUDynInstPtr mp = schIter->first;
582 if (!mp->isMemSync() && !mp->isScalar() &&
583 mp->needsToken()) {
585 }
586
587 // Set instruction's exec_mask if it's a mem operation
588 if (mp->isMemRef()) {
589 mp->exec_mask = mp->wavefront()->execMask();
590 }
591
592 doDispatchListTransition(j, EXREADY, schIter->first);
593 DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
594 "EMPTY->EXREADY\n", j);
595 schIter->first = nullptr;
596 schIter = schList.at(j).erase(schIter);
597 dispatched = true;
598 } else {
599 // Either another wave has been dispatched, or this wave
600 // was not ready, so it is stalled this cycle
601 schIter->first->wavefront()->stats.schStalls++;
602 if (!dispRdy) {
603 // not ready for dispatch, increment stall stat
604 schIter->first->wavefront()->stats.schResourceStalls++;
605 }
606 // Examine next wave for this resource
607 schIter++;
608 }
609 } else {
610 // Wave not in RFREADY, try next wave
611 schIter++;
612 }
613 }
614
615 // Increment stall count if no wave sent to dispatchList for
616 // current execution resource
617 if (!dispatched) {
619 } else {
621 }
622 }
623}
624
625void
627{
628 // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
629
630 // iterate the GM pipelines
631 for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
632 // get the GM pipe index in the dispatchList
633 int gm_exe_unit = computeUnit.firstMemUnit() + i;
634 // get the wave in the dispatchList
635 GPUDynInstPtr &gpu_dyn_inst
636 = toExecute.readyInst(gm_exe_unit);
637 // If the WF is valid, ready to execute, and the instruction
638 // is a flat access, arbitrate with the WF's assigned LM pipe
639 if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
640 == EXREADY && gpu_dyn_inst->isFlat()) {
641 Wavefront *wf = gpu_dyn_inst->wavefront();
642 // If the associated LM pipe also has a wave selected, block
643 // that wave and let the Flat instruction issue. The WF in the
644 // LM pipe is added back to the schList for consideration next
645 // cycle.
648 .readyInst(wf->localMem));
649 // Increment stall stats for LDS-VRF arbitration
652 ->wavefront()->stats.schLdsArbStalls++;
653 }
654 // With arbitration of LM pipe complete, transition the
655 // LM pipe to SKIP state in the dispatchList to inform EX stage
656 // that a Flat instruction is executing next cycle
657 doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
658 DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
659 "EXREADY->SKIP\n", wf->localMem);
660 }
661 }
662}
663
664void
666{
667 // Iterate the schList queues and check if operand reads
668 // have completed in the RFs. If so, mark the wave as ready for
669 // selection for dispatchList
670 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
671 for (auto &p : schList.at(j)) {
672 const GPUDynInstPtr &gpu_dyn_inst = p.first;
673 assert(gpu_dyn_inst);
674 Wavefront *wf = gpu_dyn_inst->wavefront();
675
676 // Increment the number of cycles the wave spends in the
677 // SCH stage, since this loop visits every wave in SCH.
678 wf->stats.schCycles++;
679
680 bool vrfRdy = true;
681 if (!gpu_dyn_inst->isScalar()) {
682 vrfRdy = computeUnit.vrf[wf->simdId]
683 ->operandReadComplete(wf, gpu_dyn_inst);
684 }
685 bool srfRdy = computeUnit.srf[wf->simdId]
686 ->operandReadComplete(wf, gpu_dyn_inst);
687 bool operandsReady = vrfRdy && srfRdy;
688 if (operandsReady) {
689 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
690 "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
691 gpu_dyn_inst->disassemble());
692 DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
693 j, wf->wfDynId);
694 p.second = RFREADY;
695 } else {
696 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
697 "for: %d: %s\n", j, wf->wfDynId,
698 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
699
700 // operands not ready yet, increment SCH stage stats
701 // aggregate to all wavefronts on the CU
702 p.second = RFBUSY;
703
704 // Increment stall stats
705 wf->stats.schStalls++;
707
709 if (!vrfRdy) {
711 }
712 if (!srfRdy) {
714 }
715 }
716 }
717 }
718}
719
720void
722{
723 std::vector<bool> exeUnitReservations;
724 exeUnitReservations.resize(computeUnit.numExeUnits(), false);
725
726 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
727 GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
728 if (gpu_dyn_inst) {
730 Wavefront *wf = gpu_dyn_inst->wavefront();
731 if (s == EMPTY) {
732 continue;
733 } else if (s == EXREADY) {
734 // Wave is ready for execution
735 std::vector<int> execUnitIds = wf->reserveResources();
736
737 if (!gpu_dyn_inst->isScalar()) {
739 ->dispatchInstruction(gpu_dyn_inst);
740 }
741 computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
742
743 std::stringstream ss;
744 for (auto id : execUnitIds) {
745 ss << id << " ";
746 }
747 DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
748 " Reserving ExeRes[ %s]\n",
749 j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
750 gpu_dyn_inst->disassemble(), ss.str());
751 // mark the resources as reserved for this cycle
752 for (auto execUnitId : execUnitIds) {
753 panic_if(exeUnitReservations.at(execUnitId),
754 "Execution unit %d is reserved!!!\n"
755 "SIMD[%d] WV[%d]: %d: %s",
756 execUnitId, wf->simdId, wf->wfDynId,
757 gpu_dyn_inst->seqNum(),
758 gpu_dyn_inst->disassemble());
759 exeUnitReservations.at(execUnitId) = true;
760 }
761
762 // If wavefront::reserveResources reserved multiple resources,
763 // then we're executing a flat memory instruction. This means
764 // that we've reserved a global and local memory unit. Thus,
765 // we need to mark the latter execution unit as not available.
766 if (execUnitIds.size() > 1) {
767 [[maybe_unused]] int lm_exec_unit = wf->localMem;
768 assert(toExecute.dispatchStatus(lm_exec_unit)
769 == SKIP);
770 }
771 } else if (s == SKIP) {
772 // Shared Memory pipe reserved for FLAT instruction.
773 // Verify the GM pipe for this wave is ready to execute
774 // and the wave in the GM pipe is the same as the wave
775 // in the LM pipe
776 [[maybe_unused]] int gm_exec_unit = wf->globalMem;
777 assert(wf->wfDynId == toExecute
778 .readyInst(gm_exec_unit)->wfDynId);
779 assert(toExecute.dispatchStatus(gm_exec_unit)
780 == EXREADY);
781 }
782 }
783 }
784}
785
786void
788{
789 wavesInSch.erase(w->wfDynId);
790}
791
793 statistics::Group *parent, int num_exec_units)
794 : statistics::Group(parent, "ScheduleStage"),
795 ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
796 "execution resource"),
797 ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
798 "list per execution resource"),
799 ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
800 "schList per execution resource when ready list is not empty"),
801 ADD_STAT(schListToDispList, "number of cycles a wave is added to "
802 "dispatchList per execution resource"),
803 ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
804 " dispatchList per execution resource"),
805 ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
806 ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
807 "conflicts"),
808 ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
809 "ready"),
810 ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
811 "ready")
812{
813 rdyListNotEmpty.init(num_exec_units);
814 rdyListEmpty.init(num_exec_units);
815 addToSchListStalls.init(num_exec_units);
816 schListToDispList.init(num_exec_units);
817 schListToDispListStalls.init(num_exec_units);
821
825
829 csprintf("VectorMemIssue"));
831 csprintf("VectorMemBusBusy"));
833 csprintf("VectorMemCoalescer"));
836 csprintf("ScalarMemIssue"));
838 csprintf("ScalarMemBusBusy"));
840 csprintf("ScalarMemFIFO"));
842 csprintf("LocalMemIssue"));
844 csprintf("LocalMemBusBusy"));
846 csprintf("LocalMemFIFO"));
848 csprintf("FlatMemIssue"));
850 csprintf("FlatMemBusBusy"));
852 csprintf("FlatMemCoalescer"));
854 csprintf("FlatMemFIFO"));
856
862}
863
864} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
std::vector< WaitClass > scalarALUs
WaitClass scalarMemUnit
WaitClass vectorGlobalMemUnit
LocalMemPipeline localMemoryPipe
WaitClass vrfToLocalMemPipeBus
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
int numExeUnits() const
GlobalMemPipeline globalMemoryPipe
void insertInPipeMap(Wavefront *w)
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
std::vector< WaitClass > vectorALUs
WaitClass vectorSharedMemUnit
std::vector< VectorRegisterFile * > vrf
WaitClass vrfToGlobalMemPipeBus
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void acqCoalescerToken(GPUDynInstPtr mp)
bool coalescerReady(GPUDynInstPtr mp) const
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
gem5::ScheduleStage::ScheduleStageStats stats
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
ScoreboardCheckToSchedule & fromScoreboardCheck
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Communication interface between Schedule and Execute stages.
Definition comm.hh:99
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
Definition comm.cc:150
void reset() override
Reset the pipe stage interface.
Definition comm.cc:115
GPUDynInstPtr & readyInst(int func_unit_id)
Definition comm.cc:127
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Definition comm.cc:133
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
Definition comm.cc:73
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
Definition comm.cc:79
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
Definition comm.cc:89
bool rdy(Cycles cycles=Cycles(0)) const
Definition misc.hh:93
bool isOldestInstWaitcnt()
Definition wavefront.cc:657
bool hasBarrier() const
void setStatus(status_e newStatus)
Definition wavefront.cc:573
const int simdId
Definition wavefront.hh:101
bool isOldestInstBarrier()
Definition wavefront.cc:704
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:111
bool isOldestInstSleep()
Definition wavefront.cc:643
void incExpInstsIssued()
std::vector< int > reserveResources()
Definition wavefront.cc:863
void incLGKMInstsIssued()
void incVMemInstsIssued()
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:88
gem5::Wavefront::WavefrontStats stats
uint64_t wfDynId
Definition wavefront.hh:233
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 4 > s
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 11 > mp
Bitfield< 0 > p
Bitfield< 2 > dq
Bitfield< 0 > w
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
DISPATCH_STATUS
Definition exec_stage.hh:60
@ EXREADY
Definition exec_stage.hh:62
@ EMPTY
Definition exec_stage.hh:61
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Scalar schCycles
Definition wavefront.hh:350
statistics::Scalar schRfAccessStalls
Definition wavefront.hh:360
statistics::Scalar schOpdNrdyStalls
Definition wavefront.hh:364
statistics::Scalar schStalls
Definition wavefront.hh:353
const std::string & name()
Definition trace.cc:48

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0