gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
schedule_stage.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <unordered_set>
35
36#include "base/compiler.hh"
37#include "debug/GPUSched.hh"
38#include "debug/GPUVRF.hh"
45
46namespace gem5
47{
48
49ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,
50 ScoreboardCheckToSchedule &from_scoreboard_check,
51 ScheduleToExecute &to_execute)
52 : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
53 toExecute(to_execute),
54 _name(cu.name() + ".ScheduleStage"),
55 vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
56 scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
57 locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())
58{
59 for (int j = 0; j < cu.numExeUnits(); ++j) {
60 scheduler.emplace_back(p);
61 }
62 wavesInSch.clear();
63 schList.resize(cu.numExeUnits());
64 for (auto &dq : schList) {
65 dq.clear();
66 }
67}
68
70{
71 scheduler.clear();
72 wavesInSch.clear();
73 schList.clear();
74}
75
76void
78{
79
80 fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),
81 "Scheduler should have same number of entries as CU's readyList");
82 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
83 scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
84 }
85
86 assert(computeUnit.numVectorGlobalMemUnits == 1);
87 assert(computeUnit.numVectorSharedMemUnits == 1);
88}
89
90void
92{
93 toExecute.reset();
94
95 // Update readyList
96 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
102 fromScoreboardCheck.updateReadyList(j);
103 for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
104 wIt != fromScoreboardCheck.readyWFs(j).end();) {
105 if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
106 *wIt = nullptr;
107 wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
108 } else {
109 wIt++;
110 }
111 }
112 }
113
114 // Attempt to add another wave for each EXE type to schList queues
115 // VMEM resources are iterated first, effectively giving priority
116 // to VMEM over VALU for scheduling read of operands to the RFs.
117 // Scalar Memory are iterated after VMEM
118
119 // Iterate VMEM and SMEM
120 int firstMemUnit = computeUnit.firstMemUnit();
121 int lastMemUnit = computeUnit.lastMemUnit();
122 for (int j = firstMemUnit; j <= lastMemUnit; j++) {
123 int readyListSize = fromScoreboardCheck.readyWFs(j).size();
124 // If no wave is ready to be scheduled on the execution resource
125 // then skip scheduling for this execution resource
126 if (!readyListSize) {
127 stats.rdyListEmpty[j]++;
128 continue;
129 }
130 stats.rdyListNotEmpty[j]++;
131
132 // Pick a wave and attempt to add it to schList
133 Wavefront *wf = scheduler[j].chooseWave();
134 GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
135 assert(gpu_dyn_inst);
136 if (!addToSchList(j, gpu_dyn_inst)) {
137 // For waves not added to schList, increment count of cycles
138 // this wave spends in SCH stage.
139 wf->stats.schCycles++;
140 stats.addToSchListStalls[j]++;
141 } else {
142 if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
143 wf->incLGKMInstsIssued();
144 wf->trackLGKMInst(gpu_dyn_inst);
145 } else {
146 wf->incVMemInstsIssued();
147 wf->trackVMemInst(gpu_dyn_inst);
148 if (gpu_dyn_inst->isFlat()) {
149 wf->incLGKMInstsIssued();
150 wf->trackLGKMInst(gpu_dyn_inst);
151 }
152 }
153 if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {
154 wf->incExpInstsIssued();
155 wf->trackExpInst(gpu_dyn_inst);
156 }
157 }
158 }
159
160 // Iterate everything else
161 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
162 // skip the VMEM resources
163 if (j >= firstMemUnit && j <= lastMemUnit) {
164 continue;
165 }
166 int readyListSize = fromScoreboardCheck.readyWFs(j).size();
167 // If no wave is ready to be scheduled on the execution resource
168 // then skip scheduling for this execution resource
169 if (!readyListSize) {
170 stats.rdyListEmpty[j]++;
171 continue;
172 }
173 stats.rdyListNotEmpty[j]++;
174
175 // Pick a wave and attempt to add it to schList
176 Wavefront *wf = scheduler[j].chooseWave();
177 GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
178 assert(gpu_dyn_inst);
179 if (!addToSchList(j, gpu_dyn_inst)) {
180 // For waves not added to schList, increment count of cycles
181 // this wave spends in SCH stage.
182 wf->stats.schCycles++;
183 stats.addToSchListStalls[j]++;
184 }
185 }
186
187 // At this point, the schList queue per EXE type may contain
188 // multiple waves, in order of age (oldest to youngest).
189 // Wave may be in RFBUSY, indicating they are waiting for registers
190 // to be read, or in RFREADY, indicating they are candidates for
191 // the dispatchList and execution
192
193 // Iterate schList queues and check if any of the waves have finished
194 // reading their operands, moving those waves to RFREADY status
196
197 // Fill the dispatch list with the oldest wave of each EXE type that
198 // is ready to execute
199 // Wave is picked if status in schList is RFREADY and it passes resource
200 // ready checks similar to those currently in SCB
202
203 // Resource arbitration on waves in dispatchList
204 // Losing waves are re-inserted to the schList at a location determined
205 // by wave age
206
207 // Arbitrate access to the VRF->LDS bus
209
210 // Schedule write operations to the register files
212
213 // Lastly, reserve resources for waves that are ready to execute.
215}
216
217void
219 const GPUDynInstPtr &gpu_dyn_inst)
220{
221 toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
222}
223
224void
226{
227 toExecute.dispatchTransition(unitId, s);
228}
229
230bool
231ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
232{
233 assert(gpu_dyn_inst);
234 Wavefront *wf = gpu_dyn_inst->wavefront();
235 bool accessVrfWr = true;
236 if (!gpu_dyn_inst->isScalar()) {
237 accessVrfWr = computeUnit.vrf[wf->simdId]
238 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
239 }
240 bool accessSrfWr = computeUnit.srf[wf->simdId]
241 ->canScheduleWriteOperands(wf, gpu_dyn_inst);
242 bool accessRf = accessVrfWr && accessSrfWr;
243 if (accessRf) {
244 if (!gpu_dyn_inst->isScalar()) {
245 computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
246 gpu_dyn_inst);
247 }
248 computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
249 return true;
250 } else {
251 stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
252 if (!accessSrfWr) {
253 stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
254 }
255 if (!accessVrfWr) {
256 stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
257 }
258
259 // Increment stall counts for WF
260 wf->stats.schStalls++;
262 }
263 return false;
264}
265
266void
268{
269 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
270 if (toExecute.dispatchStatus(j) == EMPTY ||
271 toExecute.dispatchStatus(j) == SKIP) {
272 continue;
273 }
274
275 // get the wave on dispatch list and attempt to allocate write
276 // resources in the RFs
277 const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
278 assert(gpu_dyn_inst);
279 Wavefront *wf = gpu_dyn_inst->wavefront();
280 if (!schedRfWrites(j, gpu_dyn_inst)) {
281 reinsertToSchList(j, gpu_dyn_inst);
283 // if this is a flat inst, also transition the LM pipe to empty
284 // Note: since FLAT/LM arbitration occurs before scheduling
285 // destination operands to the RFs, it is possible that a LM
286 // instruction lost arbitration, but would have been able to
287 // pass the RF destination operand check here, and execute
288 // instead of the FLAT.
289 if (wf->instructionBuffer.front()->isFlat()) {
290 assert(toExecute.dispatchStatus(wf->localMem)
291 == SKIP);
293 }
294 }
295 }
296}
297
298bool
299ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
300{
301 // Attempt to add the wave to the schList if the VRF can support the
302 // wave's next instruction
303 assert(gpu_dyn_inst);
304 Wavefront *wf = gpu_dyn_inst->wavefront();
305 bool accessVrf = true;
306 if (!gpu_dyn_inst->isScalar()) {
307 accessVrf = computeUnit.vrf[wf->simdId]
308 ->canScheduleReadOperands(wf, gpu_dyn_inst);
309 }
310 bool accessSrf = computeUnit.srf[wf->simdId]
311 ->canScheduleReadOperands(wf, gpu_dyn_inst);
312 // If RFs can support instruction, add to schList in RFBUSY state,
313 // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
314 // to the VRF
315 bool accessRf = accessVrf && accessSrf;
316 wf->lastVrfStatus = accessVrf;
317 wf->lastSrfStatus = accessSrf;
318 if (accessRf) {
319 DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
320 exeType, wf->simdId, wf->wfDynId,
321 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
322
323 computeUnit.insertInPipeMap(wf);
324 wavesInSch.emplace(wf->wfDynId);
325 schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
326 if (wf->isOldestInstBarrier() && wf->hasBarrier()) {
328 }
329 if (wf->isOldestInstWaitcnt()) {
331 }
332 if (wf->isOldestInstSleep()) {
334 }
335 if (!gpu_dyn_inst->isScalar()) {
336 computeUnit.vrf[wf->simdId]
337 ->scheduleReadOperands(wf, gpu_dyn_inst);
338 }
339 computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
340
341 DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
342 exeType, wf->simdId, wf->wfDynId,
343 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
344 return true;
345 } else {
346 // Number of stall cycles due to RF access denied
347 stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
348 // Count number of denials due to each reason
349 // Multiple items may contribute to the denied request
350 if (!accessVrf) {
351 stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
352 }
353 if (!accessSrf) {
354 stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
355 }
356
357 // Increment stall counts for WF
358 wf->stats.schStalls++;
360 DPRINTF(GPUSched, "schList[%d]: Could not add: "
361 "SIMD[%d] WV[%d]: %d: %s\n",
362 exeType, wf->simdId, wf->wfDynId,
363 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
364 }
365 return false;
366}
367
368void
370 const GPUDynInstPtr &gpu_dyn_inst)
371{
372 // Insert wave w into schList for specified exeType.
373 // Wave is inserted in age order, with oldest wave being at the
374 // front of the schList
375 assert(gpu_dyn_inst);
376 auto schIter = schList.at(exeType).begin();
377 while (schIter != schList.at(exeType).end()
378 && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
379 schIter++;
380 }
381 schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
382}
383
384void
386{
387 // Check for resource availability in the next cycle
388 scalarMemBusRdy = false;
389 scalarMemIssueRdy = false;
390 // check if there is a SRF->Global Memory bus available and
391 if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
392 scalarMemBusRdy = true;
393 }
394 // check if we can issue a scalar memory instruction
395 if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
396 scalarMemIssueRdy = true;
397 }
398
399 glbMemBusRdy = false;
400 glbMemIssueRdy = false;
401 // check if there is a VRF->Global Memory bus available
402 if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
403 glbMemBusRdy = true;
404 }
405 // check if we can issue a Global memory instruction
406 if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
407 glbMemIssueRdy = true;
408 }
409
410 locMemBusRdy = false;
411 locMemIssueRdy = false;
412 // check if there is a VRF->LDS bus available
413 if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
414 locMemBusRdy = true;
415 }
416 // check if we can issue a LDS instruction
417 if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
418 locMemIssueRdy = true;
419 }
420}
421
422bool
424{
425 assert(gpu_dyn_inst);
426 Wavefront *wf = gpu_dyn_inst->wavefront();
427 vectorAluRdy = false;
428 scalarAluRdy = false;
429 // check for available vector/scalar ALUs in the next cycle
430 if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
431 vectorAluRdy = true;
432 }
433 if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
434 scalarAluRdy = true;
435 }
436
437 if (gpu_dyn_inst->isNop()) {
438 // S_NOP requires SALU. V_NOP requires VALU.
439 // TODO: Scalar NOP does not require SALU in hardware,
440 // and is executed out of IB directly.
441 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
442 stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
443 return false;
444 } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
445 stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
446 return false;
447 }
448 } else if (gpu_dyn_inst->isEndOfKernel()) {
449 // EndPgm instruction
450 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
451 stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
452 return false;
453 }
454 } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
455 || gpu_dyn_inst->isALU()) {
456 // Barrier, Branch, or ALU instruction
457 if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
458 stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
459 return false;
460 } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
461 stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
462 return false;
463 }
464 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
465 // Vector Global Memory instruction
466 bool rdy = true;
467 if (!glbMemIssueRdy) {
468 rdy = false;
469 stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
470 }
471 if (!glbMemBusRdy) {
472 rdy = false;
473 stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
474 }
475 if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
476 rdy = false;
477 stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
478 }
479 if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
480 rdy = false;
481 stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
482 }
483 if (!rdy) {
484 return false;
485 }
486 } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
487 // Scalar Global Memory instruction
488 bool rdy = true;
489 if (!scalarMemIssueRdy) {
490 rdy = false;
491 stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
492 }
493 if (!scalarMemBusRdy) {
494 rdy = false;
495 stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
496 }
497 if (!computeUnit.scalarMemoryPipe
498 .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
500 {
501 rdy = false;
502 stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
503 }
504 if (!rdy) {
505 return false;
506 }
507 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
508 // Vector Local Memory instruction
509 bool rdy = true;
510 if (!locMemIssueRdy) {
511 rdy = false;
512 stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
513 }
514 if (!locMemBusRdy) {
515 rdy = false;
516 stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
517 }
518 if (!computeUnit.localMemoryPipe.
519 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
520 rdy = false;
521 stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
522 }
523 if (!rdy) {
524 return false;
525 }
526 } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
527 // Vector Flat memory instruction
528 bool rdy = true;
530 rdy = false;
531 stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
532 }
533 if (!glbMemBusRdy || !locMemBusRdy) {
534 rdy = false;
535 stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
536 }
537 if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
538 rdy = false;
539 stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
540 }
541 if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
542 rdy = false;
543 stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
544 }
545 if (!computeUnit.localMemoryPipe.
546 isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
547 rdy = false;
548 stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
549 }
550 if (!rdy) {
551 return false;
552 }
553 } else {
554 panic("%s: unknown instr checked for readiness",
555 gpu_dyn_inst->disassemble());
556 return false;
557 }
558 stats.dispNrdyStalls[SCH_RDY]++;
559 return true;
560}
561
562void
564{
565 // update execution resource status
567 // iterate execution resources
568 for (int j = 0; j < computeUnit.numExeUnits(); j++) {
569 assert(toExecute.dispatchStatus(j) == EMPTY);
570
571 // iterate waves in schList to pick one for dispatch
572 auto schIter = schList.at(j).begin();
573 auto selected_iter = schList.at(j).end();
574
575 //find the earliest ready wave according to seqNum
576 for (auto iter = schList.at(j).begin();
577 iter != schList.at(j).end(); iter++) {
578 if (iter->second == RFREADY && dispatchReady(iter->first)) {
579 if (selected_iter == schList.at(j).end()) {
580 selected_iter = iter;
581 } else if
582 (selected_iter->first->seqNum() > iter->first->seqNum()) {
583 selected_iter = iter;
584 }
585 }
586 }
587 while (schIter != schList.at(j).end()) {
588 // only attempt to dispatch if status is RFREADY
589 if (schIter->second == RFREADY) {
590 //oldest wave selected for dispatch
591 if (schIter == selected_iter) {
592 GPUDynInstPtr mp = schIter->first;
593 if (!mp->isMemSync() && !mp->isScalar() &&
594 mp->needsToken()) {
595 computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
596 }
597
598 // Set instruction's exec_mask if it's a mem operation
599 if (mp->isMemRef()) {
600 mp->exec_mask = mp->wavefront()->execMask();
601 }
602
603 doDispatchListTransition(j, EXREADY, schIter->first);
604 DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
605 "EMPTY->EXREADY\n", j);
606 schIter->first = nullptr;
607 schIter++;
608 } else {
609 // Either another wave has been selected for dispatch,
610 // or this wave was not ready, so it is stalled this cycle
611 schIter->first->wavefront()->stats.schStalls++;
612 if (!dispatchReady(schIter->first)) {
613 // not ready for dispatch, increment stall stat
614 schIter->first->wavefront()->stats.schResourceStalls++;
615 }
616 // Examine next wave for this resource
617 schIter++;
618 }
619 } else {
620 // Wave not in RFREADY, try next wave
621 schIter++;
622 }
623 }
624
625 // Increment stall count if no wave sent to dispatchList for
626 // current execution resource
627 // No wave has been selected for dispatch = not dispatched
628 if (selected_iter == schList.at(j).end()) {
629 stats.schListToDispListStalls[j]++;
630 } else {
631 stats.schListToDispList[j]++;
632 //erase the dispatched wave
633 schList.at(j).erase(selected_iter);
634 }
635 }
636}
637
638void
640{
641 // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
642
643 // iterate the GM pipelines
644 for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
645 // get the GM pipe index in the dispatchList
646 int gm_exe_unit = computeUnit.firstMemUnit() + i;
647 // get the wave in the dispatchList
648 GPUDynInstPtr &gpu_dyn_inst
649 = toExecute.readyInst(gm_exe_unit);
650 // If the WF is valid, ready to execute, and the instruction
651 // is a flat access, arbitrate with the WF's assigned LM pipe
652 if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
653 == EXREADY && gpu_dyn_inst->isFlat()) {
654 Wavefront *wf = gpu_dyn_inst->wavefront();
655 // If the associated LM pipe also has a wave selected, block
656 // that wave and let the Flat instruction issue. The WF in the
657 // LM pipe is added back to the schList for consideration next
658 // cycle.
659 if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {
661 .readyInst(wf->localMem));
662 // Increment stall stats for LDS-VRF arbitration
663 stats.ldsBusArbStalls++;
664 toExecute.readyInst(wf->localMem)
665 ->wavefront()->stats.schLdsArbStalls++;
666 }
667 // With arbitration of LM pipe complete, transition the
668 // LM pipe to SKIP state in the dispatchList to inform EX stage
669 // that a Flat instruction is executing next cycle
670 doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
671 DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
672 "EXREADY->SKIP\n", wf->localMem);
673 }
674 }
675}
676
677void
679{
680 // Iterate the schList queues and check if operand reads
681 // have completed in the RFs. If so, mark the wave as ready for
682 // selection for dispatchList
683 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
684 for (auto &p : schList.at(j)) {
685 const GPUDynInstPtr &gpu_dyn_inst = p.first;
686 assert(gpu_dyn_inst);
687 Wavefront *wf = gpu_dyn_inst->wavefront();
688
689 // Increment the number of cycles the wave spends in the
690 // SCH stage, since this loop visits every wave in SCH.
691 wf->stats.schCycles++;
692
693 bool vrfRdy = true;
694 if (!gpu_dyn_inst->isScalar()) {
695 vrfRdy = computeUnit.vrf[wf->simdId]
696 ->operandReadComplete(wf, gpu_dyn_inst);
697 }
698 bool srfRdy = computeUnit.srf[wf->simdId]
699 ->operandReadComplete(wf, gpu_dyn_inst);
700 bool operandsReady = vrfRdy && srfRdy;
701 if (operandsReady) {
702 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
703 "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
704 gpu_dyn_inst->disassemble());
705 DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
706 j, wf->wfDynId);
707 p.second = RFREADY;
708 } else {
709 DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
710 "for: %d: %s\n", j, wf->wfDynId,
711 gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
712
713 // operands not ready yet, increment SCH stage stats
714 // aggregate to all wavefronts on the CU
715 p.second = RFBUSY;
716
717 // Increment stall stats
718 wf->stats.schStalls++;
720
721 stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;
722 if (!vrfRdy) {
723 stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
724 }
725 if (!srfRdy) {
726 stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
727 }
728 }
729 }
730 }
731}
732
733void
735{
736 std::vector<bool> exeUnitReservations;
737 exeUnitReservations.resize(computeUnit.numExeUnits(), false);
738
739 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
740 GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
741 if (gpu_dyn_inst) {
742 DISPATCH_STATUS s = toExecute.dispatchStatus(j);
743 Wavefront *wf = gpu_dyn_inst->wavefront();
744 if (s == EMPTY) {
745 continue;
746 } else if (s == EXREADY) {
747 // Wave is ready for execution
748 std::vector<int> execUnitIds = wf->reserveResources();
749
750 if (!gpu_dyn_inst->isScalar()) {
751 computeUnit.vrf[wf->simdId]
752 ->dispatchInstruction(gpu_dyn_inst);
753 }
754 computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
755
756 std::stringstream ss;
757 for (auto id : execUnitIds) {
758 ss << id << " ";
759 }
760 DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
761 " Reserving ExeRes[ %s]\n",
762 j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
763 gpu_dyn_inst->disassemble(), ss.str());
764 // mark the resources as reserved for this cycle
765 for (auto execUnitId : execUnitIds) {
766 panic_if(exeUnitReservations.at(execUnitId),
767 "Execution unit %d is reserved!!!\n"
768 "SIMD[%d] WV[%d]: %d: %s",
769 execUnitId, wf->simdId, wf->wfDynId,
770 gpu_dyn_inst->seqNum(),
771 gpu_dyn_inst->disassemble());
772 exeUnitReservations.at(execUnitId) = true;
773 }
774
775 // If wavefront::reserveResources reserved multiple resources,
776 // then we're executing a flat memory instruction. This means
777 // that we've reserved a global and local memory unit. Thus,
778 // we need to mark the latter execution unit as not available.
779 if (execUnitIds.size() > 1) {
780 [[maybe_unused]] int lm_exec_unit = wf->localMem;
781 assert(toExecute.dispatchStatus(lm_exec_unit)
782 == SKIP);
783 }
784 } else if (s == SKIP) {
785 // Shared Memory pipe reserved for FLAT instruction.
786 // Verify the GM pipe for this wave is ready to execute
787 // and the wave in the GM pipe is the same as the wave
788 // in the LM pipe
789 [[maybe_unused]] int gm_exec_unit = wf->globalMem;
790 assert(wf->wfDynId == toExecute
791 .readyInst(gm_exec_unit)->wfDynId);
792 assert(toExecute.dispatchStatus(gm_exec_unit)
793 == EXREADY);
794 }
795 }
796 }
797}
798
799void
801{
802 wavesInSch.erase(w->wfDynId);
803}
804
806 statistics::Group *parent, int num_exec_units)
807 : statistics::Group(parent, "ScheduleStage"),
808 ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "
809 "execution resource"),
810 ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "
811 "list per execution resource"),
812 ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "
813 "schList per execution resource when ready list is not empty"),
814 ADD_STAT(schListToDispList, "number of cycles a wave is added to "
815 "dispatchList per execution resource"),
816 ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"
817 " dispatchList per execution resource"),
818 ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),
819 ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "
820 "conflicts"),
821 ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "
822 "ready"),
823 ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "
824 "ready")
825{
826 rdyListNotEmpty.init(num_exec_units);
827 rdyListEmpty.init(num_exec_units);
828 addToSchListStalls.init(num_exec_units);
829 schListToDispList.init(num_exec_units);
830 schListToDispListStalls.init(num_exec_units);
834
835 opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
836 opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
837 opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
838
839 dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
840 dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
842 csprintf("VectorMemIssue"));
844 csprintf("VectorMemBusBusy"));
846 csprintf("VectorMemCoalescer"));
847 dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
849 csprintf("ScalarMemIssue"));
851 csprintf("ScalarMemBusBusy"));
853 csprintf("ScalarMemFIFO"));
855 csprintf("LocalMemIssue"));
857 csprintf("LocalMemBusBusy"));
859 csprintf("LocalMemFIFO"));
861 csprintf("FlatMemIssue"));
863 csprintf("FlatMemBusBusy"));
865 csprintf("FlatMemCoalescer"));
867 csprintf("FlatMemFIFO"));
868 dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
869
875}
876
877} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
int numExeUnits() const
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
gem5::ScheduleStage::ScheduleStageStats stats
ScheduleToExecute & toExecute
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
const std::string _name
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
const std::string & name() const
std::vector< Scheduler > scheduler
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
ScoreboardCheckToSchedule & fromScoreboardCheck
std::unordered_set< uint64_t > wavesInSch
ComputeUnit & computeUnit
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
void deleteFromSch(Wavefront *w)
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Communication interface between Schedule and Execute stages.
Definition comm.hh:99
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63
bool isOldestInstWaitcnt()
Definition wavefront.cc:683
bool hasBarrier() const
void setStatus(status_e newStatus)
Definition wavefront.cc:599
void trackVMemInst(GPUDynInstPtr gpu_dyn_inst)
const int simdId
Definition wavefront.hh:102
bool isOldestInstBarrier()
Definition wavefront.cc:730
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:112
bool isOldestInstSleep()
Definition wavefront.cc:669
void incExpInstsIssued()
std::vector< int > reserveResources()
Definition wavefront.cc:889
void incLGKMInstsIssued()
void trackExpInst(GPUDynInstPtr gpu_dyn_inst)
void trackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
void incVMemInstsIssued()
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:93
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:89
gem5::Wavefront::WavefrontStats stats
uint64_t wfDynId
Definition wavefront.hh:235
Statistics container.
Definition group.hh:93
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
Bitfield< 4 > s
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 11 > mp
Bitfield< 0 > p
Bitfield< 2 > dq
Bitfield< 0 > w
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
DISPATCH_STATUS
Definition exec_stage.hh:60
@ EXREADY
Definition exec_stage.hh:62
@ EMPTY
Definition exec_stage.hh:61
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
statistics::Scalar schCycles
Definition wavefront.hh:376
statistics::Scalar schRfAccessStalls
Definition wavefront.hh:386
statistics::Scalar schOpdNrdyStalls
Definition wavefront.hh:390
statistics::Scalar schStalls
Definition wavefront.hh:379

Generated on Mon May 26 2025 09:19:11 for gem5 by doxygen 1.13.2