gem5 v23.0.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
wavefront.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include "base/bitfield.hh"
35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/WavefrontStack.hh"
41#include "gpu-compute/shader.hh"
44
45namespace gem5
46{
47
49 : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
50 maxIbSize(p.max_ib_size), _gpuISA(*this),
51 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
52 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
53 sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
54{
55 lastTrace = 0;
56 execUnitId = -1;
76 ldsChunk = nullptr;
77
78 memTraceBusy = 0;
79 oldVgprTcnt = 0xffffffffffffffffll;
80 oldDgprTcnt = 0xffffffffffffffffll;
81 oldVgpr.resize(p.wf_size);
82
83 pendingFetch = false;
84 dropFetch = false;
85 maxVgprs = 0;
86 maxSgprs = 0;
87
88 lastAddr.resize(p.wf_size);
89 workItemFlatId.resize(p.wf_size);
90 oldDgpr.resize(p.wf_size);
91 for (int i = 0; i < 3; ++i) {
92 workItemId[i].resize(p.wf_size);
93 }
94
95 _execMask.set();
96 rawDist.clear();
97 lastInstExec = 0;
98 vecReads.clear();
99}
100
101void
103{
106 startVgprIndex = 0;
107 startSgprIndex = 0;
108
114}
115
116void
117Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
118{
119 int regInitIdx = 0;
120
121 // Iterate over all the init fields and check which
122 // bits are enabled. Useful information can be found here:
123 // https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
124 // blob/master/AMDGPU-ABI.md
125 for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
126
127 if (task->sgprBitEnabled(en_bit)) {
128 int physSgprIdx = 0;
129 uint32_t wiCount = 0;
130 uint32_t firstWave = 0;
131 int orderedAppendTerm = 0;
132 int numWfsInWg = 0;
133 uint32_t finalValue = 0;
134 Addr host_disp_pkt_addr = task->hostDispPktAddr();
135 Addr kernarg_addr = task->kernargAddr();
136 Addr hidden_priv_base(0);
137
138 switch (en_bit) {
139 case PrivateSegBuf:
140 physSgprIdx =
141 computeUnit->registerManager->mapSgpr(this, regInitIdx);
142 computeUnit->srf[simdId]->write(physSgprIdx,
144 ++regInitIdx;
145 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
146 "Setting PrivateSegBuffer: s[%d] = %x\n",
148 wfSlotId, wfDynId, physSgprIdx,
150
151 physSgprIdx =
152 computeUnit->registerManager->mapSgpr(this, regInitIdx);
153 computeUnit->srf[simdId]->write(physSgprIdx,
155 ++regInitIdx;
156 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
157 "Setting PrivateSegBuffer: s[%d] = %x\n",
159 wfSlotId, wfDynId, physSgprIdx,
161
162 physSgprIdx =
163 computeUnit->registerManager->mapSgpr(this, regInitIdx);
164 computeUnit->srf[simdId]->write(physSgprIdx,
166 ++regInitIdx;
167 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
168 "Setting PrivateSegBuffer: s[%d] = %x\n",
170 wfSlotId, wfDynId, physSgprIdx,
172
173 physSgprIdx =
174 computeUnit->registerManager->mapSgpr(this, regInitIdx);
175 computeUnit->srf[simdId]->write(physSgprIdx,
177
178 ++regInitIdx;
179 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
180 "Setting PrivateSegBuffer: s[%d] = %x\n",
182 wfSlotId, wfDynId, physSgprIdx,
184 break;
185 case DispatchPtr:
186 physSgprIdx =
187 computeUnit->registerManager->mapSgpr(this, regInitIdx);
188 computeUnit->srf[simdId]->write(physSgprIdx,
189 bits(host_disp_pkt_addr, 31, 0));
190 ++regInitIdx;
191 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
192 "Setting DispatchPtr: s[%d] = %x\n",
194 wfSlotId, wfDynId, physSgprIdx,
195 bits(host_disp_pkt_addr, 31, 0));
196
197 physSgprIdx =
198 computeUnit->registerManager->mapSgpr(this, regInitIdx);
199 computeUnit->srf[simdId]->write(physSgprIdx,
200 bits(host_disp_pkt_addr, 63, 32));
201 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
202 "Setting DispatchPtr: s[%d] = %x\n",
204 wfSlotId, wfDynId, physSgprIdx,
205 bits(host_disp_pkt_addr, 63, 32));
206
207 ++regInitIdx;
208 break;
209 case QueuePtr:
210 physSgprIdx =
211 computeUnit->registerManager->mapSgpr(this, regInitIdx);
212 computeUnit->srf[simdId]->write(physSgprIdx,
213 bits(task->hostAMDQueueAddr, 31, 0));
214 ++regInitIdx;
215 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
216 "Setting QueuePtr: s[%d] = %x\n",
218 wfSlotId, wfDynId, physSgprIdx,
219 bits(task->hostAMDQueueAddr, 31, 0));
220
221 physSgprIdx =
222 computeUnit->registerManager->mapSgpr(this, regInitIdx);
223 computeUnit->srf[simdId]->write(physSgprIdx,
224 bits(task->hostAMDQueueAddr, 63, 32));
225 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
226 "Setting QueuePtr: s[%d] = %x\n",
228 wfSlotId, wfDynId, physSgprIdx,
229 bits(task->hostAMDQueueAddr, 63, 32));
230
231 ++regInitIdx;
232 break;
233 case KernargSegPtr:
234 physSgprIdx =
235 computeUnit->registerManager->mapSgpr(this, regInitIdx);
236 computeUnit->srf[simdId]->write(physSgprIdx,
237 bits(kernarg_addr, 31, 0));
238 ++regInitIdx;
239 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
240 "Setting KernargSegPtr: s[%d] = %x\n",
242 wfSlotId, wfDynId, physSgprIdx,
243 bits(kernarg_addr, 31, 0));
244
245 physSgprIdx =
246 computeUnit->registerManager->mapSgpr(this, regInitIdx);
247 computeUnit->srf[simdId]->write(physSgprIdx,
248 bits(kernarg_addr, 63, 32));
249 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
250 "Setting KernargSegPtr: s[%d] = %x\n",
252 wfSlotId, wfDynId, physSgprIdx,
253 bits(kernarg_addr, 63, 32));
254
255 ++regInitIdx;
256 break;
257 case DispatchId:
258 physSgprIdx
259 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
260 computeUnit->srf[simdId]->write(physSgprIdx,
261 task->dispatchId());
262 ++regInitIdx;
263 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
264 "Setting DispatchId: s[%d] = %x\n",
266 wfSlotId, wfDynId, physSgprIdx,
267 task->dispatchId());
268
269 // Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
270 physSgprIdx
271 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
272 computeUnit->srf[simdId]->write(physSgprIdx, 0);
273 ++regInitIdx;
274 break;
275 case FlatScratchInit:
276 physSgprIdx
277 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
278 computeUnit->srf[simdId]->write(physSgprIdx,
279 (TheGpuISA::ScalarRegU32)(task->amdQueue
280 .scratch_backing_memory_location & 0xffffffff));
281 ++regInitIdx;
282 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
283 "Setting FlatScratch Addr: s[%d] = %x\n",
285 wfSlotId, wfDynId, physSgprIdx,
286 (TheGpuISA::ScalarRegU32)(task->amdQueue
287 .scratch_backing_memory_location & 0xffffffff));
288
289 physSgprIdx =
290 computeUnit->registerManager->mapSgpr(this, regInitIdx);
291 // This vallue should be sizeof(DWORD) aligned, that is
292 // 4 byte aligned
293 computeUnit->srf[simdId]->write(physSgprIdx,
295 ++regInitIdx;
296 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
297 "Setting FlatScratch size: s[%d] = %x\n",
299 wfSlotId, wfDynId, physSgprIdx,
324 hidden_priv_base =
325 (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
326 (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
327 & 0x000000000000ffff) << 32);
329 hidden_priv_base,
331 break;
332 case PrivateSegSize:
333 physSgprIdx
334 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
335 computeUnit->srf[simdId]->write(physSgprIdx,
336 task->privMemPerItem());
337 ++regInitIdx;
338 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
339 "Setting private segment size: s[%d] = %x\n",
341 wfSlotId, wfDynId, physSgprIdx,
342 task->privMemPerItem());
343 break;
345 physSgprIdx =
346 computeUnit->registerManager->mapSgpr(this, regInitIdx);
347 wiCount = ((task->gridSize(0) +
348 task->wgSize(0) - 1) /
349 task->wgSize(0));
350 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
351
352 ++regInitIdx;
353 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
354 "Setting num WG X: s[%d] = %x\n",
356 wfSlotId, wfDynId, physSgprIdx, wiCount);
357 break;
359 physSgprIdx =
360 computeUnit->registerManager->mapSgpr(this, regInitIdx);
361 wiCount = ((task->gridSize(1) +
362 task->wgSize(1) - 1) /
363 task->wgSize(1));
364 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
365
366 ++regInitIdx;
367 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
368 "Setting num WG Y: s[%d] = %x\n",
370 wfSlotId, wfDynId, physSgprIdx, wiCount);
371 break;
373 physSgprIdx =
374 computeUnit->registerManager->mapSgpr(this, regInitIdx);
375 wiCount = ((task->gridSize(2) +
376 task->wgSize(2) - 1) /
377 task->wgSize(2));
378 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
379
380 ++regInitIdx;
381 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
382 "Setting num WG Z: s[%d] = %x\n",
384 wfSlotId, wfDynId, physSgprIdx, wiCount);
385 break;
386 case WorkgroupIdX:
387 physSgprIdx =
388 computeUnit->registerManager->mapSgpr(this, regInitIdx);
389 computeUnit->srf[simdId]->write(physSgprIdx,
390 workGroupId[0]);
391
392 ++regInitIdx;
393 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
394 "Setting WG ID X: s[%d] = %x\n",
396 wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
397 break;
398 case WorkgroupIdY:
399 physSgprIdx =
400 computeUnit->registerManager->mapSgpr(this, regInitIdx);
401 computeUnit->srf[simdId]->write(physSgprIdx,
402 workGroupId[1]);
403
404 ++regInitIdx;
405 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
406 "Setting WG ID Y: s[%d] = %x\n",
408 wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
409 break;
410 case WorkgroupIdZ:
411 physSgprIdx =
412 computeUnit->registerManager->mapSgpr(this, regInitIdx);
413 computeUnit->srf[simdId]->write(physSgprIdx,
414 workGroupId[2]);
415
416 ++regInitIdx;
417 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
418 "Setting WG ID Z: s[%d] = %x\n",
420 wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
421 break;
423 physSgprIdx =
424 computeUnit->registerManager->mapSgpr(this, regInitIdx);
438 computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
439 (wgId * (wgSz / 64) + wfId) *
441
442 ++regInitIdx;
443 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
444 "Setting Private Seg Offset: s[%d] = %x\n",
446 wfSlotId, wfDynId, physSgprIdx,
447 1024 * (wgId * (wgSz / 64) + wfId) *
449 break;
450 case WorkgroupInfo:
451 firstWave = (wfId == 0) ? 1 : 0;
452 numWfsInWg = divCeil(wgSizeInWorkItems,
454 finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
455 finalValue |= (orderedAppendTerm << 6);
456 finalValue |= numWfsInWg;
457 physSgprIdx =
458 computeUnit->registerManager->mapSgpr(this, regInitIdx);
460 write(physSgprIdx, finalValue);
461
462 ++regInitIdx;
463 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
464 "Setting WG Info: s[%d] = %x\n",
466 wfSlotId, wfDynId, physSgprIdx, finalValue);
467 break;
468 default:
469 fatal("SGPR enable bit %i not supported\n", en_bit);
470 break;
471 }
472 }
473 }
474
475 regInitIdx = 0;
476
477 // iterate over all the init fields and check which
478 // bits are enabled
479 for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
480 if (task->vgprBitEnabled(en_bit)) {
481 uint32_t physVgprIdx = 0;
482 TheGpuISA::VecRegContainerU32 raw_vgpr;
483
484 switch (en_bit) {
485 case WorkitemIdX:
486 {
487 physVgprIdx = computeUnit->registerManager
488 ->mapVgpr(this, regInitIdx);
489 TheGpuISA::VecElemU32 *vgpr_x
490 = raw_vgpr.as<TheGpuISA::VecElemU32>();
491
492 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
493 vgpr_x[lane] = workItemId[0][lane];
494 }
495
496 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
497 rawDist[regInitIdx] = 0;
498 ++regInitIdx;
499 }
500 break;
501 case WorkitemIdY:
502 {
503 physVgprIdx = computeUnit->registerManager
504 ->mapVgpr(this, regInitIdx);
505 TheGpuISA::VecElemU32 *vgpr_y
506 = raw_vgpr.as<TheGpuISA::VecElemU32>();
507
508 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
509 vgpr_y[lane] = workItemId[1][lane];
510 }
511
512 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
513 rawDist[regInitIdx] = 0;
514 ++regInitIdx;
515 }
516 break;
517 case WorkitemIdZ:
518 {
519 physVgprIdx = computeUnit->registerManager->
520 mapVgpr(this, regInitIdx);
521 TheGpuISA::VecElemU32 *vgpr_z
522 = raw_vgpr.as<TheGpuISA::VecElemU32>();
523
524 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
525 vgpr_z[lane] = workItemId[2][lane];
526 }
527
528 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
529 rawDist[regInitIdx] = 0;
530 ++regInitIdx;
531 }
532 break;
533 }
534 }
535 }
536}
537
538void
539Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
540{
541 maxVgprs = num_vregs;
542 maxSgprs = num_sregs;
543}
544
546{
547}
548
549void
551{
552 if (computeUnit->idleCUTimeout > 0) {
553 // Wavefront's status transitions to stalled or stopped
554 if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
555 newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
556 (status != newStatus)) {
558 assert(computeUnit->idleWfs <=
560 if (computeUnit->idleWfs ==
563 }
564 // Wavefront's status transitions to an active state (from
565 // a stopped or stalled state)
566 } else if ((status == S_STOPPED || status == S_STALLED ||
567 status == S_WAITCNT || status == S_BARRIER) &&
568 (status != newStatus)) {
569 // if all WFs in the CU were idle then check if the idleness
570 // period exceeded the timeout threshold
571 if (computeUnit->idleWfs ==
575 "CU%d has been idle for %d ticks at tick %d",
577 curTick());
578 }
580 assert(computeUnit->idleWfs >= 0);
581 }
582 }
583 status = newStatus;
584}
585
586void
587Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
588{
589 wfDynId = _wf_dyn_id;
590 _pc = init_pc;
591
593
594 vecReads.resize(maxVgprs, 0);
595}
596
597bool
599{
600 if (ii->isGlobalMem() ||
601 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
602 return true;
603 }
604
605 return false;
606}
607
608bool
610{
611 if (ii->isLocalMem() ||
612 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
613 return true;
614 }
615
616 return false;
617}
618
619bool
621{
622 if (instructionBuffer.empty())
623 return false;
624
625 GPUDynInstPtr ii = instructionBuffer.front();
626
627 if (ii->isSleep()) {
628 return true;
629 }
630 return false;
631}
632
633bool
635{
636 if (instructionBuffer.empty())
637 return false;
638
639 GPUDynInstPtr ii = instructionBuffer.front();
640
641 if (ii->isWaitcnt()) {
642 // waitcnt is a scalar
643 assert(ii->isScalar());
644 return true;
645 }
646
647 return false;
648}
649
650bool
652{
653 assert(!instructionBuffer.empty());
654 GPUDynInstPtr ii = instructionBuffer.front();
655
656 if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
657 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
658 (ii->isKernArgSeg() && ii->isLoad()))) {
659 return true;
660 }
661
662 return false;
663}
664
665bool
667{
668 assert(!instructionBuffer.empty());
669 GPUDynInstPtr ii = instructionBuffer.front();
670
671 if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
672 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
673 || (ii->isKernArgSeg() && ii->isLoad()))) {
674 return true;
675 }
676
677 return false;
678}
679
680bool
682{
683 assert(!instructionBuffer.empty());
684 GPUDynInstPtr ii = instructionBuffer.front();
685
686 if (status != S_STOPPED && ii->isBarrier()) {
687 return true;
688 }
689
690 return false;
691}
692
693bool
695{
696 assert(!instructionBuffer.empty());
697 GPUDynInstPtr ii = instructionBuffer.front();
698
699 if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
700 return true;
701 }
702
703 return false;
704}
705
706bool
708{
709 assert(!instructionBuffer.empty());
710 GPUDynInstPtr ii = instructionBuffer.front();
711
712 if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
713 return true;
714 }
715
716 return false;
717}
718
719bool
721{
722 assert(!instructionBuffer.empty());
723 GPUDynInstPtr ii = instructionBuffer.front();
724
725 if (status != S_STOPPED && ii->isLocalMem()) {
726 return true;
727 }
728
729 return false;
730}
731
732bool
734{
735 assert(!instructionBuffer.empty());
736 GPUDynInstPtr ii = instructionBuffer.front();
737
738 if (status != S_STOPPED && ii->isPrivateSeg()) {
739 return true;
740 }
741
742 return false;
743}
744
745bool
747{
748 assert(!instructionBuffer.empty());
749 GPUDynInstPtr ii = instructionBuffer.front();
750
751 if (status != S_STOPPED && ii->isFlat()) {
752 return true;
753 }
754
755 return false;
756}
757
758bool
760{
761 for (auto it : instructionBuffer) {
762 GPUDynInstPtr ii = it;
763 if (ii->isReturn() || ii->isBranch() ||
764 ii->isEndOfKernel()) {
765 return true;
766 }
767 }
768
769 return false;
770}
771
772void
774{
775 execUnitId = -1;
776}
777
779{
781 wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
782 outstandingReqs < 0,
783 "Negative requests in pipe for WF%d for slot%d"
784 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
785 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
786 " Outstanding Reqs=%d\n",
789}
790
791void
793{
794 if (!ii->isScalar()) {
795 if (ii->isLoad()) {
797 } else if (ii->isStore()) {
799 } else if (ii->isAtomic() || ii->isMemSync()) {
802 } else {
803 panic("Invalid memory operation!\n");
804 }
806 } else {
807 if (ii->isLoad()) {
809 } else if (ii->isStore()) {
811 } else if (ii->isAtomic() || ii->isMemSync()) {
814 } else {
815 panic("Invalid memory operation!\n");
816 }
818 }
819}
820
821void
823{
824 fatal_if(ii->isScalar(),
825 "Scalar instructions can not access Shared memory!!!");
826 if (ii->isLoad()) {
828 } else if (ii->isStore()) {
830 } else if (ii->isAtomic() || ii->isMemSync()) {
833 } else {
834 panic("Invalid memory operation!\n");
835 }
837}
838
841{
842 // vector of execution unit IDs to return to schedule stage
843 // this return is only used for debugging and an assertion...
844 std::vector<int> execUnitIds;
845
846 // Get current instruction
847 GPUDynInstPtr ii = instructionBuffer.front();
848 assert(ii);
849
850 // Single precision ALU or Branch or Return or Special instruction
851 if (ii->isALU() || ii->isSpecialOp() ||
852 ii->isBranch() || ii->isNop() ||
853 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
854 ii->isReturn() || ii->isEndOfKernel()) {
855 if (!ii->isScalar()) {
857 } else {
859 }
860 // this is to enforce a fixed number of cycles per issue slot per SIMD
861 } else if (ii->isBarrier()) {
862 execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
863 } else if (ii->isFlat()) {
864 assert(!ii->isScalar());
866 // add execUnitId, reserved by reserveLmResource, list before it is
867 // overwriten by reserveGmResource
868 execUnitIds.push_back(execUnitId);
872 execUnitIds.push_back(flatGmUnitId);
873 execUnitId = -1;
874 } else if (ii->isGlobalMem()) {
876 } else if (ii->isLocalMem()) {
878 } else if (ii->isPrivateSeg()) {
879 fatal_if(ii->isScalar(),
880 "Scalar instructions can not access Private memory!!!");
882 } else {
883 panic("reserveResources -> Couldn't process op!\n");
884 }
885
886 if (execUnitId != -1) {
887 execUnitIds.push_back(execUnitId);
888 }
889 assert(execUnitIds.size());
890 return execUnitIds;
891}
892
893void
895{
896 // ---- Exit if wavefront is inactive ----------------------------- //
897
898 if (status == S_STOPPED || status == S_RETURNING ||
899 status==S_STALLED || instructionBuffer.empty()) {
900 return;
901 }
902
903 if (status == S_WAITCNT) {
915 assert(isOldestInstWaitcnt());
916 }
917
918 // Get current instruction
919
920 GPUDynInstPtr ii = instructionBuffer.front();
921
922 const Addr old_pc = pc();
923 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
924 "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
925 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
926
927 ii->execute(ii);
928 // delete the dynamic instruction from the pipeline map
930 // update the instruction stats in the CU
932
933 // inform VRF of instruction execution to schedule write-back
934 // and scoreboard ready for registers
935 if (!ii->isScalar()) {
936 computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
937 }
938 computeUnit->srf[simdId]->waveExecuteInst(this, ii);
939
940 computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
941 computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
950
951 if (lastInstExec) {
954 }
956
957 // want to track:
958 // number of reads that occur per value written
959
960 // vector RAW dependency tracking
961 for (const auto& srcVecOp : ii->srcVecRegOperands()) {
962 for (const auto& virtIdx : srcVecOp.virtIndices()) {
963 // This check should never fail, but to be safe we check
964 if (rawDist.find(virtIdx) != rawDist.end()) {
966 rawDist[virtIdx]);
967 }
968 // increment number of reads to this register
969 vecReads[virtIdx]++;
970 }
971 }
972
973 for (const auto& dstVecOp : ii->dstVecRegOperands()) {
974 for (const auto& virtIdx : dstVecOp.virtIndices()) {
975 // rawDist is set on writes, but will not be set for the first
976 // write to each physical register
977 if (rawDist.find(virtIdx) != rawDist.end()) {
978 // Sample the number of reads that were performed
980 }
981 // on a write, reset count of reads to 0
982 vecReads[virtIdx] = 0;
983
984 rawDist[virtIdx] = stats.numInstrExecuted.value();
985 }
986 }
987
988 if (pc() == old_pc) {
989 // PC not modified by instruction, proceed to next
990 _gpuISA.advancePC(ii);
991 instructionBuffer.pop_front();
992 } else {
993 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
995 ii->disassemble());
996 discardFetch();
997 }
998 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1000
1002 const int num_active_lanes = execMask().count();
1004 computeUnit->stats.numVecOpsExecuted += num_active_lanes;
1005
1006 if (ii->isF16() && ii->isALU()) {
1007 if (ii->isF32() || ii->isF64()) {
1008 fatal("Instruction is tagged as both (1) F16, and (2)"
1009 "either F32 or F64.");
1010 }
1011 computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
1012 if (ii->isFMA()) {
1013 computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
1015 += num_active_lanes;
1016 }
1017 else if (ii->isMAC()) {
1018 computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
1020 += num_active_lanes;
1021 }
1022 else if (ii->isMAD()) {
1023 computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
1025 += num_active_lanes;
1026 }
1027 }
1028 if (ii->isF32() && ii->isALU()) {
1029 if (ii->isF16() || ii->isF64()) {
1030 fatal("Instruction is tagged as both (1) F32, and (2)"
1031 "either F16 or F64.");
1032 }
1033 computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
1034 if (ii->isFMA()) {
1035 computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
1037 += num_active_lanes;
1038 }
1039 else if (ii->isMAC()) {
1040 computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
1042 += num_active_lanes;
1043 }
1044 else if (ii->isMAD()) {
1045 computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
1047 += num_active_lanes;
1048 }
1049 }
1050 if (ii->isF64() && ii->isALU()) {
1051 if (ii->isF16() || ii->isF32()) {
1052 fatal("Instruction is tagged as both (1) F64, and (2)"
1053 "either F16 or F32.");
1054 }
1055 computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
1056 if (ii->isFMA()) {
1057 computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
1059 += num_active_lanes;
1060 }
1061 else if (ii->isMAC()) {
1062 computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
1064 += num_active_lanes;
1065 }
1066 else if (ii->isMAD()) {
1067 computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
1069 += num_active_lanes;
1070 }
1071 }
1072 if (isGmInstruction(ii)) {
1074 num_active_lanes);
1075 } else if (isLmInstruction(ii)) {
1077 num_active_lanes);
1078 }
1079 }
1080
1085 if (execMask().none() && ii->isFlat()) {
1087 return;
1088 }
1089
1090 // Update Vector ALU pipeline and other resources
1091 bool flat_as_gm = false;
1092 bool flat_as_lm = false;
1093 if (ii->isFlat()) {
1094 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1095 (ii->executedAs() == enums::SC_PRIVATE);
1096 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1097 }
1098
1099 // Single precision ALU or Branch or Return or Special instruction
1100 // Note, we use the same timing regardless of SP or DP ALU operation.
1101 if (ii->isALU() || ii->isSpecialOp() ||
1102 ii->isBranch() || ii->isNop() ||
1103 (ii->isKernArgSeg() && ii->isLoad()) ||
1104 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1105 // this is to enforce a fixed number of cycles per issue slot per SIMD
1106 if (!ii->isScalar()) {
1108 cyclesToTicks(computeUnit->issuePeriod));
1109 } else {
1111 cyclesToTicks(computeUnit->issuePeriod));
1112 }
1113 // Barrier on Scalar ALU
1114 } else if (ii->isBarrier()) {
1116 cyclesToTicks(computeUnit->issuePeriod));
1117 // GM or Flat as GM Load
1118 } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1119 if (!ii->isScalar()) {
1126 } else {
1128 cyclesToTicks(computeUnit->srf_scm_bus_latency));
1133 }
1134 // GM or Flat as GM Store
1135 } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1136 if (!ii->isScalar()) {
1138 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1143 } else {
1145 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1150 }
1151 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1152 (ii->isGlobalMem() || flat_as_gm)) {
1153 if (!ii->isScalar()) {
1155 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1160 } else {
1162 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1167 }
1168 // LM or Flat as LM Load
1169 } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1171 cyclesToTicks(computeUnit->vrf_lm_bus_latency));
1176 // LM or Flat as LM Store
1177 } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1179 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1184 // LM or Flat as LM, Atomic or MemFence
1185 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1186 (ii->isLocalMem() || flat_as_lm)) {
1188 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1193 } else {
1194 panic("Bad instruction type!\n");
1195 }
1196}
1197
1200{
1201 // Read next instruction from instruction buffer
1202 GPUDynInstPtr ii = instructionBuffer.front();
1203 // if the WF has been dispatched in the schedule stage then
1204 // check the next oldest instruction for readiness
1205 if (computeUnit->pipeMap.find(ii->seqNum()) !=
1206 computeUnit->pipeMap.end()) {
1207 if (instructionBuffer.size() > 1) {
1208 auto it = instructionBuffer.begin() + 1;
1209 return *it;
1210 } else { // No new instructions to check
1211 return nullptr;
1212 }
1213 }
1214 return ii;
1215}
1216
1217void
1219{
1220 instructionBuffer.clear();
1222
1228}
1229
1230bool
1232{
1233 // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1234 // waitCnt instruction has been dispatched but not executed yet: next
1235 // instruction should be blocked until waitCnt is executed.
1236 if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
1237 return false;
1238 }
1239
1245 if (vmWaitCnt != -1) {
1246 if (vmemInstsIssued > vmWaitCnt) {
1247 // vmWaitCnt not satisfied
1248 return false;
1249 }
1250 }
1251
1252 if (expWaitCnt != -1) {
1253 if (expInstsIssued > expWaitCnt) {
1254 // expWaitCnt not satisfied
1255 return false;
1256 }
1257 }
1258
1259 if (lgkmWaitCnt != -1) {
1261 // lgkmWaitCnt not satisfied
1262 return false;
1263 }
1264 }
1265
1266 // if we get here all outstanding waitcnts must
1267 // be satisfied, so we resume normal operation
1268 clearWaitCnts();
1269
1270 return true;
1271}
1272
1273bool
1275{
1276 assert(status == S_STALLED_SLEEP);
1277
1278 // if the sleep count has not been set, then the sleep instruction has not
1279 // been executed yet, so we will return true without setting the wavefront
1280 // status
1281 if (sleepCnt == 0)
1282 return false;
1283
1284 sleepCnt--;
1285 if (sleepCnt != 0)
1286 return false;
1287
1288 status = S_RUNNING;
1289 return true;
1290}
1291
1292void
1294{
1295 assert(sleepCnt == 0);
1296 sleepCnt = sleep_time;
1297}
1298
1299void
1300Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
1301{
1302 // the scoreboard should have set the status
1303 // to S_WAITCNT once a waitcnt instruction
1304 // was marked as ready
1305 assert(status == S_WAITCNT);
1306
1307 // waitcnt instruction shouldn't be sending
1308 // negative counts
1309 assert(vm_wait_cnt >= 0);
1310 assert(exp_wait_cnt >= 0);
1311 assert(lgkm_wait_cnt >= 0);
1312 // waitcnts are a max of 15 because we have
1313 // only 1 nibble (4 bits) to set the counts
1314 assert(vm_wait_cnt <= 0xf);
1315 assert(exp_wait_cnt <= 0x7);
1316 assert(lgkm_wait_cnt <= 0x1f);
1317
1324 assert(vmWaitCnt == -1);
1325 assert(expWaitCnt == -1);
1326 assert(lgkmWaitCnt == -1);
1327
1334 if (vm_wait_cnt != 0xf)
1335 vmWaitCnt = vm_wait_cnt;
1336
1337 if (exp_wait_cnt != 0x7)
1338 expWaitCnt = exp_wait_cnt;
1339
1340 if (lgkm_wait_cnt != 0x1f)
1341 lgkmWaitCnt = lgkm_wait_cnt;
1342}
1343
1344void
1346{
1347 // reset the waitcnts back to
1348 // -1, indicating they are no
1349 // longer valid
1350 vmWaitCnt = -1;
1351 expWaitCnt = -1;
1352 lgkmWaitCnt = -1;
1353
1354 // resume running normally
1355 status = S_RUNNING;
1356}
1357
1358void
1360{
1362}
1363
1364void
1366{
1368}
1369
1370void
1372{
1374}
1375
1376void
1378{
1380}
1381
1382void
1384{
1386}
1387
1388void
1390{
1392}
1393
1394Addr
1396{
1397 return _pc;
1398}
1399
1400void
1402{
1403 _pc = new_pc;
1404}
1405
1408{
1409 return _execMask;
1410}
1411
1412bool
1413Wavefront::execMask(int lane) const
1414{
1415 return _execMask[lane];
1416}
1417
1418void
1420{
1421 /* clear busy registers */
1422 for (int i=0; i < maxVgprs; i++) {
1423 int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
1424 computeUnit->vrf[simdId]->markReg(vgprIdx, false);
1425 }
1426
1427 /* Free registers used by this wavefront */
1428 uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
1429 computeUnit->vrf[simdId]->numRegs();
1431 freeRegion(startVgprIndex, endIndex);
1432}
1433
1434void
1436{
1437 actualWgSzTotal = 1;
1438 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
1439 actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
1440 - task->wgId(d) * workGroupSz[d]);
1442 }
1443}
1444
1445void
1447{
1448 assert(bar_id >= WFBarrier::InvalidID);
1449 assert(bar_id < computeUnit->numBarrierSlots());
1450 barId = bar_id;
1451}
1452
1453int
1455{
1456 return barId;
1457}
1458
1459bool
1461{
1462 return barId > WFBarrier::InvalidID;
1463}
1464
1465void
1467{
1469}
1470
1472 : statistics::Group(parent),
1473 ADD_STAT(numInstrExecuted,
1474 "number of instructions executed by this WF slot"),
1475 ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
1476 ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
1477 ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
1478 "RF denied adding instruction"),
1479 ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
1480 " not available"),
1481 ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
1482 "RF reads to complete"),
1483 ADD_STAT(schLdsArbStalls,
1484 "number of cycles wave stalled due to LDS-VRF arbitration"),
1485 // FIXME: the name of the WF needs to be unique
1486 ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
1487 "instructions are blocked due to WAW or WAR dependencies"),
1488 // FIXME: the name of the WF needs to be unique
1489 ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
1490 "instructions are blocked due to RAW dependencies"),
1491 ADD_STAT(vecRawDistance,
1492 "Count of RAW distance in dynamic instructions for this WF"),
1493 ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
1494{
1495 vecRawDistance.init(0, 20, 1);
1496 readsPerWrite.init(0, 4, 1);
1497}
1498
1499} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
Tick cyclesToTicks(Cycles c) const
int mapWaveToScalarAlu(Wavefront *w) const
std::vector< WaitClass > scalarALUs
WaitClass scalarMemUnit
std::vector< uint64_t > instExecPerSimd
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass vrfToLocalMemPipeBus
TokenManager * getTokenManager()
WaitClass srfToScalarMemPipeBus
std::vector< uint64_t > lastExecCycle
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
WaitClass vectorSharedMemUnit
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
void deleteFromPipeMap(Wavefront *w)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
FetchUnit & fetchUnit(int simdId)
void flushBuf(int wfSlotId)
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
int wgId(int dim) const
Addr hostDispPktAddr() const
static const int MAX_DIM
int wgSize(int dim) const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int gridSize(int dim) const
int mapVgpr(Wavefront *w, int vgprIndex)
std::vector< PoolManager * > vrfPoolMgrs
int mapSgpr(Wavefront *w, int sgprIndex)
hsail_mode_e hsail_mode
Definition shader.hh:222
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
Definition shader.hh:197
void incVectorInstDstOperand(int num_operands)
Definition shader.hh:312
void incVectorInstSrcOperand(int num_operands)
Definition shader.hh:306
Abstract superclass for simulation objects.
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
WF barrier slots.
static const int InvalidID
void set(uint64_t i)
Definition misc.hh:82
uint32_t maxSgprs
Definition wavefront.hh:133
status_e status
Definition wavefront.hh:328
bool isOldestInstWaitcnt()
Definition wavefront.cc:634
Addr pc() const
bool hasBarrier() const
VectorMask _execMask
Definition wavefront.hh:330
uint32_t actualWgSzTotal
Definition wavefront.hh:164
void reserveGmResource(GPUDynInstPtr ii)
Definition wavefront.cc:792
uint64_t oldVgprTcnt
Definition wavefront.hh:209
std::vector< Addr > lastAddr
Definition wavefront.hh:153
void setStatus(status_e newStatus)
Definition wavefront.cc:550
bool waitCntsSatisfied()
void validateRequestCounters()
Definition wavefront.cc:778
const int simdId
Definition wavefront.hh:99
bool isOldestInstLMem()
Definition wavefront.cc:720
bool isOldestInstPrivMem()
Definition wavefront.cc:733
bool isOldestInstScalarMem()
Definition wavefront.cc:707
uint64_t oldDgprTcnt
Definition wavefront.hh:216
Wavefront(const Params &p)
Definition wavefront.cc:48
bool isOldestInstBarrier()
Definition wavefront.cc:681
void resizeRegFiles(int num_vregs, int num_sregs)
Definition wavefront.cc:539
int scalarOutstandingReqsWrGm
Definition wavefront.hh:183
uint32_t gridSz[3]
Definition wavefront.hh:159
void decExpInstsIssued()
std::vector< uint32_t > oldVgpr
Definition wavefront.hh:205
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
Definition wavefront.cc:117
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
Definition wavefront.hh:106
std::vector< uint32_t > workItemFlatId
Definition wavefront.hh:155
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
Definition wavefront.hh:321
std::vector< int > vecReads
Definition wavefront.hh:237
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:109
bool isOldestInstSleep()
Definition wavefront.cc:620
bool isLmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:609
GPUDynInstPtr nextInstr()
uint64_t lastTrace
Definition wavefront.hh:192
std::vector< uint32_t > workItemId[3]
Definition wavefront.hh:154
std::vector< uint64_t > oldDgpr
Definition wavefront.hh:212
bool isOldestInstScalarALU()
Definition wavefront.cc:651
void releaseBarrier()
bool isOldestInstFlatMem()
Definition wavefront.cc:746
WavefrontParams Params
Definition wavefront.hh:244
uint32_t maxVgprs
Definition wavefront.hh:131
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
uint32_t workGroupId[3]
Definition wavefront.hh:157
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
const int wfSlotId
Definition wavefront.hh:96
std::unordered_map< int, uint64_t > rawDist
Definition wavefront.hh:233
void incExpInstsIssued()
std::vector< int > reserveResources()
Definition wavefront.cc:840
uint32_t startSgprIndex
Definition wavefront.hh:202
void decLGKMInstsIssued()
void incLGKMInstsIssued()
int barrierId() const
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition wavefront.cc:102
uint32_t workGroupSz[3]
Definition wavefront.hh:158
bool isOldestInstVectorALU()
Definition wavefront.cc:666
uint64_t lastInstExec
Definition wavefront.hh:229
LdsChunk * ldsChunk
Definition wavefront.hh:223
uint32_t actualWgSz[3]
Definition wavefront.hh:163
int scalarOutstandingReqsRdGm
Definition wavefront.hh:181
void freeResources()
Definition wavefront.cc:773
void incVMemInstsIssued()
void reserveLmResource(GPUDynInstPtr ii)
Definition wavefront.cc:822
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:88
bool isOldestInstGMem()
Definition wavefront.cc:694
gem5::Wavefront::WavefrontStats stats
VectorMask & execMask()
uint64_t wfDynId
Definition wavefront.hh:226
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:598
uint32_t startVgprIndex
Definition wavefront.hh:199
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition wavefront.cc:587
TheGpuISA::GPUISA _gpuISA
Definition wavefront.hh:300
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Counter value() const
Return the current value of this stat as its base type.
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:76
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 12, 11 > set
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 0 > p
const FlagsType none
Nothing extra to print.
Definition info.hh:53
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
static void init_pc(py::module_ &m_native)
Definition core.cc:168
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
@ GridWorkgroupCountZ
@ WorkgroupIdX
@ DispatchId
@ NumScalarInitFields
@ DispatchPtr
@ QueuePtr
@ PrivSegWaveByteOffset
@ PrivateSegBuf
@ WorkgroupIdY
@ PrivateSegSize
@ WorkgroupInfo
@ GridWorkgroupCountY
@ WorkgroupIdZ
@ GridWorkgroupCountX
@ FlatScratchInit
@ KernargSegPtr
@ WorkitemIdX
@ WorkitemIdZ
@ NumVectorInitFields
@ WorkitemIdY
statistics::Distribution activeLanesPerLMemInstrDist
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution controlFlowDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution execRateDist
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
Definition wavefront.hh:372
statistics::Distribution readsPerWrite
Definition wavefront.hh:376
statistics::Scalar numInstrExecuted
Definition wavefront.hh:340
uint32_t scratch_workitem_byte_size
Definition hsa_queue.hh:84
uint32_t compute_tmpring_size_wavesize
Definition hsa_queue.hh:79
uint64_t scratch_backing_memory_location
Definition hsa_queue.hh:82
uint32_t scratch_resource_descriptor[4]
Definition hsa_queue.hh:81

Generated on Mon Jul 10 2023 15:32:03 for gem5 by doxygen 1.9.7