gem5 v24.0.0.0
Loading...
Searching...
No Matches
wavefront.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include "base/bitfield.hh"
35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/WavefrontStack.hh"
42#include "gpu-compute/shader.hh"
45
46namespace gem5
47{
48
50 : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
51 maxIbSize(p.max_ib_size), _gpuISA(*this),
52 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
53 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
54 sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
55{
56 lastTrace = 0;
57 execUnitId = -1;
77 ldsChunk = nullptr;
78
79 memTraceBusy = 0;
80 oldVgprTcnt = 0xffffffffffffffffll;
81 oldDgprTcnt = 0xffffffffffffffffll;
82 oldVgpr.resize(p.wf_size);
83
84 pendingFetch = false;
85 dropFetch = false;
86 maxVgprs = 0;
87 maxSgprs = 0;
88
89 lastAddr.resize(p.wf_size);
90 workItemFlatId.resize(p.wf_size);
91 oldDgpr.resize(p.wf_size);
92 for (int i = 0; i < 3; ++i) {
93 workItemId[i].resize(p.wf_size);
94 }
95
96 _execMask.set();
97 rawDist.clear();
98 lastInstExec = 0;
99 vecReads.clear();
100}
101
102void
116
117void
118Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
119{
120 int regInitIdx = 0;
121 gfxVersion = task->gfxVersion();
122
123 // Iterate over all the init fields and check which
124 // bits are enabled. Useful information can be found here:
125 // https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
126 // blob/master/AMDGPU-ABI.md
127 for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
128
129 if (task->sgprBitEnabled(en_bit)) {
130 int physSgprIdx = 0;
131 uint32_t firstWave = 0;
132 int orderedAppendTerm = 0;
133 int numWfsInWg = 0;
134 uint32_t finalValue = 0;
135 Addr host_disp_pkt_addr = task->hostDispPktAddr();
136 Addr kernarg_addr = task->kernargAddr();
137 Addr hidden_priv_base(0);
138
139 switch (en_bit) {
140 case PrivateSegBuf:
141 physSgprIdx =
142 computeUnit->registerManager->mapSgpr(this, regInitIdx);
143 computeUnit->srf[simdId]->write(physSgprIdx,
145 ++regInitIdx;
146 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
147 "Setting PrivateSegBuffer: s[%d] = %x\n",
149 wfSlotId, wfDynId, physSgprIdx,
151
152 physSgprIdx =
153 computeUnit->registerManager->mapSgpr(this, regInitIdx);
154 computeUnit->srf[simdId]->write(physSgprIdx,
156 ++regInitIdx;
157 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
158 "Setting PrivateSegBuffer: s[%d] = %x\n",
160 wfSlotId, wfDynId, physSgprIdx,
162
163 physSgprIdx =
164 computeUnit->registerManager->mapSgpr(this, regInitIdx);
165 computeUnit->srf[simdId]->write(physSgprIdx,
167 ++regInitIdx;
168 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
169 "Setting PrivateSegBuffer: s[%d] = %x\n",
171 wfSlotId, wfDynId, physSgprIdx,
173
174 physSgprIdx =
175 computeUnit->registerManager->mapSgpr(this, regInitIdx);
176 computeUnit->srf[simdId]->write(physSgprIdx,
178
179 ++regInitIdx;
180 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
181 "Setting PrivateSegBuffer: s[%d] = %x\n",
183 wfSlotId, wfDynId, physSgprIdx,
185 break;
186 case DispatchPtr:
187 physSgprIdx =
188 computeUnit->registerManager->mapSgpr(this, regInitIdx);
189 computeUnit->srf[simdId]->write(physSgprIdx,
190 bits(host_disp_pkt_addr, 31, 0));
191 ++regInitIdx;
192 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
193 "Setting DispatchPtr: s[%d] = %x\n",
195 wfSlotId, wfDynId, physSgprIdx,
196 bits(host_disp_pkt_addr, 31, 0));
197
198 physSgprIdx =
199 computeUnit->registerManager->mapSgpr(this, regInitIdx);
200 computeUnit->srf[simdId]->write(physSgprIdx,
201 bits(host_disp_pkt_addr, 63, 32));
202 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
203 "Setting DispatchPtr: s[%d] = %x\n",
205 wfSlotId, wfDynId, physSgprIdx,
206 bits(host_disp_pkt_addr, 63, 32));
207
208 ++regInitIdx;
209 break;
210 case QueuePtr:
211 physSgprIdx =
212 computeUnit->registerManager->mapSgpr(this, regInitIdx);
213 computeUnit->srf[simdId]->write(physSgprIdx,
214 bits(task->hostAMDQueueAddr, 31, 0));
215 ++regInitIdx;
216 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
217 "Setting QueuePtr: s[%d] = %x\n",
219 wfSlotId, wfDynId, physSgprIdx,
220 bits(task->hostAMDQueueAddr, 31, 0));
221
222 physSgprIdx =
223 computeUnit->registerManager->mapSgpr(this, regInitIdx);
224 computeUnit->srf[simdId]->write(physSgprIdx,
225 bits(task->hostAMDQueueAddr, 63, 32));
226 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
227 "Setting QueuePtr: s[%d] = %x\n",
229 wfSlotId, wfDynId, physSgprIdx,
230 bits(task->hostAMDQueueAddr, 63, 32));
231
232 ++regInitIdx;
233 break;
234 case KernargSegPtr:
235 physSgprIdx =
236 computeUnit->registerManager->mapSgpr(this, regInitIdx);
237 computeUnit->srf[simdId]->write(physSgprIdx,
238 bits(kernarg_addr, 31, 0));
239 ++regInitIdx;
240 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
241 "Setting KernargSegPtr: s[%d] = %x\n",
243 wfSlotId, wfDynId, physSgprIdx,
244 bits(kernarg_addr, 31, 0));
245
246 physSgprIdx =
247 computeUnit->registerManager->mapSgpr(this, regInitIdx);
248 computeUnit->srf[simdId]->write(physSgprIdx,
249 bits(kernarg_addr, 63, 32));
250 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
251 "Setting KernargSegPtr: s[%d] = %x\n",
253 wfSlotId, wfDynId, physSgprIdx,
254 bits(kernarg_addr, 63, 32));
255
256 ++regInitIdx;
257 break;
258 case DispatchId:
259 physSgprIdx
260 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
261 computeUnit->srf[simdId]->write(physSgprIdx,
262 task->dispatchId());
263 ++regInitIdx;
264 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
265 "Setting DispatchId: s[%d] = %x\n",
267 wfSlotId, wfDynId, physSgprIdx,
268 task->dispatchId());
269
270 // Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
271 physSgprIdx
272 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
273 computeUnit->srf[simdId]->write(physSgprIdx, 0);
274 ++regInitIdx;
275 break;
276 case FlatScratchInit:
277 physSgprIdx
278 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
279 computeUnit->srf[simdId]->write(physSgprIdx,
280 (TheGpuISA::ScalarRegU32)(task->amdQueue
281 .scratch_backing_memory_location & 0xffffffff));
282 ++regInitIdx;
283 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
284 "Setting FlatScratch Addr: s[%d] = %x\n",
286 wfSlotId, wfDynId, physSgprIdx,
287 (TheGpuISA::ScalarRegU32)(task->amdQueue
288 .scratch_backing_memory_location & 0xffffffff));
289
290 physSgprIdx =
291 computeUnit->registerManager->mapSgpr(this, regInitIdx);
292 // This vallue should be sizeof(DWORD) aligned, that is
293 // 4 byte aligned
294 computeUnit->srf[simdId]->write(physSgprIdx,
296 ++regInitIdx;
297 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
298 "Setting FlatScratch size: s[%d] = %x\n",
300 wfSlotId, wfDynId, physSgprIdx,
325 hidden_priv_base =
326 (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
327 (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
328 & 0x000000000000ffff) << 32);
330 hidden_priv_base,
332 break;
333 case PrivateSegSize:
334 physSgprIdx
335 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
336 computeUnit->srf[simdId]->write(physSgprIdx,
337 task->privMemPerItem());
338 ++regInitIdx;
339 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
340 "Setting private segment size: s[%d] = %x\n",
342 wfSlotId, wfDynId, physSgprIdx,
343 task->privMemPerItem());
344 break;
345 case WorkgroupIdX:
346 physSgprIdx =
347 computeUnit->registerManager->mapSgpr(this, regInitIdx);
348 computeUnit->srf[simdId]->write(physSgprIdx,
349 workGroupId[0]);
350
351 ++regInitIdx;
352 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
353 "Setting WG ID X: s[%d] = %x\n",
355 wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
356 break;
357 case WorkgroupIdY:
358 physSgprIdx =
359 computeUnit->registerManager->mapSgpr(this, regInitIdx);
360 computeUnit->srf[simdId]->write(physSgprIdx,
361 workGroupId[1]);
362
363 ++regInitIdx;
364 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
365 "Setting WG ID Y: s[%d] = %x\n",
367 wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
368 break;
369 case WorkgroupIdZ:
370 physSgprIdx =
371 computeUnit->registerManager->mapSgpr(this, regInitIdx);
372 computeUnit->srf[simdId]->write(physSgprIdx,
373 workGroupId[2]);
374
375 ++regInitIdx;
376 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
377 "Setting WG ID Z: s[%d] = %x\n",
379 wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
380 break;
382
383 // For architected flat scratch, this enable is reused to set
384 // the FLAT_SCRATCH register pair to the scratch backing
385 // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
386 if (task->gfxVersion() == GfxVersion::gfx942) {
389
390 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
391 "Setting architected flat scratch = %x\n",
394
395 break;
396 }
397
398 // Not architected flat scratch. Write the scratch wavefront
399 // offset: https://llvm.org/docs/AMDGPUUsage.html
400 // #amdgpu-amdhsa-initial-kernel-execution-state
401 physSgprIdx =
402 computeUnit->registerManager->mapSgpr(this, regInitIdx);
403
417 computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
418 (wgId * (wgSz / 64) + wfId) *
420
421 ++regInitIdx;
422 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
423 "Setting Private Seg Offset: s[%d] = %x\n",
425 wfSlotId, wfDynId, physSgprIdx,
426 1024 * (wgId * (wgSz / 64) + wfId) *
428 break;
429 case WorkgroupInfo:
430 firstWave = (wfId == 0) ? 1 : 0;
431 numWfsInWg = divCeil(wgSizeInWorkItems,
433 finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
434 finalValue |= (orderedAppendTerm << 6);
435 finalValue |= numWfsInWg;
436 physSgprIdx =
437 computeUnit->registerManager->mapSgpr(this, regInitIdx);
439 write(physSgprIdx, finalValue);
440
441 ++regInitIdx;
442 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
443 "Setting WG Info: s[%d] = %x\n",
445 wfSlotId, wfDynId, physSgprIdx, finalValue);
446 break;
447 default:
448 fatal("SGPR enable bit %i not supported\n", en_bit);
449 break;
450 }
451 }
452 }
453
454 // Save the offset to the first accumulation VGPR number from HSA task.
455 accumOffset = task->accumOffset();
456
457 regInitIdx = 0;
458
459 // VGPRs are initialized to the work item IDs for a given thread. There
460 // are two ways to initialize the IDs based on number of dimensions. ISAs
461 // will either have packed work-item IDs or not. LLVM lists them here:
462 // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
463 // Default to false and set to true for gem5 supported ISAs.
464 bool packed_work_item_id = false;
465
466 if (task->gfxVersion() == GfxVersion::gfx90a ||
467 task->gfxVersion() == GfxVersion::gfx942) {
468 packed_work_item_id = true;
469 }
470
471 // For ISAs with packed work item IDs, only one VGPR is used and the
472 // (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
473 // for each dimension
474 if (packed_work_item_id) {
475 TheGpuISA::VecRegContainerU32 raw_vgpr;
476 TheGpuISA::VecElemU32 *packed_vgpr
477 = raw_vgpr.as<TheGpuISA::VecElemU32>();
478
479 uint32_t physVgprIdx = computeUnit->registerManager
480 ->mapVgpr(this, regInitIdx);
481 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
482 packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
483 }
484 if (task->vgprBitEnabled(1)) {
485 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
486 packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
487 }
488 }
489 if (task->vgprBitEnabled(2)) {
490 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
491 packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
492 }
493 }
494 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
495
496 return;
497 }
498
499 // For ISAs with non-packed work item IDs, map and initialize one VGPR
500 // per dimensions. Do this by iterating over all the init fields and
501 // checking which bits are enabled.
502 for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
503 if (task->vgprBitEnabled(en_bit)) {
504 uint32_t physVgprIdx = 0;
505 TheGpuISA::VecRegContainerU32 raw_vgpr;
506
507 switch (en_bit) {
508 case WorkitemIdX:
509 {
510 physVgprIdx = computeUnit->registerManager
511 ->mapVgpr(this, regInitIdx);
512 TheGpuISA::VecElemU32 *vgpr_x
513 = raw_vgpr.as<TheGpuISA::VecElemU32>();
514
515 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
516 vgpr_x[lane] = workItemId[0][lane];
517 }
518
519 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
520 rawDist[regInitIdx] = 0;
521 ++regInitIdx;
522 }
523 break;
524 case WorkitemIdY:
525 {
526 physVgprIdx = computeUnit->registerManager
527 ->mapVgpr(this, regInitIdx);
528 TheGpuISA::VecElemU32 *vgpr_y
529 = raw_vgpr.as<TheGpuISA::VecElemU32>();
530
531 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
532 vgpr_y[lane] = workItemId[1][lane];
533 }
534
535 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
536 rawDist[regInitIdx] = 0;
537 ++regInitIdx;
538 }
539 break;
540 case WorkitemIdZ:
541 {
542 physVgprIdx = computeUnit->registerManager->
543 mapVgpr(this, regInitIdx);
544 TheGpuISA::VecElemU32 *vgpr_z
545 = raw_vgpr.as<TheGpuISA::VecElemU32>();
546
547 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
548 vgpr_z[lane] = workItemId[2][lane];
549 }
550
551 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
552 rawDist[regInitIdx] = 0;
553 ++regInitIdx;
554 }
555 break;
556 }
557 }
558 }
559}
560
561void
562Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
563{
564 maxVgprs = num_vregs;
565 maxSgprs = num_sregs;
566}
567
571
572void
574{
575 if (computeUnit->idleCUTimeout > 0) {
576 // Wavefront's status transitions to stalled or stopped
577 if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
578 newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
579 (status != newStatus)) {
581 assert(computeUnit->idleWfs <=
583 if (computeUnit->idleWfs ==
586 }
587 // Wavefront's status transitions to an active state (from
588 // a stopped or stalled state)
589 } else if ((status == S_STOPPED || status == S_STALLED ||
590 status == S_WAITCNT || status == S_BARRIER) &&
591 (status != newStatus)) {
592 // if all WFs in the CU were idle then check if the idleness
593 // period exceeded the timeout threshold
594 if (computeUnit->idleWfs ==
598 "CU%d has been idle for %d ticks at tick %d",
600 curTick());
601 }
603 assert(computeUnit->idleWfs >= 0);
604 }
605 }
606 status = newStatus;
607}
608
609void
610Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
611{
612 wfDynId = _wf_dyn_id;
613 _pc = init_pc;
614
616
617 vecReads.resize(maxVgprs, 0);
618}
619
620bool
622{
623 if (ii->isGlobalMem() ||
624 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
625 return true;
626 }
627
628 return false;
629}
630
631bool
633{
634 if (ii->isLocalMem() ||
635 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
636 return true;
637 }
638
639 return false;
640}
641
642bool
644{
645 if (instructionBuffer.empty())
646 return false;
647
648 GPUDynInstPtr ii = instructionBuffer.front();
649
650 if (ii->isSleep()) {
651 return true;
652 }
653 return false;
654}
655
656bool
658{
659 if (instructionBuffer.empty())
660 return false;
661
662 GPUDynInstPtr ii = instructionBuffer.front();
663
664 if (ii->isWaitcnt()) {
665 // waitcnt is a scalar
666 assert(ii->isScalar());
667 return true;
668 }
669
670 return false;
671}
672
673bool
675{
676 assert(!instructionBuffer.empty());
677 GPUDynInstPtr ii = instructionBuffer.front();
678
679 if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
680 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
681 (ii->isKernArgSeg() && ii->isLoad()))) {
682 return true;
683 }
684
685 return false;
686}
687
688bool
690{
691 assert(!instructionBuffer.empty());
692 GPUDynInstPtr ii = instructionBuffer.front();
693
694 if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
695 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
696 || (ii->isKernArgSeg() && ii->isLoad()))) {
697 return true;
698 }
699
700 return false;
701}
702
703bool
705{
706 assert(!instructionBuffer.empty());
707 GPUDynInstPtr ii = instructionBuffer.front();
708
709 if (status != S_STOPPED && ii->isBarrier()) {
710 return true;
711 }
712
713 return false;
714}
715
716bool
718{
719 assert(!instructionBuffer.empty());
720 GPUDynInstPtr ii = instructionBuffer.front();
721
722 if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
723 return true;
724 }
725
726 return false;
727}
728
729bool
731{
732 assert(!instructionBuffer.empty());
733 GPUDynInstPtr ii = instructionBuffer.front();
734
735 if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
736 return true;
737 }
738
739 return false;
740}
741
742bool
744{
745 assert(!instructionBuffer.empty());
746 GPUDynInstPtr ii = instructionBuffer.front();
747
748 if (status != S_STOPPED && ii->isLocalMem()) {
749 return true;
750 }
751
752 return false;
753}
754
755bool
757{
758 assert(!instructionBuffer.empty());
759 GPUDynInstPtr ii = instructionBuffer.front();
760
761 if (status != S_STOPPED && ii->isPrivateSeg()) {
762 return true;
763 }
764
765 return false;
766}
767
768bool
770{
771 assert(!instructionBuffer.empty());
772 GPUDynInstPtr ii = instructionBuffer.front();
773
774 if (status != S_STOPPED && ii->isFlat()) {
775 return true;
776 }
777
778 return false;
779}
780
781bool
783{
784 for (auto it : instructionBuffer) {
785 GPUDynInstPtr ii = it;
786 if (ii->isReturn() || ii->isBranch() ||
787 ii->isEndOfKernel()) {
788 return true;
789 }
790 }
791
792 return false;
793}
794
795void
800
802{
804 wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
805 outstandingReqs < 0,
806 "Negative requests in pipe for WF%d for slot%d"
807 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
808 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
809 " Outstanding Reqs=%d\n",
812}
813
814void
816{
817 if (!ii->isScalar()) {
818 if (ii->isLoad()) {
820 } else if (ii->isStore()) {
822 } else if (ii->isAtomic() || ii->isMemSync()) {
825 } else {
826 panic("Invalid memory operation!\n");
827 }
829 } else {
830 if (ii->isLoad()) {
832 } else if (ii->isStore()) {
834 } else if (ii->isAtomic() || ii->isMemSync()) {
837 } else {
838 panic("Invalid memory operation!\n");
839 }
841 }
842}
843
844void
846{
847 fatal_if(ii->isScalar(),
848 "Scalar instructions can not access Shared memory!!!");
849 if (ii->isLoad()) {
851 } else if (ii->isStore()) {
853 } else if (ii->isAtomic() || ii->isMemSync()) {
856 } else {
857 panic("Invalid memory operation!\n");
858 }
860}
861
864{
865 // vector of execution unit IDs to return to schedule stage
866 // this return is only used for debugging and an assertion...
867 std::vector<int> execUnitIds;
868
869 // Get current instruction
870 GPUDynInstPtr ii = instructionBuffer.front();
871 assert(ii);
872
873 // Single precision ALU or Branch or Return or Special instruction
874 if (ii->isALU() || ii->isSpecialOp() ||
875 ii->isBranch() || ii->isNop() ||
876 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
877 ii->isReturn() || ii->isEndOfKernel()) {
878 if (!ii->isScalar()) {
880 } else {
882 }
883 // this is to enforce a fixed number of cycles per issue slot per SIMD
884 } else if (ii->isBarrier()) {
885 execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
886 } else if (ii->isFlat()) {
887 assert(!ii->isScalar());
889 // add execUnitId, reserved by reserveLmResource, list before it is
890 // overwriten by reserveGmResource
891 execUnitIds.push_back(execUnitId);
895 execUnitIds.push_back(flatGmUnitId);
896 execUnitId = -1;
897 } else if (ii->isGlobalMem()) {
899 } else if (ii->isLocalMem()) {
901 } else if (ii->isPrivateSeg()) {
902 fatal_if(ii->isScalar(),
903 "Scalar instructions can not access Private memory!!!");
905 } else {
906 panic("reserveResources -> Couldn't process op!\n");
907 }
908
909 if (execUnitId != -1) {
910 execUnitIds.push_back(execUnitId);
911 }
912 assert(execUnitIds.size());
913 return execUnitIds;
914}
915
916void
918{
919 // ---- Exit if wavefront is inactive ----------------------------- //
920
921 if (status == S_STOPPED || status == S_RETURNING ||
922 status==S_STALLED || instructionBuffer.empty()) {
923 return;
924 }
925
926 if (status == S_WAITCNT) {
938 assert(isOldestInstWaitcnt());
939 }
940
941 // Get current instruction
942
943 GPUDynInstPtr ii = instructionBuffer.front();
944
945 const Addr old_pc = pc();
946 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
947 "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
948 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
949
950 ii->execute(ii);
951 // delete the dynamic instruction from the pipeline map
953 // update the instruction stats in the CU
955
956 // inform VRF of instruction execution to schedule write-back
957 // and scoreboard ready for registers
958 if (!ii->isScalar()) {
959 computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
960 computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
961 }
962 computeUnit->srf[simdId]->waveExecuteInst(this, ii);
963
964 computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
965 computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
974
975 if (lastInstExec) {
978 }
980
981 // want to track:
982 // number of reads that occur per value written
983
984 // vector RAW dependency tracking
985 for (const auto& srcVecOp : ii->srcVecRegOperands()) {
986 for (const auto& virtIdx : srcVecOp.virtIndices()) {
987 // This check should never fail, but to be safe we check
988 if (rawDist.find(virtIdx) != rawDist.end()) {
990 rawDist[virtIdx]);
991 }
992 // increment number of reads to this register
993 vecReads[virtIdx]++;
994 }
995 }
996
997 for (const auto& dstVecOp : ii->dstVecRegOperands()) {
998 for (const auto& virtIdx : dstVecOp.virtIndices()) {
999 // rawDist is set on writes, but will not be set for the first
1000 // write to each physical register
1001 if (rawDist.find(virtIdx) != rawDist.end()) {
1002 // Sample the number of reads that were performed
1004 }
1005 // on a write, reset count of reads to 0
1006 vecReads[virtIdx] = 0;
1007
1008 rawDist[virtIdx] = stats.numInstrExecuted.value();
1009 }
1010 }
1011
1012 if (pc() == old_pc) {
1013 // PC not modified by instruction, proceed to next
1014 _gpuISA.advancePC(ii);
1015 instructionBuffer.pop_front();
1016 } else {
1017 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1019 ii->disassemble());
1020 discardFetch();
1021 }
1022 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1024
1026 const int num_active_lanes = execMask().count();
1028 computeUnit->stats.numVecOpsExecuted += num_active_lanes;
1029
1030 if (ii->isMFMA()) {
1031 computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
1032 if (ii->isI8()) {
1034 += num_active_lanes;
1035 }
1036 }
1037
1038 if (ii->isF16() && ii->isALU()) {
1039 if (ii->isF32() || ii->isF64()) {
1040 fatal("Instruction is tagged as both (1) F16, and (2)"
1041 "either F32 or F64.");
1042 }
1043 computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
1044 if (ii->isFMA()) {
1045 computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
1047 += num_active_lanes;
1048 }
1049 else if (ii->isMAC()) {
1050 computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
1052 += num_active_lanes;
1053 }
1054 else if (ii->isMAD()) {
1055 computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
1057 += num_active_lanes;
1058 }
1059 else if (ii->isMFMA()) {
1061 += num_active_lanes;
1062 }
1063 }
1064 if (ii->isF32() && ii->isALU()) {
1065 if (ii->isF16() || ii->isF64()) {
1066 fatal("Instruction is tagged as both (1) F32, and (2)"
1067 "either F16 or F64.");
1068 }
1069 computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
1070 if (ii->isFMA()) {
1071 computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
1073 += num_active_lanes;
1074 }
1075 else if (ii->isMAC()) {
1076 computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
1078 += num_active_lanes;
1079 }
1080 else if (ii->isMAD()) {
1081 computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
1083 += num_active_lanes;
1084 }
1085 else if (ii->isMFMA()) {
1087 += num_active_lanes;
1088 }
1089 }
1090 if (ii->isF64() && ii->isALU()) {
1091 if (ii->isF16() || ii->isF32()) {
1092 fatal("Instruction is tagged as both (1) F64, and (2)"
1093 "either F16 or F32.");
1094 }
1095 computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
1096 if (ii->isFMA()) {
1097 computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
1099 += num_active_lanes;
1100 }
1101 else if (ii->isMAC()) {
1102 computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
1104 += num_active_lanes;
1105 }
1106 else if (ii->isMAD()) {
1107 computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
1109 += num_active_lanes;
1110 }
1111 else if (ii->isMFMA()) {
1113 += num_active_lanes;
1114 }
1115 }
1116 if (isGmInstruction(ii)) {
1118 num_active_lanes);
1119 } else if (isLmInstruction(ii)) {
1121 num_active_lanes);
1122 }
1123 }
1124
1129 if (execMask().none() && ii->needsToken()) {
1131 return;
1132 }
1133
1134 // Update Vector ALU pipeline and other resources
1135 bool flat_as_gm = false;
1136 bool flat_as_lm = false;
1137 if (ii->isFlat()) {
1138 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1139 (ii->executedAs() == enums::SC_PRIVATE);
1140 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1141 }
1142
1143 // Single precision ALU or Branch or Return or Special instruction
1144 // Note, we use the same timing regardless of SP or DP ALU operation.
1145 if (ii->isALU() || ii->isSpecialOp() ||
1146 ii->isBranch() || ii->isNop() ||
1147 (ii->isKernArgSeg() && ii->isLoad()) ||
1148 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1149 // this is to enforce a fixed number of cycles per issue slot per SIMD
1150 if (!ii->isScalar()) {
1152 cyclesToTicks(computeUnit->issuePeriod));
1153 } else {
1155 cyclesToTicks(computeUnit->issuePeriod));
1156 }
1157 // Barrier on Scalar ALU
1158 } else if (ii->isBarrier()) {
1160 cyclesToTicks(computeUnit->issuePeriod));
1161 // GM or Flat as GM Load
1162 } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1163 if (!ii->isScalar()) {
1170 } else {
1172 cyclesToTicks(computeUnit->srf_scm_bus_latency));
1177 }
1178 // GM or Flat as GM Store
1179 } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1180 if (!ii->isScalar()) {
1182 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1187 } else {
1189 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1194 }
1195 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1196 (ii->isGlobalMem() || flat_as_gm)) {
1197 if (!ii->isScalar()) {
1199 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1204 } else {
1206 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1211 }
1212 // LM or Flat as LM Load
1213 } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1215 cyclesToTicks(computeUnit->vrf_lm_bus_latency));
1220 // LM or Flat as LM Store
1221 } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1223 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1228 // LM or Flat as LM, Atomic or MemFence
1229 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1230 (ii->isLocalMem() || flat_as_lm)) {
1232 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1237 } else {
1238 panic("Bad instruction type!\n");
1239 }
1240}
1241
1244{
1245 // Read next instruction from instruction buffer
1246 GPUDynInstPtr ii = instructionBuffer.front();
1247 // if the WF has been dispatched in the schedule stage then
1248 // check the next oldest instruction for readiness
1249 if (computeUnit->pipeMap.find(ii->seqNum()) !=
1250 computeUnit->pipeMap.end()) {
1251 if (instructionBuffer.size() > 1) {
1252 auto it = instructionBuffer.begin() + 1;
1253 return *it;
1254 } else { // No new instructions to check
1255 return nullptr;
1256 }
1257 }
1258 return ii;
1259}
1260
1261void
1273
1274bool
1276{
1277 // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1278 // waitCnt instruction has been dispatched but not executed yet: next
1279 // instruction should be blocked until waitCnt is executed.
1280 if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
1281 return false;
1282 }
1283
1289 if (vmWaitCnt != -1) {
1290 if (vmemInstsIssued > vmWaitCnt) {
1291 // vmWaitCnt not satisfied
1292 return false;
1293 }
1294 }
1295
1296 if (expWaitCnt != -1) {
1297 if (expInstsIssued > expWaitCnt) {
1298 // expWaitCnt not satisfied
1299 return false;
1300 }
1301 }
1302
1303 if (lgkmWaitCnt != -1) {
1305 // lgkmWaitCnt not satisfied
1306 return false;
1307 }
1308 }
1309
1310 // if we get here all outstanding waitcnts must
1311 // be satisfied, so we resume normal operation
1312 clearWaitCnts();
1313
1314 return true;
1315}
1316
1317bool
1319{
1320 assert(status == S_STALLED_SLEEP);
1321
1322 // if the sleep count has not been set, then the sleep instruction has not
1323 // been executed yet, so we will return true without setting the wavefront
1324 // status
1325 if (sleepCnt == 0)
1326 return false;
1327
1328 sleepCnt--;
1329 if (sleepCnt != 0)
1330 return false;
1331
1332 status = S_RUNNING;
1333 return true;
1334}
1335
1336void
1338{
1339 assert(sleepCnt == 0);
1340 sleepCnt = sleep_time;
1341}
1342
1343void
1344Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
1345{
1346 // the scoreboard should have set the status
1347 // to S_WAITCNT once a waitcnt instruction
1348 // was marked as ready
1349 assert(status == S_WAITCNT);
1350
1351 // waitcnt instruction shouldn't be sending
1352 // negative counts
1353 assert(vm_wait_cnt >= 0);
1354 assert(exp_wait_cnt >= 0);
1355 assert(lgkm_wait_cnt >= 0);
1356 // waitcnts are a max of 15 because we have
1357 // only 1 nibble (4 bits) to set the counts
1358 assert(vm_wait_cnt <= 0xf);
1359 assert(exp_wait_cnt <= 0x7);
1360 assert(lgkm_wait_cnt <= 0x1f);
1361
1368 assert(vmWaitCnt == -1);
1369 assert(expWaitCnt == -1);
1370 assert(lgkmWaitCnt == -1);
1371
1378 if (vm_wait_cnt != 0xf)
1379 vmWaitCnt = vm_wait_cnt;
1380
1381 if (exp_wait_cnt != 0x7)
1382 expWaitCnt = exp_wait_cnt;
1383
1384 if (lgkm_wait_cnt != 0x1f)
1385 lgkmWaitCnt = lgkm_wait_cnt;
1386}
1387
1388void
1390{
1391 // reset the waitcnts back to
1392 // -1, indicating they are no
1393 // longer valid
1394 vmWaitCnt = -1;
1395 expWaitCnt = -1;
1396 lgkmWaitCnt = -1;
1397
1398 // resume running normally
1399 status = S_RUNNING;
1400}
1401
1402void
1407
1408void
1413
1414void
1419
1420void
1425
1426void
1431
1432void
1437
1438Addr
1440{
1441 return _pc;
1442}
1443
1444void
1446{
1447 _pc = new_pc;
1448}
1449
1452{
1453 return _execMask;
1454}
1455
1456bool
1457Wavefront::execMask(int lane) const
1458{
1459 return _execMask[lane];
1460}
1461
1462void
1464{
1465 /* clear busy registers */
1466 for (int i=0; i < maxVgprs; i++) {
1467 int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
1468 computeUnit->vrf[simdId]->markReg(vgprIdx, false);
1469 }
1470
1471 /* Free registers used by this wavefront */
1472 uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
1473 computeUnit->vrf[simdId]->numRegs();
1475 freeRegion(startVgprIndex, endIndex);
1476}
1477
1478void
1480{
1481 actualWgSzTotal = 1;
1482 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
1483 actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
1484 - task->wgId(d) * workGroupSz[d]);
1486 }
1487}
1488
1489void
1491{
1492 assert(bar_id >= WFBarrier::InvalidID);
1493 assert(bar_id < computeUnit->numBarrierSlots());
1494 barId = bar_id;
1495}
1496
1497int
1499{
1500 return barId;
1501}
1502
1503bool
1505{
1506 return barId > WFBarrier::InvalidID;
1507}
1508
1509void
1514
1516 : statistics::Group(parent),
1517 ADD_STAT(numInstrExecuted,
1518 "number of instructions executed by this WF slot"),
1519 ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
1520 ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
1521 ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
1522 "RF denied adding instruction"),
1523 ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
1524 " not available"),
1525 ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
1526 "RF reads to complete"),
1527 ADD_STAT(schLdsArbStalls,
1528 "number of cycles wave stalled due to LDS-VRF arbitration"),
1529 // FIXME: the name of the WF needs to be unique
1530 ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
1531 "instructions are blocked due to WAW or WAR dependencies"),
1532 // FIXME: the name of the WF needs to be unique
1533 ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
1534 "instructions are blocked due to RAW dependencies"),
1535 ADD_STAT(vecRawDistance,
1536 "Count of RAW distance in dynamic instructions for this WF"),
1537 ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
1538{
1539 vecRawDistance.init(0, 20, 1);
1540 readsPerWrite.init(0, 4, 1);
1541}
1542
1543} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
Tick cyclesToTicks(Cycles c) const
int mapWaveToScalarAlu(Wavefront *w) const
std::vector< WaitClass > scalarALUs
WaitClass scalarMemUnit
std::vector< uint64_t > instExecPerSimd
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
std::vector< RegisterFileCache * > rfc
WaitClass vrfToLocalMemPipeBus
TokenManager * getTokenManager()
WaitClass srfToScalarMemPipeBus
std::vector< uint64_t > lastExecCycle
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
WaitClass vectorSharedMemUnit
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
void deleteFromPipeMap(Wavefront *w)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
FetchUnit & fetchUnit(int simdId)
void flushBuf(int wfSlotId)
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
int wgId(int dim) const
const GfxVersion & gfxVersion() const
Addr hostDispPktAddr() const
static const int MAX_DIM
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
unsigned accumOffset() const
int mapVgpr(Wavefront *w, int vgprIndex)
std::vector< PoolManager * > vrfPoolMgrs
int mapSgpr(Wavefront *w, int sgprIndex)
hsail_mode_e hsail_mode
Definition shader.hh:234
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
Definition shader.hh:209
void incVectorInstDstOperand(int num_operands)
Definition shader.hh:326
void incVectorInstSrcOperand(int num_operands)
Definition shader.hh:320
Abstract superclass for simulation objects.
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
WF barrier slots.
static const int InvalidID
void set(uint64_t i)
Definition misc.hh:82
uint32_t maxSgprs
Definition wavefront.hh:135
status_e status
Definition wavefront.hh:335
bool isOldestInstWaitcnt()
Definition wavefront.cc:657
Addr pc() const
bool hasBarrier() const
VectorMask _execMask
Definition wavefront.hh:337
uint32_t actualWgSzTotal
Definition wavefront.hh:168
void reserveGmResource(GPUDynInstPtr ii)
Definition wavefront.cc:815
uint64_t oldVgprTcnt
Definition wavefront.hh:216
std::vector< Addr > lastAddr
Definition wavefront.hh:157
void setStatus(status_e newStatus)
Definition wavefront.cc:573
bool waitCntsSatisfied()
void validateRequestCounters()
Definition wavefront.cc:801
const int simdId
Definition wavefront.hh:101
bool isOldestInstLMem()
Definition wavefront.cc:743
bool isOldestInstPrivMem()
Definition wavefront.cc:756
bool isOldestInstScalarMem()
Definition wavefront.cc:730
uint64_t oldDgprTcnt
Definition wavefront.hh:223
Wavefront(const Params &p)
Definition wavefront.cc:49
bool isOldestInstBarrier()
Definition wavefront.cc:704
void resizeRegFiles(int num_vregs, int num_sregs)
Definition wavefront.cc:562
int scalarOutstandingReqsWrGm
Definition wavefront.hh:187
uint32_t gridSz[3]
Definition wavefront.hh:163
void decExpInstsIssued()
std::vector< uint32_t > oldVgpr
Definition wavefront.hh:212
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
Definition wavefront.cc:118
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
Definition wavefront.hh:108
std::vector< uint32_t > workItemFlatId
Definition wavefront.hh:159
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
Definition wavefront.hh:328
std::vector< int > vecReads
Definition wavefront.hh:244
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:111
bool isOldestInstSleep()
Definition wavefront.cc:643
uint32_t accumOffset
Definition wavefront.hh:137
bool isLmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:632
GPUDynInstPtr nextInstr()
uint64_t lastTrace
Definition wavefront.hh:196
std::vector< uint32_t > workItemId[3]
Definition wavefront.hh:158
std::vector< uint64_t > oldDgpr
Definition wavefront.hh:219
bool isOldestInstScalarALU()
Definition wavefront.cc:674
void releaseBarrier()
bool isOldestInstFlatMem()
Definition wavefront.cc:769
WavefrontParams Params
Definition wavefront.hh:251
uint32_t maxVgprs
Definition wavefront.hh:133
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
uint32_t workGroupId[3]
Definition wavefront.hh:161
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
const int wfSlotId
Definition wavefront.hh:98
std::unordered_map< int, uint64_t > rawDist
Definition wavefront.hh:240
void incExpInstsIssued()
std::vector< int > reserveResources()
Definition wavefront.cc:863
uint32_t startSgprIndex
Definition wavefront.hh:206
GfxVersion gfxVersion
Definition wavefront.hh:96
void decLGKMInstsIssued()
void incLGKMInstsIssued()
int barrierId() const
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition wavefront.cc:103
uint32_t workGroupSz[3]
Definition wavefront.hh:162
bool isOldestInstVectorALU()
Definition wavefront.cc:689
uint64_t lastInstExec
Definition wavefront.hh:236
LdsChunk * ldsChunk
Definition wavefront.hh:230
uint32_t actualWgSz[3]
Definition wavefront.hh:167
Addr archFlatScratchAddr
Definition wavefront.hh:209
int scalarOutstandingReqsRdGm
Definition wavefront.hh:185
void freeResources()
Definition wavefront.cc:796
void incVMemInstsIssued()
void reserveLmResource(GPUDynInstPtr ii)
Definition wavefront.cc:845
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:88
bool isOldestInstGMem()
Definition wavefront.cc:717
gem5::Wavefront::WavefrontStats stats
VectorMask & execMask()
uint64_t wfDynId
Definition wavefront.hh:233
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:621
uint32_t startVgprIndex
Definition wavefront.hh:203
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition wavefront.cc:610
TheGpuISA::GPUISA _gpuISA
Definition wavefront.hh:307
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Counter value() const
Return the current value of this stat as its base type.
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 12, 11 > set
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 0 > p
const FlagsType none
Nothing extra to print.
Definition info.hh:53
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
static void init_pc(py::module_ &m_native)
Definition core.cc:168
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
@ WorkgroupIdX
@ DispatchId
@ NumScalarInitFields
@ DispatchPtr
@ QueuePtr
@ PrivSegWaveByteOffset
@ PrivateSegBuf
@ WorkgroupIdY
@ PrivateSegSize
@ WorkgroupInfo
@ WorkgroupIdZ
@ FlatScratchInit
@ KernargSegPtr
@ WorkitemIdX
@ WorkitemIdZ
@ NumVectorInitFields
@ WorkitemIdY
statistics::Distribution activeLanesPerLMemInstrDist
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution execRateDist
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
Definition wavefront.hh:379
statistics::Distribution readsPerWrite
Definition wavefront.hh:383
statistics::Scalar numInstrExecuted
Definition wavefront.hh:347
uint32_t scratch_workitem_byte_size
Definition hsa_queue.hh:84
uint32_t compute_tmpring_size_wavesize
Definition hsa_queue.hh:79
uint64_t scratch_backing_memory_location
Definition hsa_queue.hh:82
uint32_t scratch_resource_descriptor[4]
Definition hsa_queue.hh:81

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0