gem5 [DEVELOP-FOR-25.1]
Loading...
Searching...
No Matches
wavefront.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include "base/bitfield.hh"
35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/GPUTrace.hh"
38#include "debug/WavefrontStack.hh"
43#include "gpu-compute/shader.hh"
46
47namespace gem5
48{
49
51 : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
52 maxIbSize(p.max_ib_size), _gpuISA(*this),
53 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
55 sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
56{
57 lastTrace = 0;
58 execUnitId = -1;
78 ldsChunk = nullptr;
79
80 memTraceBusy = 0;
81 oldVgprTcnt = 0xffffffffffffffffll;
82 oldDgprTcnt = 0xffffffffffffffffll;
83 oldVgpr.resize(p.wf_size);
84
85 pendingFetch = false;
86 dropFetch = false;
87 maxVgprs = 0;
88 maxSgprs = 0;
89
90 lastAddr.resize(p.wf_size);
91 workItemFlatId.resize(p.wf_size);
92 oldDgpr.resize(p.wf_size);
93 for (int i = 0; i < 3; ++i) {
94 workItemId[i].resize(p.wf_size);
95 }
96
97 _execMask.set();
98 rawDist.clear();
99 lastInstExec = 0;
100 vecReads.clear();
101
102 lastInstSeqNum = 0;
103 lastInstDisasm = "none";
104}
105
106void
108{
111 startVgprIndex = 0;
112 startSgprIndex = 0;
113
114 scalarAlu = computeUnit->mapWaveToScalarAlu(this);
115 scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
116 globalMem = computeUnit->mapWaveToGlobalMem(this);
117 localMem = computeUnit->mapWaveToLocalMem(this);
118 scalarMem = computeUnit->mapWaveToScalarMem(this);
119}
120
121void
122Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
123{
124 int regInitIdx = 0;
125 gfxVersion = task->gfxVersion();
126
127 // Iterate over all the init fields and check which
128 // bits are enabled. Useful information can be found here:
129 // https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
130 // blob/master/AMDGPU-ABI.md
131 for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
132
133 if (task->sgprBitEnabled(en_bit)) {
134 int physSgprIdx = 0;
135 uint32_t firstWave = 0;
136 int orderedAppendTerm = 0;
137 int numWfsInWg = 0;
138 uint32_t finalValue = 0;
139 Addr host_disp_pkt_addr = task->hostDispPktAddr();
140 Addr kernarg_addr = task->kernargAddr();
141 Addr hidden_priv_base(0);
142
143 switch (en_bit) {
144 case PrivateSegBuf:
145 physSgprIdx =
146 computeUnit->registerManager->mapSgpr(this, regInitIdx);
147 computeUnit->srf[simdId]->write(physSgprIdx,
149 ++regInitIdx;
150 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
151 "Setting PrivateSegBuffer: s[%d] = %x\n",
152 computeUnit->cu_id, simdId,
153 wfSlotId, wfDynId, physSgprIdx,
155
156 physSgprIdx =
157 computeUnit->registerManager->mapSgpr(this, regInitIdx);
158 computeUnit->srf[simdId]->write(physSgprIdx,
160 ++regInitIdx;
161 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
162 "Setting PrivateSegBuffer: s[%d] = %x\n",
163 computeUnit->cu_id, simdId,
164 wfSlotId, wfDynId, physSgprIdx,
166
167 physSgprIdx =
168 computeUnit->registerManager->mapSgpr(this, regInitIdx);
169 computeUnit->srf[simdId]->write(physSgprIdx,
171 ++regInitIdx;
172 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
173 "Setting PrivateSegBuffer: s[%d] = %x\n",
174 computeUnit->cu_id, simdId,
175 wfSlotId, wfDynId, physSgprIdx,
177
178 physSgprIdx =
179 computeUnit->registerManager->mapSgpr(this, regInitIdx);
180 computeUnit->srf[simdId]->write(physSgprIdx,
182
183 ++regInitIdx;
184 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
185 "Setting PrivateSegBuffer: s[%d] = %x\n",
186 computeUnit->cu_id, simdId,
187 wfSlotId, wfDynId, physSgprIdx,
189 break;
190 case DispatchPtr:
191 physSgprIdx =
192 computeUnit->registerManager->mapSgpr(this, regInitIdx);
193 computeUnit->srf[simdId]->write(physSgprIdx,
194 bits(host_disp_pkt_addr, 31, 0));
195 ++regInitIdx;
196 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
197 "Setting DispatchPtr: s[%d] = %x\n",
198 computeUnit->cu_id, simdId,
199 wfSlotId, wfDynId, physSgprIdx,
200 bits(host_disp_pkt_addr, 31, 0));
201
202 physSgprIdx =
203 computeUnit->registerManager->mapSgpr(this, regInitIdx);
204 computeUnit->srf[simdId]->write(physSgprIdx,
205 bits(host_disp_pkt_addr, 63, 32));
206 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
207 "Setting DispatchPtr: s[%d] = %x\n",
208 computeUnit->cu_id, simdId,
209 wfSlotId, wfDynId, physSgprIdx,
210 bits(host_disp_pkt_addr, 63, 32));
211
212 ++regInitIdx;
213 break;
214 case QueuePtr:
215 physSgprIdx =
216 computeUnit->registerManager->mapSgpr(this, regInitIdx);
217 computeUnit->srf[simdId]->write(physSgprIdx,
218 bits(task->hostAMDQueueAddr, 31, 0));
219 ++regInitIdx;
220 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
221 "Setting QueuePtr: s[%d] = %x\n",
222 computeUnit->cu_id, simdId,
223 wfSlotId, wfDynId, physSgprIdx,
224 bits(task->hostAMDQueueAddr, 31, 0));
225
226 physSgprIdx =
227 computeUnit->registerManager->mapSgpr(this, regInitIdx);
228 computeUnit->srf[simdId]->write(physSgprIdx,
229 bits(task->hostAMDQueueAddr, 63, 32));
230 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
231 "Setting QueuePtr: s[%d] = %x\n",
232 computeUnit->cu_id, simdId,
233 wfSlotId, wfDynId, physSgprIdx,
234 bits(task->hostAMDQueueAddr, 63, 32));
235
236 ++regInitIdx;
237 break;
238 case KernargSegPtr:
239 physSgprIdx =
240 computeUnit->registerManager->mapSgpr(this, regInitIdx);
241 computeUnit->srf[simdId]->write(physSgprIdx,
242 bits(kernarg_addr, 31, 0));
243 ++regInitIdx;
244 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
245 "Setting KernargSegPtr: s[%d] = %x\n",
246 computeUnit->cu_id, simdId,
247 wfSlotId, wfDynId, physSgprIdx,
248 bits(kernarg_addr, 31, 0));
249
250 physSgprIdx =
251 computeUnit->registerManager->mapSgpr(this, regInitIdx);
252 computeUnit->srf[simdId]->write(physSgprIdx,
253 bits(kernarg_addr, 63, 32));
254 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
255 "Setting KernargSegPtr: s[%d] = %x\n",
256 computeUnit->cu_id, simdId,
257 wfSlotId, wfDynId, physSgprIdx,
258 bits(kernarg_addr, 63, 32));
259
260 ++regInitIdx;
261 break;
262 case DispatchId:
263 physSgprIdx
264 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
265 computeUnit->srf[simdId]->write(physSgprIdx,
266 task->dispatchId());
267 ++regInitIdx;
268 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
269 "Setting DispatchId: s[%d] = %x\n",
270 computeUnit->cu_id, simdId,
271 wfSlotId, wfDynId, physSgprIdx,
272 task->dispatchId());
273
274 // Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
275 physSgprIdx
276 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
277 computeUnit->srf[simdId]->write(physSgprIdx, 0);
278 ++regInitIdx;
279 break;
280 case FlatScratchInit:
281 physSgprIdx
282 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
283 computeUnit->srf[simdId]->write(physSgprIdx,
284 (TheGpuISA::ScalarRegU32)(task->amdQueue
285 .scratch_backing_memory_location & 0xffffffff));
286 ++regInitIdx;
287 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
288 "Setting FlatScratch Addr: s[%d] = %x\n",
289 computeUnit->cu_id, simdId,
290 wfSlotId, wfDynId, physSgprIdx,
291 (TheGpuISA::ScalarRegU32)(task->amdQueue
292 .scratch_backing_memory_location & 0xffffffff));
293
294 physSgprIdx =
295 computeUnit->registerManager->mapSgpr(this, regInitIdx);
296 // This vallue should be sizeof(DWORD) aligned, that is
297 // 4 byte aligned
298 computeUnit->srf[simdId]->write(physSgprIdx,
300 ++regInitIdx;
301 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
302 "Setting FlatScratch size: s[%d] = %x\n",
303 computeUnit->cu_id, simdId,
304 wfSlotId, wfDynId, physSgprIdx,
329 hidden_priv_base =
330 (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
331 (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
332 & 0x000000000000ffff) << 32);
333 computeUnit->shader->initShHiddenPrivateBase(
334 hidden_priv_base,
336 break;
337 case PrivateSegSize:
338 physSgprIdx
339 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
340 computeUnit->srf[simdId]->write(physSgprIdx,
341 task->privMemPerItem());
342 ++regInitIdx;
343 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
344 "Setting private segment size: s[%d] = %x\n",
345 computeUnit->cu_id, simdId,
346 wfSlotId, wfDynId, physSgprIdx,
347 task->privMemPerItem());
348 break;
349 case KernargPreload:
350 DPRINTF(GPUInitAbi, "Preload %d user SGPRs starting at virtual"
351 " SGPR s[%d]\n", task->preloadLength(), regInitIdx);
352
353 for (int idx = 0; idx < task->preloadLength(); ++idx) {
354 uint32_t finalValue = task->preloadArgs()[idx];
355 physSgprIdx =
356 computeUnit->registerManager->mapSgpr(this,
357 regInitIdx);
358
359 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] Setting "
360 "s[%d] = %x\n", computeUnit->cu_id, simdId,
361 wfSlotId, wfDynId, physSgprIdx, finalValue);
362
363 computeUnit->srf[simdId]->write(physSgprIdx, finalValue);
364 ++regInitIdx;
365 }
366 break;
367 case WorkgroupIdX:
368 physSgprIdx =
369 computeUnit->registerManager->mapSgpr(this, regInitIdx);
370 computeUnit->srf[simdId]->write(physSgprIdx,
371 workGroupId[0]);
372
373 ++regInitIdx;
374 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
375 "Setting WG ID X: s[%d] = %x\n",
376 computeUnit->cu_id, simdId,
377 wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
378 break;
379 case WorkgroupIdY:
380 physSgprIdx =
381 computeUnit->registerManager->mapSgpr(this, regInitIdx);
382 computeUnit->srf[simdId]->write(physSgprIdx,
383 workGroupId[1]);
384
385 ++regInitIdx;
386 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
387 "Setting WG ID Y: s[%d] = %x\n",
388 computeUnit->cu_id, simdId,
389 wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
390 break;
391 case WorkgroupIdZ:
392 physSgprIdx =
393 computeUnit->registerManager->mapSgpr(this, regInitIdx);
394 computeUnit->srf[simdId]->write(physSgprIdx,
395 workGroupId[2]);
396
397 ++regInitIdx;
398 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
399 "Setting WG ID Z: s[%d] = %x\n",
400 computeUnit->cu_id, simdId,
401 wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
402 break;
404
405 // For architected flat scratch, this enable is reused to set
406 // the FLAT_SCRATCH register pair to the scratch backing
407 // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
408 if (task->gfxVersion() == GfxVersion::gfx942 ||
409 task->gfxVersion() == GfxVersion::gfx950) {
410 uint32_t scratchPerWI =
412
415 + (scratchPerWI * 64 * wfId);
416
417 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
418 "Setting architected flat scratch = %x\n",
421
422 break;
423 }
424
425 // Not architected flat scratch. Write the scratch wavefront
426 // offset: https://llvm.org/docs/AMDGPUUsage.html
427 // #amdgpu-amdhsa-initial-kernel-execution-state
428 physSgprIdx =
429 computeUnit->registerManager->mapSgpr(this, regInitIdx);
430
444 computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
445 (wgId * (wgSz / 64) + wfId) *
447
448 ++regInitIdx;
449 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
450 "Setting Private Seg Offset: s[%d] = %x\n",
451 computeUnit->cu_id, simdId,
452 wfSlotId, wfDynId, physSgprIdx,
453 1024 * (wgId * (wgSz / 64) + wfId) *
455 break;
456 case WorkgroupInfo:
457 firstWave = (wfId == 0) ? 1 : 0;
458 numWfsInWg = divCeil(wgSizeInWorkItems,
459 computeUnit->wfSize());
460 finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
461 finalValue |= (orderedAppendTerm << 6);
462 finalValue |= numWfsInWg;
463 physSgprIdx =
464 computeUnit->registerManager->mapSgpr(this, regInitIdx);
465 computeUnit->srf[simdId]->
466 write(physSgprIdx, finalValue);
467
468 ++regInitIdx;
469 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
470 "Setting WG Info: s[%d] = %x\n",
471 computeUnit->cu_id, simdId,
472 wfSlotId, wfDynId, physSgprIdx, finalValue);
473 break;
474 default:
475 fatal("SGPR enable bit %i not supported\n", en_bit);
476 break;
477 }
478 }
479 }
480
481 // Save the offset to the first accumulation VGPR number from HSA task.
482 accumOffset = task->accumOffset();
483
484 regInitIdx = 0;
485
486 // VGPRs are initialized to the work item IDs for a given thread. There
487 // are two ways to initialize the IDs based on number of dimensions. ISAs
488 // will either have packed work-item IDs or not. LLVM lists them here:
489 // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
490 // Default to false and set to true for gem5 supported ISAs.
491 bool packed_work_item_id = false;
492
493 if (task->gfxVersion() == GfxVersion::gfx90a ||
494 task->gfxVersion() == GfxVersion::gfx942 ||
495 task->gfxVersion() == GfxVersion::gfx950) {
496 packed_work_item_id = true;
497 }
498
499 // For ISAs with packed work item IDs, only one VGPR is used and the
500 // (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
501 // for each dimension
502 if (packed_work_item_id) {
503 TheGpuISA::VecRegContainerU32 raw_vgpr;
504 TheGpuISA::VecElemU32 *packed_vgpr
505 = raw_vgpr.as<TheGpuISA::VecElemU32>();
506
507 uint32_t physVgprIdx = computeUnit->registerManager
508 ->mapVgpr(this, regInitIdx);
509 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
510 packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
511 }
512 if (task->vgprBitEnabled(1)) {
513 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
514 packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
515 }
516 }
517 if (task->vgprBitEnabled(2)) {
518 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
519 packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
520 }
521 }
522 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
523
524 return;
525 }
526
527 // For ISAs with non-packed work item IDs, map and initialize one VGPR
528 // per dimensions. Do this by iterating over all the init fields and
529 // checking which bits are enabled.
530 for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
531 if (task->vgprBitEnabled(en_bit)) {
532 uint32_t physVgprIdx = 0;
533 TheGpuISA::VecRegContainerU32 raw_vgpr;
534
535 switch (en_bit) {
536 case WorkitemIdX:
537 {
538 physVgprIdx = computeUnit->registerManager
539 ->mapVgpr(this, regInitIdx);
540 TheGpuISA::VecElemU32 *vgpr_x
541 = raw_vgpr.as<TheGpuISA::VecElemU32>();
542
543 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
544 vgpr_x[lane] = workItemId[0][lane];
545 }
546
547 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
548 rawDist[regInitIdx] = 0;
549 ++regInitIdx;
550 }
551 break;
552 case WorkitemIdY:
553 {
554 physVgprIdx = computeUnit->registerManager
555 ->mapVgpr(this, regInitIdx);
556 TheGpuISA::VecElemU32 *vgpr_y
557 = raw_vgpr.as<TheGpuISA::VecElemU32>();
558
559 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
560 vgpr_y[lane] = workItemId[1][lane];
561 }
562
563 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
564 rawDist[regInitIdx] = 0;
565 ++regInitIdx;
566 }
567 break;
568 case WorkitemIdZ:
569 {
570 physVgprIdx = computeUnit->registerManager->
571 mapVgpr(this, regInitIdx);
572 TheGpuISA::VecElemU32 *vgpr_z
573 = raw_vgpr.as<TheGpuISA::VecElemU32>();
574
575 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
576 vgpr_z[lane] = workItemId[2][lane];
577 }
578
579 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
580 rawDist[regInitIdx] = 0;
581 ++regInitIdx;
582 }
583 break;
584 }
585 }
586 }
587}
588
589void
590Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
591{
592 maxVgprs = num_vregs;
593 maxSgprs = num_sregs;
594}
595
599
600void
602{
603 if (computeUnit->idleCUTimeout > 0) {
604 // Wavefront's status transitions to stalled or stopped
605 if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
606 newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
607 (status != newStatus)) {
608 computeUnit->idleWfs++;
609 assert(computeUnit->idleWfs <=
610 (computeUnit->shader->n_wf * computeUnit->numVectorALUs));
611 if (computeUnit->idleWfs ==
612 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
614 }
615 // Wavefront's status transitions to an active state (from
616 // a stopped or stalled state)
617 } else if ((status == S_STOPPED || status == S_STALLED ||
618 status == S_WAITCNT || status == S_BARRIER) &&
619 (status != newStatus)) {
620 // if all WFs in the CU were idle then check if the idleness
621 // period exceeded the timeout threshold
622 if (computeUnit->idleWfs ==
623 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
625 computeUnit->idleCUTimeout,
626 "CU%d has been idle for %d ticks at tick %d",
627 computeUnit->cu_id, computeUnit->idleCUTimeout,
628 curTick());
629 }
630 computeUnit->idleWfs--;
631 assert(computeUnit->idleWfs >= 0);
632 }
633 }
634 status = newStatus;
635}
636
637void
638Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
639{
640 wfDynId = _wf_dyn_id;
641 _pc = init_pc;
642
644
645 vecReads.resize(maxVgprs, 0);
646}
647
648bool
650{
651 if (ii->isGlobalMem() ||
652 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
653 return true;
654 }
655
656 return false;
657}
658
659bool
661{
662 if (ii->isLocalMem() ||
663 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
664 return true;
665 }
666
667 return false;
668}
669
670bool
672{
673 if (instructionBuffer.empty())
674 return false;
675
676 GPUDynInstPtr ii = instructionBuffer.front();
677
678 if (ii->isSleep()) {
679 return true;
680 }
681 return false;
682}
683
684bool
686{
687 if (instructionBuffer.empty())
688 return false;
689
690 GPUDynInstPtr ii = instructionBuffer.front();
691
692 if (ii->isWaitcnt()) {
693 // waitcnt is a scalar
694 assert(ii->isScalar());
695 return true;
696 }
697
698 return false;
699}
700
701bool
703{
704 assert(!instructionBuffer.empty());
705 GPUDynInstPtr ii = instructionBuffer.front();
706
707 if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
708 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
709 (ii->isKernArgSeg() && ii->isLoad()))) {
710 return true;
711 }
712
713 return false;
714}
715
716bool
718{
719 assert(!instructionBuffer.empty());
720 GPUDynInstPtr ii = instructionBuffer.front();
721
722 if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
723 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
724 || (ii->isKernArgSeg() && ii->isLoad()))) {
725 return true;
726 }
727
728 return false;
729}
730
731bool
733{
734 assert(!instructionBuffer.empty());
735 GPUDynInstPtr ii = instructionBuffer.front();
736
737 if (status != S_STOPPED && ii->isBarrier()) {
738 return true;
739 }
740
741 return false;
742}
743
744bool
746{
747 assert(!instructionBuffer.empty());
748 GPUDynInstPtr ii = instructionBuffer.front();
749
750 if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
751 return true;
752 }
753
754 return false;
755}
756
757bool
759{
760 assert(!instructionBuffer.empty());
761 GPUDynInstPtr ii = instructionBuffer.front();
762
763 if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
764 return true;
765 }
766
767 return false;
768}
769
770bool
772{
773 assert(!instructionBuffer.empty());
774 GPUDynInstPtr ii = instructionBuffer.front();
775
776 if (status != S_STOPPED && ii->isLocalMem()) {
777 return true;
778 }
779
780 return false;
781}
782
783bool
785{
786 assert(!instructionBuffer.empty());
787 GPUDynInstPtr ii = instructionBuffer.front();
788
789 if (status != S_STOPPED && ii->isPrivateSeg()) {
790 return true;
791 }
792
793 return false;
794}
795
796bool
798{
799 assert(!instructionBuffer.empty());
800 GPUDynInstPtr ii = instructionBuffer.front();
801
802 if (status != S_STOPPED && ii->isFlat()) {
803 return true;
804 }
805
806 return false;
807}
808
809bool
811{
812 for (auto it : instructionBuffer) {
813 GPUDynInstPtr ii = it;
814 if (ii->isReturn() || ii->isBranch() ||
815 ii->isEndOfKernel()) {
816 return true;
817 }
818 }
819
820 return false;
821}
822
823void
828
830{
832 wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
833 outstandingReqs < 0,
834 "Negative requests in pipe for WF%d for slot%d"
835 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
836 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
837 " Outstanding Reqs=%d\n",
840}
841
842void
844{
845 if (!ii->isScalar()) {
846 if (ii->isLoad()) {
848 } else if (ii->isStore()) {
850 } else if (ii->isAtomic() || ii->isMemSync()) {
853 } else {
854 panic("Invalid memory operation!\n");
855 }
857 } else {
858 if (ii->isLoad()) {
860 } else if (ii->isStore()) {
862 } else if (ii->isAtomic() || ii->isMemSync()) {
865 } else {
866 panic("Invalid memory operation!\n");
867 }
869 }
870}
871
872void
874{
875 fatal_if(ii->isScalar(),
876 "Scalar instructions can not access Shared memory!!!");
877 if (ii->isLoad()) {
879 } else if (ii->isStore()) {
881 } else if (ii->isAtomic() || ii->isMemSync()) {
884 } else {
885 panic("Invalid memory operation!\n");
886 }
888}
889
892{
893 // vector of execution unit IDs to return to schedule stage
894 // this return is only used for debugging and an assertion...
895 std::vector<int> execUnitIds;
896
897 // Get current instruction
898 GPUDynInstPtr ii = instructionBuffer.front();
899 assert(ii);
900
901 // Single precision ALU or Branch or Return or Special instruction
902 if (ii->isALU() || ii->isSpecialOp() ||
903 ii->isBranch() || ii->isNop() ||
904 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
905 ii->isReturn() || ii->isEndOfKernel()) {
906 if (!ii->isScalar()) {
908 } else {
910 }
911 // this is to enforce a fixed number of cycles per issue slot per SIMD
912 } else if (ii->isBarrier()) {
913 execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
914 } else if (ii->isFlat()) {
915 assert(!ii->isScalar());
917 // add execUnitId, reserved by reserveLmResource, list before it is
918 // overwriten by reserveGmResource
919 execUnitIds.push_back(execUnitId);
923 execUnitIds.push_back(flatGmUnitId);
924 execUnitId = -1;
925 } else if (ii->isGlobalMem()) {
927 } else if (ii->isLocalMem()) {
929 } else if (ii->isPrivateSeg()) {
930 fatal_if(ii->isScalar(),
931 "Scalar instructions can not access Private memory!!!");
933 } else {
934 panic("reserveResources -> Couldn't process op!\n");
935 }
936
937 if (execUnitId != -1) {
938 execUnitIds.push_back(execUnitId);
939 }
940 assert(execUnitIds.size());
941 return execUnitIds;
942}
943
944void
946{
947 // ---- Exit if wavefront is inactive ----------------------------- //
948
949 if (status == S_STOPPED || status == S_RETURNING ||
950 status==S_STALLED || instructionBuffer.empty()) {
951 return;
952 }
953
954 if (status == S_WAITCNT) {
966 assert(isOldestInstWaitcnt());
967 }
968
969 // Get current instruction
970
971 GPUDynInstPtr ii = instructionBuffer.front();
972
973 const Addr old_pc = pc();
974 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
975 "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
976 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
977 DPRINTF(GPUTrace, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
978 "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
979 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
980
981 ii->execute(ii);
982 // delete the dynamic instruction from the pipeline map
983 computeUnit->deleteFromPipeMap(this);
984 // update the instruction stats in the CU
985 computeUnit->updateInstStats(ii);
986
987 // inform VRF of instruction execution to schedule write-back
988 // and scoreboard ready for registers
989 if (!ii->isScalar()) {
990 computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
991 computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
992 }
993 computeUnit->srf[simdId]->waveExecuteInst(this, ii);
994
995 computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
996 computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
997 computeUnit->stats.numInstrExecuted++;
998 stats.numInstrExecuted++;
999 computeUnit->instExecPerSimd[simdId]++;
1000 computeUnit->stats.execRateDist.sample(
1001 computeUnit->stats.totalCycles.value() -
1002 computeUnit->lastExecCycle[simdId]);
1003 computeUnit->lastExecCycle[simdId] =
1004 computeUnit->stats.totalCycles.value();
1005
1006 if (lastInstExec) {
1007 computeUnit->stats.instInterleave[simdId].
1008 sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
1009 }
1010 lastInstExec = computeUnit->instExecPerSimd[simdId];
1011
1012 // want to track:
1013 // number of reads that occur per value written
1014
1015 // vector RAW dependency tracking
1016 for (const auto& srcVecOp : ii->srcVecRegOperands()) {
1017 for (const auto& virtIdx : srcVecOp.virtIndices()) {
1018 // This check should never fail, but to be safe we check
1019 if (rawDist.find(virtIdx) != rawDist.end()) {
1020 stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
1021 rawDist[virtIdx]);
1022 }
1023 // increment number of reads to this register
1024 vecReads[virtIdx]++;
1025 }
1026 }
1027
1028 for (const auto& dstVecOp : ii->dstVecRegOperands()) {
1029 for (const auto& virtIdx : dstVecOp.virtIndices()) {
1030 // rawDist is set on writes, but will not be set for the first
1031 // write to each physical register
1032 if (rawDist.find(virtIdx) != rawDist.end()) {
1033 // Sample the number of reads that were performed
1034 stats.readsPerWrite.sample(vecReads[virtIdx]);
1035 }
1036 // on a write, reset count of reads to 0
1037 vecReads[virtIdx] = 0;
1038
1039 rawDist[virtIdx] = stats.numInstrExecuted.value();
1040 }
1041 }
1042
1043 if (pc() == old_pc) {
1044 // PC not modified by instruction, proceed to next
1045 _gpuISA.advancePC(ii);
1046 instructionBuffer.pop_front();
1047 } else {
1048 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1050 ii->disassemble());
1051 discardFetch();
1052 }
1053 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1054 computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
1055
1056 if (computeUnit->shader->hsail_mode==Shader::SIMT) {
1057 const int num_active_lanes = execMask().count();
1058 computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
1059 computeUnit->stats.numVecOpsExecuted += num_active_lanes;
1060
1061 if (ii->isMFMA()) {
1062 computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
1063 if (ii->isI8()) {
1064 computeUnit->stats.numVecOpsExecutedMFMAI8
1065 += num_active_lanes;
1066 }
1067 }
1068
1069 if (ii->isF16() && ii->isALU()) {
1070 if (ii->isF32() || ii->isF64()) {
1071 fatal("Instruction is tagged as both (1) F16, and (2)"
1072 "either F32 or F64.");
1073 }
1074 computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
1075 if (ii->isFMA()) {
1076 computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
1077 computeUnit->stats.numVecOpsExecutedTwoOpFP
1078 += num_active_lanes;
1079 }
1080 else if (ii->isMAC()) {
1081 computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
1082 computeUnit->stats.numVecOpsExecutedTwoOpFP
1083 += num_active_lanes;
1084 }
1085 else if (ii->isMAD()) {
1086 computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
1087 computeUnit->stats.numVecOpsExecutedTwoOpFP
1088 += num_active_lanes;
1089 }
1090 else if (ii->isMFMA()) {
1091 computeUnit->stats.numVecOpsExecutedMFMAF16
1092 += num_active_lanes;
1093 }
1094 }
1095 if (ii->isF32() && ii->isALU()) {
1096 if (ii->isF16() || ii->isF64()) {
1097 fatal("Instruction is tagged as both (1) F32, and (2)"
1098 "either F16 or F64.");
1099 }
1100 computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
1101 if (ii->isFMA()) {
1102 computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
1103 computeUnit->stats.numVecOpsExecutedTwoOpFP
1104 += num_active_lanes;
1105 }
1106 else if (ii->isMAC()) {
1107 computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
1108 computeUnit->stats.numVecOpsExecutedTwoOpFP
1109 += num_active_lanes;
1110 }
1111 else if (ii->isMAD()) {
1112 computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
1113 computeUnit->stats.numVecOpsExecutedTwoOpFP
1114 += num_active_lanes;
1115 }
1116 else if (ii->isMFMA()) {
1117 computeUnit->stats.numVecOpsExecutedMFMAF32
1118 += num_active_lanes;
1119 }
1120 }
1121 if (ii->isF64() && ii->isALU()) {
1122 if (ii->isF16() || ii->isF32()) {
1123 fatal("Instruction is tagged as both (1) F64, and (2)"
1124 "either F16 or F32.");
1125 }
1126 computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
1127 if (ii->isFMA()) {
1128 computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
1129 computeUnit->stats.numVecOpsExecutedTwoOpFP
1130 += num_active_lanes;
1131 }
1132 else if (ii->isMAC()) {
1133 computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
1134 computeUnit->stats.numVecOpsExecutedTwoOpFP
1135 += num_active_lanes;
1136 }
1137 else if (ii->isMAD()) {
1138 computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
1139 computeUnit->stats.numVecOpsExecutedTwoOpFP
1140 += num_active_lanes;
1141 }
1142 else if (ii->isMFMA()) {
1143 computeUnit->stats.numVecOpsExecutedMFMAF64
1144 += num_active_lanes;
1145 }
1146 }
1147 if (isGmInstruction(ii)) {
1148 computeUnit->stats.activeLanesPerGMemInstrDist.sample(
1149 num_active_lanes);
1150 } else if (isLmInstruction(ii)) {
1151 computeUnit->stats.activeLanesPerLMemInstrDist.sample(
1152 num_active_lanes);
1153 }
1154 }
1155
1160 if (execMask().none() && ii->needsToken()) {
1161 computeUnit->getTokenManager()->recvTokens(1);
1162 return;
1163 }
1164
1165 // Update Vector ALU pipeline and other resources
1166 bool flat_as_gm = false;
1167 bool flat_as_lm = false;
1168 if (ii->isFlat()) {
1169 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1170 (ii->executedAs() == enums::SC_PRIVATE);
1171 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1172 }
1173
1174 // Single precision ALU or Branch or Return or Special instruction
1175 // Note, we use the same timing regardless of SP or DP ALU operation.
1176 if (ii->isALU() || ii->isSpecialOp() ||
1177 ii->isBranch() || ii->isNop() ||
1178 (ii->isKernArgSeg() && ii->isLoad()) ||
1179 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1180 // this is to enforce a fixed number of cycles per issue slot per SIMD
1181 if (!ii->isScalar()) {
1182 computeUnit->vectorALUs[simdId].set(computeUnit->
1183 cyclesToTicks(computeUnit->issuePeriod));
1184 } else {
1185 computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1186 cyclesToTicks(computeUnit->issuePeriod));
1187 }
1188 // Barrier on Scalar ALU
1189 } else if (ii->isBarrier()) {
1190 computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1191 cyclesToTicks(computeUnit->issuePeriod));
1192 // GM or Flat as GM Load
1193 } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1194 if (!ii->isScalar()) {
1195 computeUnit->vrfToGlobalMemPipeBus.set(
1196 computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
1197 computeUnit->vectorGlobalMemUnit.
1198 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1199 computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
1200 computeUnit->vrf_gm_bus_latency;
1201 } else {
1202 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1203 cyclesToTicks(computeUnit->srf_scm_bus_latency));
1204 computeUnit->scalarMemUnit.
1205 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1206 computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
1207 computeUnit->srf_scm_bus_latency;
1208 }
1209 // GM or Flat as GM Store
1210 } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1211 if (!ii->isScalar()) {
1212 computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1213 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1214 computeUnit->vectorGlobalMemUnit.
1215 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1216 computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
1217 (2 * computeUnit->vrf_gm_bus_latency);
1218 } else {
1219 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1220 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1221 computeUnit->scalarMemUnit.
1222 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1223 computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
1224 (2 * computeUnit->srf_scm_bus_latency);
1225 }
1226 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1227 (ii->isGlobalMem() || flat_as_gm)) {
1228 if (!ii->isScalar()) {
1229 computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1230 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1231 computeUnit->vectorGlobalMemUnit.
1232 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1233 computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
1234 (2 * computeUnit->vrf_gm_bus_latency);
1235 } else {
1236 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1237 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1238 computeUnit->scalarMemUnit.
1239 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1240 computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
1241 (2 * computeUnit->srf_scm_bus_latency);
1242 }
1243 // LM or Flat as LM Load
1244 } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1245 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1246 cyclesToTicks(computeUnit->vrf_lm_bus_latency));
1247 computeUnit->vectorSharedMemUnit.
1248 set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
1249 computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
1250 computeUnit->vrf_lm_bus_latency;
1251 // LM or Flat as LM Store
1252 } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1253 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1254 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1255 computeUnit->vectorSharedMemUnit.
1256 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1257 computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
1258 (2 * computeUnit->vrf_lm_bus_latency);
1259 // LM or Flat as LM, Atomic or MemFence
1260 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1261 (ii->isLocalMem() || flat_as_lm)) {
1262 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1263 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1264 computeUnit->vectorSharedMemUnit.
1265 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1266 computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
1267 (2 * computeUnit->vrf_lm_bus_latency);
1268 } else {
1269 panic("Bad instruction type!\n");
1270 }
1271}
1272
1275{
1276 // Read next instruction from instruction buffer
1277 GPUDynInstPtr ii = instructionBuffer.front();
1278 // if the WF has been dispatched in the schedule stage then
1279 // check the next oldest instruction for readiness
1280 if (computeUnit->pipeMap.find(ii->seqNum()) !=
1281 computeUnit->pipeMap.end()) {
1282 if (instructionBuffer.size() > 1) {
1283 auto it = instructionBuffer.begin() + 1;
1284 return *it;
1285 } else { // No new instructions to check
1286 return nullptr;
1287 }
1288 }
1289 return ii;
1290}
1291
1292void
1294{
1295 instructionBuffer.clear();
1297
1302 computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
1303}
1304
1305bool
1307{
1308 // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1309 // waitCnt instruction has been dispatched but not executed yet: next
1310 // instruction should be blocked until waitCnt is executed.
1311 if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
1312 return false;
1313 }
1314
1320 if (vmWaitCnt != -1) {
1321 if (vmemInstsIssued > vmWaitCnt) {
1322 // vmWaitCnt not satisfied
1323 return false;
1324 }
1325 }
1326
1327 if (expWaitCnt != -1) {
1328 if (expInstsIssued > expWaitCnt) {
1329 // expWaitCnt not satisfied
1330 return false;
1331 }
1332 }
1333
1334 if (lgkmWaitCnt != -1) {
1336 // lgkmWaitCnt not satisfied
1337 return false;
1338 }
1339 }
1340
1341 // if we get here all outstanding waitcnts must
1342 // be satisfied, so we resume normal operation
1343 clearWaitCnts();
1344
1345 return true;
1346}
1347
1348bool
1350{
1351 assert(status == S_STALLED_SLEEP);
1352
1353 // if the sleep count has not been set, then the sleep instruction has not
1354 // been executed yet, so we will return true without setting the wavefront
1355 // status
1356 if (sleepCnt == 0)
1357 return false;
1358
1359 sleepCnt--;
1360 if (sleepCnt != 0)
1361 return false;
1362
1363 status = S_RUNNING;
1364 return true;
1365}
1366
1367void
1369{
1370 assert(sleepCnt == 0);
1371 sleepCnt = sleep_time;
1372}
1373
1374void
1375Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
1376{
1377 // the scoreboard should have set the status
1378 // to S_WAITCNT once a waitcnt instruction
1379 // was marked as ready
1380 assert(status == S_WAITCNT);
1381
1382 // waitcnt instruction shouldn't be sending
1383 // negative counts
1384 assert(vm_wait_cnt >= 0);
1385 assert(exp_wait_cnt >= 0);
1386 assert(lgkm_wait_cnt >= 0);
1387 // waitcnts are a max of 15 because we have
1388 // only 1 nibble (4 bits) to set the counts
1389 assert(vm_wait_cnt <= 0xf);
1390 assert(exp_wait_cnt <= 0x7);
1391 assert(lgkm_wait_cnt <= 0x1f);
1392
1399 assert(vmWaitCnt == -1);
1400 assert(expWaitCnt == -1);
1401 assert(lgkmWaitCnt == -1);
1402
1409 if (vm_wait_cnt != 0xf)
1410 vmWaitCnt = vm_wait_cnt;
1411
1412 if (exp_wait_cnt != 0x7)
1413 expWaitCnt = exp_wait_cnt;
1414
1415 if (lgkm_wait_cnt != 0x1f)
1416 lgkmWaitCnt = lgkm_wait_cnt;
1417}
1418
1419void
1421{
1422 // reset the waitcnts back to
1423 // -1, indicating they are no
1424 // longer valid
1425 vmWaitCnt = -1;
1426 expWaitCnt = -1;
1427 lgkmWaitCnt = -1;
1428
1429 // resume running normally
1430 status = S_RUNNING;
1431}
1432
1433void
1438
1439void
1444
1445void
1450
1451void
1456
1457void
1462
1463void
1468
1469void
1471{
1472 if (!computeUnit->shader->getProgressInterval()) {
1473 return;
1474 }
1475
1476 assert(!vmemIssued.count(gpu_dyn_inst->seqNum()));
1477 vmemIssued.insert(gpu_dyn_inst->seqNum());
1478 trackInst(gpu_dyn_inst);
1479}
1480
1481void
1483{
1484 if (!computeUnit->shader->getProgressInterval()) {
1485 return;
1486 }
1487
1488 assert(!lgkmIssued.count(gpu_dyn_inst->seqNum()));
1489 lgkmIssued.insert(gpu_dyn_inst->seqNum());
1490 trackInst(gpu_dyn_inst);
1491}
1492
1493void
1495{
1496 if (!computeUnit->shader->getProgressInterval()) {
1497 return;
1498 }
1499
1500 assert(!expIssued.count(gpu_dyn_inst->seqNum()));
1501 expIssued.insert(gpu_dyn_inst->seqNum());
1502 trackInst(gpu_dyn_inst);
1503}
1504
1505void
1507{
1508 if (!computeUnit->shader->getProgressInterval()) {
1509 return;
1510 }
1511
1512 cntInsts.insert({gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()});
1513}
1514
1515void
1517{
1518 if (!computeUnit->shader->getProgressInterval()) {
1519 return;
1520 }
1521
1522 warn_if(!vmemIssued.count(gpu_dyn_inst->seqNum()),
1523 "%d not in VMEM issued!\n", gpu_dyn_inst->seqNum());
1524 vmemIssued.erase(gpu_dyn_inst->seqNum());
1525 untrackInst(gpu_dyn_inst->seqNum());
1526}
1527
1528void
1530{
1531 if (!computeUnit->shader->getProgressInterval()) {
1532 return;
1533 }
1534
1535 warn_if(!lgkmIssued.count(gpu_dyn_inst->seqNum()),
1536 "%d not in LGKM issued!\n", gpu_dyn_inst->seqNum());
1537 lgkmIssued.erase(gpu_dyn_inst->seqNum());
1538 untrackInst(gpu_dyn_inst->seqNum());
1539}
1540
1541void
1543{
1544 if (!computeUnit->shader->getProgressInterval()) {
1545 return;
1546 }
1547
1548 warn_if(!expIssued.count(gpu_dyn_inst->seqNum()),
1549 "%d not in EXP issued!\n", gpu_dyn_inst->seqNum());
1550 expIssued.erase(gpu_dyn_inst->seqNum());
1551 untrackInst(gpu_dyn_inst->seqNum());
1552}
1553
1554void
1556{
1557 if (!computeUnit->shader->getProgressInterval()) {
1558 return;
1559 }
1560
1561 if (!vmemIssued.count(seqNum) &&
1562 !lgkmIssued.count(seqNum) &&
1563 !expIssued.count(seqNum)) {
1564 cntInsts.erase(seqNum);
1565 }
1566}
1567
1568Addr
1570{
1571 return _pc;
1572}
1573
1574void
1576{
1577 _pc = new_pc;
1578}
1579
1582{
1583 return _execMask;
1584}
1585
1586bool
1587Wavefront::execMask(int lane) const
1588{
1589 return _execMask[lane];
1590}
1591
1592void
1594{
1595 /* clear busy registers */
1596 for (int i=0; i < maxVgprs; i++) {
1597 int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
1598 computeUnit->vrf[simdId]->markReg(vgprIdx, false);
1599 }
1600
1601 /* Free registers used by this wavefront */
1602 uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
1603 computeUnit->vrf[simdId]->numRegs();
1604 computeUnit->registerManager->vrfPoolMgrs[simdId]->
1605 freeRegion(startVgprIndex, endIndex);
1606}
1607
1608void
1610{
1611 actualWgSzTotal = 1;
1612 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
1613 actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
1614 - task->wgId(d) * workGroupSz[d]);
1616 }
1617}
1618
1619void
1621{
1622 assert(bar_id >= WFBarrier::InvalidID);
1623 assert(bar_id < computeUnit->numBarrierSlots());
1624 barId = bar_id;
1625}
1626
1627int
1629{
1630 return barId;
1631}
1632
1633bool
1635{
1636 return barId > WFBarrier::InvalidID;
1637}
1638
1639void
1644
1645std::string
1647{
1648 switch (status) {
1649 case S_STOPPED: return "S_STOPPED";
1650 case S_RETURNING: return "S_RETURNING";
1651 case S_RUNNING: return "S_RUNNING";
1652 case S_STALLED: return "S_STALLED";
1653 case S_STALLED_SLEEP: return "S_STALLED_SLEEP";
1654 case S_WAITCNT: return "S_WAITCNT";
1655 case S_BARRIER: return "S_BARRIER";
1656 default: break;
1657 }
1658
1659 return "Unknown";
1660}
1661
1662void
1664{
1665 std::cout << "wave[" << wfDynId << "] status: "
1666 << statusToString(getStatus()) << " last inst: "
1667 << lastInstDisasm << " waitcnts: vmem: " << vmemInstsIssued
1668 << "/" << vmWaitCnt << "(";
1669 for (auto &elem : vmemIssued) {
1670 std::cout << elem << ' ';
1671 }
1672 std::cout << ") exp: " << expInstsIssued << "/"
1673 << expWaitCnt << "(";
1674 for (auto &elem : expIssued) {
1675 std::cout << elem << ' ';
1676 }
1677
1678 std::cout << ") lgkm: " << lgkmInstsIssued << " / "
1679 << lgkmWaitCnt << "(";
1680 for (auto &elem : lgkmIssued) {
1681 std::cout << elem << ' ';
1682 }
1683 std::cout << ") last ready status: " << lastInstRdyStatus
1684 << " status VRF/SRF: " << lastVrfStatus << "/" << lastSrfStatus
1685 << " wait insts:\n";
1686
1687 for (auto &elem : vmemIssued) {
1688 std::cout << "\t" << cntInsts[elem] << "\n";
1689 }
1690 for (auto &elem : lgkmIssued) {
1691 std::cout << "\t" << cntInsts[elem] << "\n";
1692 }
1693 for (auto &elem : expIssued) {
1694 std::cout << "\t" << cntInsts[elem] << "\n";
1695 }
1696}
1697
1698void
1699Wavefront::setMfmaAScale(int idx, uint8_t value)
1700{
1701 assert(idx < VegaISA::NumVecElemPerVecReg);
1702 mfmaAScale[idx] = value;
1703}
1704
1705void
1706Wavefront::setMfmaBScale(int idx, uint8_t value)
1707{
1708 assert(idx < VegaISA::NumVecElemPerVecReg);
1709 mfmaBScale[idx] = value;
1710}
1711
1712uint8_t
1714{
1715 assert(idx < VegaISA::NumVecElemPerVecReg);
1716 uint8_t rv = mfmaAScale[idx];
1717 mfmaAScale[idx] = 0;
1718
1719 return rv;
1720}
1721
1722uint8_t
1724{
1725 assert(idx < VegaISA::NumVecElemPerVecReg);
1726 uint8_t rv = mfmaBScale[idx];
1727 mfmaBScale[idx] = 0;
1728
1729 return rv;
1730}
1731
1733 : statistics::Group(parent),
1735 "number of instructions executed by this WF slot"),
1736 ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
1737 ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
1738 ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
1739 "RF denied adding instruction"),
1740 ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
1741 " not available"),
1742 ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
1743 "RF reads to complete"),
1745 "number of cycles wave stalled due to LDS-VRF arbitration"),
1746 // FIXME: the name of the WF needs to be unique
1747 ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
1748 "instructions are blocked due to WAW or WAR dependencies"),
1749 // FIXME: the name of the WF needs to be unique
1750 ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
1751 "instructions are blocked due to RAW dependencies"),
1753 "Count of RAW distance in dynamic instructions for this WF"),
1754 ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
1755{
1756 vecRawDistance.init(0, 20, 1);
1757 readsPerWrite.init(0, 4, 1);
1758}
1759
1760} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
int wgId(int dim) const
const GfxVersion & gfxVersion() const
void preloadLength(unsigned val)
Addr hostDispPktAddr() const
static const int MAX_DIM
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
unsigned accumOffset() const
WF barrier slots.
static const int InvalidID
uint32_t maxSgprs
Definition wavefront.hh:136
status_e status
Definition wavefront.hh:369
bool isOldestInstWaitcnt()
Definition wavefront.cc:685
Addr pc() const
bool hasBarrier() const
VectorMask _execMask
Definition wavefront.hh:371
uint32_t actualWgSzTotal
Definition wavefront.hh:170
InstSeqNum lastInstSeqNum
Definition wavefront.hh:327
void reserveGmResource(GPUDynInstPtr ii)
Definition wavefront.cc:843
uint64_t oldVgprTcnt
Definition wavefront.hh:218
std::vector< Addr > lastAddr
Definition wavefront.hh:159
std::set< InstSeqNum > expIssued
Definition wavefront.hh:293
void setStatus(status_e newStatus)
Definition wavefront.cc:601
void untrackInst(InstSeqNum seqNum)
bool waitCntsSatisfied()
std::array< uint8_t, VegaISA::NumVecElemPerVecReg > mfmaAScale
Definition wavefront.hh:377
void validateRequestCounters()
Definition wavefront.cc:829
uint8_t getMfmaAScale(int idx)
void trackInst(GPUDynInstPtr gpu_dyn_inst)
void trackVMemInst(GPUDynInstPtr gpu_dyn_inst)
const int simdId
Definition wavefront.hh:102
bool isOldestInstLMem()
Definition wavefront.cc:771
bool isOldestInstPrivMem()
Definition wavefront.cc:784
bool isOldestInstScalarMem()
Definition wavefront.cc:758
uint64_t oldDgprTcnt
Definition wavefront.hh:225
Wavefront(const Params &p)
Definition wavefront.cc:50
uint8_t getMfmaBScale(int idx)
bool isOldestInstBarrier()
Definition wavefront.cc:732
void resizeRegFiles(int num_vregs, int num_sregs)
Definition wavefront.cc:590
int scalarOutstandingReqsWrGm
Definition wavefront.hh:189
uint32_t gridSz[3]
Definition wavefront.hh:165
void decExpInstsIssued()
std::array< uint8_t, VegaISA::NumVecElemPerVecReg > mfmaBScale
Definition wavefront.hh:378
std::set< InstSeqNum > lgkmIssued
Definition wavefront.hh:292
std::vector< uint32_t > oldVgpr
Definition wavefront.hh:214
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
Definition wavefront.cc:122
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
Definition wavefront.hh:109
std::vector< uint32_t > workItemFlatId
Definition wavefront.hh:161
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
Definition wavefront.hh:362
std::vector< int > vecReads
Definition wavefront.hh:246
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:112
bool isOldestInstSleep()
Definition wavefront.cc:671
uint32_t accumOffset
Definition wavefront.hh:138
bool isLmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:660
GPUDynInstPtr nextInstr()
uint64_t lastTrace
Definition wavefront.hh:198
std::vector< uint32_t > workItemId[3]
Definition wavefront.hh:160
std::vector< uint64_t > oldDgpr
Definition wavefront.hh:221
bool isOldestInstScalarALU()
Definition wavefront.cc:702
void untrackExpInst(GPUDynInstPtr gpu_dyn_inst)
void releaseBarrier()
bool isOldestInstFlatMem()
Definition wavefront.cc:797
status_e getStatus()
Definition wavefront.hh:142
WavefrontParams Params
Definition wavefront.hh:253
uint32_t maxVgprs
Definition wavefront.hh:134
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
std::string lastInstDisasm
Definition wavefront.hh:328
uint32_t workGroupId[3]
Definition wavefront.hh:163
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
const int wfSlotId
Definition wavefront.hh:99
std::unordered_map< int, uint64_t > rawDist
Definition wavefront.hh:242
void incExpInstsIssued()
void untrackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
std::vector< int > reserveResources()
Definition wavefront.cc:891
uint32_t startSgprIndex
Definition wavefront.hh:208
GfxVersion gfxVersion
Definition wavefront.hh:97
void decLGKMInstsIssued()
void incLGKMInstsIssued()
int barrierId() const
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition wavefront.cc:107
uint32_t workGroupSz[3]
Definition wavefront.hh:164
void untrackVMemInst(GPUDynInstPtr gpu_dyn_inst)
void trackExpInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstVectorALU()
Definition wavefront.cc:717
uint64_t lastInstExec
Definition wavefront.hh:238
LdsChunk * ldsChunk
Definition wavefront.hh:232
std::unordered_map< InstSeqNum, std::string > cntInsts
Definition wavefront.hh:294
uint32_t actualWgSz[3]
Definition wavefront.hh:169
Addr archFlatScratchAddr
Definition wavefront.hh:211
std::set< InstSeqNum > vmemIssued
Definition wavefront.hh:291
void setMfmaBScale(int idx, uint8_t value)
void trackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
int scalarOutstandingReqsRdGm
Definition wavefront.hh:187
void freeResources()
Definition wavefront.cc:824
void incVMemInstsIssued()
std::string statusToString(status_e status)
void reserveLmResource(GPUDynInstPtr ii)
Definition wavefront.cc:873
std::string lastInstRdyStatus
Definition wavefront.hh:329
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:93
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:89
bool isOldestInstGMem()
Definition wavefront.cc:745
gem5::Wavefront::WavefrontStats stats
VectorMask & execMask()
void setMfmaAScale(int idx, uint8_t value)
uint64_t wfDynId
Definition wavefront.hh:235
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:649
uint32_t startVgprIndex
Definition wavefront.hh:205
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition wavefront.cc:638
TheGpuISA::GPUISA _gpuISA
Definition wavefront.hh:341
Statistics container.
Definition group.hh:93
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
SimObject(const Params &p)
Definition sim_object.cc:58
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:315
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 12, 11 > set
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 0 > p
const int NumVecElemPerVecReg(64)
const FlagsType none
Nothing extra to print.
Definition info.hh:53
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
static void init_pc(py::module_ &m_native)
Definition core.cc:168
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
@ WorkgroupIdX
@ DispatchId
@ NumScalarInitFields
@ DispatchPtr
@ QueuePtr
@ PrivSegWaveByteOffset
@ PrivateSegBuf
@ WorkgroupIdY
@ PrivateSegSize
@ WorkgroupInfo
@ WorkgroupIdZ
@ FlatScratchInit
@ KernargPreload
@ KernargSegPtr
@ WorkitemIdX
@ WorkitemIdZ
@ NumVectorInitFields
@ WorkitemIdY
uint64_t InstSeqNum
Definition inst_seq.hh:40
statistics::Scalar numTimesBlockedDueRAWDependencies
Definition wavefront.hh:415
statistics::Scalar schResourceStalls
Definition wavefront.hh:402
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
Definition wavefront.hh:419
statistics::Distribution readsPerWrite
Definition wavefront.hh:423
statistics::Scalar schCycles
Definition wavefront.hh:390
statistics::Scalar numTimesBlockedDueWAXDependencies
Definition wavefront.hh:412
statistics::Scalar schRfAccessStalls
Definition wavefront.hh:400
statistics::Scalar schOpdNrdyStalls
Definition wavefront.hh:404
statistics::Scalar numInstrExecuted
Definition wavefront.hh:387
statistics::Scalar schStalls
Definition wavefront.hh:393
statistics::Scalar schLdsArbStalls
Definition wavefront.hh:408
uint32_t scratch_workitem_byte_size
Definition hsa_queue.hh:84
uint32_t compute_tmpring_size_wavesize
Definition hsa_queue.hh:79
uint64_t scratch_backing_memory_location
Definition hsa_queue.hh:82
uint32_t scratch_resource_descriptor[4]
Definition hsa_queue.hh:81

Generated on Mon Oct 27 2025 04:13:02 for gem5 by doxygen 1.14.0