gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
wavefront.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include "base/bitfield.hh"
35#include "debug/GPUExec.hh"
36#include "debug/GPUInitAbi.hh"
37#include "debug/GPUTrace.hh"
38#include "debug/WavefrontStack.hh"
43#include "gpu-compute/shader.hh"
46
47namespace gem5
48{
49
51 : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
52 maxIbSize(p.max_ib_size), _gpuISA(*this),
53 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
55 sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
56{
57 lastTrace = 0;
58 execUnitId = -1;
78 ldsChunk = nullptr;
79
80 memTraceBusy = 0;
81 oldVgprTcnt = 0xffffffffffffffffll;
82 oldDgprTcnt = 0xffffffffffffffffll;
83 oldVgpr.resize(p.wf_size);
84
85 pendingFetch = false;
86 dropFetch = false;
87 maxVgprs = 0;
88 maxSgprs = 0;
89
90 lastAddr.resize(p.wf_size);
91 workItemFlatId.resize(p.wf_size);
92 oldDgpr.resize(p.wf_size);
93 for (int i = 0; i < 3; ++i) {
94 workItemId[i].resize(p.wf_size);
95 }
96
97 _execMask.set();
98 rawDist.clear();
99 lastInstExec = 0;
100 vecReads.clear();
101
102 lastInstSeqNum = 0;
103 lastInstDisasm = "none";
104}
105
106void
108{
111 startVgprIndex = 0;
112 startSgprIndex = 0;
113
114 scalarAlu = computeUnit->mapWaveToScalarAlu(this);
115 scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
116 globalMem = computeUnit->mapWaveToGlobalMem(this);
117 localMem = computeUnit->mapWaveToLocalMem(this);
118 scalarMem = computeUnit->mapWaveToScalarMem(this);
119}
120
121void
122Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
123{
124 int regInitIdx = 0;
125 gfxVersion = task->gfxVersion();
126
127 // Iterate over all the init fields and check which
128 // bits are enabled. Useful information can be found here:
129 // https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
130 // blob/master/AMDGPU-ABI.md
131 for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
132
133 if (task->sgprBitEnabled(en_bit)) {
134 int physSgprIdx = 0;
135 uint32_t firstWave = 0;
136 int orderedAppendTerm = 0;
137 int numWfsInWg = 0;
138 uint32_t finalValue = 0;
139 Addr host_disp_pkt_addr = task->hostDispPktAddr();
140 Addr kernarg_addr = task->kernargAddr();
141 Addr hidden_priv_base(0);
142
143 switch (en_bit) {
144 case PrivateSegBuf:
145 physSgprIdx =
146 computeUnit->registerManager->mapSgpr(this, regInitIdx);
147 computeUnit->srf[simdId]->write(physSgprIdx,
149 ++regInitIdx;
150 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
151 "Setting PrivateSegBuffer: s[%d] = %x\n",
152 computeUnit->cu_id, simdId,
153 wfSlotId, wfDynId, physSgprIdx,
155
156 physSgprIdx =
157 computeUnit->registerManager->mapSgpr(this, regInitIdx);
158 computeUnit->srf[simdId]->write(physSgprIdx,
160 ++regInitIdx;
161 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
162 "Setting PrivateSegBuffer: s[%d] = %x\n",
163 computeUnit->cu_id, simdId,
164 wfSlotId, wfDynId, physSgprIdx,
166
167 physSgprIdx =
168 computeUnit->registerManager->mapSgpr(this, regInitIdx);
169 computeUnit->srf[simdId]->write(physSgprIdx,
171 ++regInitIdx;
172 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
173 "Setting PrivateSegBuffer: s[%d] = %x\n",
174 computeUnit->cu_id, simdId,
175 wfSlotId, wfDynId, physSgprIdx,
177
178 physSgprIdx =
179 computeUnit->registerManager->mapSgpr(this, regInitIdx);
180 computeUnit->srf[simdId]->write(physSgprIdx,
182
183 ++regInitIdx;
184 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
185 "Setting PrivateSegBuffer: s[%d] = %x\n",
186 computeUnit->cu_id, simdId,
187 wfSlotId, wfDynId, physSgprIdx,
189 break;
190 case DispatchPtr:
191 physSgprIdx =
192 computeUnit->registerManager->mapSgpr(this, regInitIdx);
193 computeUnit->srf[simdId]->write(physSgprIdx,
194 bits(host_disp_pkt_addr, 31, 0));
195 ++regInitIdx;
196 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
197 "Setting DispatchPtr: s[%d] = %x\n",
198 computeUnit->cu_id, simdId,
199 wfSlotId, wfDynId, physSgprIdx,
200 bits(host_disp_pkt_addr, 31, 0));
201
202 physSgprIdx =
203 computeUnit->registerManager->mapSgpr(this, regInitIdx);
204 computeUnit->srf[simdId]->write(physSgprIdx,
205 bits(host_disp_pkt_addr, 63, 32));
206 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
207 "Setting DispatchPtr: s[%d] = %x\n",
208 computeUnit->cu_id, simdId,
209 wfSlotId, wfDynId, physSgprIdx,
210 bits(host_disp_pkt_addr, 63, 32));
211
212 ++regInitIdx;
213 break;
214 case QueuePtr:
215 physSgprIdx =
216 computeUnit->registerManager->mapSgpr(this, regInitIdx);
217 computeUnit->srf[simdId]->write(physSgprIdx,
218 bits(task->hostAMDQueueAddr, 31, 0));
219 ++regInitIdx;
220 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
221 "Setting QueuePtr: s[%d] = %x\n",
222 computeUnit->cu_id, simdId,
223 wfSlotId, wfDynId, physSgprIdx,
224 bits(task->hostAMDQueueAddr, 31, 0));
225
226 physSgprIdx =
227 computeUnit->registerManager->mapSgpr(this, regInitIdx);
228 computeUnit->srf[simdId]->write(physSgprIdx,
229 bits(task->hostAMDQueueAddr, 63, 32));
230 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
231 "Setting QueuePtr: s[%d] = %x\n",
232 computeUnit->cu_id, simdId,
233 wfSlotId, wfDynId, physSgprIdx,
234 bits(task->hostAMDQueueAddr, 63, 32));
235
236 ++regInitIdx;
237 break;
238 case KernargSegPtr:
239 physSgprIdx =
240 computeUnit->registerManager->mapSgpr(this, regInitIdx);
241 computeUnit->srf[simdId]->write(physSgprIdx,
242 bits(kernarg_addr, 31, 0));
243 ++regInitIdx;
244 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
245 "Setting KernargSegPtr: s[%d] = %x\n",
246 computeUnit->cu_id, simdId,
247 wfSlotId, wfDynId, physSgprIdx,
248 bits(kernarg_addr, 31, 0));
249
250 physSgprIdx =
251 computeUnit->registerManager->mapSgpr(this, regInitIdx);
252 computeUnit->srf[simdId]->write(physSgprIdx,
253 bits(kernarg_addr, 63, 32));
254 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
255 "Setting KernargSegPtr: s[%d] = %x\n",
256 computeUnit->cu_id, simdId,
257 wfSlotId, wfDynId, physSgprIdx,
258 bits(kernarg_addr, 63, 32));
259
260 ++regInitIdx;
261 break;
262 case DispatchId:
263 physSgprIdx
264 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
265 computeUnit->srf[simdId]->write(physSgprIdx,
266 task->dispatchId());
267 ++regInitIdx;
268 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
269 "Setting DispatchId: s[%d] = %x\n",
270 computeUnit->cu_id, simdId,
271 wfSlotId, wfDynId, physSgprIdx,
272 task->dispatchId());
273
274 // Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
275 physSgprIdx
276 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
277 computeUnit->srf[simdId]->write(physSgprIdx, 0);
278 ++regInitIdx;
279 break;
280 case FlatScratchInit:
281 physSgprIdx
282 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
283 computeUnit->srf[simdId]->write(physSgprIdx,
284 (TheGpuISA::ScalarRegU32)(task->amdQueue
285 .scratch_backing_memory_location & 0xffffffff));
286 ++regInitIdx;
287 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
288 "Setting FlatScratch Addr: s[%d] = %x\n",
289 computeUnit->cu_id, simdId,
290 wfSlotId, wfDynId, physSgprIdx,
291 (TheGpuISA::ScalarRegU32)(task->amdQueue
292 .scratch_backing_memory_location & 0xffffffff));
293
294 physSgprIdx =
295 computeUnit->registerManager->mapSgpr(this, regInitIdx);
296 // This vallue should be sizeof(DWORD) aligned, that is
297 // 4 byte aligned
298 computeUnit->srf[simdId]->write(physSgprIdx,
300 ++regInitIdx;
301 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
302 "Setting FlatScratch size: s[%d] = %x\n",
303 computeUnit->cu_id, simdId,
304 wfSlotId, wfDynId, physSgprIdx,
329 hidden_priv_base =
330 (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
331 (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
332 & 0x000000000000ffff) << 32);
333 computeUnit->shader->initShHiddenPrivateBase(
334 hidden_priv_base,
336 break;
337 case PrivateSegSize:
338 physSgprIdx
339 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
340 computeUnit->srf[simdId]->write(physSgprIdx,
341 task->privMemPerItem());
342 ++regInitIdx;
343 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
344 "Setting private segment size: s[%d] = %x\n",
345 computeUnit->cu_id, simdId,
346 wfSlotId, wfDynId, physSgprIdx,
347 task->privMemPerItem());
348 break;
349 case KernargPreload:
350 DPRINTF(GPUInitAbi, "Preload %d user SGPRs starting at virtual"
351 " SGPR s[%d]\n", task->preloadLength(), regInitIdx);
352
353 for (int idx = 0; idx < task->preloadLength(); ++idx) {
354 uint32_t finalValue = task->preloadArgs()[idx];
355 physSgprIdx =
356 computeUnit->registerManager->mapSgpr(this,
357 regInitIdx);
358
359 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] Setting "
360 "s[%d] = %x\n", computeUnit->cu_id, simdId,
361 wfSlotId, wfDynId, physSgprIdx, finalValue);
362
363 computeUnit->srf[simdId]->write(physSgprIdx, finalValue);
364 ++regInitIdx;
365 }
366 break;
367 case WorkgroupIdX:
368 physSgprIdx =
369 computeUnit->registerManager->mapSgpr(this, regInitIdx);
370 computeUnit->srf[simdId]->write(physSgprIdx,
371 workGroupId[0]);
372
373 ++regInitIdx;
374 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
375 "Setting WG ID X: s[%d] = %x\n",
376 computeUnit->cu_id, simdId,
377 wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
378 break;
379 case WorkgroupIdY:
380 physSgprIdx =
381 computeUnit->registerManager->mapSgpr(this, regInitIdx);
382 computeUnit->srf[simdId]->write(physSgprIdx,
383 workGroupId[1]);
384
385 ++regInitIdx;
386 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
387 "Setting WG ID Y: s[%d] = %x\n",
388 computeUnit->cu_id, simdId,
389 wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
390 break;
391 case WorkgroupIdZ:
392 physSgprIdx =
393 computeUnit->registerManager->mapSgpr(this, regInitIdx);
394 computeUnit->srf[simdId]->write(physSgprIdx,
395 workGroupId[2]);
396
397 ++regInitIdx;
398 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
399 "Setting WG ID Z: s[%d] = %x\n",
400 computeUnit->cu_id, simdId,
401 wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
402 break;
404
405 // For architected flat scratch, this enable is reused to set
406 // the FLAT_SCRATCH register pair to the scratch backing
407 // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
408 if (task->gfxVersion() == GfxVersion::gfx942) {
409 uint32_t scratchPerWI =
411
414 + (scratchPerWI * 64 * wfId);
415
416 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
417 "Setting architected flat scratch = %x\n",
420
421 break;
422 }
423
424 // Not architected flat scratch. Write the scratch wavefront
425 // offset: https://llvm.org/docs/AMDGPUUsage.html
426 // #amdgpu-amdhsa-initial-kernel-execution-state
427 physSgprIdx =
428 computeUnit->registerManager->mapSgpr(this, regInitIdx);
429
443 computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
444 (wgId * (wgSz / 64) + wfId) *
446
447 ++regInitIdx;
448 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
449 "Setting Private Seg Offset: s[%d] = %x\n",
450 computeUnit->cu_id, simdId,
451 wfSlotId, wfDynId, physSgprIdx,
452 1024 * (wgId * (wgSz / 64) + wfId) *
454 break;
455 case WorkgroupInfo:
456 firstWave = (wfId == 0) ? 1 : 0;
457 numWfsInWg = divCeil(wgSizeInWorkItems,
458 computeUnit->wfSize());
459 finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
460 finalValue |= (orderedAppendTerm << 6);
461 finalValue |= numWfsInWg;
462 physSgprIdx =
463 computeUnit->registerManager->mapSgpr(this, regInitIdx);
464 computeUnit->srf[simdId]->
465 write(physSgprIdx, finalValue);
466
467 ++regInitIdx;
468 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
469 "Setting WG Info: s[%d] = %x\n",
470 computeUnit->cu_id, simdId,
471 wfSlotId, wfDynId, physSgprIdx, finalValue);
472 break;
473 default:
474 fatal("SGPR enable bit %i not supported\n", en_bit);
475 break;
476 }
477 }
478 }
479
480 // Save the offset to the first accumulation VGPR number from HSA task.
481 accumOffset = task->accumOffset();
482
483 regInitIdx = 0;
484
485 // VGPRs are initialized to the work item IDs for a given thread. There
486 // are two ways to initialize the IDs based on number of dimensions. ISAs
487 // will either have packed work-item IDs or not. LLVM lists them here:
488 // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
489 // Default to false and set to true for gem5 supported ISAs.
490 bool packed_work_item_id = false;
491
492 if (task->gfxVersion() == GfxVersion::gfx90a ||
493 task->gfxVersion() == GfxVersion::gfx942) {
494 packed_work_item_id = true;
495 }
496
497 // For ISAs with packed work item IDs, only one VGPR is used and the
498 // (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
499 // for each dimension
500 if (packed_work_item_id) {
501 TheGpuISA::VecRegContainerU32 raw_vgpr;
502 TheGpuISA::VecElemU32 *packed_vgpr
503 = raw_vgpr.as<TheGpuISA::VecElemU32>();
504
505 uint32_t physVgprIdx = computeUnit->registerManager
506 ->mapVgpr(this, regInitIdx);
507 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
508 packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
509 }
510 if (task->vgprBitEnabled(1)) {
511 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
512 packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
513 }
514 }
515 if (task->vgprBitEnabled(2)) {
516 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
517 packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
518 }
519 }
520 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
521
522 return;
523 }
524
525 // For ISAs with non-packed work item IDs, map and initialize one VGPR
526 // per dimensions. Do this by iterating over all the init fields and
527 // checking which bits are enabled.
528 for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
529 if (task->vgprBitEnabled(en_bit)) {
530 uint32_t physVgprIdx = 0;
531 TheGpuISA::VecRegContainerU32 raw_vgpr;
532
533 switch (en_bit) {
534 case WorkitemIdX:
535 {
536 physVgprIdx = computeUnit->registerManager
537 ->mapVgpr(this, regInitIdx);
538 TheGpuISA::VecElemU32 *vgpr_x
539 = raw_vgpr.as<TheGpuISA::VecElemU32>();
540
541 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
542 vgpr_x[lane] = workItemId[0][lane];
543 }
544
545 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
546 rawDist[regInitIdx] = 0;
547 ++regInitIdx;
548 }
549 break;
550 case WorkitemIdY:
551 {
552 physVgprIdx = computeUnit->registerManager
553 ->mapVgpr(this, regInitIdx);
554 TheGpuISA::VecElemU32 *vgpr_y
555 = raw_vgpr.as<TheGpuISA::VecElemU32>();
556
557 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
558 vgpr_y[lane] = workItemId[1][lane];
559 }
560
561 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
562 rawDist[regInitIdx] = 0;
563 ++regInitIdx;
564 }
565 break;
566 case WorkitemIdZ:
567 {
568 physVgprIdx = computeUnit->registerManager->
569 mapVgpr(this, regInitIdx);
570 TheGpuISA::VecElemU32 *vgpr_z
571 = raw_vgpr.as<TheGpuISA::VecElemU32>();
572
573 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
574 vgpr_z[lane] = workItemId[2][lane];
575 }
576
577 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
578 rawDist[regInitIdx] = 0;
579 ++regInitIdx;
580 }
581 break;
582 }
583 }
584 }
585}
586
587void
588Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
589{
590 maxVgprs = num_vregs;
591 maxSgprs = num_sregs;
592}
593
597
598void
600{
601 if (computeUnit->idleCUTimeout > 0) {
602 // Wavefront's status transitions to stalled or stopped
603 if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
604 newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
605 (status != newStatus)) {
606 computeUnit->idleWfs++;
607 assert(computeUnit->idleWfs <=
608 (computeUnit->shader->n_wf * computeUnit->numVectorALUs));
609 if (computeUnit->idleWfs ==
610 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
612 }
613 // Wavefront's status transitions to an active state (from
614 // a stopped or stalled state)
615 } else if ((status == S_STOPPED || status == S_STALLED ||
616 status == S_WAITCNT || status == S_BARRIER) &&
617 (status != newStatus)) {
618 // if all WFs in the CU were idle then check if the idleness
619 // period exceeded the timeout threshold
620 if (computeUnit->idleWfs ==
621 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
623 computeUnit->idleCUTimeout,
624 "CU%d has been idle for %d ticks at tick %d",
625 computeUnit->cu_id, computeUnit->idleCUTimeout,
626 curTick());
627 }
628 computeUnit->idleWfs--;
629 assert(computeUnit->idleWfs >= 0);
630 }
631 }
632 status = newStatus;
633}
634
635void
636Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
637{
638 wfDynId = _wf_dyn_id;
639 _pc = init_pc;
640
642
643 vecReads.resize(maxVgprs, 0);
644}
645
646bool
648{
649 if (ii->isGlobalMem() ||
650 (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
651 return true;
652 }
653
654 return false;
655}
656
657bool
659{
660 if (ii->isLocalMem() ||
661 (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
662 return true;
663 }
664
665 return false;
666}
667
668bool
670{
671 if (instructionBuffer.empty())
672 return false;
673
674 GPUDynInstPtr ii = instructionBuffer.front();
675
676 if (ii->isSleep()) {
677 return true;
678 }
679 return false;
680}
681
682bool
684{
685 if (instructionBuffer.empty())
686 return false;
687
688 GPUDynInstPtr ii = instructionBuffer.front();
689
690 if (ii->isWaitcnt()) {
691 // waitcnt is a scalar
692 assert(ii->isScalar());
693 return true;
694 }
695
696 return false;
697}
698
699bool
701{
702 assert(!instructionBuffer.empty());
703 GPUDynInstPtr ii = instructionBuffer.front();
704
705 if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
706 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
707 (ii->isKernArgSeg() && ii->isLoad()))) {
708 return true;
709 }
710
711 return false;
712}
713
714bool
716{
717 assert(!instructionBuffer.empty());
718 GPUDynInstPtr ii = instructionBuffer.front();
719
720 if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
721 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
722 || (ii->isKernArgSeg() && ii->isLoad()))) {
723 return true;
724 }
725
726 return false;
727}
728
729bool
731{
732 assert(!instructionBuffer.empty());
733 GPUDynInstPtr ii = instructionBuffer.front();
734
735 if (status != S_STOPPED && ii->isBarrier()) {
736 return true;
737 }
738
739 return false;
740}
741
742bool
744{
745 assert(!instructionBuffer.empty());
746 GPUDynInstPtr ii = instructionBuffer.front();
747
748 if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
749 return true;
750 }
751
752 return false;
753}
754
755bool
757{
758 assert(!instructionBuffer.empty());
759 GPUDynInstPtr ii = instructionBuffer.front();
760
761 if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
762 return true;
763 }
764
765 return false;
766}
767
768bool
770{
771 assert(!instructionBuffer.empty());
772 GPUDynInstPtr ii = instructionBuffer.front();
773
774 if (status != S_STOPPED && ii->isLocalMem()) {
775 return true;
776 }
777
778 return false;
779}
780
781bool
783{
784 assert(!instructionBuffer.empty());
785 GPUDynInstPtr ii = instructionBuffer.front();
786
787 if (status != S_STOPPED && ii->isPrivateSeg()) {
788 return true;
789 }
790
791 return false;
792}
793
794bool
796{
797 assert(!instructionBuffer.empty());
798 GPUDynInstPtr ii = instructionBuffer.front();
799
800 if (status != S_STOPPED && ii->isFlat()) {
801 return true;
802 }
803
804 return false;
805}
806
807bool
809{
810 for (auto it : instructionBuffer) {
811 GPUDynInstPtr ii = it;
812 if (ii->isReturn() || ii->isBranch() ||
813 ii->isEndOfKernel()) {
814 return true;
815 }
816 }
817
818 return false;
819}
820
821void
826
828{
830 wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
831 outstandingReqs < 0,
832 "Negative requests in pipe for WF%d for slot%d"
833 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
834 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
835 " Outstanding Reqs=%d\n",
838}
839
840void
842{
843 if (!ii->isScalar()) {
844 if (ii->isLoad()) {
846 } else if (ii->isStore()) {
848 } else if (ii->isAtomic() || ii->isMemSync()) {
851 } else {
852 panic("Invalid memory operation!\n");
853 }
855 } else {
856 if (ii->isLoad()) {
858 } else if (ii->isStore()) {
860 } else if (ii->isAtomic() || ii->isMemSync()) {
863 } else {
864 panic("Invalid memory operation!\n");
865 }
867 }
868}
869
870void
872{
873 fatal_if(ii->isScalar(),
874 "Scalar instructions can not access Shared memory!!!");
875 if (ii->isLoad()) {
877 } else if (ii->isStore()) {
879 } else if (ii->isAtomic() || ii->isMemSync()) {
882 } else {
883 panic("Invalid memory operation!\n");
884 }
886}
887
890{
891 // vector of execution unit IDs to return to schedule stage
892 // this return is only used for debugging and an assertion...
893 std::vector<int> execUnitIds;
894
895 // Get current instruction
896 GPUDynInstPtr ii = instructionBuffer.front();
897 assert(ii);
898
899 // Single precision ALU or Branch or Return or Special instruction
900 if (ii->isALU() || ii->isSpecialOp() ||
901 ii->isBranch() || ii->isNop() ||
902 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
903 ii->isReturn() || ii->isEndOfKernel()) {
904 if (!ii->isScalar()) {
906 } else {
908 }
909 // this is to enforce a fixed number of cycles per issue slot per SIMD
910 } else if (ii->isBarrier()) {
911 execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
912 } else if (ii->isFlat()) {
913 assert(!ii->isScalar());
915 // add execUnitId, reserved by reserveLmResource, list before it is
916 // overwriten by reserveGmResource
917 execUnitIds.push_back(execUnitId);
921 execUnitIds.push_back(flatGmUnitId);
922 execUnitId = -1;
923 } else if (ii->isGlobalMem()) {
925 } else if (ii->isLocalMem()) {
927 } else if (ii->isPrivateSeg()) {
928 fatal_if(ii->isScalar(),
929 "Scalar instructions can not access Private memory!!!");
931 } else {
932 panic("reserveResources -> Couldn't process op!\n");
933 }
934
935 if (execUnitId != -1) {
936 execUnitIds.push_back(execUnitId);
937 }
938 assert(execUnitIds.size());
939 return execUnitIds;
940}
941
942void
944{
945 // ---- Exit if wavefront is inactive ----------------------------- //
946
947 if (status == S_STOPPED || status == S_RETURNING ||
948 status==S_STALLED || instructionBuffer.empty()) {
949 return;
950 }
951
952 if (status == S_WAITCNT) {
964 assert(isOldestInstWaitcnt());
965 }
966
967 // Get current instruction
968
969 GPUDynInstPtr ii = instructionBuffer.front();
970
971 const Addr old_pc = pc();
972 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
973 "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
974 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
975 DPRINTF(GPUTrace, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
976 "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
977 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
978
979 ii->execute(ii);
980 // delete the dynamic instruction from the pipeline map
981 computeUnit->deleteFromPipeMap(this);
982 // update the instruction stats in the CU
983 computeUnit->updateInstStats(ii);
984
985 // inform VRF of instruction execution to schedule write-back
986 // and scoreboard ready for registers
987 if (!ii->isScalar()) {
988 computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
989 computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
990 }
991 computeUnit->srf[simdId]->waveExecuteInst(this, ii);
992
993 computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
994 computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
995 computeUnit->stats.numInstrExecuted++;
996 stats.numInstrExecuted++;
997 computeUnit->instExecPerSimd[simdId]++;
998 computeUnit->stats.execRateDist.sample(
999 computeUnit->stats.totalCycles.value() -
1000 computeUnit->lastExecCycle[simdId]);
1001 computeUnit->lastExecCycle[simdId] =
1002 computeUnit->stats.totalCycles.value();
1003
1004 if (lastInstExec) {
1005 computeUnit->stats.instInterleave[simdId].
1006 sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
1007 }
1008 lastInstExec = computeUnit->instExecPerSimd[simdId];
1009
1010 // want to track:
1011 // number of reads that occur per value written
1012
1013 // vector RAW dependency tracking
1014 for (const auto& srcVecOp : ii->srcVecRegOperands()) {
1015 for (const auto& virtIdx : srcVecOp.virtIndices()) {
1016 // This check should never fail, but to be safe we check
1017 if (rawDist.find(virtIdx) != rawDist.end()) {
1018 stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
1019 rawDist[virtIdx]);
1020 }
1021 // increment number of reads to this register
1022 vecReads[virtIdx]++;
1023 }
1024 }
1025
1026 for (const auto& dstVecOp : ii->dstVecRegOperands()) {
1027 for (const auto& virtIdx : dstVecOp.virtIndices()) {
1028 // rawDist is set on writes, but will not be set for the first
1029 // write to each physical register
1030 if (rawDist.find(virtIdx) != rawDist.end()) {
1031 // Sample the number of reads that were performed
1032 stats.readsPerWrite.sample(vecReads[virtIdx]);
1033 }
1034 // on a write, reset count of reads to 0
1035 vecReads[virtIdx] = 0;
1036
1037 rawDist[virtIdx] = stats.numInstrExecuted.value();
1038 }
1039 }
1040
1041 if (pc() == old_pc) {
1042 // PC not modified by instruction, proceed to next
1043 _gpuISA.advancePC(ii);
1044 instructionBuffer.pop_front();
1045 } else {
1046 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1048 ii->disassemble());
1049 discardFetch();
1050 }
1051 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1052 computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
1053
1054 if (computeUnit->shader->hsail_mode==Shader::SIMT) {
1055 const int num_active_lanes = execMask().count();
1056 computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
1057 computeUnit->stats.numVecOpsExecuted += num_active_lanes;
1058
1059 if (ii->isMFMA()) {
1060 computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
1061 if (ii->isI8()) {
1062 computeUnit->stats.numVecOpsExecutedMFMAI8
1063 += num_active_lanes;
1064 }
1065 }
1066
1067 if (ii->isF16() && ii->isALU()) {
1068 if (ii->isF32() || ii->isF64()) {
1069 fatal("Instruction is tagged as both (1) F16, and (2)"
1070 "either F32 or F64.");
1071 }
1072 computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
1073 if (ii->isFMA()) {
1074 computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
1075 computeUnit->stats.numVecOpsExecutedTwoOpFP
1076 += num_active_lanes;
1077 }
1078 else if (ii->isMAC()) {
1079 computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
1080 computeUnit->stats.numVecOpsExecutedTwoOpFP
1081 += num_active_lanes;
1082 }
1083 else if (ii->isMAD()) {
1084 computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
1085 computeUnit->stats.numVecOpsExecutedTwoOpFP
1086 += num_active_lanes;
1087 }
1088 else if (ii->isMFMA()) {
1089 computeUnit->stats.numVecOpsExecutedMFMAF16
1090 += num_active_lanes;
1091 }
1092 }
1093 if (ii->isF32() && ii->isALU()) {
1094 if (ii->isF16() || ii->isF64()) {
1095 fatal("Instruction is tagged as both (1) F32, and (2)"
1096 "either F16 or F64.");
1097 }
1098 computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
1099 if (ii->isFMA()) {
1100 computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
1101 computeUnit->stats.numVecOpsExecutedTwoOpFP
1102 += num_active_lanes;
1103 }
1104 else if (ii->isMAC()) {
1105 computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
1106 computeUnit->stats.numVecOpsExecutedTwoOpFP
1107 += num_active_lanes;
1108 }
1109 else if (ii->isMAD()) {
1110 computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
1111 computeUnit->stats.numVecOpsExecutedTwoOpFP
1112 += num_active_lanes;
1113 }
1114 else if (ii->isMFMA()) {
1115 computeUnit->stats.numVecOpsExecutedMFMAF32
1116 += num_active_lanes;
1117 }
1118 }
1119 if (ii->isF64() && ii->isALU()) {
1120 if (ii->isF16() || ii->isF32()) {
1121 fatal("Instruction is tagged as both (1) F64, and (2)"
1122 "either F16 or F32.");
1123 }
1124 computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
1125 if (ii->isFMA()) {
1126 computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
1127 computeUnit->stats.numVecOpsExecutedTwoOpFP
1128 += num_active_lanes;
1129 }
1130 else if (ii->isMAC()) {
1131 computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
1132 computeUnit->stats.numVecOpsExecutedTwoOpFP
1133 += num_active_lanes;
1134 }
1135 else if (ii->isMAD()) {
1136 computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
1137 computeUnit->stats.numVecOpsExecutedTwoOpFP
1138 += num_active_lanes;
1139 }
1140 else if (ii->isMFMA()) {
1141 computeUnit->stats.numVecOpsExecutedMFMAF64
1142 += num_active_lanes;
1143 }
1144 }
1145 if (isGmInstruction(ii)) {
1146 computeUnit->stats.activeLanesPerGMemInstrDist.sample(
1147 num_active_lanes);
1148 } else if (isLmInstruction(ii)) {
1149 computeUnit->stats.activeLanesPerLMemInstrDist.sample(
1150 num_active_lanes);
1151 }
1152 }
1153
1158 if (execMask().none() && ii->needsToken()) {
1159 computeUnit->getTokenManager()->recvTokens(1);
1160 return;
1161 }
1162
1163 // Update Vector ALU pipeline and other resources
1164 bool flat_as_gm = false;
1165 bool flat_as_lm = false;
1166 if (ii->isFlat()) {
1167 flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1168 (ii->executedAs() == enums::SC_PRIVATE);
1169 flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1170 }
1171
1172 // Single precision ALU or Branch or Return or Special instruction
1173 // Note, we use the same timing regardless of SP or DP ALU operation.
1174 if (ii->isALU() || ii->isSpecialOp() ||
1175 ii->isBranch() || ii->isNop() ||
1176 (ii->isKernArgSeg() && ii->isLoad()) ||
1177 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1178 // this is to enforce a fixed number of cycles per issue slot per SIMD
1179 if (!ii->isScalar()) {
1180 computeUnit->vectorALUs[simdId].set(computeUnit->
1181 cyclesToTicks(computeUnit->issuePeriod));
1182 } else {
1183 computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1184 cyclesToTicks(computeUnit->issuePeriod));
1185 }
1186 // Barrier on Scalar ALU
1187 } else if (ii->isBarrier()) {
1188 computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1189 cyclesToTicks(computeUnit->issuePeriod));
1190 // GM or Flat as GM Load
1191 } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1192 if (!ii->isScalar()) {
1193 computeUnit->vrfToGlobalMemPipeBus.set(
1194 computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
1195 computeUnit->vectorGlobalMemUnit.
1196 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1197 computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
1198 computeUnit->vrf_gm_bus_latency;
1199 } else {
1200 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1201 cyclesToTicks(computeUnit->srf_scm_bus_latency));
1202 computeUnit->scalarMemUnit.
1203 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1204 computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
1205 computeUnit->srf_scm_bus_latency;
1206 }
1207 // GM or Flat as GM Store
1208 } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1209 if (!ii->isScalar()) {
1210 computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1211 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1212 computeUnit->vectorGlobalMemUnit.
1213 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1214 computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
1215 (2 * computeUnit->vrf_gm_bus_latency);
1216 } else {
1217 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1218 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1219 computeUnit->scalarMemUnit.
1220 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1221 computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
1222 (2 * computeUnit->srf_scm_bus_latency);
1223 }
1224 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1225 (ii->isGlobalMem() || flat_as_gm)) {
1226 if (!ii->isScalar()) {
1227 computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1228 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1229 computeUnit->vectorGlobalMemUnit.
1230 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1231 computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
1232 (2 * computeUnit->vrf_gm_bus_latency);
1233 } else {
1234 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1235 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1236 computeUnit->scalarMemUnit.
1237 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1238 computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
1239 (2 * computeUnit->srf_scm_bus_latency);
1240 }
1241 // LM or Flat as LM Load
1242 } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1243 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1244 cyclesToTicks(computeUnit->vrf_lm_bus_latency));
1245 computeUnit->vectorSharedMemUnit.
1246 set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
1247 computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
1248 computeUnit->vrf_lm_bus_latency;
1249 // LM or Flat as LM Store
1250 } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1251 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1252 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1253 computeUnit->vectorSharedMemUnit.
1254 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1255 computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
1256 (2 * computeUnit->vrf_lm_bus_latency);
1257 // LM or Flat as LM, Atomic or MemFence
1258 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1259 (ii->isLocalMem() || flat_as_lm)) {
1260 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1261 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1262 computeUnit->vectorSharedMemUnit.
1263 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1264 computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
1265 (2 * computeUnit->vrf_lm_bus_latency);
1266 } else {
1267 panic("Bad instruction type!\n");
1268 }
1269}
1270
1273{
1274 // Read next instruction from instruction buffer
1275 GPUDynInstPtr ii = instructionBuffer.front();
1276 // if the WF has been dispatched in the schedule stage then
1277 // check the next oldest instruction for readiness
1278 if (computeUnit->pipeMap.find(ii->seqNum()) !=
1279 computeUnit->pipeMap.end()) {
1280 if (instructionBuffer.size() > 1) {
1281 auto it = instructionBuffer.begin() + 1;
1282 return *it;
1283 } else { // No new instructions to check
1284 return nullptr;
1285 }
1286 }
1287 return ii;
1288}
1289
1290void
1292{
1293 instructionBuffer.clear();
1295
1300 computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
1301}
1302
1303bool
1305{
1306 // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1307 // waitCnt instruction has been dispatched but not executed yet: next
1308 // instruction should be blocked until waitCnt is executed.
1309 if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
1310 return false;
1311 }
1312
1318 if (vmWaitCnt != -1) {
1319 if (vmemInstsIssued > vmWaitCnt) {
1320 // vmWaitCnt not satisfied
1321 return false;
1322 }
1323 }
1324
1325 if (expWaitCnt != -1) {
1326 if (expInstsIssued > expWaitCnt) {
1327 // expWaitCnt not satisfied
1328 return false;
1329 }
1330 }
1331
1332 if (lgkmWaitCnt != -1) {
1334 // lgkmWaitCnt not satisfied
1335 return false;
1336 }
1337 }
1338
1339 // if we get here all outstanding waitcnts must
1340 // be satisfied, so we resume normal operation
1341 clearWaitCnts();
1342
1343 return true;
1344}
1345
1346bool
1348{
1349 assert(status == S_STALLED_SLEEP);
1350
1351 // if the sleep count has not been set, then the sleep instruction has not
1352 // been executed yet, so we will return true without setting the wavefront
1353 // status
1354 if (sleepCnt == 0)
1355 return false;
1356
1357 sleepCnt--;
1358 if (sleepCnt != 0)
1359 return false;
1360
1361 status = S_RUNNING;
1362 return true;
1363}
1364
1365void
1367{
1368 assert(sleepCnt == 0);
1369 sleepCnt = sleep_time;
1370}
1371
1372void
1373Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
1374{
1375 // the scoreboard should have set the status
1376 // to S_WAITCNT once a waitcnt instruction
1377 // was marked as ready
1378 assert(status == S_WAITCNT);
1379
1380 // waitcnt instruction shouldn't be sending
1381 // negative counts
1382 assert(vm_wait_cnt >= 0);
1383 assert(exp_wait_cnt >= 0);
1384 assert(lgkm_wait_cnt >= 0);
1385 // waitcnts are a max of 15 because we have
1386 // only 1 nibble (4 bits) to set the counts
1387 assert(vm_wait_cnt <= 0xf);
1388 assert(exp_wait_cnt <= 0x7);
1389 assert(lgkm_wait_cnt <= 0x1f);
1390
1397 assert(vmWaitCnt == -1);
1398 assert(expWaitCnt == -1);
1399 assert(lgkmWaitCnt == -1);
1400
1407 if (vm_wait_cnt != 0xf)
1408 vmWaitCnt = vm_wait_cnt;
1409
1410 if (exp_wait_cnt != 0x7)
1411 expWaitCnt = exp_wait_cnt;
1412
1413 if (lgkm_wait_cnt != 0x1f)
1414 lgkmWaitCnt = lgkm_wait_cnt;
1415}
1416
1417void
1419{
1420 // reset the waitcnts back to
1421 // -1, indicating they are no
1422 // longer valid
1423 vmWaitCnt = -1;
1424 expWaitCnt = -1;
1425 lgkmWaitCnt = -1;
1426
1427 // resume running normally
1428 status = S_RUNNING;
1429}
1430
1431void
1436
1437void
1442
1443void
1448
1449void
1454
1455void
1460
1461void
1466
1467void
1469{
1470 if (!computeUnit->shader->getProgressInterval()) {
1471 return;
1472 }
1473
1474 assert(!vmemIssued.count(gpu_dyn_inst->seqNum()));
1475 vmemIssued.insert(gpu_dyn_inst->seqNum());
1476 trackInst(gpu_dyn_inst);
1477}
1478
1479void
1481{
1482 if (!computeUnit->shader->getProgressInterval()) {
1483 return;
1484 }
1485
1486 assert(!lgkmIssued.count(gpu_dyn_inst->seqNum()));
1487 lgkmIssued.insert(gpu_dyn_inst->seqNum());
1488 trackInst(gpu_dyn_inst);
1489}
1490
1491void
1493{
1494 if (!computeUnit->shader->getProgressInterval()) {
1495 return;
1496 }
1497
1498 assert(!expIssued.count(gpu_dyn_inst->seqNum()));
1499 expIssued.insert(gpu_dyn_inst->seqNum());
1500 trackInst(gpu_dyn_inst);
1501}
1502
1503void
1505{
1506 if (!computeUnit->shader->getProgressInterval()) {
1507 return;
1508 }
1509
1510 cntInsts.insert({gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()});
1511}
1512
1513void
1515{
1516 if (!computeUnit->shader->getProgressInterval()) {
1517 return;
1518 }
1519
1520 warn_if(!vmemIssued.count(gpu_dyn_inst->seqNum()),
1521 "%d not in VMEM issued!\n", gpu_dyn_inst->seqNum());
1522 vmemIssued.erase(gpu_dyn_inst->seqNum());
1523 untrackInst(gpu_dyn_inst->seqNum());
1524}
1525
1526void
1528{
1529 if (!computeUnit->shader->getProgressInterval()) {
1530 return;
1531 }
1532
1533 warn_if(!lgkmIssued.count(gpu_dyn_inst->seqNum()),
1534 "%d not in LGKM issued!\n", gpu_dyn_inst->seqNum());
1535 lgkmIssued.erase(gpu_dyn_inst->seqNum());
1536 untrackInst(gpu_dyn_inst->seqNum());
1537}
1538
1539void
1541{
1542 if (!computeUnit->shader->getProgressInterval()) {
1543 return;
1544 }
1545
1546 warn_if(!expIssued.count(gpu_dyn_inst->seqNum()),
1547 "%d not in EXP issued!\n", gpu_dyn_inst->seqNum());
1548 expIssued.erase(gpu_dyn_inst->seqNum());
1549 untrackInst(gpu_dyn_inst->seqNum());
1550}
1551
1552void
1554{
1555 if (!computeUnit->shader->getProgressInterval()) {
1556 return;
1557 }
1558
1559 if (!vmemIssued.count(seqNum) &&
1560 !lgkmIssued.count(seqNum) &&
1561 !expIssued.count(seqNum)) {
1562 cntInsts.erase(seqNum);
1563 }
1564}
1565
1566Addr
1568{
1569 return _pc;
1570}
1571
1572void
1574{
1575 _pc = new_pc;
1576}
1577
1580{
1581 return _execMask;
1582}
1583
1584bool
1585Wavefront::execMask(int lane) const
1586{
1587 return _execMask[lane];
1588}
1589
1590void
1592{
1593 /* clear busy registers */
1594 for (int i=0; i < maxVgprs; i++) {
1595 int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
1596 computeUnit->vrf[simdId]->markReg(vgprIdx, false);
1597 }
1598
1599 /* Free registers used by this wavefront */
1600 uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
1601 computeUnit->vrf[simdId]->numRegs();
1602 computeUnit->registerManager->vrfPoolMgrs[simdId]->
1603 freeRegion(startVgprIndex, endIndex);
1604}
1605
1606void
1608{
1609 actualWgSzTotal = 1;
1610 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
1611 actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
1612 - task->wgId(d) * workGroupSz[d]);
1614 }
1615}
1616
1617void
1619{
1620 assert(bar_id >= WFBarrier::InvalidID);
1621 assert(bar_id < computeUnit->numBarrierSlots());
1622 barId = bar_id;
1623}
1624
1625int
1627{
1628 return barId;
1629}
1630
1631bool
1633{
1634 return barId > WFBarrier::InvalidID;
1635}
1636
1637void
1642
1643std::string
1645{
1646 switch (status) {
1647 case S_STOPPED: return "S_STOPPED";
1648 case S_RETURNING: return "S_RETURNING";
1649 case S_RUNNING: return "S_RUNNING";
1650 case S_STALLED: return "S_STALLED";
1651 case S_STALLED_SLEEP: return "S_STALLED_SLEEP";
1652 case S_WAITCNT: return "S_WAITCNT";
1653 case S_BARRIER: return "S_BARRIER";
1654 default: break;
1655 }
1656
1657 return "Unknown";
1658}
1659
1660void
1662{
1663 std::cout << "wave[" << wfDynId << "] status: "
1664 << statusToString(getStatus()) << " last inst: "
1665 << lastInstDisasm << " waitcnts: vmem: " << vmemInstsIssued
1666 << "/" << vmWaitCnt << "(";
1667 for (auto &elem : vmemIssued) {
1668 std::cout << elem << ' ';
1669 }
1670 std::cout << ") exp: " << expInstsIssued << "/"
1671 << expWaitCnt << "(";
1672 for (auto &elem : expIssued) {
1673 std::cout << elem << ' ';
1674 }
1675
1676 std::cout << ") lgkm: " << lgkmInstsIssued << " / "
1677 << lgkmWaitCnt << "(";
1678 for (auto &elem : lgkmIssued) {
1679 std::cout << elem << ' ';
1680 }
1681 std::cout << ") last ready status: " << lastInstRdyStatus
1682 << " status VRF/SRF: " << lastVrfStatus << "/" << lastSrfStatus
1683 << " wait insts:\n";
1684
1685 for (auto &elem : vmemIssued) {
1686 std::cout << "\t" << cntInsts[elem] << "\n";
1687 }
1688 for (auto &elem : lgkmIssued) {
1689 std::cout << "\t" << cntInsts[elem] << "\n";
1690 }
1691 for (auto &elem : expIssued) {
1692 std::cout << "\t" << cntInsts[elem] << "\n";
1693 }
1694}
1695
1697 : statistics::Group(parent),
1699 "number of instructions executed by this WF slot"),
1700 ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
1701 ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
1702 ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
1703 "RF denied adding instruction"),
1704 ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
1705 " not available"),
1706 ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
1707 "RF reads to complete"),
1709 "number of cycles wave stalled due to LDS-VRF arbitration"),
1710 // FIXME: the name of the WF needs to be unique
1711 ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
1712 "instructions are blocked due to WAW or WAR dependencies"),
1713 // FIXME: the name of the WF needs to be unique
1714 ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
1715 "instructions are blocked due to RAW dependencies"),
1717 "Count of RAW distance in dynamic instructions for this WF"),
1718 ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
1719{
1720 vecRawDistance.init(0, 20, 1);
1721 readsPerWrite.init(0, 4, 1);
1722}
1723
1724} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool sgprBitEnabled(int bit) const
int wgId(int dim) const
const GfxVersion & gfxVersion() const
void preloadLength(unsigned val)
Addr hostDispPktAddr() const
static const int MAX_DIM
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
unsigned accumOffset() const
WF barrier slots.
static const int InvalidID
uint32_t maxSgprs
Definition wavefront.hh:136
status_e status
Definition wavefront.hh:361
bool isOldestInstWaitcnt()
Definition wavefront.cc:683
Addr pc() const
bool hasBarrier() const
VectorMask _execMask
Definition wavefront.hh:363
uint32_t actualWgSzTotal
Definition wavefront.hh:170
InstSeqNum lastInstSeqNum
Definition wavefront.hh:327
void reserveGmResource(GPUDynInstPtr ii)
Definition wavefront.cc:841
uint64_t oldVgprTcnt
Definition wavefront.hh:218
std::vector< Addr > lastAddr
Definition wavefront.hh:159
std::set< InstSeqNum > expIssued
Definition wavefront.hh:293
void setStatus(status_e newStatus)
Definition wavefront.cc:599
void untrackInst(InstSeqNum seqNum)
bool waitCntsSatisfied()
void validateRequestCounters()
Definition wavefront.cc:827
void trackInst(GPUDynInstPtr gpu_dyn_inst)
void trackVMemInst(GPUDynInstPtr gpu_dyn_inst)
const int simdId
Definition wavefront.hh:102
bool isOldestInstLMem()
Definition wavefront.cc:769
bool isOldestInstPrivMem()
Definition wavefront.cc:782
bool isOldestInstScalarMem()
Definition wavefront.cc:756
uint64_t oldDgprTcnt
Definition wavefront.hh:225
Wavefront(const Params &p)
Definition wavefront.cc:50
bool isOldestInstBarrier()
Definition wavefront.cc:730
void resizeRegFiles(int num_vregs, int num_sregs)
Definition wavefront.cc:588
int scalarOutstandingReqsWrGm
Definition wavefront.hh:189
uint32_t gridSz[3]
Definition wavefront.hh:165
void decExpInstsIssued()
std::set< InstSeqNum > lgkmIssued
Definition wavefront.hh:292
std::vector< uint32_t > oldVgpr
Definition wavefront.hh:214
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
Definition wavefront.cc:122
void setSleepTime(int sleep_time)
ComputeUnit * computeUnit
Definition wavefront.hh:109
std::vector< uint32_t > workItemFlatId
Definition wavefront.hh:161
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
Definition wavefront.hh:354
std::vector< int > vecReads
Definition wavefront.hh:246
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:112
bool isOldestInstSleep()
Definition wavefront.cc:669
uint32_t accumOffset
Definition wavefront.hh:138
bool isLmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:658
GPUDynInstPtr nextInstr()
uint64_t lastTrace
Definition wavefront.hh:198
std::vector< uint32_t > workItemId[3]
Definition wavefront.hh:160
std::vector< uint64_t > oldDgpr
Definition wavefront.hh:221
bool isOldestInstScalarALU()
Definition wavefront.cc:700
void untrackExpInst(GPUDynInstPtr gpu_dyn_inst)
void releaseBarrier()
bool isOldestInstFlatMem()
Definition wavefront.cc:795
status_e getStatus()
Definition wavefront.hh:142
WavefrontParams Params
Definition wavefront.hh:253
uint32_t maxVgprs
Definition wavefront.hh:134
void decVMemInstsIssued()
void computeActualWgSz(HSAQueueEntry *task)
std::string lastInstDisasm
Definition wavefront.hh:328
uint32_t workGroupId[3]
Definition wavefront.hh:163
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
const int wfSlotId
Definition wavefront.hh:99
std::unordered_map< int, uint64_t > rawDist
Definition wavefront.hh:242
void incExpInstsIssued()
void untrackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
std::vector< int > reserveResources()
Definition wavefront.cc:889
uint32_t startSgprIndex
Definition wavefront.hh:208
GfxVersion gfxVersion
Definition wavefront.hh:97
void decLGKMInstsIssued()
void incLGKMInstsIssued()
int barrierId() const
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition wavefront.cc:107
uint32_t workGroupSz[3]
Definition wavefront.hh:164
void untrackVMemInst(GPUDynInstPtr gpu_dyn_inst)
void trackExpInst(GPUDynInstPtr gpu_dyn_inst)
bool isOldestInstVectorALU()
Definition wavefront.cc:715
uint64_t lastInstExec
Definition wavefront.hh:238
LdsChunk * ldsChunk
Definition wavefront.hh:232
std::unordered_map< InstSeqNum, std::string > cntInsts
Definition wavefront.hh:294
uint32_t actualWgSz[3]
Definition wavefront.hh:169
Addr archFlatScratchAddr
Definition wavefront.hh:211
std::set< InstSeqNum > vmemIssued
Definition wavefront.hh:291
void trackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
int scalarOutstandingReqsRdGm
Definition wavefront.hh:187
void freeResources()
Definition wavefront.cc:822
void incVMemInstsIssued()
std::string statusToString(status_e status)
void reserveLmResource(GPUDynInstPtr ii)
Definition wavefront.cc:871
std::string lastInstRdyStatus
Definition wavefront.hh:329
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:93
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:89
bool isOldestInstGMem()
Definition wavefront.cc:743
gem5::Wavefront::WavefrontStats stats
VectorMask & execMask()
uint64_t wfDynId
Definition wavefront.hh:235
void freeRegisterFile()
Freeing VRF space.
bool isGmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:647
uint32_t startVgprIndex
Definition wavefront.hh:205
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition wavefront.cc:636
TheGpuISA::GPUISA _gpuISA
Definition wavefront.hh:333
Statistics container.
Definition group.hh:93
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
SimObject(const Params &p)
Definition sim_object.cc:58
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:315
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 12, 11 > set
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 0 > p
const FlagsType none
Nothing extra to print.
Definition info.hh:53
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
static void init_pc(py::module_ &m_native)
Definition core.cc:168
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
@ WorkgroupIdX
@ DispatchId
@ NumScalarInitFields
@ DispatchPtr
@ QueuePtr
@ PrivSegWaveByteOffset
@ PrivateSegBuf
@ WorkgroupIdY
@ PrivateSegSize
@ WorkgroupInfo
@ WorkgroupIdZ
@ FlatScratchInit
@ KernargPreload
@ KernargSegPtr
@ WorkitemIdX
@ WorkitemIdZ
@ NumVectorInitFields
@ WorkitemIdY
uint64_t InstSeqNum
Definition inst_seq.hh:40
statistics::Scalar numTimesBlockedDueRAWDependencies
Definition wavefront.hh:401
statistics::Scalar schResourceStalls
Definition wavefront.hh:388
WavefrontStats(statistics::Group *parent)
statistics::Distribution vecRawDistance
Definition wavefront.hh:405
statistics::Distribution readsPerWrite
Definition wavefront.hh:409
statistics::Scalar schCycles
Definition wavefront.hh:376
statistics::Scalar numTimesBlockedDueWAXDependencies
Definition wavefront.hh:398
statistics::Scalar schRfAccessStalls
Definition wavefront.hh:386
statistics::Scalar schOpdNrdyStalls
Definition wavefront.hh:390
statistics::Scalar numInstrExecuted
Definition wavefront.hh:373
statistics::Scalar schStalls
Definition wavefront.hh:379
statistics::Scalar schLdsArbStalls
Definition wavefront.hh:394
uint32_t scratch_workitem_byte_size
Definition hsa_queue.hh:84
uint32_t compute_tmpring_size_wavesize
Definition hsa_queue.hh:79
uint64_t scratch_backing_memory_location
Definition hsa_queue.hh:82
uint32_t scratch_resource_descriptor[4]
Definition hsa_queue.hh:81

Generated on Mon May 26 2025 09:19:11 for gem5 by doxygen 1.13.2