gem5  v22.1.0.0
wavefront.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "gpu-compute/wavefront.hh"
33 
34 #include "base/bitfield.hh"
35 #include "debug/GPUExec.hh"
36 #include "debug/GPUInitAbi.hh"
37 #include "debug/WavefrontStack.hh"
41 #include "gpu-compute/shader.hh"
44 
45 namespace gem5
46 {
47 
49  : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
50  maxIbSize(p.max_ib_size), _gpuISA(*this),
51  vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
52  vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
53  sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
54 {
55  lastTrace = 0;
56  execUnitId = -1;
57  status = S_STOPPED;
60  startVgprIndex = 0;
61  startSgprIndex = 0;
62  outstandingReqs = 0;
67  rdLmReqsInPipe = 0;
68  rdGmReqsInPipe = 0;
69  wrLmReqsInPipe = 0;
70  wrGmReqsInPipe = 0;
75  lastNonIdleTick = 0;
76  ldsChunk = nullptr;
77 
78  memTraceBusy = 0;
79  oldVgprTcnt = 0xffffffffffffffffll;
80  oldDgprTcnt = 0xffffffffffffffffll;
81  oldVgpr.resize(p.wf_size);
82 
83  pendingFetch = false;
84  dropFetch = false;
85  maxVgprs = 0;
86  maxSgprs = 0;
87 
88  lastAddr.resize(p.wf_size);
89  workItemFlatId.resize(p.wf_size);
90  oldDgpr.resize(p.wf_size);
91  for (int i = 0; i < 3; ++i) {
92  workItemId[i].resize(p.wf_size);
93  }
94 
95  _execMask.set();
96  rawDist.clear();
97  lastInstExec = 0;
98  vecReads.clear();
99 }
100 
101 void
103 {
104  reservedVectorRegs = 0;
105  reservedScalarRegs = 0;
106  startVgprIndex = 0;
107  startSgprIndex = 0;
108 
114 }
115 
116 void
117 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
118 {
119  int regInitIdx = 0;
120 
121  // iterate over all the init fields and check which
122  // bits are enabled
123  for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
124 
125  if (task->sgprBitEnabled(en_bit)) {
126  int physSgprIdx = 0;
127  uint32_t wiCount = 0;
128  uint32_t firstWave = 0;
129  int orderedAppendTerm = 0;
130  int numWfsInWg = 0;
131  uint32_t finalValue = 0;
132  Addr host_disp_pkt_addr = task->hostDispPktAddr();
133  Addr kernarg_addr = task->kernargAddr();
134  Addr hidden_priv_base(0);
135 
136  switch (en_bit) {
137  case PrivateSegBuf:
138  physSgprIdx =
139  computeUnit->registerManager->mapSgpr(this, regInitIdx);
140  computeUnit->srf[simdId]->write(physSgprIdx,
142  ++regInitIdx;
143  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
144  "Setting PrivateSegBuffer: s[%d] = %x\n",
146  wfSlotId, wfDynId, physSgprIdx,
148 
149  physSgprIdx =
150  computeUnit->registerManager->mapSgpr(this, regInitIdx);
151  computeUnit->srf[simdId]->write(physSgprIdx,
153  ++regInitIdx;
154  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
155  "Setting PrivateSegBuffer: s[%d] = %x\n",
157  wfSlotId, wfDynId, physSgprIdx,
159 
160  physSgprIdx =
161  computeUnit->registerManager->mapSgpr(this, regInitIdx);
162  computeUnit->srf[simdId]->write(physSgprIdx,
164  ++regInitIdx;
165  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
166  "Setting PrivateSegBuffer: s[%d] = %x\n",
168  wfSlotId, wfDynId, physSgprIdx,
170 
171  physSgprIdx =
172  computeUnit->registerManager->mapSgpr(this, regInitIdx);
173  computeUnit->srf[simdId]->write(physSgprIdx,
175 
176  ++regInitIdx;
177  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
178  "Setting PrivateSegBuffer: s[%d] = %x\n",
180  wfSlotId, wfDynId, physSgprIdx,
182  break;
183  case DispatchPtr:
184  physSgprIdx =
185  computeUnit->registerManager->mapSgpr(this, regInitIdx);
186  computeUnit->srf[simdId]->write(physSgprIdx,
187  bits(host_disp_pkt_addr, 31, 0));
188  ++regInitIdx;
189  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
190  "Setting DispatchPtr: s[%d] = %x\n",
192  wfSlotId, wfDynId, physSgprIdx,
193  bits(host_disp_pkt_addr, 31, 0));
194 
195  physSgprIdx =
196  computeUnit->registerManager->mapSgpr(this, regInitIdx);
197  computeUnit->srf[simdId]->write(physSgprIdx,
198  bits(host_disp_pkt_addr, 63, 32));
199  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
200  "Setting DispatchPtr: s[%d] = %x\n",
202  wfSlotId, wfDynId, physSgprIdx,
203  bits(host_disp_pkt_addr, 63, 32));
204 
205  ++regInitIdx;
206  break;
207  case QueuePtr:
208  physSgprIdx =
209  computeUnit->registerManager->mapSgpr(this, regInitIdx);
210  computeUnit->srf[simdId]->write(physSgprIdx,
211  bits(task->hostAMDQueueAddr, 31, 0));
212  ++regInitIdx;
213  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
214  "Setting QueuePtr: s[%d] = %x\n",
216  wfSlotId, wfDynId, physSgprIdx,
217  bits(task->hostAMDQueueAddr, 31, 0));
218 
219  physSgprIdx =
220  computeUnit->registerManager->mapSgpr(this, regInitIdx);
221  computeUnit->srf[simdId]->write(physSgprIdx,
222  bits(task->hostAMDQueueAddr, 63, 32));
223  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
224  "Setting QueuePtr: s[%d] = %x\n",
226  wfSlotId, wfDynId, physSgprIdx,
227  bits(task->hostAMDQueueAddr, 63, 32));
228 
229  ++regInitIdx;
230  break;
231  case KernargSegPtr:
232  physSgprIdx =
233  computeUnit->registerManager->mapSgpr(this, regInitIdx);
234  computeUnit->srf[simdId]->write(physSgprIdx,
235  bits(kernarg_addr, 31, 0));
236  ++regInitIdx;
237  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
238  "Setting KernargSegPtr: s[%d] = %x\n",
240  wfSlotId, wfDynId, physSgprIdx,
241  bits(kernarg_addr, 31, 0));
242 
243  physSgprIdx =
244  computeUnit->registerManager->mapSgpr(this, regInitIdx);
245  computeUnit->srf[simdId]->write(physSgprIdx,
246  bits(kernarg_addr, 63, 32));
247  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
248  "Setting KernargSegPtr: s[%d] = %x\n",
250  wfSlotId, wfDynId, physSgprIdx,
251  bits(kernarg_addr, 63, 32));
252 
253  ++regInitIdx;
254  break;
255  case DispatchId:
256  physSgprIdx
257  = computeUnit->registerManager->mapSgpr(this, regInitIdx);
258  computeUnit->srf[simdId]->write(physSgprIdx,
259  task->dispatchId());
260  ++regInitIdx;
261  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
262  "Setting DispatchId: s[%d] = %x\n",
264  wfSlotId, wfDynId, physSgprIdx,
265  task->dispatchId());
266  break;
267  case FlatScratchInit:
268  physSgprIdx
269  = computeUnit->registerManager->mapSgpr(this, regInitIdx);
270  computeUnit->srf[simdId]->write(physSgprIdx,
272  .scratch_backing_memory_location & 0xffffffff));
273  ++regInitIdx;
274  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
275  "Setting FlatScratch Addr: s[%d] = %x\n",
277  wfSlotId, wfDynId, physSgprIdx,
279  .scratch_backing_memory_location & 0xffffffff));
280 
281  physSgprIdx =
282  computeUnit->registerManager->mapSgpr(this, regInitIdx);
283  // This vallue should be sizeof(DWORD) aligned, that is
284  // 4 byte aligned
285  computeUnit->srf[simdId]->write(physSgprIdx,
287  ++regInitIdx;
288  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
289  "Setting FlatScratch size: s[%d] = %x\n",
291  wfSlotId, wfDynId, physSgprIdx,
316  hidden_priv_base =
317  (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
318  (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
319  & 0x000000000000ffff) << 32);
321  hidden_priv_base,
323  break;
324  case PrivateSegSize:
325  physSgprIdx
326  = computeUnit->registerManager->mapSgpr(this, regInitIdx);
327  computeUnit->srf[simdId]->write(physSgprIdx,
328  task->privMemPerItem());
329  ++regInitIdx;
330  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
331  "Setting private segment size: s[%d] = %x\n",
333  wfSlotId, wfDynId, physSgprIdx,
334  task->privMemPerItem());
335  break;
336  case GridWorkgroupCountX:
337  physSgprIdx =
338  computeUnit->registerManager->mapSgpr(this, regInitIdx);
339  wiCount = ((task->gridSize(0) +
340  task->wgSize(0) - 1) /
341  task->wgSize(0));
342  computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
343 
344  ++regInitIdx;
345  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
346  "Setting num WG X: s[%d] = %x\n",
348  wfSlotId, wfDynId, physSgprIdx, wiCount);
349  break;
350  case GridWorkgroupCountY:
351  physSgprIdx =
352  computeUnit->registerManager->mapSgpr(this, regInitIdx);
353  wiCount = ((task->gridSize(1) +
354  task->wgSize(1) - 1) /
355  task->wgSize(1));
356  computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
357 
358  ++regInitIdx;
359  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
360  "Setting num WG Y: s[%d] = %x\n",
362  wfSlotId, wfDynId, physSgprIdx, wiCount);
363  break;
364  case GridWorkgroupCountZ:
365  physSgprIdx =
366  computeUnit->registerManager->mapSgpr(this, regInitIdx);
367  wiCount = ((task->gridSize(2) +
368  task->wgSize(2) - 1) /
369  task->wgSize(2));
370  computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
371 
372  ++regInitIdx;
373  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
374  "Setting num WG Z: s[%d] = %x\n",
376  wfSlotId, wfDynId, physSgprIdx, wiCount);
377  break;
378  case WorkgroupIdX:
379  physSgprIdx =
380  computeUnit->registerManager->mapSgpr(this, regInitIdx);
381  computeUnit->srf[simdId]->write(physSgprIdx,
382  workGroupId[0]);
383 
384  ++regInitIdx;
385  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
386  "Setting WG ID X: s[%d] = %x\n",
388  wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
389  break;
390  case WorkgroupIdY:
391  physSgprIdx =
392  computeUnit->registerManager->mapSgpr(this, regInitIdx);
393  computeUnit->srf[simdId]->write(physSgprIdx,
394  workGroupId[1]);
395 
396  ++regInitIdx;
397  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
398  "Setting WG ID Y: s[%d] = %x\n",
400  wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
401  break;
402  case WorkgroupIdZ:
403  physSgprIdx =
404  computeUnit->registerManager->mapSgpr(this, regInitIdx);
405  computeUnit->srf[simdId]->write(physSgprIdx,
406  workGroupId[2]);
407 
408  ++regInitIdx;
409  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
410  "Setting WG ID Z: s[%d] = %x\n",
412  wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
413  break;
415  physSgprIdx =
416  computeUnit->registerManager->mapSgpr(this, regInitIdx);
430  computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
431  (wgId * (wgSz / 64) + wfId) *
433 
434  ++regInitIdx;
435  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
436  "Setting Private Seg Offset: s[%d] = %x\n",
438  wfSlotId, wfDynId, physSgprIdx,
439  1024 * (wgId * (wgSz / 64) + wfId) *
441  break;
442  case WorkgroupInfo:
443  firstWave = (wfId == 0) ? 1 : 0;
444  numWfsInWg = divCeil(wgSizeInWorkItems,
445  computeUnit->wfSize());
446  finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
447  finalValue |= (orderedAppendTerm << 6);
448  finalValue |= numWfsInWg;
449  physSgprIdx =
450  computeUnit->registerManager->mapSgpr(this, regInitIdx);
452  write(physSgprIdx, finalValue);
453 
454  ++regInitIdx;
455  DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
456  "Setting WG Info: s[%d] = %x\n",
458  wfSlotId, wfDynId, physSgprIdx, finalValue);
459  break;
460  default:
461  fatal("SGPR enable bit %i not supported\n", en_bit);
462  break;
463  }
464  }
465  }
466 
467  regInitIdx = 0;
468 
469  // iterate over all the init fields and check which
470  // bits are enabled
471  for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
472  if (task->vgprBitEnabled(en_bit)) {
473  uint32_t physVgprIdx = 0;
475 
476  switch (en_bit) {
477  case WorkitemIdX:
478  {
479  physVgprIdx = computeUnit->registerManager
480  ->mapVgpr(this, regInitIdx);
481  TheGpuISA::VecElemU32 *vgpr_x
482  = raw_vgpr.as<TheGpuISA::VecElemU32>();
483 
484  for (int lane = 0; lane < workItemId[0].size(); ++lane) {
485  vgpr_x[lane] = workItemId[0][lane];
486  }
487 
488  computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
489  rawDist[regInitIdx] = 0;
490  ++regInitIdx;
491  }
492  break;
493  case WorkitemIdY:
494  {
495  physVgprIdx = computeUnit->registerManager
496  ->mapVgpr(this, regInitIdx);
497  TheGpuISA::VecElemU32 *vgpr_y
498  = raw_vgpr.as<TheGpuISA::VecElemU32>();
499 
500  for (int lane = 0; lane < workItemId[1].size(); ++lane) {
501  vgpr_y[lane] = workItemId[1][lane];
502  }
503 
504  computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
505  rawDist[regInitIdx] = 0;
506  ++regInitIdx;
507  }
508  break;
509  case WorkitemIdZ:
510  {
511  physVgprIdx = computeUnit->registerManager->
512  mapVgpr(this, regInitIdx);
513  TheGpuISA::VecElemU32 *vgpr_z
514  = raw_vgpr.as<TheGpuISA::VecElemU32>();
515 
516  for (int lane = 0; lane < workItemId[2].size(); ++lane) {
517  vgpr_z[lane] = workItemId[2][lane];
518  }
519 
520  computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
521  rawDist[regInitIdx] = 0;
522  ++regInitIdx;
523  }
524  break;
525  }
526  }
527  }
528 }
529 
530 void
531 Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
532 {
533  maxVgprs = num_vregs;
534  maxSgprs = num_sregs;
535 }
536 
538 {
539 }
540 
541 void
543 {
544  if (computeUnit->idleCUTimeout > 0) {
545  // Wavefront's status transitions to stalled or stopped
546  if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
547  newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
548  (status != newStatus)) {
549  computeUnit->idleWfs++;
550  assert(computeUnit->idleWfs <=
552  if (computeUnit->idleWfs ==
555  }
556  // Wavefront's status transitions to an active state (from
557  // a stopped or stalled state)
558  } else if ((status == S_STOPPED || status == S_STALLED ||
559  status == S_WAITCNT || status == S_BARRIER) &&
560  (status != newStatus)) {
561  // if all WFs in the CU were idle then check if the idleness
562  // period exceeded the timeout threshold
563  if (computeUnit->idleWfs ==
567  "CU%d has been idle for %d ticks at tick %d",
569  curTick());
570  }
571  computeUnit->idleWfs--;
572  assert(computeUnit->idleWfs >= 0);
573  }
574  }
575  status = newStatus;
576 }
577 
578 void
579 Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
580 {
581  wfDynId = _wf_dyn_id;
582  _pc = init_pc;
583 
584  status = S_RUNNING;
585 
586  vecReads.resize(maxVgprs, 0);
587 }
588 
589 bool
591 {
592  if (ii->isGlobalMem() ||
593  (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
594  return true;
595  }
596 
597  return false;
598 }
599 
600 bool
602 {
603  if (ii->isLocalMem() ||
604  (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
605  return true;
606  }
607 
608  return false;
609 }
610 
611 bool
613 {
614  if (instructionBuffer.empty())
615  return false;
616 
617  GPUDynInstPtr ii = instructionBuffer.front();
618 
619  if (ii->isSleep()) {
620  return true;
621  }
622  return false;
623 }
624 
625 bool
627 {
628  if (instructionBuffer.empty())
629  return false;
630 
631  GPUDynInstPtr ii = instructionBuffer.front();
632 
633  if (ii->isWaitcnt()) {
634  // waitcnt is a scalar
635  assert(ii->isScalar());
636  return true;
637  }
638 
639  return false;
640 }
641 
642 bool
644 {
645  assert(!instructionBuffer.empty());
646  GPUDynInstPtr ii = instructionBuffer.front();
647 
648  if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
649  || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
650  (ii->isKernArgSeg() && ii->isLoad()))) {
651  return true;
652  }
653 
654  return false;
655 }
656 
657 bool
659 {
660  assert(!instructionBuffer.empty());
661  GPUDynInstPtr ii = instructionBuffer.front();
662 
663  if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
664  ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
665  || (ii->isKernArgSeg() && ii->isLoad()))) {
666  return true;
667  }
668 
669  return false;
670 }
671 
672 bool
674 {
675  assert(!instructionBuffer.empty());
676  GPUDynInstPtr ii = instructionBuffer.front();
677 
678  if (status != S_STOPPED && ii->isBarrier()) {
679  return true;
680  }
681 
682  return false;
683 }
684 
685 bool
687 {
688  assert(!instructionBuffer.empty());
689  GPUDynInstPtr ii = instructionBuffer.front();
690 
691  if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
692  return true;
693  }
694 
695  return false;
696 }
697 
698 bool
700 {
701  assert(!instructionBuffer.empty());
702  GPUDynInstPtr ii = instructionBuffer.front();
703 
704  if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
705  return true;
706  }
707 
708  return false;
709 }
710 
711 bool
713 {
714  assert(!instructionBuffer.empty());
715  GPUDynInstPtr ii = instructionBuffer.front();
716 
717  if (status != S_STOPPED && ii->isLocalMem()) {
718  return true;
719  }
720 
721  return false;
722 }
723 
724 bool
726 {
727  assert(!instructionBuffer.empty());
728  GPUDynInstPtr ii = instructionBuffer.front();
729 
730  if (status != S_STOPPED && ii->isPrivateSeg()) {
731  return true;
732  }
733 
734  return false;
735 }
736 
737 bool
739 {
740  assert(!instructionBuffer.empty());
741  GPUDynInstPtr ii = instructionBuffer.front();
742 
743  if (status != S_STOPPED && ii->isFlat()) {
744  return true;
745  }
746 
747  return false;
748 }
749 
750 bool
752 {
753  for (auto it : instructionBuffer) {
754  GPUDynInstPtr ii = it;
755  if (ii->isReturn() || ii->isBranch() ||
756  ii->isEndOfKernel()) {
757  return true;
758  }
759  }
760 
761  return false;
762 }
763 
764 void
766 {
767  execUnitId = -1;
768 }
769 
771 {
773  wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
774  outstandingReqs < 0,
775  "Negative requests in pipe for WF%d for slot%d"
776  " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
777  " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
778  " Outstanding Reqs=%d\n",
781 }
782 
783 void
785 {
786  if (!ii->isScalar()) {
787  if (ii->isLoad()) {
788  rdGmReqsInPipe++;
789  } else if (ii->isStore()) {
790  wrGmReqsInPipe++;
791  } else if (ii->isAtomic() || ii->isMemSync()) {
792  rdGmReqsInPipe++;
793  wrGmReqsInPipe++;
794  } else {
795  panic("Invalid memory operation!\n");
796  }
798  } else {
799  if (ii->isLoad()) {
801  } else if (ii->isStore()) {
803  } else if (ii->isAtomic() || ii->isMemSync()) {
806  } else {
807  panic("Invalid memory operation!\n");
808  }
810  }
811 }
812 
813 void
815 {
816  fatal_if(ii->isScalar(),
817  "Scalar instructions can not access Shared memory!!!");
818  if (ii->isLoad()) {
819  rdLmReqsInPipe++;
820  } else if (ii->isStore()) {
821  wrLmReqsInPipe++;
822  } else if (ii->isAtomic() || ii->isMemSync()) {
823  wrLmReqsInPipe++;
824  rdLmReqsInPipe++;
825  } else {
826  panic("Invalid memory operation!\n");
827  }
829 }
830 
833 {
834  // vector of execution unit IDs to return to schedule stage
835  // this return is only used for debugging and an assertion...
836  std::vector<int> execUnitIds;
837 
838  // Get current instruction
839  GPUDynInstPtr ii = instructionBuffer.front();
840  assert(ii);
841 
842  // Single precision ALU or Branch or Return or Special instruction
843  if (ii->isALU() || ii->isSpecialOp() ||
844  ii->isBranch() || ii->isNop() ||
845  (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
846  ii->isReturn() || ii->isEndOfKernel()) {
847  if (!ii->isScalar()) {
848  execUnitId = simdId;
849  } else {
851  }
852  // this is to enforce a fixed number of cycles per issue slot per SIMD
853  } else if (ii->isBarrier()) {
854  execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
855  } else if (ii->isFlat()) {
856  assert(!ii->isScalar());
857  reserveLmResource(ii);
858  // add execUnitId, reserved by reserveLmResource, list before it is
859  // overwriten by reserveGmResource
860  execUnitIds.push_back(execUnitId);
862  reserveGmResource(ii);
864  execUnitIds.push_back(flatGmUnitId);
865  execUnitId = -1;
866  } else if (ii->isGlobalMem()) {
867  reserveGmResource(ii);
868  } else if (ii->isLocalMem()) {
869  reserveLmResource(ii);
870  } else if (ii->isPrivateSeg()) {
871  fatal_if(ii->isScalar(),
872  "Scalar instructions can not access Private memory!!!");
873  reserveGmResource(ii);
874  } else {
875  panic("reserveResources -> Couldn't process op!\n");
876  }
877 
878  if (execUnitId != -1) {
879  execUnitIds.push_back(execUnitId);
880  }
881  assert(execUnitIds.size());
882  return execUnitIds;
883 }
884 
885 void
887 {
888  // ---- Exit if wavefront is inactive ----------------------------- //
889 
890  if (status == S_STOPPED || status == S_RETURNING ||
891  status==S_STALLED || instructionBuffer.empty()) {
892  return;
893  }
894 
895  if (status == S_WAITCNT) {
907  assert(isOldestInstWaitcnt());
908  }
909 
910  // Get current instruction
911 
912  GPUDynInstPtr ii = instructionBuffer.front();
913 
914  const Addr old_pc = pc();
915  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
916  "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
917  wfDynId, ii->disassemble(), old_pc, ii->seqNum());
918 
919  ii->execute(ii);
920  // delete the dynamic instruction from the pipeline map
922  // update the instruction stats in the CU
924 
925  // inform VRF of instruction execution to schedule write-back
926  // and scoreboard ready for registers
927  if (!ii->isScalar()) {
928  computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
929  }
930  computeUnit->srf[simdId]->waveExecuteInst(this, ii);
931 
932  computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
933  computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
942 
943  if (lastInstExec) {
946  }
948 
949  // want to track:
950  // number of reads that occur per value written
951 
952  // vector RAW dependency tracking
953  for (const auto& srcVecOp : ii->srcVecRegOperands()) {
954  for (const auto& virtIdx : srcVecOp.virtIndices()) {
955  // This check should never fail, but to be safe we check
956  if (rawDist.find(virtIdx) != rawDist.end()) {
958  rawDist[virtIdx]);
959  }
960  // increment number of reads to this register
961  vecReads[virtIdx]++;
962  }
963  }
964 
965  for (const auto& dstVecOp : ii->dstVecRegOperands()) {
966  for (const auto& virtIdx : dstVecOp.virtIndices()) {
967  // rawDist is set on writes, but will not be set for the first
968  // write to each physical register
969  if (rawDist.find(virtIdx) != rawDist.end()) {
970  // Sample the number of reads that were performed
972  }
973  // on a write, reset count of reads to 0
974  vecReads[virtIdx] = 0;
975 
976  rawDist[virtIdx] = stats.numInstrExecuted.value();
977  }
978  }
979 
980  if (pc() == old_pc) {
981  // PC not modified by instruction, proceed to next
982  _gpuISA.advancePC(ii);
983  instructionBuffer.pop_front();
984  } else {
985  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
987  ii->disassemble());
988  discardFetch();
989  }
990  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
992 
994  const int num_active_lanes = execMask().count();
996  computeUnit->stats.numVecOpsExecuted += num_active_lanes;
997 
998  if (ii->isF16() && ii->isALU()) {
999  if (ii->isF32() || ii->isF64()) {
1000  fatal("Instruction is tagged as both (1) F16, and (2)"
1001  "either F32 or F64.");
1002  }
1003  computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
1004  if (ii->isFMA()) {
1005  computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
1007  += num_active_lanes;
1008  }
1009  else if (ii->isMAC()) {
1010  computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
1012  += num_active_lanes;
1013  }
1014  else if (ii->isMAD()) {
1015  computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
1017  += num_active_lanes;
1018  }
1019  }
1020  if (ii->isF32() && ii->isALU()) {
1021  if (ii->isF16() || ii->isF64()) {
1022  fatal("Instruction is tagged as both (1) F32, and (2)"
1023  "either F16 or F64.");
1024  }
1025  computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
1026  if (ii->isFMA()) {
1027  computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
1029  += num_active_lanes;
1030  }
1031  else if (ii->isMAC()) {
1032  computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
1034  += num_active_lanes;
1035  }
1036  else if (ii->isMAD()) {
1037  computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
1039  += num_active_lanes;
1040  }
1041  }
1042  if (ii->isF64() && ii->isALU()) {
1043  if (ii->isF16() || ii->isF32()) {
1044  fatal("Instruction is tagged as both (1) F64, and (2)"
1045  "either F16 or F32.");
1046  }
1047  computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
1048  if (ii->isFMA()) {
1049  computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
1051  += num_active_lanes;
1052  }
1053  else if (ii->isMAC()) {
1054  computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
1056  += num_active_lanes;
1057  }
1058  else if (ii->isMAD()) {
1059  computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
1061  += num_active_lanes;
1062  }
1063  }
1064  if (isGmInstruction(ii)) {
1066  num_active_lanes);
1067  } else if (isLmInstruction(ii)) {
1069  num_active_lanes);
1070  }
1071  }
1072 
1077  if (execMask().none() && ii->isFlat()) {
1079  return;
1080  }
1081 
1082  // Update Vector ALU pipeline and other resources
1083  bool flat_as_gm = false;
1084  bool flat_as_lm = false;
1085  if (ii->isFlat()) {
1086  flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
1087  (ii->executedAs() == enums::SC_PRIVATE);
1088  flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
1089  }
1090 
1091  // Single precision ALU or Branch or Return or Special instruction
1092  // Note, we use the same timing regardless of SP or DP ALU operation.
1093  if (ii->isALU() || ii->isSpecialOp() ||
1094  ii->isBranch() || ii->isNop() ||
1095  (ii->isKernArgSeg() && ii->isLoad()) ||
1096  ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1097  // this is to enforce a fixed number of cycles per issue slot per SIMD
1098  if (!ii->isScalar()) {
1100  cyclesToTicks(computeUnit->issuePeriod));
1101  } else {
1103  cyclesToTicks(computeUnit->issuePeriod));
1104  }
1105  // Barrier on Scalar ALU
1106  } else if (ii->isBarrier()) {
1108  cyclesToTicks(computeUnit->issuePeriod));
1109  // GM or Flat as GM Load
1110  } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1111  if (!ii->isScalar()) {
1118  } else {
1120  cyclesToTicks(computeUnit->srf_scm_bus_latency));
1125  }
1126  // GM or Flat as GM Store
1127  } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1128  if (!ii->isScalar()) {
1130  cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1135  } else {
1137  cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1142  }
1143  } else if ((ii->isAtomic() || ii->isMemSync()) &&
1144  (ii->isGlobalMem() || flat_as_gm)) {
1145  if (!ii->isScalar()) {
1147  cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1152  } else {
1154  cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1159  }
1160  // LM or Flat as LM Load
1161  } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1163  cyclesToTicks(computeUnit->vrf_lm_bus_latency));
1168  // LM or Flat as LM Store
1169  } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1171  cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1176  // LM or Flat as LM, Atomic or MemFence
1177  } else if ((ii->isAtomic() || ii->isMemSync()) &&
1178  (ii->isLocalMem() || flat_as_lm)) {
1180  cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1185  } else {
1186  panic("Bad instruction type!\n");
1187  }
1188 }
1189 
1192 {
1193  // Read next instruction from instruction buffer
1194  GPUDynInstPtr ii = instructionBuffer.front();
1195  // if the WF has been dispatched in the schedule stage then
1196  // check the next oldest instruction for readiness
1197  if (computeUnit->pipeMap.find(ii->seqNum()) !=
1198  computeUnit->pipeMap.end()) {
1199  if (instructionBuffer.size() > 1) {
1200  auto it = instructionBuffer.begin() + 1;
1201  return *it;
1202  } else { // No new instructions to check
1203  return nullptr;
1204  }
1205  }
1206  return ii;
1207 }
1208 
1209 void
1211 {
1212  instructionBuffer.clear();
1214 
1220 }
1221 
1222 bool
1224 {
1225  // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1226  // waitCnt instruction has been dispatched but not executed yet: next
1227  // instruction should be blocked until waitCnt is executed.
1228  if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
1229  return false;
1230  }
1231 
1237  if (vmWaitCnt != -1) {
1238  if (vmemInstsIssued > vmWaitCnt) {
1239  // vmWaitCnt not satisfied
1240  return false;
1241  }
1242  }
1243 
1244  if (expWaitCnt != -1) {
1245  if (expInstsIssued > expWaitCnt) {
1246  // expWaitCnt not satisfied
1247  return false;
1248  }
1249  }
1250 
1251  if (lgkmWaitCnt != -1) {
1252  if (lgkmInstsIssued > lgkmWaitCnt) {
1253  // lgkmWaitCnt not satisfied
1254  return false;
1255  }
1256  }
1257 
1258  // if we get here all outstanding waitcnts must
1259  // be satisfied, so we resume normal operation
1260  clearWaitCnts();
1261 
1262  return true;
1263 }
1264 
1265 bool
1267 {
1268  assert(status == S_STALLED_SLEEP);
1269 
1270  // if the sleep count has not been set, then the sleep instruction has not
1271  // been executed yet, so we will return true without setting the wavefront
1272  // status
1273  if (sleepCnt == 0)
1274  return false;
1275 
1276  sleepCnt--;
1277  if (sleepCnt != 0)
1278  return false;
1279 
1280  status = S_RUNNING;
1281  return true;
1282 }
1283 
1284 void
1286 {
1287  assert(sleepCnt == 0);
1288  sleepCnt = sleep_time;
1289 }
1290 
1291 void
1292 Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
1293 {
1294  // the scoreboard should have set the status
1295  // to S_WAITCNT once a waitcnt instruction
1296  // was marked as ready
1297  assert(status == S_WAITCNT);
1298 
1299  // waitcnt instruction shouldn't be sending
1300  // negative counts
1301  assert(vm_wait_cnt >= 0);
1302  assert(exp_wait_cnt >= 0);
1303  assert(lgkm_wait_cnt >= 0);
1304  // waitcnts are a max of 15 because we have
1305  // only 1 nibble (4 bits) to set the counts
1306  assert(vm_wait_cnt <= 0xf);
1307  assert(exp_wait_cnt <= 0x7);
1308  assert(lgkm_wait_cnt <= 0x1f);
1309 
1316  assert(vmWaitCnt == -1);
1317  assert(expWaitCnt == -1);
1318  assert(lgkmWaitCnt == -1);
1319 
1326  if (vm_wait_cnt != 0xf)
1327  vmWaitCnt = vm_wait_cnt;
1328 
1329  if (exp_wait_cnt != 0x7)
1330  expWaitCnt = exp_wait_cnt;
1331 
1332  if (lgkm_wait_cnt != 0x1f)
1333  lgkmWaitCnt = lgkm_wait_cnt;
1334 }
1335 
1336 void
1338 {
1339  // reset the waitcnts back to
1340  // -1, indicating they are no
1341  // longer valid
1342  vmWaitCnt = -1;
1343  expWaitCnt = -1;
1344  lgkmWaitCnt = -1;
1345 
1346  // resume running normally
1347  status = S_RUNNING;
1348 }
1349 
1350 void
1352 {
1353  ++vmemInstsIssued;
1354 }
1355 
1356 void
1358 {
1359  ++expInstsIssued;
1360 }
1361 
1362 void
1364 {
1365  ++lgkmInstsIssued;
1366 }
1367 
1368 void
1370 {
1371  --vmemInstsIssued;
1372 }
1373 
1374 void
1376 {
1377  --expInstsIssued;
1378 }
1379 
1380 void
1382 {
1383  --lgkmInstsIssued;
1384 }
1385 
1386 Addr
1388 {
1389  return _pc;
1390 }
1391 
1392 void
1394 {
1395  _pc = new_pc;
1396 }
1397 
1398 VectorMask&
1400 {
1401  return _execMask;
1402 }
1403 
1404 bool
1405 Wavefront::execMask(int lane) const
1406 {
1407  return _execMask[lane];
1408 }
1409 
1410 void
1412 {
1413  /* clear busy registers */
1414  for (int i=0; i < maxVgprs; i++) {
1415  int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
1416  computeUnit->vrf[simdId]->markReg(vgprIdx, false);
1417  }
1418 
1419  /* Free registers used by this wavefront */
1420  uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
1421  computeUnit->vrf[simdId]->numRegs();
1423  freeRegion(startVgprIndex, endIndex);
1424 }
1425 
1426 void
1428 {
1429  actualWgSzTotal = 1;
1430  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
1431  actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
1432  - task->wgId(d) * workGroupSz[d]);
1434  }
1435 }
1436 
1437 void
1439 {
1440  assert(bar_id >= WFBarrier::InvalidID);
1441  assert(bar_id < computeUnit->numBarrierSlots());
1442  barId = bar_id;
1443 }
1444 
1445 int
1447 {
1448  return barId;
1449 }
1450 
1451 bool
1453 {
1454  return barId > WFBarrier::InvalidID;
1455 }
1456 
1457 void
1459 {
1461 }
1462 
1464  : statistics::Group(parent),
1465  ADD_STAT(numInstrExecuted,
1466  "number of instructions executed by this WF slot"),
1467  ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
1468  ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
1469  ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
1470  "RF denied adding instruction"),
1471  ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
1472  " not available"),
1473  ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
1474  "RF reads to complete"),
1475  ADD_STAT(schLdsArbStalls,
1476  "number of cycles wave stalled due to LDS-VRF arbitration"),
1477  // FIXME: the name of the WF needs to be unique
1478  ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
1479  "instructions are blocked due to WAW or WAR dependencies"),
1480  // FIXME: the name of the WF needs to be unique
1481  ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
1482  "instructions are blocked due to RAW dependencies"),
1483  ADD_STAT(vecRawDistance,
1484  "Count of RAW distance in dynamic instructions for this WF"),
1485  ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
1486 {
1487  vecRawDistance.init(0, 20, 1);
1488  readsPerWrite.init(0, 4, 1);
1489 }
1490 
1491 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
Tick cyclesToTicks(Cycles c) const
int wfSize() const
int mapWaveToScalarAlu(Wavefront *w) const
std::vector< WaitClass > scalarALUs
WaitClass scalarMemUnit
Cycles srf_scm_bus_latency
std::vector< uint64_t > instExecPerSimd
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass vrfToLocalMemPipeBus
WaitClass srfToScalarMemPipeBus
std::vector< uint64_t > lastExecCycle
std::vector< ScalarRegisterFile * > srf
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
WaitClass vectorSharedMemUnit
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
TokenManager * getTokenManager()
std::vector< VectorRegisterFile * > vrf
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
void deleteFromPipeMap(Wavefront *w)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:79
FetchUnit & fetchUnit(int simdId)
Definition: fetch_stage.hh:66
void flushBuf(int wfSlotId)
Definition: fetch_unit.cc:333
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
Addr kernargAddr() const
bool sgprBitEnabled(int bit) const
int wgId(int dim) const
Addr hostDispPktAddr() const
static const int MAX_DIM
int wgSize(int dim) const
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
int privMemPerItem() const
int gridSize(int dim) const
int mapVgpr(Wavefront *w, int vgprIndex)
std::vector< PoolManager * > vrfPoolMgrs
int mapSgpr(Wavefront *w, int sgprIndex)
hsail_mode_e hsail_mode
Definition: shader.hh:222
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
Definition: shader.hh:197
void incVectorInstDstOperand(int num_operands)
Definition: shader.hh:312
void incVectorInstSrcOperand(int num_operands)
Definition: shader.hh:306
Abstract superclass for simulation objects.
Definition: sim_object.hh:148
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition: token_port.cc:155
WF barrier slots.
Definition: compute_unit.hh:91
static const int InvalidID
Definition: compute_unit.hh:97
void set(uint64_t i)
Definition: misc.hh:82
uint32_t maxSgprs
Definition: wavefront.hh:133
status_e status
Definition: wavefront.hh:328
int scalarAluGlobalIdx
Definition: wavefront.hh:125
bool isOldestInstWaitcnt()
Definition: wavefront.cc:626
Addr pc() const
Definition: wavefront.cc:1387
bool hasBarrier() const
Definition: wavefront.cc:1452
VectorMask _execMask
Definition: wavefront.hh:330
uint32_t actualWgSzTotal
Definition: wavefront.hh:164
void reserveGmResource(GPUDynInstPtr ii)
Definition: wavefront.cc:784
uint64_t oldVgprTcnt
Definition: wavefront.hh:209
std::vector< Addr > lastAddr
Definition: wavefront.hh:153
void discardFetch()
Definition: wavefront.cc:1210
uint32_t wfId
Definition: wavefront.hh:167
void setStatus(status_e newStatus)
Definition: wavefront.cc:542
bool waitCntsSatisfied()
Definition: wavefront.cc:1223
uint32_t wgId
Definition: wavefront.hh:160
void validateRequestCounters()
Definition: wavefront.cc:770
int reservedScalarRegs
Definition: wavefront.hh:196
const int simdId
Definition: wavefront.hh:99
int outstandingReqsWrGm
Definition: wavefront.hh:173
bool isOldestInstLMem()
Definition: wavefront.cc:712
bool isOldestInstPrivMem()
Definition: wavefront.cc:725
bool isOldestInstScalarMem()
Definition: wavefront.cc:699
uint64_t oldDgprTcnt
Definition: wavefront.hh:216
int scalarWrGmReqsInPipe
Definition: wavefront.hh:189
Wavefront(const Params &p)
Definition: wavefront.cc:48
bool isOldestInstBarrier()
Definition: wavefront.cc:673
Tick lastNonIdleTick
Definition: wavefront.hh:114
void resizeRegFiles(int num_vregs, int num_sregs)
Definition: wavefront.cc:531
int scalarOutstandingReqsWrGm
Definition: wavefront.hh:183
uint32_t gridSz[3]
Definition: wavefront.hh:159
void decExpInstsIssued()
Definition: wavefront.cc:1375
std::vector< uint32_t > oldVgpr
Definition: wavefront.hh:205
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
Definition: wavefront.cc:117
void setSleepTime(int sleep_time)
Definition: wavefront.cc:1285
ComputeUnit * computeUnit
Definition: wavefront.hh:106
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:155
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
Definition: wavefront.hh:321
std::vector< int > vecReads
Definition: wavefront.hh:237
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:109
bool isOldestInstSleep()
Definition: wavefront.cc:612
int outstandingReqsRdLm
Definition: wavefront.hh:179
bool isLmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:601
GPUDynInstPtr nextInstr()
Definition: wavefront.cc:1191
uint64_t lastTrace
Definition: wavefront.hh:192
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:154
std::vector< uint64_t > oldDgpr
Definition: wavefront.hh:212
bool isOldestInstScalarALU()
Definition: wavefront.cc:643
int reservedVectorRegs
Definition: wavefront.hh:194
void releaseBarrier()
Definition: wavefront.cc:1458
bool isOldestInstFlatMem()
Definition: wavefront.cc:738
WavefrontParams Params
Definition: wavefront.hh:244
uint32_t maxVgprs
Definition: wavefront.hh:131
void decVMemInstsIssued()
Definition: wavefront.cc:1369
void computeActualWgSz(HSAQueueEntry *task)
Definition: wavefront.cc:1427
bool stopFetch()
Definition: wavefront.cc:751
uint32_t workGroupId[3]
Definition: wavefront.hh:157
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
Definition: wavefront.cc:1292
const int wfSlotId
Definition: wavefront.hh:96
std::unordered_map< int, uint64_t > rawDist
Definition: wavefront.hh:233
void incExpInstsIssued()
Definition: wavefront.cc:1357
std::vector< int > reserveResources()
Definition: wavefront.cc:832
uint32_t startSgprIndex
Definition: wavefront.hh:202
void decLGKMInstsIssued()
Definition: wavefront.cc:1381
int outstandingReqsWrLm
Definition: wavefront.hh:175
void incLGKMInstsIssued()
Definition: wavefront.cc:1363
int barrierId() const
Definition: wavefront.cc:1446
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: wavefront.cc:102
uint32_t workGroupSz[3]
Definition: wavefront.hh:158
uint32_t wgSz
Definition: wavefront.hh:161
bool isOldestInstVectorALU()
Definition: wavefront.cc:658
uint64_t lastInstExec
Definition: wavefront.hh:229
LdsChunk * ldsChunk
Definition: wavefront.hh:223
uint32_t actualWgSz[3]
Definition: wavefront.hh:163
int scalarOutstandingReqsRdGm
Definition: wavefront.hh:181
int scalarRdGmReqsInPipe
Definition: wavefront.hh:188
void freeResources()
Definition: wavefront.cc:765
void incVMemInstsIssued()
Definition: wavefront.cc:1351
void reserveLmResource(GPUDynInstPtr ii)
Definition: wavefront.cc:814
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:92
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition: wavefront.hh:88
int outstandingReqsRdGm
Definition: wavefront.hh:177
bool isOldestInstGMem()
Definition: wavefront.cc:686
gem5::Wavefront::WavefrontStats stats
VectorMask & execMask()
Definition: wavefront.cc:1399
uint64_t wfDynId
Definition: wavefront.hh:226
void freeRegisterFile()
Freeing VRF space.
Definition: wavefront.cc:1411
bool isGmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:590
void clearWaitCnts()
Definition: wavefront.cc:1337
uint32_t startVgprIndex
Definition: wavefront.hh:199
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:579
TheGpuISA::GPUISA _gpuISA
Definition: wavefront.hh:300
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1328
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
Statistics container.
Definition: group.hh:94
Counter value() const
Return the current value of this stat as its base type.
Definition: statistics.hh:622
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
static constexpr T divCeil(const T &a, const U &b)
Definition: intmath.hh:110
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition: bitfield.hh:76
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 12, 11 > set
Definition: misc_types.hh:709
Bitfield< 9 > d
Definition: misc_types.hh:64
VecRegContainer< sizeof(VecElemU32) *NumVecElemPerVecReg > VecRegContainerU32
uint32_t ScalarRegU32
uint32_t VecElemU32
Bitfield< 54 > p
Definition: pagetable.hh:70
const FlagsType none
Nothing extra to print.
Definition: info.hh:54
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
@ GridWorkgroupCountZ
Definition: kernel_code.hh:65
@ WorkgroupIdX
Definition: kernel_code.hh:66
@ DispatchId
Definition: kernel_code.hh:60
@ NumScalarInitFields
Definition: kernel_code.hh:71
@ DispatchPtr
Definition: kernel_code.hh:57
@ QueuePtr
Definition: kernel_code.hh:58
@ PrivSegWaveByteOffset
Definition: kernel_code.hh:70
@ PrivateSegBuf
Definition: kernel_code.hh:56
@ WorkgroupIdY
Definition: kernel_code.hh:67
@ PrivateSegSize
Definition: kernel_code.hh:62
@ WorkgroupInfo
Definition: kernel_code.hh:69
@ GridWorkgroupCountY
Definition: kernel_code.hh:64
@ WorkgroupIdZ
Definition: kernel_code.hh:68
@ GridWorkgroupCountX
Definition: kernel_code.hh:63
@ FlatScratchInit
Definition: kernel_code.hh:61
@ KernargSegPtr
Definition: kernel_code.hh:59
@ WorkitemIdX
Definition: kernel_code.hh:76
@ WorkitemIdZ
Definition: kernel_code.hh:78
@ NumVectorInitFields
Definition: kernel_code.hh:79
@ WorkitemIdY
Definition: kernel_code.hh:77
statistics::Distribution activeLanesPerLMemInstrDist
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedMAC64
statistics::Vector instCyclesVMemPerSimd
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Scalar numVecOpsExecutedFMA64
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedFMA32
statistics::Scalar numVecOpsExecutedMAD16
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar numVecOpsExecutedMAD32
statistics::Vector instCyclesScMemPerSimd
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution execRateDist
WavefrontStats(statistics::Group *parent)
Definition: wavefront.cc:1463
statistics::Distribution vecRawDistance
Definition: wavefront.hh:372
statistics::Distribution readsPerWrite
Definition: wavefront.hh:376
statistics::Scalar numInstrExecuted
Definition: wavefront.hh:340
uint32_t scratch_workitem_byte_size
Definition: hsa_queue.hh:84
uint32_t compute_tmpring_size_wavesize
Definition: hsa_queue.hh:79
uint64_t scratch_backing_memory_location
Definition: hsa_queue.hh:82
uint32_t scratch_resource_descriptor[4]
Definition: hsa_queue.hh:81

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1