gem5  v22.0.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <limits>
35 
38 #include "base/output.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUReg.hh"
46 #include "debug/GPURename.hh"
47 #include "debug/GPUSync.hh"
48 #include "debug/GPUTLB.hh"
54 #include "gpu-compute/shader.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 #include "sim/sim_exit.hh"
61 
62 namespace gem5
63 {
64 
66  numVectorGlobalMemUnits(p.num_global_mem_pipes),
67  numVectorSharedMemUnits(p.num_shared_mem_pipes),
68  numScalarMemUnits(p.num_scalar_mem_pipes),
69  numVectorALUs(p.num_SIMDs),
70  numScalarALUs(p.num_scalar_cores),
71  vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
72  coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
73  registerManager(p.register_manager),
74  fetchStage(p, *this),
75  scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
76  scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
77  execStage(p, *this, scheduleToExecute),
78  globalMemoryPipe(p, *this),
79  localMemoryPipe(p, *this),
80  scalarMemoryPipe(p, *this),
81  tickEvent([this]{ exec(); }, "Compute unit tick event",
82  false, Event::CPU_Tick_Pri),
83  cu_id(p.cu_id),
84  vrf(p.vector_register_file), srf(p.scalar_register_file),
85  simdWidth(p.simd_width),
86  spBypassPipeLength(p.spbypass_pipe_length),
87  dpBypassPipeLength(p.dpbypass_pipe_length),
88  scalarPipeStages(p.scalar_pipe_length),
89  operandNetworkLength(p.operand_network_length),
90  issuePeriod(p.issue_period),
91  vrf_gm_bus_latency(p.vrf_gm_bus_latency),
92  srf_scm_bus_latency(p.srf_scm_bus_latency),
93  vrf_lm_bus_latency(p.vrf_lm_bus_latency),
94  perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
95  prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
96  debugSegFault(p.debugSegFault),
97  functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
98  countPages(p.countPages),
99  req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
100  resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
101  _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
102  lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
103  ldsPort(csprintf("%s-port", name()), this),
104  scalarDataPort(csprintf("%s-port", name()), this),
105  scalarDTLBPort(csprintf("%s-port", name()), this),
106  sqcPort(csprintf("%s-port", name()), this),
107  sqcTLBPort(csprintf("%s-port", name()), this),
108  _cacheLineSize(p.system->cacheLineSize()),
109  _numBarrierSlots(p.num_barrier_slots),
110  globalSeqNum(0), wavefrontSize(p.wf_size),
111  scoreboardCheckToSchedule(p),
112  scheduleToExecute(p),
113  stats(this, p.n_wf)
114 {
115  // This is not currently supported and would require adding more handling
116  // for system vs. device memory requests on the functional paths, so we
117  // fatal immediately in the constructor if this configuration is seen.
118  fatal_if(functionalTLB && FullSystem,
119  "Functional TLB not supported in full-system GPU simulation");
120 
130  fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
131  p.wf_size <= 0,
132  "WF size is larger than the host can support");
133  fatal_if(!isPowerOf2(wavefrontSize),
134  "Wavefront size should be a power of 2");
135  // calculate how many cycles a vector load or store will need to transfer
136  // its data over the corresponding buses
137  numCyclesPerStoreTransfer =
138  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
139  (double)vrfToCoalescerBusWidth);
140 
141  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
142  / coalescerToVrfBusWidth;
143 
144  // Initialization: all WF slots are assumed STOPPED
145  idleWfs = p.n_wf * numVectorALUs;
146  lastVaddrWF.resize(numVectorALUs);
147  wfList.resize(numVectorALUs);
148 
149  wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
150 
151  for (int i = 0; i < p.num_barrier_slots; ++i) {
152  freeBarrierIds.insert(i);
153  }
154 
155  for (int j = 0; j < numVectorALUs; ++j) {
156  lastVaddrWF[j].resize(p.n_wf);
157 
158  for (int i = 0; i < p.n_wf; ++i) {
159  lastVaddrWF[j][i].resize(wfSize());
160 
161  wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
162  wfList[j][i]->setParent(this);
163 
164  for (int k = 0; k < wfSize(); ++k) {
165  lastVaddrWF[j][i][k] = 0;
166  }
167  }
168  }
169 
170  lastVaddrSimd.resize(numVectorALUs);
171 
172  for (int i = 0; i < numVectorALUs; ++i) {
173  lastVaddrSimd[i].resize(wfSize(), 0);
174  }
175 
176  lastVaddrCU.resize(wfSize());
177 
178  lds.setParent(this);
179 
180  if (p.execPolicy == "OLDEST-FIRST") {
181  exec_policy = EXEC_POLICY::OLDEST;
182  } else if (p.execPolicy == "ROUND-ROBIN") {
183  exec_policy = EXEC_POLICY::RR;
184  } else {
185  fatal("Invalid WF execution policy (CU)\n");
186  }
187 
188  for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
189  memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
190  }
191 
192  for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
193  tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
194  }
195 
196  // Setup tokens for response ports. The number of tokens in memPortTokens
197  // is the total token count for the entire vector port (i.e., this CU).
198  memPortTokens = new TokenManager(p.max_cu_tokens);
199 
200  registerExitCallback([this]() { exitCallback(); });
201 
202  lastExecCycle.resize(numVectorALUs, 0);
203 
204  for (int i = 0; i < vrf.size(); ++i) {
205  vrf[i]->setParent(this);
206  }
207  for (int i = 0; i < srf.size(); ++i) {
208  srf[i]->setParent(this);
209  }
210  numVecRegsPerSimd = vrf[0]->numRegs();
211  numScalarRegsPerSimd = srf[0]->numRegs();
212 
213  registerManager->setParent(this);
214 
215  activeWaves = 0;
216 
217  instExecPerSimd.resize(numVectorALUs, 0);
218 
219  // Calculate the number of bits to address a cache line
220  panic_if(!isPowerOf2(_cacheLineSize),
221  "Cache line size should be a power of two.");
222  cacheLineBits = floorLog2(_cacheLineSize);
223 }
224 
226 {
227  // Delete wavefront slots
228  for (int j = 0; j < numVectorALUs; ++j) {
229  for (int i = 0; i < shader->n_wf; ++i) {
230  delete wfList[j][i];
231  }
232  lastVaddrSimd[j].clear();
233  }
234  lastVaddrCU.clear();
235 }
236 
237 int
239 {
242 }
243 
244 // index into readyList of the first memory unit
245 int
247 {
248  return numVectorALUs + numScalarALUs;
249 }
250 
251 // index into readyList of the last memory unit
252 int
254 {
255  return numExeUnits() - 1;
256 }
257 
258 // index into scalarALUs vector of SALU used by the wavefront
259 int
261 {
262  if (numScalarALUs == 1) {
263  return 0;
264  } else {
265  return w->simdId % numScalarALUs;
266  }
267 }
268 
269 // index into readyList of Scalar ALU unit used by wavefront
270 int
272 {
274 }
275 
276 // index into readyList of Global Memory unit used by wavefront
277 int
279 {
280  // TODO: FIXME if more than 1 GM pipe supported
281  return numVectorALUs + numScalarALUs;
282 }
283 
284 // index into readyList of Local Memory unit used by wavefront
285 int
287 {
288  // TODO: FIXME if more than 1 LM pipe supported
290 }
291 
292 // index into readyList of Scalar Memory unit used by wavefront
293 int
295 {
296  // TODO: FIXME if more than 1 ScM pipe supported
299 }
300 
301 void
303 {
304  w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
305  w->workGroupSz[0] = task->wgSize(0);
306  w->workGroupSz[1] = task->wgSize(1);
307  w->workGroupSz[2] = task->wgSize(2);
308  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
309  w->gridSz[0] = task->gridSize(0);
310  w->gridSz[1] = task->gridSize(1);
311  w->gridSz[2] = task->gridSize(2);
312  w->computeActualWgSz(task);
313 }
314 
315 void
317  HSAQueueEntry *task, int bar_id, bool fetchContext)
318 {
319  static int _n_wave = 0;
320 
321  VectorMask init_mask;
322  init_mask.reset();
323 
324  for (int k = 0; k < wfSize(); ++k) {
325  if (k + waveId * wfSize() < w->actualWgSzTotal)
326  init_mask[k] = 1;
327  }
328 
329  w->execMask() = init_mask;
330 
331  w->kernId = task->dispatchId();
332  w->wfId = waveId;
333  w->initMask = init_mask.to_ullong();
334 
335  if (bar_id > WFBarrier::InvalidID) {
336  w->barrierId(bar_id);
337  } else {
338  assert(!w->hasBarrier());
339  }
340 
341  for (int k = 0; k < wfSize(); ++k) {
342  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
343  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
344  w->actualWgSz[1];
345  w->workItemId[2][k] = (k + waveId * wfSize()) /
346  (w->actualWgSz[0] * w->actualWgSz[1]);
347 
348  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
349  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
350  w->workItemId[0][k];
351  }
352 
353  // WG state
354  w->wgId = task->globalWgId();
355  w->dispatchId = task->dispatchId();
356  w->workGroupId[0] = w->wgId % task->numWg(0);
357  w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
358  w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
359 
360  // set the wavefront context to have a pointer to this section of the LDS
361  w->ldsChunk = ldsChunk;
362 
363  [[maybe_unused]] int32_t refCount =
364  lds.increaseRefCounter(w->dispatchId, w->wgId);
365  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
366  cu_id, w->wgId, refCount);
367 
368  w->instructionBuffer.clear();
369 
370  if (w->pendingFetch)
371  w->dropFetch = true;
372 
373  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
374  "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
375  w->simdId, w->wfSlotId, refCount);
376 
377  w->initRegState(task, w->actualWgSzTotal);
378  w->start(_n_wave++, task->codeAddr());
379 
381  activeWaves++;
382 }
383 
389 void
391  GPUDynInstPtr gpuDynInst
392  = std::make_shared<GPUDynInst>(this, nullptr,
394 
395  // kern_id will be used in inv responses
396  gpuDynInst->kern_id = kernId;
397  // update contextId field
398  req->setContext(gpuDynInst->wfDynId);
399 
400  injectGlobalMemFence(gpuDynInst, true, req);
401 }
402 
408 void
410  injectGlobalMemFence(gpuDynInst, true);
411 }
412 
413 // reseting SIMD register pools
414 // I couldn't think of any other place and
415 // I think it is needed in my implementation
416 void
418 {
419  for (int i=0; i<numVectorALUs; i++)
420  {
423  }
424 }
425 
426 void
427 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
428 {
429  // If we aren't ticking, start it up!
430  if (!tickEvent.scheduled()) {
431  DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
433  }
434 
435  // the kernel's invalidate must have finished before any wg dispatch
436  assert(task->isInvDone());
437 
438  // reserve the LDS capacity allocated to the work group
439  // disambiguated by the dispatch ID and workgroup ID, which should be
440  // globally unique
441  LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
442  task->globalWgId(),
443  task->ldsSize());
444 
445  panic_if(!ldsChunk, "was not able to reserve space for this WG");
446 
447  // calculate the number of 32-bit vector registers required
448  // by each work item
449  int vregDemand = task->numVectorRegs();
450  int sregDemand = task->numScalarRegs();
451  int wave_id = 0;
452 
453  int barrier_id = WFBarrier::InvalidID;
454 
459  if (num_wfs_in_wg > 1) {
464  barrier_id = getFreeBarrierId();
465  auto &wf_barrier = barrierSlot(barrier_id);
466  assert(!wf_barrier.maxBarrierCnt());
467  assert(!wf_barrier.numAtBarrier());
468  wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
469 
470  DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
471  "%d waves using this barrier.\n", cu_id, barrier_id,
472  num_wfs_in_wg);
473  }
474 
475  // Assign WFs according to numWfsToSched vector, which is computed by
476  // hasDispResources()
477  for (int j = 0; j < shader->n_wf; ++j) {
478  for (int i = 0; i < numVectorALUs; ++i) {
479  Wavefront *w = wfList[i][j];
480  // Check if this wavefront slot is available and there are WFs
481  // remaining to be dispatched to current SIMD:
482  // WF slot must be stopped and not waiting
483  // for a release to complete S_RETURNING
484  if (w->getStatus() == Wavefront::S_STOPPED &&
485  numWfsToSched[i] > 0) {
486  // decrement number of WFs awaiting dispatch to current SIMD
487  numWfsToSched[i] -= 1;
488 
489  fillKernelState(w, task);
490 
491  DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
492  "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
493  vregDemand, sregDemand);
494 
495  registerManager->allocateRegisters(w, vregDemand, sregDemand);
496 
497  startWavefront(w, wave_id, ldsChunk, task, barrier_id);
498  ++wave_id;
499  }
500  }
501  }
502 }
503 
504 void
506 {
507  panic_if(w->instructionBuffer.empty(),
508  "Instruction Buffer of WF%d can't be empty", w->wgId);
509  GPUDynInstPtr ii = w->instructionBuffer.front();
510  pipeMap.emplace(ii->seqNum());
511 }
512 
513 void
515 {
516  panic_if(w->instructionBuffer.empty(),
517  "Instruction Buffer of WF%d can't be empty", w->wgId);
518  GPUDynInstPtr ii = w->instructionBuffer.front();
519  // delete the dynamic instruction from the pipeline map
520  auto it = pipeMap.find(ii->seqNum());
521  panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
522  pipeMap.erase(it);
523 }
524 
525 bool
527 {
528  // compute true size of workgroup (after clamping to grid size)
529  int trueWgSize[HSAQueueEntry::MAX_DIM];
530  int trueWgSizeTotal = 1;
531 
532  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
533  trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
534  task->wgId(d) * task->wgSize(d));
535 
536  trueWgSizeTotal *= trueWgSize[d];
537  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
538  }
539 
540  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
541 
542  // calculate the number of WFs in this WG
543  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
544  num_wfs_in_wg = numWfs;
545 
546  bool barrier_avail = true;
547 
548  if (numWfs > 1 && !freeBarrierIds.size()) {
549  barrier_avail = false;
550  }
551 
552  // calculate the number of 32-bit vector registers required by each
553  // work item of the work group
554  int vregDemandPerWI = task->numVectorRegs();
555  // calculate the number of 32-bit scalar registers required by each
556  // work item of the work group
557  int sregDemandPerWI = task->numScalarRegs();
558 
559  // check if the total number of VGPRs snd SGPRs required by all WFs
560  // of the WG fit in the VRFs of all SIMD units and the CU's SRF
561  panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
562  "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
563  "that has %d VGPRs\n",
564  numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
565  panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
566  "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
567  "with %d SGPRs\n",
568  numWfs, sregDemandPerWI, numScalarRegsPerSimd);
569 
570  // number of WF slots that are not occupied
571  int freeWfSlots = 0;
572  // number of Wfs from WG that were successfully mapped to a SIMD
573  int numMappedWfs = 0;
574  numWfsToSched.clear();
575  numWfsToSched.resize(numVectorALUs, 0);
576 
577  // attempt to map WFs to the SIMDs, based on WF slot availability
578  // and register file availability
579  for (int j = 0; j < shader->n_wf; ++j) {
580  for (int i = 0; i < numVectorALUs; ++i) {
581  if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
582  ++freeWfSlots;
583  // check if current WF will fit onto current SIMD/VRF
584  // if all WFs have not yet been mapped to the SIMDs
585  if (numMappedWfs < numWfs &&
587  sregDemandPerWI) &&
589  vregDemandPerWI)) {
590  numWfsToSched[i]++;
591  numMappedWfs++;
592  }
593  }
594  }
595  }
596 
597  // check that the number of mapped WFs is not greater
598  // than the actual number of WFs
599  assert(numMappedWfs <= numWfs);
600 
601  bool vregAvail = true;
602  bool sregAvail = true;
603  // if a WF to SIMD mapping was not found, find the limiting resource
604  if (numMappedWfs < numWfs) {
605 
606  for (int j = 0; j < numVectorALUs; ++j) {
607  // find if there are enough free VGPRs in the SIMD's VRF
608  // to accomodate the WFs of the new WG that would be mapped
609  // to this SIMD unit
610  vregAvail &= registerManager->
611  canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
612  // find if there are enough free SGPRs in the SIMD's SRF
613  // to accomodate the WFs of the new WG that would be mapped
614  // to this SIMD unit
615  sregAvail &= registerManager->
616  canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
617  }
618  }
619 
620  DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
621  VGPR Availability = %d, SGPR Availability = %d\n",
622  freeWfSlots, numMappedWfs, vregAvail, sregAvail);
623 
624  if (!vregAvail) {
626  }
627 
628  if (!sregAvail) {
630  }
631 
632  // Return true if enough WF slots to submit workgroup and if there are
633  // enough VGPRs to schedule all WFs to their SIMD units
634  bool ldsAvail = lds.canReserve(task->ldsSize());
635  if (!ldsAvail) {
637  }
638 
639  if (!barrier_avail) {
641  }
642 
643  // Return true if the following are all true:
644  // (a) all WFs of the WG were mapped to free WF slots
645  // (b) there are enough VGPRs to schedule all WFs to their SIMD units
646  // (c) there are enough SGPRs on the CU to schedule all WFs
647  // (d) there is enough space in LDS to allocate for all WFs
648  bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
649  && ldsAvail && barrier_avail;
650  return can_dispatch;
651 }
652 
653 int
655 {
656  auto &wf_barrier = barrierSlot(bar_id);
657  return wf_barrier.numYetToReachBarrier();
658 }
659 
660 bool
662 {
663  auto &wf_barrier = barrierSlot(bar_id);
664  return wf_barrier.allAtBarrier();
665 }
666 
667 void
669 {
670  auto &wf_barrier = barrierSlot(bar_id);
671  wf_barrier.incNumAtBarrier();
672 }
673 
674 int
676 {
677  auto &wf_barrier = barrierSlot(bar_id);
678  return wf_barrier.numAtBarrier();
679 }
680 
681 int
683 {
684  auto &wf_barrier = barrierSlot(bar_id);
685  return wf_barrier.maxBarrierCnt();
686 }
687 
688 void
690 {
691  auto &wf_barrier = barrierSlot(bar_id);
692  wf_barrier.reset();
693 }
694 
695 void
697 {
698  auto &wf_barrier = barrierSlot(bar_id);
699  wf_barrier.decMaxBarrierCnt();
700 }
701 
702 void
704 {
705  auto &wf_barrier = barrierSlot(bar_id);
706  wf_barrier.release();
707  freeBarrierIds.insert(bar_id);
708 }
709 
710 void
712 {
713  for (int i = 0; i < numVectorALUs; ++i) {
714  for (int j = 0; j < shader->n_wf; ++j) {
715  Wavefront *wf = wfList[i][j];
716  if (wf->barrierId() == bar_id) {
717  assert(wf->getStatus() == Wavefront::S_BARRIER);
719  }
720  }
721  }
722 }
723 
724 // Execute one clock worth of work on the ComputeUnit.
725 void
727 {
728  // process reads and writes in the RFs
729  for (auto &vecRegFile : vrf) {
730  vecRegFile->exec();
731  }
732 
733  for (auto &scRegFile : srf) {
734  scRegFile->exec();
735  }
736 
737  // Execute pipeline stages in reverse order to simulate
738  // the pipeline latency
742  execStage.exec();
745  fetchStage.exec();
746 
747  stats.totalCycles++;
748 
749  // Put this CU to sleep if there is no more work to be done.
750  if (!isDone()) {
752  } else {
754  DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
755  }
756 }
757 
758 void
760 {
761  // Initialize CU Bus models and execution resources
762 
763  // Vector ALUs
764  vectorALUs.clear();
765  for (int i = 0; i < numVectorALUs; i++) {
766  vectorALUs.emplace_back(this, clockPeriod());
767  }
768 
769  // Scalar ALUs
770  scalarALUs.clear();
771  for (int i = 0; i < numScalarALUs; i++) {
772  scalarALUs.emplace_back(this, clockPeriod());
773  }
774 
775  // Vector Global Memory
777  "No support for multiple Global Memory Pipelines exists!!!");
781 
782  // Vector Local/Shared Memory
784  "No support for multiple Local Memory Pipelines exists!!!");
788 
789  // Scalar Memory
791  "No support for multiple Scalar Memory Pipelines exists!!!");
792  scalarMemUnit.init(this, clockPeriod());
795 
798 
799  fetchStage.init();
801  execStage.init();
803 
805 }
806 
807 bool
809 {
810  return handleResponse(pkt);
811 }
812 
813 bool
815 {
816  // Ruby has completed the memory op. Schedule the mem_resp_event at the
817  // appropriate cycle to process the timing memory response
818  // This delay represents the pipeline delay
819  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
820  PortID index = sender_state->port_index;
821  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
822  GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
823 
824  // MemSyncResp + WriteAckResp are handled completely here and we don't
825  // schedule a MemRespEvent to process the responses further
826  if (pkt->cmd == MemCmd::MemSyncResp) {
827  // This response is for 1 of the following request types:
828  // - kernel launch
829  // - kernel end
830  // - non-kernel mem sync
831 
832  // Kernel Launch
833  // wavefront was nullptr when launching kernel, so it is meaningless
834  // here (simdId=-1, wfSlotId=-1)
835  if (gpuDynInst->isKernelLaunch()) {
836  // for kernel launch, the original request must be both kernel-type
837  // and INV_L1
838  assert(pkt->req->isKernel());
839  assert(pkt->req->isInvL1());
840 
841  // one D-Cache inv is done, decrement counter
842  dispatcher.updateInvCounter(gpuDynInst->kern_id);
843 
844  delete pkt->senderState;
845  delete pkt;
846  return true;
847  }
848 
849  // retrieve wavefront from inst
850  Wavefront *w = gpuDynInst->wavefront();
851 
852  // Check if we are waiting on Kernel End Flush
853  if (w->getStatus() == Wavefront::S_RETURNING
854  && gpuDynInst->isEndOfKernel()) {
855  // for kernel end, the original request must be both kernel-type
856  // and last-level GPU cache should be flushed if it contains
857  // dirty data. This request may have been quiesced and
858  // immediately responded to if the GL2 is a write-through /
859  // read-only cache.
860  assert(pkt->req->isKernel());
861  assert(pkt->req->isGL2CacheFlush());
862 
863  // once flush done, decrement counter, and return whether all
864  // dirty writeback operations are done for the kernel
865  bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
866 
867  // not all wbs are done for the kernel, just release pkt
868  // resources
869  if (!isWbDone) {
870  delete pkt->senderState;
871  delete pkt;
872  return true;
873  }
874 
875  // all wbs are completed for the kernel, do retirement work
876  // for the workgroup
877  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
878  computeUnit->cu_id, w->simdId, w->wfSlotId,
879  w->wfDynId, w->wgId);
880 
881  dispatcher.notifyWgCompl(w);
882  w->setStatus(Wavefront::S_STOPPED);
883  }
884 
885  if (!pkt->req->isKernel()) {
886  w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
887  DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
888  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
889  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
890  gpuDynInst->disassemble(), w->outstandingReqs,
891  w->outstandingReqs - 1);
892  computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
893  }
894 
895  delete pkt->senderState;
896  delete pkt;
897  return true;
898  }
899 
900  EventFunctionWrapper *mem_resp_event =
901  computeUnit->memPort[index].createMemRespEvent(pkt);
902 
903  DPRINTF(GPUPort,
904  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
905  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
906  gpuDynInst->seqNum(), index, pkt->req->getPaddr());
907 
908  computeUnit->schedule(mem_resp_event,
909  curTick() + computeUnit->resp_tick_latency);
910 
911  return true;
912 }
913 
914 bool
916 {
917  return handleResponse(pkt);
918 }
919 
920 bool
922 {
923  assert(!pkt->req->isKernel());
924 
925  // retrieve sender state
926  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
927  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
928 
929  assert(pkt->isRead() || pkt->isWrite());
930  assert(gpuDynInst->numScalarReqs > 0);
931 
932  gpuDynInst->numScalarReqs--;
933 
942  if (!gpuDynInst->numScalarReqs) {
943  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
944  computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
945  gpuDynInst);
946  } else {
947  computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
948  gpuDynInst);
949  }
950  }
951 
952  delete pkt->senderState;
953  delete pkt;
954 
955  return true;
956 }
957 
958 void
960 {
961  for (const auto &pkt : retries) {
962  if (!sendTimingReq(pkt)) {
963  break;
964  } else {
965  retries.pop_front();
966  }
967  }
968 }
969 
970 void
972 {
973  int len = retries.size();
974 
975  assert(len > 0);
976 
977  for (int i = 0; i < len; ++i) {
978  PacketPtr pkt = retries.front().first;
979  [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
980  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
981  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
982  pkt->req->getPaddr());
983 
987  if (!sendTimingReq(pkt)) {
988  DPRINTF(GPUMem, "failed again!\n");
989  break;
990  } else {
991  DPRINTF(GPUMem, "successful!\n");
992  retries.pop_front();
993  }
994  }
995 }
996 
997 bool
999 {
1000  computeUnit->handleSQCReturn(pkt);
1001 
1002  return true;
1003 }
1004 
1005 void
1007 {
1009 }
1010 
1011 void
1013 {
1014  int len = retries.size();
1015 
1016  assert(len > 0);
1017 
1018  for (int i = 0; i < len; ++i) {
1019  PacketPtr pkt = retries.front().first;
1020  [[maybe_unused]] Wavefront *wavefront = retries.front().second;
1021  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1022  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1023  pkt->req->getPaddr());
1024  if (!sendTimingReq(pkt)) {
1025  DPRINTF(GPUFetch, "failed again!\n");
1026  break;
1027  } else {
1028  DPRINTF(GPUFetch, "successful!\n");
1029  retries.pop_front();
1030  }
1031  }
1032 }
1033 
1034 void
1036 {
1037  // There must be a way around this check to do the globalMemStart...
1038  Addr tmp_vaddr = pkt->req->getVaddr();
1039 
1040  updatePageDivergenceDist(tmp_vaddr);
1041 
1042  // set PC in request
1043  pkt->req->setPC(gpuDynInst->wavefront()->pc());
1044 
1045  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1046 
1047  // figure out the type of the request to set read/write
1048  BaseMMU::Mode TLB_mode;
1049  assert(pkt->isRead() || pkt->isWrite());
1050 
1051  // only do some things if actually accessing data
1052  bool isDataAccess = pkt->isWrite() || pkt->isRead();
1053 
1054  // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode
1055  // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1056  // Rather than hacking up the pagetable to add these bits in, we just
1057  // keep a structure local to our GPUs that are populated in our
1058  // emulated driver whenever memory is allocated. Consult that structure
1059  // here in case we need a memtype override.
1060  //
1061  // In full system mode these can be extracted from the PTE and assigned
1062  // after address translation takes place.
1063  if (!FullSystem) {
1064  shader->gpuCmdProc.driver()->setMtype(pkt->req);
1065  }
1066 
1067  // Check write before read for atomic operations
1068  // since atomic operations should use BaseMMU::Write
1069  if (pkt->isWrite()) {
1070  TLB_mode = BaseMMU::Write;
1071  } else if (pkt->isRead()) {
1072  TLB_mode = BaseMMU::Read;
1073  } else {
1074  fatal("pkt is not a read nor a write\n");
1075  }
1076 
1077  stats.tlbCycles -= curTick();
1078  ++stats.tlbRequests;
1079 
1080  PortID tlbPort_index = perLaneTLB ? index : 0;
1081 
1082  if (shader->timingSim) {
1083  if (!FullSystem && debugSegFault) {
1085  Addr vaddr = pkt->req->getVaddr();
1086  unsigned size = pkt->getSize();
1087 
1088  if ((vaddr + size - 1) % 64 < vaddr % 64) {
1089  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1090  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1091  }
1092 
1093  Addr paddr;
1094 
1095  if (!p->pTable->translate(vaddr, paddr)) {
1096  if (!p->fixupFault(vaddr)) {
1097  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1098  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1099  vaddr);
1100  }
1101  }
1102  }
1103 
1104  // This is the SenderState needed upon return
1105  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1106 
1107  // This is the senderState needed by the TLB hierarchy to function
1108  GpuTranslationState *translation_state =
1109  new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1110  pkt->senderState);
1111 
1112  pkt->senderState = translation_state;
1113 
1114  if (functionalTLB) {
1115  tlbPort[tlbPort_index].sendFunctional(pkt);
1116 
1117  // update the hitLevel distribution
1118  int hit_level = translation_state->hitLevel;
1119  assert(hit_level != -1);
1120  stats.hitsPerTLBLevel[hit_level]++;
1121 
1122  // New SenderState for the memory access
1123  GpuTranslationState *sender_state =
1124  safe_cast<GpuTranslationState*>(pkt->senderState);
1125 
1126  delete sender_state->tlbEntry;
1127  delete sender_state->saved;
1128  delete sender_state;
1129 
1130  assert(pkt->req->hasPaddr());
1131  assert(pkt->req->hasSize());
1132 
1133  // this is necessary because the GPU TLB receives packets instead
1134  // of requests. when the translation is complete, all relevent
1135  // fields in the request will be populated, but not in the packet.
1136  // here we create the new packet so we can set the size, addr,
1137  // and proper flags.
1138  PacketPtr oldPkt = pkt;
1139  pkt = new Packet(oldPkt->req, oldPkt->cmd);
1140  if (isDataAccess) {
1141  uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1142  pkt->dataStatic(tmpData);
1143  }
1144  delete oldPkt;
1145 
1146 
1147  // New SenderState for the memory access
1148  pkt->senderState =
1149  new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1150  nullptr);
1151 
1152  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1153  gpuDynInst->tlbHitLevel[index] = hit_level;
1154 
1155  // translation is done. Schedule the mem_req_event at the
1156  // appropriate cycle to send the timing memory request to ruby
1157  EventFunctionWrapper *mem_req_event =
1158  memPort[index].createMemReqEvent(pkt);
1159 
1160  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1161  "scheduled\n", cu_id, gpuDynInst->simdId,
1162  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1163 
1164  schedule(mem_req_event, curTick() + req_tick_latency);
1165  } else if (tlbPort[tlbPort_index].isStalled()) {
1166  assert(tlbPort[tlbPort_index].retries.size() > 0);
1167 
1168  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1169  "failed!\n", cu_id, gpuDynInst->simdId,
1170  gpuDynInst->wfSlotId, tmp_vaddr);
1171 
1172  tlbPort[tlbPort_index].retries.push_back(pkt);
1173  } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1174  // Stall the data port;
1175  // No more packet will be issued till
1176  // ruby indicates resources are freed by
1177  // a recvReqRetry() call back on this port.
1178  tlbPort[tlbPort_index].stallPort();
1179 
1180  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1181  "failed!\n", cu_id, gpuDynInst->simdId,
1182  gpuDynInst->wfSlotId, tmp_vaddr);
1183 
1184  tlbPort[tlbPort_index].retries.push_back(pkt);
1185  } else {
1186  DPRINTF(GPUTLB,
1187  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1188  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1189  }
1190  } else {
1191  if (pkt->cmd == MemCmd::MemSyncReq) {
1192  gpuDynInst->resetEntireStatusVector();
1193  } else {
1194  gpuDynInst->decrementStatusVector(index);
1195  }
1196 
1197  // New SenderState for the memory access
1198  delete pkt->senderState;
1199 
1200  // Because it's atomic operation, only need TLB translation state
1201  pkt->senderState = new GpuTranslationState(TLB_mode,
1202  shader->gpuTc);
1203 
1204  tlbPort[tlbPort_index].sendFunctional(pkt);
1205 
1206  // the addr of the packet is not modified, so we need to create a new
1207  // packet, or otherwise the memory access will have the old virtual
1208  // address sent in the translation packet, instead of the physical
1209  // address returned by the translation.
1210  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1211  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1212 
1213  // Translation is done. It is safe to send the packet to memory.
1214  memPort[0].sendFunctional(new_pkt);
1215 
1216  DPRINTF(GPUMem, "Functional sendRequest\n");
1217  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1218  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1219  new_pkt->req->getPaddr());
1220 
1221  // safe_cast the senderState
1222  GpuTranslationState *sender_state =
1223  safe_cast<GpuTranslationState*>(pkt->senderState);
1224 
1225  delete sender_state->tlbEntry;
1226  delete new_pkt;
1227  delete pkt->senderState;
1228  delete pkt;
1229  }
1230 }
1231 
1232 void
1234 {
1235  assert(pkt->isWrite() || pkt->isRead());
1236 
1237  BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1238 
1239  pkt->senderState =
1241 
1242  pkt->senderState =
1243  new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1244  pkt->senderState);
1245 
1246  if (scalarDTLBPort.isStalled()) {
1247  assert(scalarDTLBPort.retries.size());
1248  scalarDTLBPort.retries.push_back(pkt);
1249  } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1251  scalarDTLBPort.retries.push_back(pkt);
1252  } else {
1253  DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1254  tlb_mode == BaseMMU::Read ? "read" : "write",
1255  pkt->req->getVaddr());
1256  }
1257 }
1258 
1259 void
1261  bool kernelMemSync,
1262  RequestPtr req)
1263 {
1264  assert(gpuDynInst->isGlobalSeg() ||
1265  gpuDynInst->executedAs() == enums::SC_GLOBAL);
1266 
1267  // Fences will never be issued to system memory, so we can mark the
1268  // requestor as a device memory ID here.
1269  if (!req) {
1270  req = std::make_shared<Request>(
1271  0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
1272  } else {
1273  req->requestorId(vramRequestorId());
1274  }
1275 
1276  // all mem sync requests have Paddr == 0
1277  req->setPaddr(0);
1278 
1279  PacketPtr pkt = nullptr;
1280 
1281  if (kernelMemSync) {
1282  if (gpuDynInst->isKernelLaunch()) {
1283  req->setCacheCoherenceFlags(Request::INV_L1);
1284  req->setReqInstSeqNum(gpuDynInst->seqNum());
1285  req->setFlags(Request::KERNEL);
1286  pkt = new Packet(req, MemCmd::MemSyncReq);
1287  pkt->pushSenderState(
1288  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1289 
1290  EventFunctionWrapper *mem_req_event =
1291  memPort[0].createMemReqEvent(pkt);
1292 
1293  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1294  "an acquire\n", cu_id, gpuDynInst->simdId,
1295  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1296 
1297  schedule(mem_req_event, curTick() + req_tick_latency);
1298  } else {
1299  // kernel end flush of GL2 cache may be quiesced by Ruby if the
1300  // GL2 is a read-only cache
1301  assert(shader->impl_kern_end_rel);
1302  assert(gpuDynInst->isEndOfKernel());
1303 
1304  req->setCacheCoherenceFlags(Request::FLUSH_L2);
1305  req->setReqInstSeqNum(gpuDynInst->seqNum());
1306  req->setFlags(Request::KERNEL);
1307  pkt = new Packet(req, MemCmd::MemSyncReq);
1308  pkt->pushSenderState(
1309  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1310 
1311  EventFunctionWrapper *mem_req_event =
1312  memPort[0].createMemReqEvent(pkt);
1313 
1314  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1315  "a release\n", cu_id, gpuDynInst->simdId,
1316  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1317 
1318  schedule(mem_req_event, curTick() + req_tick_latency);
1319  }
1320  } else {
1321  gpuDynInst->setRequestFlags(req);
1322 
1323  req->setReqInstSeqNum(gpuDynInst->seqNum());
1324 
1325  pkt = new Packet(req, MemCmd::MemSyncReq);
1326  pkt->pushSenderState(
1327  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1328 
1329  EventFunctionWrapper *mem_req_event =
1330  memPort[0].createMemReqEvent(pkt);
1331 
1332  DPRINTF(GPUPort,
1333  "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1334  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1335  pkt->req->getPaddr());
1336 
1337  schedule(mem_req_event, curTick() + req_tick_latency);
1338  }
1339 }
1340 
1341 void
1343 {
1344  DataPort::SenderState *sender_state =
1345  safe_cast<DataPort::SenderState*>(pkt->senderState);
1346 
1347  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1348  ComputeUnit *compute_unit = computeUnit;
1349 
1350  assert(gpuDynInst);
1351 
1352  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1353  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1354  pkt->req->getPaddr(), id);
1355 
1356  Addr paddr = pkt->req->getPaddr();
1357 
1358  // mem sync resp callback must be handled already in
1359  // DataPort::recvTimingResp
1360  assert(pkt->cmd != MemCmd::MemSyncResp);
1361 
1362  // The status vector and global memory response for WriteResp packets get
1363  // handled by the WriteCompleteResp packets.
1364  if (pkt->cmd == MemCmd::WriteResp) {
1365  if (!FullSystem || !pkt->req->systemReq()) {
1366  delete pkt;
1367  return;
1368  }
1369  }
1370 
1371  // this is for read, write and atomic
1372  int index = gpuDynInst->memStatusVector[paddr].back();
1373 
1374  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1375  pkt->req->getPaddr(), id);
1376 
1377  gpuDynInst->memStatusVector[paddr].pop_back();
1378  gpuDynInst->pAddr = pkt->req->getPaddr();
1379 
1380  gpuDynInst->decrementStatusVector(index);
1381  DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1382 
1383  if (gpuDynInst->allLanesZero()) {
1384  auto iter = gpuDynInst->memStatusVector.begin();
1385  auto end = gpuDynInst->memStatusVector.end();
1386 
1387  while (iter != end) {
1388  assert(iter->second.empty());
1389  ++iter;
1390  }
1391 
1392  // Calculate the difference between the arrival of the first cache
1393  // block and the last cache block to arrive if we have the time
1394  // for the first cache block.
1395  if (compute_unit->headTailMap.count(gpuDynInst)) {
1396  Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1397  compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1398  compute_unit->headTailMap.erase(gpuDynInst);
1399  }
1400 
1401  gpuDynInst->memStatusVector.clear();
1402 
1403  gpuDynInst->
1404  profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1405  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1406 
1407  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1408  compute_unit->cu_id, gpuDynInst->simdId,
1409  gpuDynInst->wfSlotId);
1410  } else {
1411  if (pkt->isRead()) {
1412  if (!compute_unit->headTailMap.count(gpuDynInst)) {
1413  compute_unit->headTailMap
1414  .insert(std::make_pair(gpuDynInst, curTick()));
1415  }
1416  }
1417  }
1418 
1419  delete pkt->senderState;
1420  delete pkt;
1421 }
1422 
1423 bool
1425 {
1426  Addr line = pkt->req->getPaddr();
1427 
1428  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1429  pkt->req->getVaddr(), line);
1430 
1431  assert(pkt->senderState);
1432  computeUnit->stats.tlbCycles += curTick();
1433 
1434  // pop off the TLB translation state
1435  GpuTranslationState *translation_state =
1436  safe_cast<GpuTranslationState*>(pkt->senderState);
1437 
1438  // no PageFaults are permitted for data accesses
1439  if (!translation_state->tlbEntry) {
1440  DTLBPort::SenderState *sender_state =
1441  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1442 
1443  [[maybe_unused]] Wavefront *w =
1444  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1445  [sender_state->_gpuDynInst->wfSlotId];
1446 
1447  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1448  pkt->req->getVaddr());
1449  }
1450 
1451  // update the hitLevel distribution
1452  int hit_level = translation_state->hitLevel;
1453  computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1454 
1455  delete translation_state->tlbEntry;
1456  assert(!translation_state->ports.size());
1457  pkt->senderState = translation_state->saved;
1458 
1459  // for prefetch pkt
1460  BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1461 
1462  delete translation_state;
1463 
1464  // use the original sender state to know how to close this transaction
1465  DTLBPort::SenderState *sender_state =
1466  safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1467 
1468  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1469  PortID mp_index = sender_state->portIndex;
1470  Addr vaddr = pkt->req->getVaddr();
1471  gpuDynInst->memStatusVector[line].push_back(mp_index);
1472  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1473 
1474  MemCmd requestCmd;
1475 
1476  if (pkt->cmd == MemCmd::ReadResp) {
1477  requestCmd = MemCmd::ReadReq;
1478  } else if (pkt->cmd == MemCmd::WriteResp) {
1479  requestCmd = MemCmd::WriteReq;
1480  } else if (pkt->cmd == MemCmd::SwapResp) {
1481  requestCmd = MemCmd::SwapReq;
1482  } else {
1483  panic("unsupported response to request conversion %s\n",
1484  pkt->cmd.toString());
1485  }
1486 
1487  if (computeUnit->prefetchDepth) {
1488  int simdId = gpuDynInst->simdId;
1489  int wfSlotId = gpuDynInst->wfSlotId;
1490  Addr last = 0;
1491 
1492  switch(computeUnit->prefetchType) {
1493  case enums::PF_CU:
1494  last = computeUnit->lastVaddrCU[mp_index];
1495  break;
1496  case enums::PF_PHASE:
1497  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1498  break;
1499  case enums::PF_WF:
1500  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1501  default:
1502  break;
1503  }
1504 
1505  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1506  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1507 
1508  int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1510  : 0;
1511 
1512  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1513 
1514  computeUnit->lastVaddrCU[mp_index] = vaddr;
1515  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1516  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1517 
1518  stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1519  computeUnit->prefetchStride: stride;
1520 
1521  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1522  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1523 
1524  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1525 
1526  // Prefetch Next few pages atomically
1527  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1528  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1530 
1531  if (!stride)
1532  break;
1533 
1534  RequestPtr prefetch_req = std::make_shared<Request>(
1536  sizeof(uint8_t), 0,
1537  computeUnit->requestorId(),
1538  0, 0, nullptr);
1539 
1540  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1541  uint8_t foo = 0;
1542  prefetch_pkt->dataStatic(&foo);
1543 
1544  // Because it's atomic operation, only need TLB translation state
1545  prefetch_pkt->senderState =
1546  new GpuTranslationState(TLB_mode,
1547  computeUnit->shader->gpuTc, true);
1548 
1549  // Currently prefetches are zero-latency, hence the sendFunctional
1550  sendFunctional(prefetch_pkt);
1551 
1552  /* safe_cast the senderState */
1553  GpuTranslationState *tlb_state =
1554  safe_cast<GpuTranslationState*>(
1555  prefetch_pkt->senderState);
1556 
1557 
1558  delete tlb_state->tlbEntry;
1559  delete tlb_state;
1560  delete prefetch_pkt;
1561  }
1562  }
1563 
1564  // First we must convert the response cmd back to a request cmd so that
1565  // the request can be sent through the cu's request port
1566  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1567  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1568  delete pkt->senderState;
1569  delete pkt;
1570 
1571  // New SenderState for the memory access
1572  new_pkt->senderState =
1573  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1574  nullptr);
1575 
1576  // Set VRAM ID for device requests
1577  // For now, system vmem requests use functional reads. This is not that
1578  // critical to model as the region of interest should always be accessing
1579  // device memory. System vmem requests are used by blit kernels to do
1580  // memcpys and load code objects into device memory.
1581  if (new_pkt->req->systemReq()) {
1582  // There will be multiple packets returned for the same gpuDynInst,
1583  // so first check if systemReq is not already set and if so, return
1584  // the token acquired when the dispatch list is filled as system
1585  // requests do not require a GPU coalescer token.
1586  if (!gpuDynInst->isSystemReq()) {
1587  computeUnit->getTokenManager()->recvTokens(1);
1588  gpuDynInst->setSystemReq();
1589  }
1590  } else {
1591  new_pkt->req->requestorId(computeUnit->vramRequestorId());
1592  }
1593 
1594  // translation is done. Schedule the mem_req_event at the appropriate
1595  // cycle to send the timing memory request to ruby
1596  EventFunctionWrapper *mem_req_event =
1597  computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1598 
1599  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1600  computeUnit->cu_id, gpuDynInst->simdId,
1601  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1602 
1603  computeUnit->schedule(mem_req_event, curTick() +
1604  computeUnit->req_tick_latency);
1605 
1606  return true;
1607 }
1608 
1611 {
1612  return new EventFunctionWrapper(
1613  [this, pkt]{ processMemReqEvent(pkt); },
1614  "ComputeUnit memory request event", true);
1615 }
1616 
1619 {
1620  return new EventFunctionWrapper(
1621  [this, pkt]{ processMemRespEvent(pkt); },
1622  "ComputeUnit memory response event", true);
1623 }
1624 
1625 void
1627 {
1628  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1629  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1630  [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1631 
1632  if (pkt->req->systemReq()) {
1633  assert(compute_unit->shader->systemHub);
1634  SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
1635  compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1636  } else if (!(sendTimingReq(pkt))) {
1637  retries.push_back(std::make_pair(pkt, gpuDynInst));
1638 
1639  DPRINTF(GPUPort,
1640  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1641  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1642  id, pkt->req->getPaddr());
1643  } else {
1644  DPRINTF(GPUPort,
1645  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1646  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1647  gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1648  pkt->req->getPaddr());
1649  }
1650 }
1651 
1652 const char*
1654 {
1655  return "ComputeUnit scalar memory request event";
1656 }
1657 
1658 void
1660 {
1661  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1662  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1663  [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1664 
1665  if (pkt->req->systemReq()) {
1666  assert(compute_unit->shader->systemHub);
1667  SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
1668  compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1669  } else if (!(scalarDataPort.sendTimingReq(pkt))) {
1670  scalarDataPort.retries.push_back(pkt);
1671 
1672  DPRINTF(GPUPort,
1673  "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1674  compute_unit->cu_id, gpuDynInst->simdId,
1675  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1676  } else {
1677  DPRINTF(GPUPort,
1678  "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1679  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1680  gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1681  pkt->req->getPaddr());
1682  }
1683 }
1684 
1685 /*
1686  * The initial translation request could have been rejected,
1687  * if <retries> queue is not Retry sending the translation
1688  * request. sendRetry() is called from the peer port whenever
1689  * a translation completes.
1690  */
1691 void
1693 {
1694  int len = retries.size();
1695 
1696  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1697  computeUnit->cu_id, len);
1698 
1699  assert(len > 0);
1700  assert(isStalled());
1701  // recvReqRetry is an indication that the resource on which this
1702  // port was stalling on is freed. So, remove the stall first
1703  unstallPort();
1704 
1705  for (int i = 0; i < len; ++i) {
1706  PacketPtr pkt = retries.front();
1707  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1708  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1709 
1710  if (!sendTimingReq(pkt)) {
1711  // Stall port
1712  stallPort();
1713  DPRINTF(GPUTLB, ": failed again\n");
1714  break;
1715  } else {
1716  DPRINTF(GPUTLB, ": successful\n");
1717  retries.pop_front();
1718  }
1719  }
1720 }
1721 
1722 bool
1724 {
1725  assert(pkt->senderState);
1726 
1727  GpuTranslationState *translation_state =
1728  safe_cast<GpuTranslationState*>(pkt->senderState);
1729 
1730  // Page faults are not allowed
1731  fatal_if(!translation_state->tlbEntry,
1732  "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1733 
1734  delete translation_state->tlbEntry;
1735  assert(!translation_state->ports.size());
1736 
1737  pkt->senderState = translation_state->saved;
1738  delete translation_state;
1739 
1740  ScalarDTLBPort::SenderState *sender_state =
1741  safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1742 
1743  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1744  delete pkt->senderState;
1745 
1746  [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1747 
1748  DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1749  "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1750  w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1751 
1752  MemCmd mem_cmd;
1753 
1754  if (pkt->cmd == MemCmd::ReadResp) {
1755  mem_cmd = MemCmd::ReadReq;
1756  } else if (pkt->cmd == MemCmd::WriteResp) {
1757  mem_cmd = MemCmd::WriteReq;
1758  } else {
1759  fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1760  pkt->cmd.toString());
1761  }
1762 
1763  PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1764  req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1765  delete pkt;
1766 
1767  req_pkt->senderState =
1769 
1770  // For a system request we want to mark the GPU instruction as a system
1771  // load/store so that after the request is issued to system memory we can
1772  // return any token acquired for the request. Since tokens are returned
1773  // by the coalescer and system requests do not take that path, this needs
1774  // to be tracked.
1775  //
1776  // Device requests change the requestor ID to something in the device
1777  // memory Ruby network.
1778  if (req_pkt->req->systemReq()) {
1779  gpuDynInst->setSystemReq();
1780  } else {
1781  req_pkt->req->requestorId(computeUnit->vramRequestorId());
1782  }
1783 
1784  ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
1786  (computeUnit->scalarDataPort, req_pkt);
1787  computeUnit->schedule(scalar_mem_req_event, curTick() +
1788  computeUnit->req_tick_latency);
1789 
1790  return true;
1791 }
1792 
1793 bool
1795 {
1796  [[maybe_unused]] Addr line = pkt->req->getPaddr();
1797  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1798  computeUnit->cu_id, pkt->req->getVaddr(), line);
1799 
1800  assert(pkt->senderState);
1801 
1802  // pop off the TLB translation state
1803  GpuTranslationState *translation_state
1804  = safe_cast<GpuTranslationState*>(pkt->senderState);
1805 
1806  bool success = translation_state->tlbEntry != nullptr;
1807  delete translation_state->tlbEntry;
1808  assert(!translation_state->ports.size());
1809  pkt->senderState = translation_state->saved;
1810  delete translation_state;
1811 
1812  // use the original sender state to know how to close this transaction
1813  ITLBPort::SenderState *sender_state =
1814  safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1815 
1816  // get the wavefront associated with this translation request
1817  Wavefront *wavefront = sender_state->wavefront;
1818  delete pkt->senderState;
1819 
1820  if (success) {
1821  // pkt is reused in fetch(), don't delete it here. However, we must
1822  // reset the command to be a request so that it can be sent through
1823  // the cu's request port
1824  assert(pkt->cmd == MemCmd::ReadResp);
1825  pkt->cmd = MemCmd::ReadReq;
1826 
1827  computeUnit->fetchStage.fetch(pkt, wavefront);
1828  } else {
1829  if (wavefront->dropFetch) {
1830  assert(wavefront->instructionBuffer.empty());
1831  wavefront->dropFetch = false;
1832  }
1833 
1834  wavefront->pendingFetch = 0;
1835  }
1836 
1837  return true;
1838 }
1839 
1840 /*
1841  * The initial translation request could have been rejected, if
1842  * <retries> queue is not empty. Retry sending the translation
1843  * request. sendRetry() is called from the peer port whenever
1844  * a translation completes.
1845  */
1846 void
1848 {
1849 
1850  int len = retries.size();
1851  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1852 
1853  assert(len > 0);
1854  assert(isStalled());
1855 
1856  // recvReqRetry is an indication that the resource on which this
1857  // port was stalling on is freed. So, remove the stall first
1858  unstallPort();
1859 
1860  for (int i = 0; i < len; ++i) {
1861  PacketPtr pkt = retries.front();
1862  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1863  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1864 
1865  if (!sendTimingReq(pkt)) {
1866  stallPort(); // Stall port
1867  DPRINTF(GPUTLB, ": failed again\n");
1868  break;
1869  } else {
1870  DPRINTF(GPUTLB, ": successful\n");
1871  retries.pop_front();
1872  }
1873  }
1874 }
1875 
1876 void
1878 {
1879  if (gpuDynInst->isScalar()) {
1880  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1881  stats.sALUInsts++;
1883  } else if (gpuDynInst->isLoad()) {
1885  } else if (gpuDynInst->isStore()) {
1887  }
1888  } else {
1889  if (gpuDynInst->isALU()) {
1892  exitSimLoop("max vALU insts");
1893  }
1894  stats.vALUInsts++;
1897  += gpuDynInst->wavefront()->execMask().count();
1898  } else if (gpuDynInst->isFlat()) {
1899  if (gpuDynInst->isLocalMem()) {
1900  stats.flatLDSInsts++;
1901  } else {
1902  stats.flatVMemInsts++;
1903  }
1904  } else if (gpuDynInst->isFlatGlobal()) {
1905  stats.flatVMemInsts++;
1906  } else if (gpuDynInst->isLocalMem()) {
1908  } else if (gpuDynInst->isLoad()) {
1910  } else if (gpuDynInst->isStore()) {
1912  }
1913 
1914  if (gpuDynInst->isLoad()) {
1915  switch (gpuDynInst->executedAs()) {
1916  case enums::SC_SPILL:
1917  stats.spillReads++;
1918  break;
1919  case enums::SC_GLOBAL:
1920  stats.globalReads++;
1921  break;
1922  case enums::SC_GROUP:
1923  stats.groupReads++;
1924  break;
1925  case enums::SC_PRIVATE:
1926  stats.privReads++;
1927  break;
1928  case enums::SC_READONLY:
1929  stats.readonlyReads++;
1930  break;
1931  case enums::SC_KERNARG:
1932  stats.kernargReads++;
1933  break;
1934  case enums::SC_ARG:
1935  stats.argReads++;
1936  break;
1937  case enums::SC_NONE:
1942  break;
1943  default:
1944  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1945  break;
1946  }
1947  } else if (gpuDynInst->isStore()) {
1948  switch (gpuDynInst->executedAs()) {
1949  case enums::SC_SPILL:
1950  stats.spillWrites++;
1951  break;
1952  case enums::SC_GLOBAL:
1953  stats.globalWrites++;
1954  break;
1955  case enums::SC_GROUP:
1956  stats.groupWrites++;
1957  break;
1958  case enums::SC_PRIVATE:
1959  stats.privWrites++;
1960  break;
1961  case enums::SC_READONLY:
1963  break;
1964  case enums::SC_KERNARG:
1965  stats.kernargWrites++;
1966  break;
1967  case enums::SC_ARG:
1968  stats.argWrites++;
1969  break;
1970  case enums::SC_NONE:
1975  break;
1976  default:
1977  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1978  break;
1979  }
1980  }
1981  }
1982 }
1983 
1984 void
1986 {
1987  Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
1988 
1989  if (!pagesTouched.count(virt_page_addr))
1990  pagesTouched[virt_page_addr] = 1;
1991  else
1992  pagesTouched[virt_page_addr]++;
1993 }
1994 
1995 void
1997 {
1998  if (countPages) {
1999  std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2000 
2001  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2002  std::endl;
2003 
2004  for (auto iter : pageAccesses) {
2005  *page_stat_file << std::hex << iter.first << ",";
2006  *page_stat_file << std::dec << iter.second.first << ",";
2007  *page_stat_file << std::dec << iter.second.second << std::endl;
2008  }
2009  }
2010 }
2011 
2012 bool
2014 {
2015  for (int i = 0; i < numVectorALUs; ++i) {
2016  if (!isVectorAluIdle(i)) {
2017  return false;
2018  }
2019  }
2020 
2021  // TODO: FIXME if more than 1 of any memory pipe supported
2022  if (!srfToScalarMemPipeBus.rdy()) {
2023  return false;
2024  }
2025  if (!vrfToGlobalMemPipeBus.rdy()) {
2026  return false;
2027  }
2028  if (!vrfToLocalMemPipeBus.rdy()) {
2029  return false;
2030  }
2031 
2036  return false;
2037  }
2038 
2039  return true;
2040 }
2041 
2042 int32_t
2043 ComputeUnit::getRefCounter(const uint32_t dispatchId,
2044  const uint32_t wgId) const
2045 {
2046  return lds.getRefCounter(dispatchId, wgId);
2047 }
2048 
2049 bool
2050 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2051 {
2052  assert(simdId < numVectorALUs);
2053 
2054  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2055  if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2056  return false;
2057  }
2058  }
2059 
2060  return true;
2061 }
2062 
2068 bool
2070 {
2071  // this is just a request to carry the GPUDynInstPtr
2072  // back and forth
2073  RequestPtr newRequest = std::make_shared<Request>();
2074  newRequest->setPaddr(0x0);
2075 
2076  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2077  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2078 
2079  // This is the SenderState needed upon return
2080  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2081 
2082  return ldsPort.sendTimingReq(newPacket);
2083 }
2084 
2090 {
2091  return FullSystem ? shader->vramRequestorId() : requestorId();
2092 }
2093 
2097 bool
2099 {
2100  const ComputeUnit::LDSPort::SenderState *senderState =
2101  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2102 
2103  fatal_if(!senderState, "did not get the right sort of sender state");
2104 
2105  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2106 
2107  delete packet->senderState;
2108  delete packet;
2109 
2110  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2111  return true;
2112 }
2113 
2119 bool
2121 {
2122  ComputeUnit::LDSPort::SenderState *sender_state =
2123  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2124  fatal_if(!sender_state, "packet without a valid sender state");
2125 
2126  [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2127 
2128  if (isStalled()) {
2129  fatal_if(retries.empty(), "must have retries waiting to be stalled");
2130 
2131  retries.push(pkt);
2132 
2133  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2134  computeUnit->cu_id, gpuDynInst->simdId,
2135  gpuDynInst->wfSlotId);
2136  return false;
2137  } else if (!RequestPort::sendTimingReq(pkt)) {
2138  // need to stall the LDS port until a recvReqRetry() is received
2139  // this indicates that there is more space
2140  stallPort();
2141  retries.push(pkt);
2142 
2143  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2144  computeUnit->cu_id, gpuDynInst->simdId,
2145  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2146  return false;
2147  } else {
2148  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2149  computeUnit->cu_id, gpuDynInst->simdId,
2150  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2151  return true;
2152  }
2153 }
2154 
2161 void
2163 {
2164  auto queueSize = retries.size();
2165 
2166  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2167  computeUnit->cu_id, queueSize);
2168 
2169  fatal_if(queueSize < 1,
2170  "why was there a recvReqRetry() with no pending reqs?");
2171  fatal_if(!isStalled(),
2172  "recvReqRetry() happened when the port was not stalled");
2173 
2174  unstallPort();
2175 
2176  while (!retries.empty()) {
2177  PacketPtr packet = retries.front();
2178 
2179  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2180 
2181  if (!RequestPort::sendTimingReq(packet)) {
2182  // Stall port
2183  stallPort();
2184  DPRINTF(GPUPort, ": LDS send failed again\n");
2185  break;
2186  } else {
2187  DPRINTF(GPUTLB, ": LDS send successful\n");
2188  retries.pop();
2189  }
2190  }
2191 }
2192 
2194  int n_wf)
2195  : statistics::Group(parent),
2196  ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2197  ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2198  "per-wavefront."),
2199  ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2200  ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2201  "per-wavefront."),
2202  ADD_STAT(instCyclesVALU,
2203  "Number of cycles needed to execute VALU insts."),
2204  ADD_STAT(instCyclesSALU,
2205  "Number of cycles needed to execute SALU insts."),
2206  ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2207  "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2208  "the number of active threads."),
2209  ADD_STAT(vALUUtilization,
2210  "Percentage of active vector ALU threads in a wave."),
2211  ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2212  " accesses that resolve to LDS."),
2213  ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2214  "including FLAT accesses that resolve to LDS) per-wavefront."),
2215  ADD_STAT(flatVMemInsts,
2216  "The number of FLAT insts that resolve to vmem issued."),
2217  ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2218  "resolve to vmem issued per-wavefront."),
2219  ADD_STAT(flatLDSInsts,
2220  "The number of FLAT insts that resolve to LDS issued."),
2221  ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2222  "resolve to LDS issued per-wavefront."),
2223  ADD_STAT(vectorMemWrites,
2224  "Number of vector mem write insts (excluding FLAT insts)."),
2225  ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2226  "insts (excluding FLAT insts) per-wavefront."),
2227  ADD_STAT(vectorMemReads,
2228  "Number of vector mem read insts (excluding FLAT insts)."),
2229  ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2230  "(excluding FLAT insts) per-wavefront."),
2231  ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2232  ADD_STAT(scalarMemWritesPerWF,
2233  "The average number of scalar mem write insts per-wavefront."),
2234  ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2235  ADD_STAT(scalarMemReadsPerWF,
2236  "The average number of scalar mem read insts per-wavefront."),
2237  ADD_STAT(vectorMemReadsPerKiloInst,
2238  "Number of vector mem reads per kilo-instruction"),
2239  ADD_STAT(vectorMemWritesPerKiloInst,
2240  "Number of vector mem writes per kilo-instruction"),
2241  ADD_STAT(vectorMemInstsPerKiloInst,
2242  "Number of vector mem insts per kilo-instruction"),
2243  ADD_STAT(scalarMemReadsPerKiloInst,
2244  "Number of scalar mem reads per kilo-instruction"),
2245  ADD_STAT(scalarMemWritesPerKiloInst,
2246  "Number of scalar mem writes per kilo-instruction"),
2247  ADD_STAT(scalarMemInstsPerKiloInst,
2248  "Number of scalar mem insts per kilo-instruction"),
2249  ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2250  "command, data from VRF to vector memory unit, per SIMD"),
2251  ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2252  "command, data from SRF to scalar memory unit, per SIMD"),
2253  ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2254  "command, data from VRF to LDS unit, per SIMD"),
2255  ADD_STAT(globalReads, "Number of reads to the global segment"),
2256  ADD_STAT(globalWrites, "Number of writes to the global segment"),
2257  ADD_STAT(globalMemInsts,
2258  "Number of memory instructions sent to the global segment"),
2259  ADD_STAT(argReads, "Number of reads to the arg segment"),
2260  ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2261  ADD_STAT(argMemInsts,
2262  "Number of memory instructions sent to the arg segment"),
2263  ADD_STAT(spillReads, "Number of reads to the spill segment"),
2264  ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2265  ADD_STAT(spillMemInsts,
2266  "Number of memory instructions sent to the spill segment"),
2267  ADD_STAT(groupReads, "Number of reads to the group segment"),
2268  ADD_STAT(groupWrites, "Number of writes to the group segment"),
2269  ADD_STAT(groupMemInsts,
2270  "Number of memory instructions sent to the group segment"),
2271  ADD_STAT(privReads, "Number of reads to the private segment"),
2272  ADD_STAT(privWrites, "Number of writes to the private segment"),
2273  ADD_STAT(privMemInsts,
2274  "Number of memory instructions sent to the private segment"),
2275  ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2276  ADD_STAT(readonlyWrites,
2277  "Number of memory instructions sent to the readonly segment"),
2278  ADD_STAT(readonlyMemInsts,
2279  "Number of memory instructions sent to the readonly segment"),
2280  ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2281  ADD_STAT(kernargWrites,
2282  "Number of memory instructions sent to the kernarg segment"),
2283  ADD_STAT(kernargMemInsts,
2284  "Number of memory instructions sent to the kernarg segment"),
2285  ADD_STAT(waveLevelParallelism,
2286  "wave level parallelism: count of active waves at wave launch"),
2287  ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2288  ADD_STAT(tlbCycles,
2289  "total number of cycles for all uncoalesced requests"),
2290  ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2291  ADD_STAT(hitsPerTLBLevel,
2292  "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2293  ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2294  ADD_STAT(ldsBankConflictDist,
2295  "Number of bank conflicts per LDS memory packet"),
2296  ADD_STAT(pageDivergenceDist,
2297  "pages touched per wf (over all mem. instr.)"),
2298  ADD_STAT(dynamicGMemInstrCnt,
2299  "dynamic non-flat global memory instruction count"),
2300  ADD_STAT(dynamicFlatMemInstrCnt,
2301  "dynamic flat global memory instruction count"),
2302  ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2303  ADD_STAT(wgBlockedDueBarrierAllocation,
2304  "WG dispatch was blocked due to lack of barrier resources"),
2305  ADD_STAT(wgBlockedDueLdsAllocation,
2306  "Workgroup blocked due to LDS capacity"),
2307  ADD_STAT(numInstrExecuted, "number of instructions executed"),
2308  ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2309  "vector instructions per cycle"),
2310  ADD_STAT(numVecOpsExecuted,
2311  "number of vec ops executed (e.g. WF size/inst)"),
2312  ADD_STAT(numVecOpsExecutedF16,
2313  "number of f16 vec ops executed (e.g. WF size/inst)"),
2314  ADD_STAT(numVecOpsExecutedF32,
2315  "number of f32 vec ops executed (e.g. WF size/inst)"),
2316  ADD_STAT(numVecOpsExecutedF64,
2317  "number of f64 vec ops executed (e.g. WF size/inst)"),
2318  ADD_STAT(numVecOpsExecutedFMA16,
2319  "number of fma16 vec ops executed (e.g. WF size/inst)"),
2320  ADD_STAT(numVecOpsExecutedFMA32,
2321  "number of fma32 vec ops executed (e.g. WF size/inst)"),
2322  ADD_STAT(numVecOpsExecutedFMA64,
2323  "number of fma64 vec ops executed (e.g. WF size/inst)"),
2324  ADD_STAT(numVecOpsExecutedMAC16,
2325  "number of mac16 vec ops executed (e.g. WF size/inst)"),
2326  ADD_STAT(numVecOpsExecutedMAC32,
2327  "number of mac32 vec ops executed (e.g. WF size/inst)"),
2328  ADD_STAT(numVecOpsExecutedMAC64,
2329  "number of mac64 vec ops executed (e.g. WF size/inst)"),
2330  ADD_STAT(numVecOpsExecutedMAD16,
2331  "number of mad16 vec ops executed (e.g. WF size/inst)"),
2332  ADD_STAT(numVecOpsExecutedMAD32,
2333  "number of mad32 vec ops executed (e.g. WF size/inst)"),
2334  ADD_STAT(numVecOpsExecutedMAD64,
2335  "number of mad64 vec ops executed (e.g. WF size/inst)"),
2336  ADD_STAT(numVecOpsExecutedTwoOpFP,
2337  "number of two op FP vec ops executed (e.g. WF size/inst)"),
2338  ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2339  ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2340  ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2341  ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2342  ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2343  ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2344  ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2345  "instruction (over all instructions)"),
2346  ADD_STAT(activeLanesPerGMemInstrDist,
2347  "number of active lanes per global memory instruction"),
2348  ADD_STAT(activeLanesPerLMemInstrDist,
2349  "number of active lanes per local memory instruction"),
2350  ADD_STAT(numALUInstsExecuted,
2351  "Number of dynamic non-GM memory insts executed"),
2352  ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2353  "blocked due to VGPR allocation per SIMD"),
2354  ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2355  "blocked due to SGPR allocation per SIMD"),
2356  ADD_STAT(numCASOps, "number of compare and swap operations"),
2357  ADD_STAT(numFailedCASOps,
2358  "number of compare and swap operations that failed"),
2359  ADD_STAT(completedWfs, "number of completed wavefronts"),
2360  ADD_STAT(completedWGs, "number of completed workgroups"),
2361  ADD_STAT(headTailLatency, "ticks between first and last cache block "
2362  "arrival at coalescer"),
2363  ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2364 {
2365  ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2366 
2370 
2371  hitsPerTLBLevel.init(4);
2372  execRateDist.init(0, 10, 2);
2373  ldsBankConflictDist.init(0, cu->wfSize(), 2);
2374 
2375  pageDivergenceDist.init(1, cu->wfSize(), 4);
2376  controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
2379 
2380  headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
2382  waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2383  instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2384 
2387  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
2395 
2404 
2412 
2414 
2415  // fixed number of TLB levels
2416  for (int i = 0; i < 4; ++i) {
2417  if (!i)
2418  hitsPerTLBLevel.subname(i,"page_table");
2419  else
2420  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2421  }
2422 
2428 
2431 }
2432 
2433 } // namespace gem5
gem5::ComputeUnit::ComputeUnitStats::tlbRequests
statistics::Scalar tlbRequests
Definition: compute_unit.hh:1060
gem5::ComputeUnit::ComputeUnitStats::sALUInstsPerWF
statistics::Formula sALUInstsPerWF
Definition: compute_unit.hh:1001
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
gem5::PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:245
gem5::ComputeUnit::ComputeUnitStats::vALUUtilization
statistics::Formula vALUUtilization
Definition: compute_unit.hh:1005
gem5::ComputeUnit::getAndIncSeqNum
InstSeqNum getAndIncSeqNum()
Definition: compute_unit.hh:932
gem5::GMEnqueue
@ GMEnqueue
Definition: misc.hh:56
gem5::HSAQueueEntry::numWg
int numWg(int dim) const
Definition: hsa_queue_entry.hh:235
gem5::ComputeUnit::wfList
std::vector< std::vector< Wavefront * > > wfList
Definition: compute_unit.hh:291
gem5::ComputeUnit::ComputeUnit
ComputeUnit(const Params &p)
Definition: compute_unit.cc:65
gem5::BaseMMU::Read
@ Read
Definition: mmu.hh:56
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerWF
statistics::Formula scalarMemReadsPerWF
Definition: compute_unit.hh:1019
gem5::RequestPort::sendTimingReq
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition: port.hh:495
gem5::LdsState::getRefCounter
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:330
gem5::ComputeUnit::ComputeUnitStats::instCyclesSALU
statistics::Scalar instCyclesSALU
Definition: compute_unit.hh:1003
gem5::Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:109
simple_pool_manager.hh
gem5::Wavefront::S_RUNNING
@ S_RUNNING
Definition: wavefront.hh:70
gem5::ComputeUnit::fetchStage
FetchStage fetchStage
Definition: compute_unit.hh:280
gem5::ComputeUnit::ComputeUnitStats::instInterleave
statistics::VectorDistribution instInterleave
Definition: compute_unit.hh:1139
gem5::ComputeUnit::ComputeUnitStats::flatVMemInsts
statistics::Scalar flatVMemInsts
Definition: compute_unit.hh:1008
gem5::ComputeUnit::ScalarDTLBPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:763
gem5::ComputeUnit::sendRequest
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
Definition: compute_unit.cc:1035
gem5::ScalarMemPipeline::exec
void exec()
Definition: scalar_memory_pipeline.cc:54
gem5::FetchStage::processFetchReturn
void processFetchReturn(PacketPtr pkt)
Definition: fetch_stage.cc:73
gem5::ComputeUnit::debugSegFault
bool debugSegFault
Definition: compute_unit.hh:341
gem5::FetchStage::exec
void exec()
Definition: fetch_stage.cc:65
shader.hh
gem5::ComputeUnit::DataPort::processMemReqEvent
void processMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1626
gem5::ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:285
gem5::ComputeUnit::ComputeUnitStats::privWrites
statistics::Scalar privWrites
Definition: compute_unit.hh:1047
gem5::ComputeUnit::ComputeUnitStats::kernargWrites
statistics::Scalar kernargWrites
Definition: compute_unit.hh:1053
gem5::MemCmd::SwapReq
@ SwapReq
Definition: packet.hh:119
gem5::ComputeUnit::numVecRegsPerSimd
int numVecRegsPerSimd
Definition: compute_unit.hh:371
gem5::GpuTranslationState
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: gpu_translation_state.hh:58
gem5::ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition: compute_unit.hh:297
gem5::GpuTranslationState::ports
std::vector< ResponsePort * > ports
Definition: gpu_translation_state.hh:79
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerWF
statistics::Formula scalarMemWritesPerWF
Definition: compute_unit.hh:1017
gem5::ComputeUnit::ComputeUnitStats::argMemInsts
statistics::Formula argMemInsts
Definition: compute_unit.hh:1039
gem5::ComputeUnit::ITLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1794
gem5::ComputeUnit::ComputeUnitStats::spillWrites
statistics::Scalar spillWrites
Definition: compute_unit.hh:1041
gem5::ComputeUnit::ComputeUnitStats::spillMemInsts
statistics::Formula spillMemInsts
Definition: compute_unit.hh:1042
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerKiloInst
statistics::Formula scalarMemWritesPerKiloInst
Definition: compute_unit.hh:1025
gem5::MipsISA::index
Bitfield< 30, 0 > index
Definition: pra_constants.hh:47
gem5::ComputeUnit::DataPort::handleResponse
bool handleResponse(PacketPtr pkt)
Definition: compute_unit.cc:814
gem5::ComputeUnit::ComputeUnitStats::readonlyReads
statistics::Scalar readonlyReads
Definition: compute_unit.hh:1049
gem5::ComputeUnit::handleSQCReturn
void handleSQCReturn(PacketPtr pkt)
Definition: compute_unit.cc:1006
gem5::ComputeUnit::LDSPort::SenderState
SenderState is information carried along with the packet, esp.
Definition: compute_unit.hh:834
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueBarrierAllocation
Definition: compute_unit.hh:1079
gem5::ComputeUnit::DTLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1424
gem5::Packet::pushSenderState
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition: packet.cc:330
gem5::ComputeUnit::ComputeUnitStats::vectorMemReads
statistics::Scalar vectorMemReads
Definition: compute_unit.hh:1014
gem5::BaseMMU::Mode
Mode
Definition: mmu.hh:56
gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:374
gem5::Request::KERNEL
@ KERNEL
The request should be marked with KERNEL.
Definition: request.hh:183
gem5::Shader::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition: shader.cc:530
gem5::ComputeUnit::lastVaddrSimd
std::vector< std::vector< Addr > > lastVaddrSimd
Definition: compute_unit.hh:336
gem5::BaseMMU::Write
@ Write
Definition: mmu.hh:56
gem5::Wavefront
Definition: wavefront.hh:60
gem5::ComputeUnit::ScalarDTLBPort::isStalled
bool isStalled() const
Definition: compute_unit.hh:759
gem5::FetchStage::init
void init()
Definition: fetch_stage.cc:56
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF64
statistics::Scalar numVecOpsExecutedF64
Definition: compute_unit.hh:1095
gem5::ComputeUnit::ComputeUnitStats::dynamicGMemInstrCnt
statistics::Scalar dynamicGMemInstrCnt
Definition: compute_unit.hh:1074
gem5::HSAQueueEntry
Definition: hsa_queue_entry.hh:59
compute_unit.hh
gem5::ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition: compute_unit.cc:246
gem5::ComputeUnit::pagesTouched
std::map< Addr, int > pagesTouched
Definition: compute_unit.hh:378
gpu_static_inst.hh
gem5::VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
gem5::ComputeUnit::DataPort::SystemHubEvent
Definition: compute_unit.hh:532
gem5::ComputeUnit::scoreboardCheckStage
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:281
gem5::ComputeUnit::stats
gem5::ComputeUnit::ComputeUnitStats stats
gem5::ComputeUnit::headTailMap
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
Definition: compute_unit.hh:988
gem5::ComputeUnit::ComputeUnitStats::vpc_f16
statistics::Formula vpc_f16
Definition: compute_unit.hh:1113
gem5::floorLog2
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition: intmath.hh:59
gem5::ComputeUnit::ComputeUnitStats::tlbLatency
statistics::Formula tlbLatency
Definition: compute_unit.hh:1062
gem5::simout
OutputDirectory simout
Definition: output.cc:62
gem5::ComputeUnit::DataPort::processMemRespEvent
void processMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1342
gem5::ComputeUnit::lastVaddrCU
std::vector< Addr > lastVaddrCU
Definition: compute_unit.hh:335
gem5::MemCmd::SwapResp
@ SwapResp
Definition: packet.hh:120
gem5::ComputeUnit::ScalarDTLBPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:1723
gem5::statistics::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:402
gem5::ComputeUnit::ComputeUnitStats::sALUInsts
statistics::Scalar sALUInsts
Definition: compute_unit.hh:1000
gem5::Packet::isWrite
bool isWrite() const
Definition: packet.hh:591
gem5::ComputeUnit::exec
void exec()
Definition: compute_unit.cc:726
gem5::Wavefront::pendingFetch
bool pendingFetch
Definition: wavefront.hh:111
gem5::ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition: compute_unit.hh:239
gem5::ComputeUnit::releaseBarrier
void releaseBarrier(int bar_id)
Definition: compute_unit.cc:703
gem5::ComputeUnit::ComputeUnitStats::instCyclesScMemPerSimd
statistics::Vector instCyclesScMemPerSimd
Definition: compute_unit.hh:1031
gem5::AMDGPUSystemHub::sendRequest
void sendRequest(PacketPtr pkt, Event *callback)
Definition: system_hub.cc:40
gem5::Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:517
gem5::ComputeUnit::numYetToReachBarrier
int numYetToReachBarrier(int bar_id)
Definition: compute_unit.cc:654
gem5::HSAQueueEntry::wgId
int wgId(int dim) const
Definition: hsa_queue_entry.hh:209
gem5::VegaISA::w
Bitfield< 6 > w
Definition: pagetable.hh:59
gem5::GPUDispatcher::shader
Shader * shader
Definition: dispatcher.hh:85
gem5::ComputeUnit::ComputeUnitStats::ldsBankConflictDist
statistics::Distribution ldsBankConflictDist
Definition: compute_unit.hh:1068
gem5::ComputeUnit::getRefCounter
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Definition: compute_unit.cc:2043
gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
gem5::ComputeUnit::ITLBPort::SenderState::wavefront
Wavefront * wavefront
Definition: compute_unit.hh:793
gem5::ComputeUnit::ScalarDataPort::handleResponse
bool handleResponse(PacketPtr pkt)
Definition: compute_unit.cc:921
gem5::OutputDirectory::create
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:210
gem5::ComputeUnit::ComputeUnitStats::kernargReads
statistics::Scalar kernargReads
Definition: compute_unit.hh:1052
gem5::csprintf
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
gem5::Wavefront::S_STOPPED
@ S_STOPPED
Definition: wavefront.hh:66
gem5::X86ISA::PageShift
const Addr PageShift
Definition: page_size.hh:48
gem5::ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition: compute_unit.hh:223
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInsts
statistics::Scalar ldsNoFlatInsts
Definition: compute_unit.hh:1006
gem5::ComputeUnit::resetBarrier
void resetBarrier(int bar_id)
Definition: compute_unit.cc:689
gem5::ComputeUnit::ComputeUnitStats::globalReads
statistics::Scalar globalReads
Definition: compute_unit.hh:1034
gem5::ComputeUnit::ComputeUnitStats::groupMemInsts
statistics::Formula groupMemInsts
Definition: compute_unit.hh:1045
gem5::ComputeUnit::ComputeUnitStats::vpc
statistics::Formula vpc
Definition: compute_unit.hh:1112
gem5::RegisterManager::vrfPoolMgrs
std::vector< PoolManager * > vrfPoolMgrs
Definition: register_manager.hh:80
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerLMemInstrDist
statistics::Distribution activeLanesPerLMemInstrDist
Definition: compute_unit.hh:1119
gem5::ComputeUnit::memPortTokens
TokenManager * memPortTokens
Definition: compute_unit.hh:507
gem5::GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:57
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:67
gem5::ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition: compute_unit.hh:227
gem5::ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:353
gem5::ComputeUnit::req_tick_latency
Tick req_tick_latency
Definition: compute_unit.hh:355
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerWF
statistics::Formula vectorMemWritesPerWF
Definition: compute_unit.hh:1013
sim_exit.hh
gem5::HSAQueueEntry::numScalarRegs
int numScalarRegs() const
Definition: hsa_queue_entry.hh:141
gem5::isPowerOf2
static constexpr bool isPowerOf2(const T &n)
Definition: intmath.hh:98
output.hh
gem5::ComputeUnit::ComputeUnitStats::headTailLatency
statistics::Distribution headTailLatency
Definition: compute_unit.hh:1133
gem5::ComputeUnit::scalarDataPort
ScalarDataPort scalarDataPort
Definition: compute_unit.hh:900
gem5::ComputeUnit::ComputeUnitStats::threadCyclesVALU
statistics::Scalar threadCyclesVALU
Definition: compute_unit.hh:1004
gem5::ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:292
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerWF
statistics::Formula vectorMemReadsPerWF
Definition: compute_unit.hh:1015
gem5::statistics::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1328
gem5::ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:295
gem5::ComputeUnit::ComputeUnitStats::instCyclesVMemPerSimd
statistics::Vector instCyclesVMemPerSimd
Definition: compute_unit.hh:1030
wavefront.hh
gem5::exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition: sim_events.cc:88
gem5::TokenRequestPort::setTokenManager
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition: token_port.cc:72
gem5::ComputeUnit::SQCPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1012
gem5::ComputeUnit::ComputeUnitStats::groupReads
statistics::Scalar groupReads
Definition: compute_unit.hh:1043
gem5::GPUComputeDriver::setMtype
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
Definition: gpu_compute_driver.cc:1022
gem5::ComputeUnit::ComputeUnitStats::vpc_f64
statistics::Formula vpc_f64
Definition: compute_unit.hh:1115
gem5::ComputeUnit::injectGlobalMemFence
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
Definition: compute_unit.cc:1260
gem5::ComputeUnit::ScalarDataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:650
gem5::ComputeUnit::LDSPort::sendTimingReq
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
Definition: compute_unit.cc:2120
gem5::ComputeUnit::locMemToVrfBus
WaitClass locMemToVrfBus
Definition: compute_unit.hh:229
gem5::MemCmd
Definition: packet.hh:75
gem5::statistics::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:62
gem5::Shader::systemHub
AMDGPUSystemHub * systemHub
Definition: shader.hh:229
gem5::ComputeUnit::ComputeUnitStats::kernargMemInsts
statistics::Formula kernargMemInsts
Definition: compute_unit.hh:1054
gem5::ComputeUnit::ComputeUnitStats::flatVMemInstsPerWF
statistics::Formula flatVMemInstsPerWF
Definition: compute_unit.hh:1009
gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1147
gem5::Wavefront::setStatus
void setStatus(status_e newStatus)
Definition: wavefront.cc:518
gem5::LdsState::canReserve
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:478
gem5::ComputeUnit::numScalarMemUnits
int numScalarMemUnits
Definition: compute_unit.hh:235
gem5::GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:266
gem5::ComputeUnit::DTLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1692
gem5::ComputeUnit::ITLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:790
gem5::ArmISA::j
Bitfield< 24 > j
Definition: misc_types.hh:57
gem5::ComputeUnit
Definition: compute_unit.hh:201
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::process
void process()
Definition: compute_unit.cc:1659
gem5::ComputeUnit::pageAccesses
pageDataStruct pageAccesses
Definition: compute_unit.hh:486
gem5::HSAQueueEntry::MAX_DIM
const static int MAX_DIM
Definition: hsa_queue_entry.hh:310
gem5::ComputeUnit::ScalarDataPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:647
gem5::OutputStream::stream
std::ostream * stream() const
Get the output underlying output stream.
Definition: output.hh:62
gem5::ComputeUnit::ScalarDataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:602
gem5::ComputeUnit::ComputeUnitStats::flatLDSInsts
statistics::Scalar flatLDSInsts
Definition: compute_unit.hh:1010
gem5::ComputeUnit::numScalarALUs
int numScalarALUs
Definition: compute_unit.hh:248
gem5::statistics::VectorDistribution::init
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
Definition: statistics.hh:2278
gem5::ComputeUnit::numVectorALUs
int numVectorALUs
Definition: compute_unit.hh:244
vector_register_file.hh
gem5::Packet::isRead
bool isRead() const
Definition: packet.hh:590
gem5::LocalMemPipeline::exec
void exec()
Definition: local_memory_pipeline.cc:52
gem5::ComputeUnit::startWavefront
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
Definition: compute_unit.cc:316
gem5::WaitClass::init
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition: misc.hh:76
gem5::ComputeUnit::ComputeUnitStats::privReads
statistics::Scalar privReads
Definition: compute_unit.hh:1046
gem5::ComputeUnit::functionalTLB
bool functionalTLB
Definition: compute_unit.hh:345
gem5::ComputeUnit::numAtBarrier
int numAtBarrier(int bar_id)
Definition: compute_unit.cc:675
gem5::ComputeUnit::incNumAtBarrier
void incNumAtBarrier(int bar_id)
Definition: compute_unit.cc:668
gem5::statistics::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
gem5::MemCmd::WriteResp
@ WriteResp
Definition: packet.hh:90
gem5::ComputeUnit::ComputeUnitStats::completedWfs
statistics::Scalar completedWfs
Definition: compute_unit.hh:1128
gem5::HSAQueueEntry::numVectorRegs
int numVectorRegs() const
Definition: hsa_queue_entry.hh:135
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecuted
statistics::Scalar numVecOpsExecuted
Definition: compute_unit.hh:1089
gem5::Named::name
virtual std::string name() const
Definition: named.hh:47
gem5::WFBarrier::InvalidID
static const int InvalidID
Definition: compute_unit.hh:97
gem5::VegaISA::p
Bitfield< 54 > p
Definition: pagetable.hh:70
gem5::ScheduleStage::init
void init()
Definition: schedule_stage.cc:76
gem5::ComputeUnit::decMaxBarrierCnt
void decMaxBarrierCnt(int bar_id)
Definition: compute_unit.cc:696
gem5::ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition: compute_unit.hh:233
gem5::ComputeUnit::releaseWFsFromBarrier
void releaseWFsFromBarrier(int bar_id)
Definition: compute_unit.cc:711
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerGMemInstrDist
statistics::Distribution activeLanesPerGMemInstrDist
Definition: compute_unit.hh:1118
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:186
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition: compute_unit.hh:241
gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:291
gem5::ArmISA::d
Bitfield< 9 > d
Definition: misc_types.hh:64
gem5::ComputeUnit::execStage
ExecStage execStage
Definition: compute_unit.hh:283
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::description
const char * description() const
Return a C string describing the event.
Definition: compute_unit.cc:1653
gem5::ComputeUnit::ComputeUnitStats::vALUInsts
statistics::Scalar vALUInsts
Definition: compute_unit.hh:998
gem5::probing::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
gem5::ComputeUnit::ComputeUnitStats::instCyclesVALU
statistics::Scalar instCyclesVALU
Definition: compute_unit.hh:1002
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gem5::Wavefront::wfSlotId
const int wfSlotId
Definition: wavefront.hh:96
gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
gem5::ComputeUnit::tickEvent
EventFunctionWrapper tickEvent
Definition: compute_unit.hh:288
gem5::LocalMemPipeline::isLMRespFIFOWrRdy
bool isLMRespFIFOWrRdy() const
Definition: local_memory_pipeline.hh:68
gem5::MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:86
gem5::RR
@ RR
Definition: compute_unit.hh:75
gem5::MemCmd::MemSyncReq
@ MemSyncReq
Definition: packet.hh:123
gem5::ComputeUnit::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
Definition: compute_unit.cc:2089
process.hh
gem5::ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:284
gem5::ComputeUnit::resetRegisterPool
void resetRegisterPool()
Definition: compute_unit.cc:417
gem5::GpuTranslationState::tlbEntry
Serializable * tlbEntry
Definition: gpu_translation_state.hh:73
len
uint16_t len
Definition: helpers.cc:62
gem5::ComputeUnit::registerManager
RegisterManager * registerManager
Definition: compute_unit.hh:278
gem5::ComputeUnit::ComputeUnitStats::numInstrExecuted
statistics::Scalar numInstrExecuted
Definition: compute_unit.hh:1084
gem5::ComputeUnit::ScalarDataPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:915
gem5::HSAQueueEntry::isInvDone
bool isInvDone() const
Is invalidate done?
Definition: hsa_queue_entry.hh:354
gem5::ComputeUnit::ITLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1847
gem5::ComputeUnit::ScalarDTLBPort::stallPort
void stallPort()
Definition: compute_unit.hh:760
gem5::GlobalMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: global_memory_pipeline.hh:95
gem5::Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:92
gem5::ComputeUnit::DataPort::createMemReqEvent
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1610
gem5::ComputeUnit::ComputeUnitStats::vectorMemInstsPerKiloInst
statistics::Formula vectorMemInstsPerKiloInst
Definition: compute_unit.hh:1023
gem5::ComputeUnit::~ComputeUnit
~ComputeUnit()
Definition: compute_unit.cc:225
scalar_register_file.hh
gpu_dyn_inst.hh
gem5::ComputeUnit::DTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:719
gem5::HSAQueueEntry::wgSize
int wgSize(int dim) const
Definition: hsa_queue_entry.hh:121
gem5::ComputeUnit::activeWaves
int activeWaves
Definition: compute_unit.hh:992
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueVgprAlloc
statistics::Scalar numTimesWgBlockedDueVgprAlloc
Definition: compute_unit.hh:1123
gem5::RegisterManager::srfPoolMgrs
std::vector< PoolManager * > srfPoolMgrs
Definition: register_manager.hh:79
gem5::GpuTranslationState::hitLevel
int hitLevel
Definition: gpu_translation_state.hh:85
gem5::HSAQueueEntry::codeAddr
Addr codeAddr() const
Definition: hsa_queue_entry.hh:177
gem5::LdsChunk
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:56
gem5::ComputeUnit::mapWaveToScalarMem
int mapWaveToScalarMem(Wavefront *w) const
Definition: compute_unit.cc:294
gpu_command_processor.hh
gem5::ComputeUnit::mapWaveToGlobalMem
int mapWaveToGlobalMem(Wavefront *w) const
Definition: compute_unit.cc:278
gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
gem5::ComputeUnit::deleteFromPipeMap
void deleteFromPipeMap(Wavefront *w)
Definition: compute_unit.cc:514
gpu_translation_state.hh
gem5::ExecStage::init
void init()
Definition: exec_stage.cc:59
gem5::ComputeUnit::doFlush
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
Definition: compute_unit.cc:409
gem5::ComputeUnit::DataPort::SenderState::port_index
PortID port_index
Definition: compute_unit.hh:522
gem5::ComputeUnit::init
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: compute_unit.cc:759
gem5::HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:223
gem5::HSAQueueEntry::gridSize
int gridSize(int dim) const
Definition: hsa_queue_entry.hh:128
gem5::ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition: compute_unit.hh:249
gem5::ComputeUnit::DataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:521
gem5::ComputeUnit::memPort
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
Definition: compute_unit.hh:896
gem5::OLDEST
@ OLDEST
Definition: compute_unit.hh:74
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
Definition: compute_unit.hh:1024
gem5::X86ISA::pf
Bitfield< 2 > pf
Definition: misc.hh:550
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
Definition: compute_unit.hh:1021
gem5::Packet::cmd
MemCmd cmd
The command field of the packet.
Definition: packet.hh:369
gem5::ComputeUnit::DTLBPort::SenderState::portIndex
PortID portIndex
Definition: compute_unit.hh:723
gem5::ComputeUnit::perLaneTLB
bool perLaneTLB
Definition: compute_unit.hh:329
gem5::ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition: compute_unit.cc:253
gem5::LocalMemPipeline::isLMReqFIFOWrRdy
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: local_memory_pipeline.hh:74
gem5::ComputeUnit::ScalarDTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:753
gem5::ComputeUnit::ComputeUnitStats::groupWrites
statistics::Scalar groupWrites
Definition: compute_unit.hh:1044
gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
gem5::ComputeUnit::ComputeUnitStats::globalWrites
statistics::Scalar globalWrites
Definition: compute_unit.hh:1035
gem5::ComputeUnit::ComputeUnitStats::vALUInstsPerWF
statistics::Formula vALUInstsPerWF
Definition: compute_unit.hh:999
gem5::LdsState::increaseRefCounter
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:295
tlb.hh
gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:542
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueSgprAlloc
statistics::Scalar numTimesWgBlockedDueSgprAlloc
Definition: compute_unit.hh:1125
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedF16
Definition: compute_unit.hh:1091
gem5::ComputeUnit::barrierSlot
WFBarrier & barrierSlot(int bar_id)
Definition: compute_unit.hh:418
name
const std::string & name()
Definition: trace.cc:49
gem5::ComputeUnit::exitCallback
void exitCallback()
Definition: compute_unit.cc:1996
gem5::ComputeUnit::SQCPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:998
gem5::ComputeUnit::ComputeUnitStats::privMemInsts
statistics::Formula privMemInsts
Definition: compute_unit.hh:1048
gem5::ComputeUnit::mapWaveToScalarAlu
int mapWaveToScalarAlu(Wavefront *w) const
Definition: compute_unit.cc:260
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::ComputeUnit::hasDispResources
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
Definition: compute_unit.cc:526
gem5::ComputeUnit::getFreeBarrierId
int getFreeBarrierId()
Definition: compute_unit.hh:425
gem5::ComputeUnit::wfSize
int wfSize() const
Definition: compute_unit.hh:394
gem5::ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:234
gem5::GpuTranslationState::saved
Packet::SenderState * saved
Definition: gpu_translation_state.hh:86
gem5::ComputeUnit::pipeMap
std::unordered_set< uint64_t > pipeMap
Definition: compute_unit.hh:276
gem5::ComputeUnit::LDSPort::recvReqRetry
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
Definition: compute_unit.cc:2162
gem5::MemCmd::toString
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:273
gem5::Shader::timingSim
bool timingSim
Definition: shader.hh:192
gem5::Process
Definition: process.hh:68
gem5::GPUDispatcher::notifyWgCompl
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
Definition: dispatcher.cc:295
gem5::EventFunctionWrapper
Definition: eventq.hh:1115
gem5::ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
gem5::Clocked::nextCycle
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Definition: clocked_object.hh:213
gem5::FullSystem
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition: root.cc:220
gem5::ComputeUnit::updateInstStats
void updateInstStats(GPUDynInstPtr gpuDynInst)
Definition: compute_unit.cc:1877
gem5::ComputeUnit::ComputeUnitStats::numALUInstsExecuted
statistics::Formula numALUInstsExecuted
Definition: compute_unit.hh:1121
gem5::ComputeUnit::ComputeUnitStats::instCyclesLdsPerSimd
statistics::Vector instCyclesLdsPerSimd
Definition: compute_unit.hh:1032
gem5::ComputeUnit::ComputeUnitStats::argReads
statistics::Scalar argReads
Definition: compute_unit.hh:1037
gem5::ComputeUnit::ComputeUnitStats::globalMemInsts
statistics::Formula globalMemInsts
Definition: compute_unit.hh:1036
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueLdsAllocation
statistics::Scalar wgBlockedDueLdsAllocation
Definition: compute_unit.hh:1080
gem5::ComputeUnit::LDSPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Definition: compute_unit.cc:2098
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
gem5::ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition: compute_unit.hh:219
gem5::Request::FLUSH_L2
@ FLUSH_L2
Definition: request.hh:329
gem5::Wavefront::barrierId
void barrierId(int bar_id)
Definition: wavefront.cc:1414
gem5::ComputeUnit::Params
ComputeUnitParams Params
Definition: compute_unit.hh:290
gem5::Wavefront::S_RETURNING
@ S_RETURNING
Definition: wavefront.hh:68
gem5::ComputeUnit::ComputeUnitStats::ipc
statistics::Formula ipc
Definition: compute_unit.hh:1116
gem5::RegisterManager::allocateRegisters
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
Definition: register_manager.cc:122
gem5::ComputeUnit::updatePageDivergenceDist
void updatePageDivergenceDist(Addr addr)
Definition: compute_unit.cc:1985
gem5::ComputeUnit::vectorRegsReserved
std::vector< int > vectorRegsReserved
Definition: compute_unit.hh:367
gem5::ComputeUnit::ComputeUnitStats::readonlyWrites
statistics::Scalar readonlyWrites
Definition: compute_unit.hh:1050
gem5::Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:99
gem5::ComputeUnit::ComputeUnitStats::waveLevelParallelism
statistics::Distribution waveLevelParallelism
Definition: compute_unit.hh:1056
gem5::ComputeUnit::ComputeUnitStats::scalarMemWrites
statistics::Scalar scalarMemWrites
Definition: compute_unit.hh:1016
gem5::ComputeUnit::ComputeUnitStats::controlFlowDivergenceDist
statistics::Distribution controlFlowDivergenceDist
Definition: compute_unit.hh:1117
gem5::ScheduleStage::exec
void exec()
Definition: schedule_stage.cc:90
gem5::ComputeUnit::ComputeUnitStats::vectorMemWrites
statistics::Scalar vectorMemWrites
Definition: compute_unit.hh:1012
gem5::ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition: compute_unit.cc:505
gem5::ComputeUnit::ScalarDTLBPort::SenderState
Definition: compute_unit.hh:750
gem5::statistics::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:72
gem5::GpuTranslationState::tlbMode
BaseMMU::Mode tlbMode
Definition: gpu_translation_state.hh:61
gem5::GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:246
gem5::ComputeUnit::mapWaveToLocalMem
int mapWaveToLocalMem(Wavefront *w) const
Definition: compute_unit.cc:286
gem5::ComputeUnit::ldsPort
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
Definition: compute_unit.hh:885
gem5::MemCmd::ReadResp
@ ReadResp
Definition: packet.hh:87
gem5::ComputeUnit::ComputeUnitStats::flatLDSInstsPerWF
statistics::Formula flatLDSInstsPerWF
Definition: compute_unit.hh:1011
gem5::WFBarrier
WF barrier slots.
Definition: compute_unit.hh:90
gem5::ComputeUnit::isDone
bool isDone() const
Definition: compute_unit.cc:2013
gem5::ComputeUnit::LDSPort::SenderState::getMemInst
GPUDynInstPtr getMemInst() const
Definition: compute_unit.hh:847
gem5::ComputeUnit::ComputeUnitStats::hitsPerTLBLevel
statistics::Vector hitsPerTLBLevel
Definition: compute_unit.hh:1065
gem5::Shader::gpuCmdProc
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:227
gem5::ComputeUnit::maxBarrierCnt
int maxBarrierCnt(int bar_id)
Definition: compute_unit.cc:682
gem5::Shader::n_wf
int n_wf
Definition: shader.hh:206
gem5::ComputeUnit::scalarRegsReserved
std::vector< int > scalarRegsReserved
Definition: compute_unit.hh:369
gem5::ComputeUnit::fillKernelState
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
Definition: compute_unit.cc:302
gem5::MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:89
gem5::ComputeUnit::lds
LdsState & lds
Definition: compute_unit.hh:471
gem5::ComputeUnit::DTLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:716
gem5::ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition: compute_unit.hh:231
gem5::statistics::Group
Statistics container.
Definition: group.hh:93
gem5::ComputeUnit::ComputeUnitStats::execRateDist
statistics::Distribution execRateDist
Definition: compute_unit.hh:1087
gem5::ComputeUnit::tlbPort
std::vector< DTLBPort > tlbPort
Definition: compute_unit.hh:898
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedF32
Definition: compute_unit.hh:1093
gem5::ComputeUnit::isVectorAluIdle
bool isVectorAluIdle(uint32_t simdId) const
Definition: compute_unit.cc:2050
gem5::ComputeUnit::numScalarRegsPerSimd
int numScalarRegsPerSimd
Definition: compute_unit.hh:373
gem5::ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition: compute_unit.hh:245
gem5::ComputeUnit::sendScalarRequest
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
Definition: compute_unit.cc:1233
gem5::ComputeUnit::countPages
bool countPages
Definition: compute_unit.hh:351
gem5::ComputeUnit::freeBarrierIds
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
Definition: compute_unit.hh:983
sc_core::SC_NONE
@ SC_NONE
Definition: sc_report.hh:50
gem5::Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:109
gem5::ComputeUnit::ComputeUnitStats::ComputeUnitStats
ComputeUnitStats(statistics::Group *parent, int n_wf)
Definition: compute_unit.cc:2193
gem5::RegisterManager::canAllocateSgprs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:115
gem5::ComputeUnit::scalarMemToSrfBus
WaitClass scalarMemToSrfBus
Definition: compute_unit.hh:237
gem5::MipsISA::k
Bitfield< 23 > k
Definition: dt_constants.hh:81
gem5::ComputeUnit::scalarDTLBPort
ScalarDTLBPort scalarDTLBPort
Definition: compute_unit.hh:902
gem5::ComputeUnit::ComputeUnitStats::pageDivergenceDist
statistics::Distribution pageDivergenceDist
Definition: compute_unit.hh:1072
gem5::Shader::max_valu_insts
int64_t max_valu_insts
Definition: shader.hh:231
gem5::RequestorID
uint16_t RequestorID
Definition: request.hh:95
gem5::ExecStage::exec
void exec()
Definition: exec_stage.cc:152
gem5::GPUDispatcher
Definition: dispatcher.hh:62
dispatcher.hh
DPRINTFN
#define DPRINTFN(...)
Definition: trace.hh:214
gem5::ComputeUnit::ScalarDataPort::MemReqEvent
Definition: compute_unit.hh:606
gem5::statistics::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:358
gem5::MipsISA::vaddr
vaddr
Definition: pra_constants.hh:278
gem5::Request::INV_L1
@ INV_L1
Definition: request.hh:324
gem5::ComputeUnit::ComputeUnitStats::argWrites
statistics::Scalar argWrites
Definition: compute_unit.hh:1038
gem5::ComputeUnit::ComputeUnitStats::vpc_f32
statistics::Formula vpc_f32
Definition: compute_unit.hh:1114
gem5::HSAQueueEntry::ldsSize
int ldsSize() const
Definition: hsa_queue_entry.hh:189
gem5::Packet::getAddr
Addr getAddr() const
Definition: packet.hh:790
gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
gem5::RegisterManager::canAllocateVgprs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:109
gem5::ComputeUnit::sendToLds
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
Definition: compute_unit.cc:2069
gem5::X86ISA::PageBytes
const Addr PageBytes
Definition: page_size.hh:49
gem5::registerExitCallback
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition: core.cc:146
gem5::Wavefront::getStatus
status_e getStatus()
Definition: wavefront.hh:137
gem5::ComputeUnit::ComputeUnitStats::dynamicLMemInstrCnt
statistics::Scalar dynamicLMemInstrCnt
Definition: compute_unit.hh:1077
gem5::ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition: compute_unit.hh:286
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
page_table.hh
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: gpu_translation_state.hh:37
gem5::ComputeUnit::DataPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:808
gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:225
gem5::MemCmd::MemSyncResp
@ MemSyncResp
Definition: packet.hh:124
gem5::ComputeUnit::ScalarDataPort::SystemHubEvent
Definition: compute_unit.hh:623
gem5::ScoreboardCheckStage::exec
void exec()
Definition: scoreboard_check_stage.cc:248
gem5::ComputeUnit::ComputeUnitStats::readonlyMemInsts
statistics::Formula readonlyMemInsts
Definition: compute_unit.hh:1051
gem5::ComputeUnit::ScalarDataPort::recvReqRetry
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:959
gem5::ComputeUnit::ComputeUnitStats::scalarMemReads
statistics::Scalar scalarMemReads
Definition: compute_unit.hh:1018
gem5::ComputeUnit::ComputeUnitStats::totalCycles
statistics::Scalar totalCycles
Definition: compute_unit.hh:1111
gem5::statistics::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1040
gem5::ComputeUnit::dispWorkgroup
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
Definition: compute_unit.cc:427
gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:153
gem5::ArmISA::stride
Bitfield< 21, 20 > stride
Definition: misc_types.hh:447
gem5::ComputeUnit::ComputeUnitStats::tlbCycles
statistics::Scalar tlbCycles
Definition: compute_unit.hh:1061
gem5::ComputeUnit::mapWaveToScalarAluGlobalIdx
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
Definition: compute_unit.cc:271
gem5::ComputeUnit::gmTokenPort
GMTokenPort gmTokenPort
Definition: compute_unit.hh:508
gem5::LdsState::reserveSpace
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:363
gem5::Shader::total_valu_insts
int64_t total_valu_insts
Definition: shader.hh:232
gem5::ComputeUnit::DataPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:971
gem5::ComputeUnit::doInvalidate
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
Definition: compute_unit.cc:390
gem5::ComputeUnit::ComputeUnitStats::spillReads
statistics::Scalar spillReads
Definition: compute_unit.hh:1040
gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
gem5::ComputeUnit::allAtBarrier
bool allAtBarrier(int bar_id)
Definition: compute_unit.cc:661
gem5::ComputeUnit::numWfsToSched
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
Definition: compute_unit.hh:364
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInstsPerWF
statistics::Formula ldsNoFlatInstsPerWF
Definition: compute_unit.hh:1007
gem5::GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:111
gem5::KernelLaunchStaticInst
Definition: gpu_static_inst.hh:325
gem5::ComputeUnit::DataPort::SenderState
Definition: compute_unit.hh:519
gem5::Packet::getSize
unsigned getSize() const
Definition: packet.hh:800
gem5::Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
gem5::ComputeUnit::ScalarDataPort::SenderState
Definition: compute_unit.hh:594
gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:305
gem5::ComputeUnit::scheduleStage
ScheduleStage scheduleStage
Definition: compute_unit.hh:282
gem5::Wavefront::dropFetch
bool dropFetch
Definition: wavefront.hh:112
gem5::Shader::impl_kern_end_rel
int impl_kern_end_rel
Definition: shader.hh:198
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
gem5::GPUCommandProcessor::driver
GPUComputeDriver * driver()
Definition: gpu_command_processor.cc:303
gem5::ComputeUnit::DataPort::createMemRespEvent
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1618
gem5::Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:217
gem5::ComputeUnit::requestorId
RequestorID requestorId()
Definition: compute_unit.hh:460
gem5::X86ISA::addr
Bitfield< 3 > addr
Definition: types.hh:84
gem5::ComputeUnit::ComputeUnitStats::scalarMemInstsPerKiloInst
statistics::Formula scalarMemInstsPerKiloInst
Definition: compute_unit.hh:1026
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerKiloInst
statistics::Formula vectorMemWritesPerKiloInst
Definition: compute_unit.hh:1022
gem5::SenderState
RubyTester::SenderState SenderState
Definition: Check.cc:40
gem5::ComputeUnit::numExeUnits
int numExeUnits() const
Definition: compute_unit.cc:238
gem5::Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1197
gem5::ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:221
gem5::Wavefront::simdId
const int simdId
Definition: wavefront.hh:99
gem5::MipsISA::vpc
Bitfield< 1 > vpc
Definition: mt_constants.hh:44

Generated on Wed Jul 13 2022 10:39:21 for gem5 by doxygen 1.8.17