gem5  [DEVELOP-FOR-23.0]
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <limits>
35 
38 #include "base/output.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUReg.hh"
46 #include "debug/GPURename.hh"
47 #include "debug/GPUSync.hh"
48 #include "debug/GPUTLB.hh"
54 #include "gpu-compute/shader.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 #include "sim/sim_exit.hh"
61 
62 namespace gem5
63 {
64 
66  numVectorGlobalMemUnits(p.num_global_mem_pipes),
67  numVectorSharedMemUnits(p.num_shared_mem_pipes),
68  numScalarMemUnits(p.num_scalar_mem_pipes),
69  numVectorALUs(p.num_SIMDs),
70  numScalarALUs(p.num_scalar_cores),
71  vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
72  coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
73  registerManager(p.register_manager),
74  fetchStage(p, *this),
75  scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
76  scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
77  execStage(p, *this, scheduleToExecute),
78  globalMemoryPipe(p, *this),
79  localMemoryPipe(p, *this),
80  scalarMemoryPipe(p, *this),
81  tickEvent([this]{ exec(); }, "Compute unit tick event",
82  false, Event::CPU_Tick_Pri),
83  cu_id(p.cu_id),
84  vrf(p.vector_register_file), srf(p.scalar_register_file),
85  simdWidth(p.simd_width),
86  spBypassPipeLength(p.spbypass_pipe_length),
87  dpBypassPipeLength(p.dpbypass_pipe_length),
88  scalarPipeStages(p.scalar_pipe_length),
89  operandNetworkLength(p.operand_network_length),
90  issuePeriod(p.issue_period),
91  vrf_gm_bus_latency(p.vrf_gm_bus_latency),
92  srf_scm_bus_latency(p.srf_scm_bus_latency),
93  vrf_lm_bus_latency(p.vrf_lm_bus_latency),
94  perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
95  prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
96  debugSegFault(p.debugSegFault),
97  functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
98  countPages(p.countPages),
99  req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
100  resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
101  scalar_req_tick_latency(
102  p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
103  scalar_resp_tick_latency(
104  p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
105  _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
106  lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
107  ldsPort(csprintf("%s-port", name()), this),
108  scalarDataPort(csprintf("%s-port", name()), this),
109  scalarDTLBPort(csprintf("%s-port", name()), this),
110  sqcPort(csprintf("%s-port", name()), this),
111  sqcTLBPort(csprintf("%s-port", name()), this),
112  _cacheLineSize(p.system->cacheLineSize()),
113  _numBarrierSlots(p.num_barrier_slots),
114  globalSeqNum(0), wavefrontSize(p.wf_size),
115  scoreboardCheckToSchedule(p),
116  scheduleToExecute(p),
117  stats(this, p.n_wf)
118 {
119  // This is not currently supported and would require adding more handling
120  // for system vs. device memory requests on the functional paths, so we
121  // fatal immediately in the constructor if this configuration is seen.
122  fatal_if(functionalTLB && FullSystem,
123  "Functional TLB not supported in full-system GPU simulation");
124 
134  fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
135  p.wf_size <= 0,
136  "WF size is larger than the host can support");
137  fatal_if(!isPowerOf2(wavefrontSize),
138  "Wavefront size should be a power of 2");
139  // calculate how many cycles a vector load or store will need to transfer
140  // its data over the corresponding buses
141  numCyclesPerStoreTransfer =
142  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
143  (double)vrfToCoalescerBusWidth);
144 
145  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
146  / coalescerToVrfBusWidth;
147 
148  // Initialization: all WF slots are assumed STOPPED
149  idleWfs = p.n_wf * numVectorALUs;
150  lastVaddrWF.resize(numVectorALUs);
151  wfList.resize(numVectorALUs);
152 
153  wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
154 
155  for (int i = 0; i < p.num_barrier_slots; ++i) {
156  freeBarrierIds.insert(i);
157  }
158 
159  for (int j = 0; j < numVectorALUs; ++j) {
160  lastVaddrWF[j].resize(p.n_wf);
161 
162  for (int i = 0; i < p.n_wf; ++i) {
163  lastVaddrWF[j][i].resize(wfSize());
164 
165  wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
166  wfList[j][i]->setParent(this);
167 
168  for (int k = 0; k < wfSize(); ++k) {
169  lastVaddrWF[j][i][k] = 0;
170  }
171  }
172  }
173 
174  lastVaddrSimd.resize(numVectorALUs);
175 
176  for (int i = 0; i < numVectorALUs; ++i) {
177  lastVaddrSimd[i].resize(wfSize(), 0);
178  }
179 
180  lastVaddrCU.resize(wfSize());
181 
182  lds.setParent(this);
183 
184  if (p.execPolicy == "OLDEST-FIRST") {
185  exec_policy = EXEC_POLICY::OLDEST;
186  } else if (p.execPolicy == "ROUND-ROBIN") {
187  exec_policy = EXEC_POLICY::RR;
188  } else {
189  fatal("Invalid WF execution policy (CU)\n");
190  }
191 
192  for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
193  memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
194  }
195 
196  for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
197  tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
198  }
199 
200  // Setup tokens for response ports. The number of tokens in memPortTokens
201  // is the total token count for the entire vector port (i.e., this CU).
202  memPortTokens = new TokenManager(p.max_cu_tokens);
203 
204  registerExitCallback([this]() { exitCallback(); });
205 
206  lastExecCycle.resize(numVectorALUs, 0);
207 
208  for (int i = 0; i < vrf.size(); ++i) {
209  vrf[i]->setParent(this);
210  }
211  for (int i = 0; i < srf.size(); ++i) {
212  srf[i]->setParent(this);
213  }
214  numVecRegsPerSimd = vrf[0]->numRegs();
215  numScalarRegsPerSimd = srf[0]->numRegs();
216 
217  registerManager->setParent(this);
218 
219  activeWaves = 0;
220 
221  instExecPerSimd.resize(numVectorALUs, 0);
222 
223  // Calculate the number of bits to address a cache line
224  panic_if(!isPowerOf2(_cacheLineSize),
225  "Cache line size should be a power of two.");
226  cacheLineBits = floorLog2(_cacheLineSize);
227 }
228 
230 {
231  // Delete wavefront slots
232  for (int j = 0; j < numVectorALUs; ++j) {
233  for (int i = 0; i < shader->n_wf; ++i) {
234  delete wfList[j][i];
235  }
236  lastVaddrSimd[j].clear();
237  }
238  lastVaddrCU.clear();
239 }
240 
241 int
243 {
246 }
247 
248 // index into readyList of the first memory unit
249 int
251 {
252  return numVectorALUs + numScalarALUs;
253 }
254 
255 // index into readyList of the last memory unit
256 int
258 {
259  return numExeUnits() - 1;
260 }
261 
262 // index into scalarALUs vector of SALU used by the wavefront
263 int
265 {
266  if (numScalarALUs == 1) {
267  return 0;
268  } else {
269  return w->simdId % numScalarALUs;
270  }
271 }
272 
273 // index into readyList of Scalar ALU unit used by wavefront
274 int
276 {
278 }
279 
280 // index into readyList of Global Memory unit used by wavefront
281 int
283 {
284  // TODO: FIXME if more than 1 GM pipe supported
285  return numVectorALUs + numScalarALUs;
286 }
287 
288 // index into readyList of Local Memory unit used by wavefront
289 int
291 {
292  // TODO: FIXME if more than 1 LM pipe supported
294 }
295 
296 // index into readyList of Scalar Memory unit used by wavefront
297 int
299 {
300  // TODO: FIXME if more than 1 ScM pipe supported
303 }
304 
305 void
307 {
308  w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
309  w->workGroupSz[0] = task->wgSize(0);
310  w->workGroupSz[1] = task->wgSize(1);
311  w->workGroupSz[2] = task->wgSize(2);
312  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
313  w->gridSz[0] = task->gridSize(0);
314  w->gridSz[1] = task->gridSize(1);
315  w->gridSz[2] = task->gridSize(2);
316  w->computeActualWgSz(task);
317 }
318 
319 void
321  HSAQueueEntry *task, int bar_id, bool fetchContext)
322 {
323  static int _n_wave = 0;
324 
325  VectorMask init_mask;
326  init_mask.reset();
327 
328  for (int k = 0; k < wfSize(); ++k) {
329  if (k + waveId * wfSize() < w->actualWgSzTotal)
330  init_mask[k] = 1;
331  }
332 
333  w->execMask() = init_mask;
334 
335  w->kernId = task->dispatchId();
336  w->wfId = waveId;
337  w->initMask = init_mask.to_ullong();
338 
339  if (bar_id > WFBarrier::InvalidID) {
340  w->barrierId(bar_id);
341  } else {
342  assert(!w->hasBarrier());
343  }
344 
345  for (int k = 0; k < wfSize(); ++k) {
346  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
347  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
348  w->actualWgSz[1];
349  w->workItemId[2][k] = (k + waveId * wfSize()) /
350  (w->actualWgSz[0] * w->actualWgSz[1]);
351 
352  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
353  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
354  w->workItemId[0][k];
355  }
356 
357  // WG state
358  w->wgId = task->globalWgId();
359  w->dispatchId = task->dispatchId();
360  w->workGroupId[0] = w->wgId % task->numWg(0);
361  w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
362  w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
363 
364  // set the wavefront context to have a pointer to this section of the LDS
365  w->ldsChunk = ldsChunk;
366 
367  [[maybe_unused]] int32_t refCount =
368  lds.increaseRefCounter(w->dispatchId, w->wgId);
369  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
370  cu_id, w->wgId, refCount);
371 
372  w->instructionBuffer.clear();
373 
374  if (w->pendingFetch)
375  w->dropFetch = true;
376 
377  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
378  "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
379  w->simdId, w->wfSlotId, refCount);
380 
381  w->initRegState(task, w->actualWgSzTotal);
382  w->start(_n_wave++, task->codeAddr());
383 
385  activeWaves++;
386 }
387 
393 void
395  GPUDynInstPtr gpuDynInst
396  = std::make_shared<GPUDynInst>(this, nullptr,
398 
399  // kern_id will be used in inv responses
400  gpuDynInst->kern_id = kernId;
401  // update contextId field
402  req->setContext(gpuDynInst->wfDynId);
403 
404  injectGlobalMemFence(gpuDynInst, true, req);
405 }
406 
412 void
414  injectGlobalMemFence(gpuDynInst, true);
415 }
416 
417 // reseting SIMD register pools
418 // I couldn't think of any other place and
419 // I think it is needed in my implementation
420 void
422 {
423  for (int i=0; i<numVectorALUs; i++)
424  {
427  }
428 }
429 
430 void
431 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
432 {
433  // If we aren't ticking, start it up!
434  if (!tickEvent.scheduled()) {
435  DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
437  }
438 
439  // the kernel's invalidate must have finished before any wg dispatch
440  assert(task->isInvDone());
441 
442  // reserve the LDS capacity allocated to the work group
443  // disambiguated by the dispatch ID and workgroup ID, which should be
444  // globally unique
445  LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
446  task->globalWgId(),
447  task->ldsSize());
448 
449  panic_if(!ldsChunk, "was not able to reserve space for this WG");
450 
451  // calculate the number of 32-bit vector registers required
452  // by each work item
453  int vregDemand = task->numVectorRegs();
454  int sregDemand = task->numScalarRegs();
455  int wave_id = 0;
456 
457  int barrier_id = WFBarrier::InvalidID;
458 
463  if (num_wfs_in_wg > 1) {
468  barrier_id = getFreeBarrierId();
469  auto &wf_barrier = barrierSlot(barrier_id);
470  assert(!wf_barrier.maxBarrierCnt());
471  assert(!wf_barrier.numAtBarrier());
472  wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
473 
474  DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
475  "%d waves using this barrier.\n", cu_id, barrier_id,
476  num_wfs_in_wg);
477  }
478 
479  // Assign WFs according to numWfsToSched vector, which is computed by
480  // hasDispResources()
481  for (int j = 0; j < shader->n_wf; ++j) {
482  for (int i = 0; i < numVectorALUs; ++i) {
483  Wavefront *w = wfList[i][j];
484  // Check if this wavefront slot is available and there are WFs
485  // remaining to be dispatched to current SIMD:
486  // WF slot must be stopped and not waiting
487  // for a release to complete S_RETURNING
488  if (w->getStatus() == Wavefront::S_STOPPED &&
489  numWfsToSched[i] > 0) {
490  // decrement number of WFs awaiting dispatch to current SIMD
491  numWfsToSched[i] -= 1;
492 
493  fillKernelState(w, task);
494 
495  DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
496  "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
497  vregDemand, sregDemand);
498 
499  registerManager->allocateRegisters(w, vregDemand, sregDemand);
500 
501  startWavefront(w, wave_id, ldsChunk, task, barrier_id);
502  ++wave_id;
503  }
504  }
505  }
506 }
507 
508 void
510 {
511  panic_if(w->instructionBuffer.empty(),
512  "Instruction Buffer of WF%d can't be empty", w->wgId);
513  GPUDynInstPtr ii = w->instructionBuffer.front();
514  pipeMap.emplace(ii->seqNum());
515 }
516 
517 void
519 {
520  panic_if(w->instructionBuffer.empty(),
521  "Instruction Buffer of WF%d can't be empty", w->wgId);
522  GPUDynInstPtr ii = w->instructionBuffer.front();
523  // delete the dynamic instruction from the pipeline map
524  auto it = pipeMap.find(ii->seqNum());
525  panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
526  pipeMap.erase(it);
527 }
528 
529 bool
531 {
532  // compute true size of workgroup (after clamping to grid size)
533  int trueWgSize[HSAQueueEntry::MAX_DIM];
534  int trueWgSizeTotal = 1;
535 
536  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
537  trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
538  task->wgId(d) * task->wgSize(d));
539 
540  trueWgSizeTotal *= trueWgSize[d];
541  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
542  }
543 
544  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
545 
546  // calculate the number of WFs in this WG
547  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
548  num_wfs_in_wg = numWfs;
549 
550  bool barrier_avail = true;
551 
552  if (numWfs > 1 && !freeBarrierIds.size()) {
553  barrier_avail = false;
554  }
555 
556  // calculate the number of 32-bit vector registers required by each
557  // work item of the work group
558  int vregDemandPerWI = task->numVectorRegs();
559  // calculate the number of 32-bit scalar registers required by each
560  // work item of the work group
561  int sregDemandPerWI = task->numScalarRegs();
562 
563  // check if the total number of VGPRs snd SGPRs required by all WFs
564  // of the WG fit in the VRFs of all SIMD units and the CU's SRF
565  panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
566  "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
567  "that has %d VGPRs\n",
568  numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
569  panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
570  "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
571  "with %d SGPRs\n",
572  numWfs, sregDemandPerWI, numScalarRegsPerSimd);
573 
574  // number of WF slots that are not occupied
575  int freeWfSlots = 0;
576  // number of Wfs from WG that were successfully mapped to a SIMD
577  int numMappedWfs = 0;
578  numWfsToSched.clear();
579  numWfsToSched.resize(numVectorALUs, 0);
580 
581  // attempt to map WFs to the SIMDs, based on WF slot availability
582  // and register file availability
583  for (int j = 0; j < shader->n_wf; ++j) {
584  for (int i = 0; i < numVectorALUs; ++i) {
585  if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
586  ++freeWfSlots;
587  // check if current WF will fit onto current SIMD/VRF
588  // if all WFs have not yet been mapped to the SIMDs
589  if (numMappedWfs < numWfs &&
591  sregDemandPerWI) &&
593  vregDemandPerWI)) {
594  numWfsToSched[i]++;
595  numMappedWfs++;
596  }
597  }
598  }
599  }
600 
601  // check that the number of mapped WFs is not greater
602  // than the actual number of WFs
603  assert(numMappedWfs <= numWfs);
604 
605  bool vregAvail = true;
606  bool sregAvail = true;
607  // if a WF to SIMD mapping was not found, find the limiting resource
608  if (numMappedWfs < numWfs) {
609 
610  for (int j = 0; j < numVectorALUs; ++j) {
611  // find if there are enough free VGPRs in the SIMD's VRF
612  // to accomodate the WFs of the new WG that would be mapped
613  // to this SIMD unit
614  vregAvail &= registerManager->
615  canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
616  // find if there are enough free SGPRs in the SIMD's SRF
617  // to accomodate the WFs of the new WG that would be mapped
618  // to this SIMD unit
619  sregAvail &= registerManager->
620  canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
621  }
622  }
623 
624  DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
625  VGPR Availability = %d, SGPR Availability = %d\n",
626  freeWfSlots, numMappedWfs, vregAvail, sregAvail);
627 
628  if (!vregAvail) {
630  }
631 
632  if (!sregAvail) {
634  }
635 
636  // Return true if enough WF slots to submit workgroup and if there are
637  // enough VGPRs to schedule all WFs to their SIMD units
638  bool ldsAvail = lds.canReserve(task->ldsSize());
639  if (!ldsAvail) {
641  }
642 
643  if (!barrier_avail) {
645  }
646 
647  // Return true if the following are all true:
648  // (a) all WFs of the WG were mapped to free WF slots
649  // (b) there are enough VGPRs to schedule all WFs to their SIMD units
650  // (c) there are enough SGPRs on the CU to schedule all WFs
651  // (d) there is enough space in LDS to allocate for all WFs
652  bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
653  && ldsAvail && barrier_avail;
654  return can_dispatch;
655 }
656 
657 int
659 {
660  auto &wf_barrier = barrierSlot(bar_id);
661  return wf_barrier.numYetToReachBarrier();
662 }
663 
664 bool
666 {
667  auto &wf_barrier = barrierSlot(bar_id);
668  return wf_barrier.allAtBarrier();
669 }
670 
671 void
673 {
674  auto &wf_barrier = barrierSlot(bar_id);
675  wf_barrier.incNumAtBarrier();
676 }
677 
678 int
680 {
681  auto &wf_barrier = barrierSlot(bar_id);
682  return wf_barrier.numAtBarrier();
683 }
684 
685 int
687 {
688  auto &wf_barrier = barrierSlot(bar_id);
689  return wf_barrier.maxBarrierCnt();
690 }
691 
692 void
694 {
695  auto &wf_barrier = barrierSlot(bar_id);
696  wf_barrier.reset();
697 }
698 
699 void
701 {
702  auto &wf_barrier = barrierSlot(bar_id);
703  wf_barrier.decMaxBarrierCnt();
704 }
705 
706 void
708 {
709  auto &wf_barrier = barrierSlot(bar_id);
710  wf_barrier.release();
711  freeBarrierIds.insert(bar_id);
712 }
713 
714 void
716 {
717  for (int i = 0; i < numVectorALUs; ++i) {
718  for (int j = 0; j < shader->n_wf; ++j) {
719  Wavefront *wf = wfList[i][j];
720  if (wf->barrierId() == bar_id) {
721  assert(wf->getStatus() == Wavefront::S_BARRIER);
723  }
724  }
725  }
726 }
727 
728 // Execute one clock worth of work on the ComputeUnit.
729 void
731 {
732  // process reads and writes in the RFs
733  for (auto &vecRegFile : vrf) {
734  vecRegFile->exec();
735  }
736 
737  for (auto &scRegFile : srf) {
738  scRegFile->exec();
739  }
740 
741  // Execute pipeline stages in reverse order to simulate
742  // the pipeline latency
746  execStage.exec();
749  fetchStage.exec();
750 
751  stats.totalCycles++;
752 
753  // Put this CU to sleep if there is no more work to be done.
754  if (!isDone()) {
756  } else {
758  DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
759  }
760 }
761 
762 void
764 {
765  // Initialize CU Bus models and execution resources
766 
767  // Vector ALUs
768  vectorALUs.clear();
769  for (int i = 0; i < numVectorALUs; i++) {
770  vectorALUs.emplace_back(this, clockPeriod());
771  }
772 
773  // Scalar ALUs
774  scalarALUs.clear();
775  for (int i = 0; i < numScalarALUs; i++) {
776  scalarALUs.emplace_back(this, clockPeriod());
777  }
778 
779  // Vector Global Memory
781  "No support for multiple Global Memory Pipelines exists!!!");
785 
786  // Vector Local/Shared Memory
788  "No support for multiple Local Memory Pipelines exists!!!");
792 
793  // Scalar Memory
795  "No support for multiple Scalar Memory Pipelines exists!!!");
796  scalarMemUnit.init(this, clockPeriod());
799 
802 
803  fetchStage.init();
805  execStage.init();
807 
809 }
810 
811 bool
813 {
814  return handleResponse(pkt);
815 }
816 
817 bool
819 {
820  // Ruby has completed the memory op. Schedule the mem_resp_event at the
821  // appropriate cycle to process the timing memory response
822  // This delay represents the pipeline delay
823  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
824  PortID index = sender_state->port_index;
825  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
826  GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
827 
828  // MemSyncResp + WriteAckResp are handled completely here and we don't
829  // schedule a MemRespEvent to process the responses further
830  if (pkt->cmd == MemCmd::MemSyncResp) {
831  // This response is for 1 of the following request types:
832  // - kernel launch
833  // - kernel end
834  // - non-kernel mem sync
835 
836  // Kernel Launch
837  // wavefront was nullptr when launching kernel, so it is meaningless
838  // here (simdId=-1, wfSlotId=-1)
839  if (gpuDynInst->isKernelLaunch()) {
840  // for kernel launch, the original request must be both kernel-type
841  // and INV_L1
842  assert(pkt->req->isKernel());
843  assert(pkt->req->isInvL1());
844 
845  // one D-Cache inv is done, decrement counter
846  dispatcher.updateInvCounter(gpuDynInst->kern_id);
847 
848  delete pkt->senderState;
849  delete pkt;
850  return true;
851  }
852 
853  // retrieve wavefront from inst
854  Wavefront *w = gpuDynInst->wavefront();
855 
856  // Check if we are waiting on Kernel End Flush
857  if (w->getStatus() == Wavefront::S_RETURNING
858  && gpuDynInst->isEndOfKernel()) {
859  // for kernel end, the original request must be both kernel-type
860  // and last-level GPU cache should be flushed if it contains
861  // dirty data. This request may have been quiesced and
862  // immediately responded to if the GL2 is a write-through /
863  // read-only cache.
864  assert(pkt->req->isKernel());
865  assert(pkt->req->isGL2CacheFlush());
866 
867  // once flush done, decrement counter, and return whether all
868  // dirty writeback operations are done for the kernel
869  bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
870 
871  // not all wbs are done for the kernel, just release pkt
872  // resources
873  if (!isWbDone) {
874  delete pkt->senderState;
875  delete pkt;
876  return true;
877  }
878 
879  // all wbs are completed for the kernel, do retirement work
880  // for the workgroup
881  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
882  computeUnit->cu_id, w->simdId, w->wfSlotId,
883  w->wfDynId, w->wgId);
884 
885  dispatcher.notifyWgCompl(w);
886  w->setStatus(Wavefront::S_STOPPED);
887  }
888 
889  if (!pkt->req->isKernel()) {
890  w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
891  DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
892  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
893  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
894  gpuDynInst->disassemble(), w->outstandingReqs,
895  w->outstandingReqs - 1);
896  computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
897  }
898 
899  delete pkt->senderState;
900  delete pkt;
901  return true;
902  }
903 
904  EventFunctionWrapper *mem_resp_event =
905  computeUnit->memPort[index].createMemRespEvent(pkt);
906 
907  DPRINTF(GPUPort,
908  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
909  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
910  gpuDynInst->seqNum(), index, pkt->req->getPaddr());
911 
912  computeUnit->schedule(mem_resp_event,
913  curTick() + computeUnit->resp_tick_latency);
914 
915  return true;
916 }
917 
918 bool
920 {
921  return handleResponse(pkt);
922 }
923 
924 bool
926 {
927  assert(!pkt->req->isKernel());
928 
929  // retrieve sender state
930  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
931  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
932 
933  assert(pkt->isRead() || pkt->isWrite());
934  assert(gpuDynInst->numScalarReqs > 0);
935 
936  gpuDynInst->numScalarReqs--;
937 
946  if (!gpuDynInst->numScalarReqs) {
947  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
948  computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
949  gpuDynInst);
950  } else {
951  computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
952  gpuDynInst);
953  }
954  }
955 
956  delete pkt->senderState;
957  delete pkt;
958 
959  return true;
960 }
961 
962 void
964 {
965  for (const auto &pkt : retries) {
966  if (!sendTimingReq(pkt)) {
967  break;
968  } else {
969  retries.pop_front();
970  }
971  }
972 }
973 
974 void
976 {
977  int len = retries.size();
978 
979  assert(len > 0);
980 
981  for (int i = 0; i < len; ++i) {
982  PacketPtr pkt = retries.front().first;
983  [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
984  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
985  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
986  pkt->req->getPaddr());
987 
991  if (!sendTimingReq(pkt)) {
992  DPRINTF(GPUMem, "failed again!\n");
993  break;
994  } else {
995  DPRINTF(GPUMem, "successful!\n");
996  retries.pop_front();
997  }
998  }
999 }
1000 
1001 bool
1003 {
1004  computeUnit->handleSQCReturn(pkt);
1005 
1006  return true;
1007 }
1008 
1009 void
1011 {
1013 }
1014 
1015 void
1017 {
1018  int len = retries.size();
1019 
1020  assert(len > 0);
1021 
1022  for (int i = 0; i < len; ++i) {
1023  PacketPtr pkt = retries.front().first;
1024  [[maybe_unused]] Wavefront *wavefront = retries.front().second;
1025  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1026  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1027  pkt->req->getPaddr());
1028  if (!sendTimingReq(pkt)) {
1029  DPRINTF(GPUFetch, "failed again!\n");
1030  break;
1031  } else {
1032  DPRINTF(GPUFetch, "successful!\n");
1033  retries.pop_front();
1034  }
1035  }
1036 }
1037 
1038 void
1040 {
1041  // There must be a way around this check to do the globalMemStart...
1042  Addr tmp_vaddr = pkt->req->getVaddr();
1043 
1044  updatePageDivergenceDist(tmp_vaddr);
1045 
1046  // set PC in request
1047  pkt->req->setPC(gpuDynInst->wavefront()->pc());
1048 
1049  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1050 
1051  // figure out the type of the request to set read/write
1052  BaseMMU::Mode TLB_mode;
1053  assert(pkt->isRead() || pkt->isWrite());
1054 
1055  // only do some things if actually accessing data
1056  bool isDataAccess = pkt->isWrite() || pkt->isRead();
1057 
1058  // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode
1059  // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1060  // Rather than hacking up the pagetable to add these bits in, we just
1061  // keep a structure local to our GPUs that are populated in our
1062  // emulated driver whenever memory is allocated. Consult that structure
1063  // here in case we need a memtype override.
1064  //
1065  // In full system mode these can be extracted from the PTE and assigned
1066  // after address translation takes place.
1067  if (!FullSystem) {
1068  shader->gpuCmdProc.driver()->setMtype(pkt->req);
1069  }
1070 
1071  // Check write before read for atomic operations
1072  // since atomic operations should use BaseMMU::Write
1073  if (pkt->isWrite()) {
1074  TLB_mode = BaseMMU::Write;
1075  } else if (pkt->isRead()) {
1076  TLB_mode = BaseMMU::Read;
1077  } else {
1078  fatal("pkt is not a read nor a write\n");
1079  }
1080 
1081  if (!functionalTLB) {
1082  stats.tlbCycles -= curTick();
1083  }
1084  ++stats.tlbRequests;
1085 
1086  PortID tlbPort_index = perLaneTLB ? index : 0;
1087 
1088  if (shader->timingSim) {
1089  if (!FullSystem && debugSegFault) {
1091  Addr vaddr = pkt->req->getVaddr();
1092  unsigned size = pkt->getSize();
1093 
1094  if ((vaddr + size - 1) % 64 < vaddr % 64) {
1095  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1096  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1097  }
1098 
1099  Addr paddr;
1100 
1101  if (!p->pTable->translate(vaddr, paddr)) {
1102  if (!p->fixupFault(vaddr)) {
1103  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1104  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1105  vaddr);
1106  }
1107  }
1108  }
1109 
1110  // This is the SenderState needed upon return
1111  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1112 
1113  // This is the senderState needed by the TLB hierarchy to function
1114  GpuTranslationState *translation_state =
1115  new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1116  pkt->senderState);
1117 
1118  pkt->senderState = translation_state;
1119 
1120  if (functionalTLB) {
1121  tlbPort[tlbPort_index].sendFunctional(pkt);
1122 
1123  // update the hitLevel distribution
1124  int hit_level = translation_state->hitLevel;
1125  assert(hit_level != -1);
1126  stats.hitsPerTLBLevel[hit_level]++;
1127 
1128  // New SenderState for the memory access
1129  GpuTranslationState *sender_state =
1130  safe_cast<GpuTranslationState*>(pkt->senderState);
1131 
1132  delete sender_state->tlbEntry;
1133  delete sender_state->saved;
1134  delete sender_state;
1135 
1136  assert(pkt->req->hasPaddr());
1137  assert(pkt->req->hasSize());
1138 
1139  // this is necessary because the GPU TLB receives packets instead
1140  // of requests. when the translation is complete, all relevent
1141  // fields in the request will be populated, but not in the packet.
1142  // here we create the new packet so we can set the size, addr,
1143  // and proper flags.
1144  PacketPtr oldPkt = pkt;
1145  pkt = new Packet(oldPkt->req, oldPkt->cmd);
1146  if (isDataAccess) {
1147  uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1148  pkt->dataStatic(tmpData);
1149  }
1150  delete oldPkt;
1151 
1152 
1153  // New SenderState for the memory access
1154  pkt->senderState =
1155  new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1156  nullptr);
1157 
1158  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1159  gpuDynInst->tlbHitLevel[index] = hit_level;
1160 
1161  // translation is done. Schedule the mem_req_event at the
1162  // appropriate cycle to send the timing memory request to ruby
1163  EventFunctionWrapper *mem_req_event =
1164  memPort[index].createMemReqEvent(pkt);
1165 
1166  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1167  "scheduled\n", cu_id, gpuDynInst->simdId,
1168  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1169 
1170  schedule(mem_req_event, curTick() + req_tick_latency);
1171  } else if (tlbPort[tlbPort_index].isStalled()) {
1172  assert(tlbPort[tlbPort_index].retries.size() > 0);
1173 
1174  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1175  "failed!\n", cu_id, gpuDynInst->simdId,
1176  gpuDynInst->wfSlotId, tmp_vaddr);
1177 
1178  tlbPort[tlbPort_index].retries.push_back(pkt);
1179  } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1180  // Stall the data port;
1181  // No more packet will be issued till
1182  // ruby indicates resources are freed by
1183  // a recvReqRetry() call back on this port.
1184  tlbPort[tlbPort_index].stallPort();
1185 
1186  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1187  "failed!\n", cu_id, gpuDynInst->simdId,
1188  gpuDynInst->wfSlotId, tmp_vaddr);
1189 
1190  tlbPort[tlbPort_index].retries.push_back(pkt);
1191  } else {
1192  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x from "
1193  "instruction %s sent!\n", cu_id, gpuDynInst->simdId,
1194  gpuDynInst->wfSlotId, tmp_vaddr,
1195  gpuDynInst->disassemble().c_str());
1196  }
1197  } else {
1198  if (pkt->cmd == MemCmd::MemSyncReq) {
1199  gpuDynInst->resetEntireStatusVector();
1200  } else {
1201  gpuDynInst->decrementStatusVector(index);
1202  }
1203 
1204  // New SenderState for the memory access
1205  delete pkt->senderState;
1206 
1207  // Because it's atomic operation, only need TLB translation state
1208  pkt->senderState = new GpuTranslationState(TLB_mode,
1209  shader->gpuTc);
1210 
1211  tlbPort[tlbPort_index].sendFunctional(pkt);
1212 
1213  // the addr of the packet is not modified, so we need to create a new
1214  // packet, or otherwise the memory access will have the old virtual
1215  // address sent in the translation packet, instead of the physical
1216  // address returned by the translation.
1217  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1218  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1219 
1220  // Translation is done. It is safe to send the packet to memory.
1221  memPort[0].sendFunctional(new_pkt);
1222 
1223  DPRINTF(GPUMem, "Functional sendRequest\n");
1224  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1225  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1226  new_pkt->req->getPaddr());
1227 
1228  // safe_cast the senderState
1229  GpuTranslationState *sender_state =
1230  safe_cast<GpuTranslationState*>(pkt->senderState);
1231 
1232  delete sender_state->tlbEntry;
1233  delete new_pkt;
1234  delete pkt->senderState;
1235  delete pkt;
1236  }
1237 }
1238 
1239 void
1241 {
1242  assert(pkt->isWrite() || pkt->isRead());
1243 
1244  BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1245 
1246  pkt->senderState =
1248 
1249  pkt->senderState =
1250  new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1251  pkt->senderState);
1252 
1253  if (scalarDTLBPort.isStalled()) {
1254  assert(scalarDTLBPort.retries.size());
1255  scalarDTLBPort.retries.push_back(pkt);
1256  } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1258  scalarDTLBPort.retries.push_back(pkt);
1259  } else {
1260  DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1261  tlb_mode == BaseMMU::Read ? "read" : "write",
1262  pkt->req->getVaddr());
1263  }
1264 }
1265 
1266 void
1268  bool kernelMemSync,
1269  RequestPtr req)
1270 {
1271  assert(gpuDynInst->isGlobalSeg() ||
1272  gpuDynInst->executedAs() == enums::SC_GLOBAL);
1273 
1274  // Fences will never be issued to system memory, so we can mark the
1275  // requestor as a device memory ID here.
1276  if (!req) {
1277  req = std::make_shared<Request>(
1278  0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
1279  } else {
1280  req->requestorId(vramRequestorId());
1281  }
1282 
1283  // all mem sync requests have Paddr == 0
1284  req->setPaddr(0);
1285 
1286  PacketPtr pkt = nullptr;
1287 
1288  if (kernelMemSync) {
1289  if (gpuDynInst->isKernelLaunch()) {
1290  req->setCacheCoherenceFlags(Request::INV_L1);
1291  req->setReqInstSeqNum(gpuDynInst->seqNum());
1292  req->setFlags(Request::KERNEL);
1293  pkt = new Packet(req, MemCmd::MemSyncReq);
1294  pkt->pushSenderState(
1295  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1296 
1297  EventFunctionWrapper *mem_req_event =
1298  memPort[0].createMemReqEvent(pkt);
1299 
1300  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1301  "an acquire\n", cu_id, gpuDynInst->simdId,
1302  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1303 
1304  schedule(mem_req_event, curTick() + req_tick_latency);
1305  } else {
1306  // kernel end flush of GL2 cache may be quiesced by Ruby if the
1307  // GL2 is a read-only cache
1308  assert(shader->impl_kern_end_rel);
1309  assert(gpuDynInst->isEndOfKernel());
1310 
1311  req->setCacheCoherenceFlags(Request::FLUSH_L2);
1312  req->setReqInstSeqNum(gpuDynInst->seqNum());
1313  req->setFlags(Request::KERNEL);
1314  pkt = new Packet(req, MemCmd::MemSyncReq);
1315  pkt->pushSenderState(
1316  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1317 
1318  EventFunctionWrapper *mem_req_event =
1319  memPort[0].createMemReqEvent(pkt);
1320 
1321  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1322  "a release\n", cu_id, gpuDynInst->simdId,
1323  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1324 
1325  schedule(mem_req_event, curTick() + req_tick_latency);
1326  }
1327  } else {
1328  gpuDynInst->setRequestFlags(req);
1329 
1330  req->setReqInstSeqNum(gpuDynInst->seqNum());
1331 
1332  pkt = new Packet(req, MemCmd::MemSyncReq);
1333  pkt->pushSenderState(
1334  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1335 
1336  EventFunctionWrapper *mem_req_event =
1337  memPort[0].createMemReqEvent(pkt);
1338 
1339  DPRINTF(GPUPort,
1340  "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1341  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1342  pkt->req->getPaddr());
1343 
1344  schedule(mem_req_event, curTick() + req_tick_latency);
1345  }
1346 }
1347 
1348 void
1350 {
1351  DataPort::SenderState *sender_state =
1352  safe_cast<DataPort::SenderState*>(pkt->senderState);
1353 
1354  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1355  ComputeUnit *compute_unit = computeUnit;
1356 
1357  assert(gpuDynInst);
1358 
1359  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1360  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1361  pkt->req->getPaddr(), id);
1362 
1363  Addr paddr = pkt->req->getPaddr();
1364 
1365  // mem sync resp callback must be handled already in
1366  // DataPort::recvTimingResp
1367  assert(pkt->cmd != MemCmd::MemSyncResp);
1368 
1369  // The status vector and global memory response for WriteResp packets get
1370  // handled by the WriteCompleteResp packets.
1371  if (pkt->cmd == MemCmd::WriteResp) {
1372  if (!FullSystem || !pkt->req->systemReq()) {
1373  delete pkt;
1374  return;
1375  }
1376  }
1377 
1378  // this is for read, write and atomic
1379  int index = gpuDynInst->memStatusVector[paddr].back();
1380 
1381  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1382  pkt->req->getPaddr(), id);
1383 
1384  gpuDynInst->memStatusVector[paddr].pop_back();
1385  gpuDynInst->pAddr = pkt->req->getPaddr();
1386 
1387  gpuDynInst->decrementStatusVector(index);
1388  DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1389 
1390  if (gpuDynInst->allLanesZero()) {
1391  auto iter = gpuDynInst->memStatusVector.begin();
1392  auto end = gpuDynInst->memStatusVector.end();
1393 
1394  while (iter != end) {
1395  assert(iter->second.empty());
1396  ++iter;
1397  }
1398 
1399  // Calculate the difference between the arrival of the first cache
1400  // block and the last cache block to arrive if we have the time
1401  // for the first cache block.
1402  if (compute_unit->headTailMap.count(gpuDynInst)) {
1403  Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1404  compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1405  compute_unit->headTailMap.erase(gpuDynInst);
1406  }
1407 
1408  gpuDynInst->memStatusVector.clear();
1409 
1410  gpuDynInst->
1411  profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1412  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1413 
1414  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1415  compute_unit->cu_id, gpuDynInst->simdId,
1416  gpuDynInst->wfSlotId);
1417  } else {
1418  if (pkt->isRead()) {
1419  if (!compute_unit->headTailMap.count(gpuDynInst)) {
1420  compute_unit->headTailMap
1421  .insert(std::make_pair(gpuDynInst, curTick()));
1422  }
1423  }
1424  }
1425 
1426  delete pkt->senderState;
1427  delete pkt;
1428 }
1429 
1430 bool
1432 {
1433  Addr line = pkt->req->getPaddr();
1434 
1435  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1436  pkt->req->getVaddr(), line);
1437 
1438  assert(pkt->senderState);
1439  computeUnit->stats.tlbCycles += curTick();
1440 
1441  // pop off the TLB translation state
1442  GpuTranslationState *translation_state =
1443  safe_cast<GpuTranslationState*>(pkt->senderState);
1444 
1445  // no PageFaults are permitted for data accesses
1446  if (!translation_state->tlbEntry) {
1447  DTLBPort::SenderState *sender_state =
1448  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1449 
1450  [[maybe_unused]] Wavefront *w =
1451  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1452  [sender_state->_gpuDynInst->wfSlotId];
1453 
1454  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1455  pkt->req->getVaddr());
1456  }
1457 
1458  // update the hitLevel distribution
1459  int hit_level = translation_state->hitLevel;
1460  computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1461 
1462  delete translation_state->tlbEntry;
1463  assert(!translation_state->ports.size());
1464  pkt->senderState = translation_state->saved;
1465 
1466  // for prefetch pkt
1467  BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1468 
1469  delete translation_state;
1470 
1471  // use the original sender state to know how to close this transaction
1472  DTLBPort::SenderState *sender_state =
1473  safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1474 
1475  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1476  PortID mp_index = sender_state->portIndex;
1477  Addr vaddr = pkt->req->getVaddr();
1478  gpuDynInst->memStatusVector[line].push_back(mp_index);
1479  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1480 
1481  MemCmd requestCmd;
1482 
1483  if (pkt->cmd == MemCmd::ReadResp) {
1484  requestCmd = MemCmd::ReadReq;
1485  } else if (pkt->cmd == MemCmd::WriteResp) {
1486  requestCmd = MemCmd::WriteReq;
1487  } else if (pkt->cmd == MemCmd::SwapResp) {
1488  requestCmd = MemCmd::SwapReq;
1489  } else {
1490  panic("unsupported response to request conversion %s\n",
1491  pkt->cmd.toString());
1492  }
1493 
1494  if (computeUnit->prefetchDepth) {
1495  int simdId = gpuDynInst->simdId;
1496  int wfSlotId = gpuDynInst->wfSlotId;
1497  Addr last = 0;
1498 
1499  switch(computeUnit->prefetchType) {
1500  case enums::PF_CU:
1501  last = computeUnit->lastVaddrCU[mp_index];
1502  break;
1503  case enums::PF_PHASE:
1504  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1505  break;
1506  case enums::PF_WF:
1507  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1508  default:
1509  break;
1510  }
1511 
1512  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1513  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1514 
1515  int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1517  : 0;
1518 
1519  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1520 
1521  computeUnit->lastVaddrCU[mp_index] = vaddr;
1522  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1523  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1524 
1525  stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1526  computeUnit->prefetchStride: stride;
1527 
1528  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1529  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1530 
1531  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1532 
1533  // Prefetch Next few pages atomically
1534  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1535  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1537 
1538  if (!stride)
1539  break;
1540 
1541  RequestPtr prefetch_req = std::make_shared<Request>(
1543  sizeof(uint8_t), 0,
1544  computeUnit->requestorId(),
1545  0, 0, nullptr);
1546 
1547  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1548  uint8_t foo = 0;
1549  prefetch_pkt->dataStatic(&foo);
1550 
1551  // Because it's atomic operation, only need TLB translation state
1552  prefetch_pkt->senderState =
1553  new GpuTranslationState(TLB_mode,
1554  computeUnit->shader->gpuTc, true);
1555 
1556  // Currently prefetches are zero-latency, hence the sendFunctional
1557  sendFunctional(prefetch_pkt);
1558 
1559  /* safe_cast the senderState */
1560  GpuTranslationState *tlb_state =
1561  safe_cast<GpuTranslationState*>(
1562  prefetch_pkt->senderState);
1563 
1564 
1565  delete tlb_state->tlbEntry;
1566  delete tlb_state;
1567  delete prefetch_pkt;
1568  }
1569  }
1570 
1571  // First we must convert the response cmd back to a request cmd so that
1572  // the request can be sent through the cu's request port
1573  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1574  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1575  delete pkt->senderState;
1576  delete pkt;
1577 
1578  // New SenderState for the memory access
1579  new_pkt->senderState =
1580  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1581  nullptr);
1582 
1583  // Set VRAM ID for device requests
1584  // For now, system vmem requests use functional reads. This is not that
1585  // critical to model as the region of interest should always be accessing
1586  // device memory. System vmem requests are used by blit kernels to do
1587  // memcpys and load code objects into device memory.
1588  if (new_pkt->req->systemReq()) {
1589  // There will be multiple packets returned for the same gpuDynInst,
1590  // so first check if systemReq is not already set and if so, return
1591  // the token acquired when the dispatch list is filled as system
1592  // requests do not require a GPU coalescer token.
1593  if (!gpuDynInst->isSystemReq()) {
1594  computeUnit->getTokenManager()->recvTokens(1);
1595  gpuDynInst->setSystemReq();
1596  }
1597  } else {
1598  new_pkt->req->requestorId(computeUnit->vramRequestorId());
1599  }
1600 
1601  // translation is done. Schedule the mem_req_event at the appropriate
1602  // cycle to send the timing memory request to ruby
1603  EventFunctionWrapper *mem_req_event =
1604  computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1605 
1606  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1607  computeUnit->cu_id, gpuDynInst->simdId,
1608  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1609 
1610  computeUnit->schedule(mem_req_event, curTick() +
1611  computeUnit->req_tick_latency);
1612 
1613  return true;
1614 }
1615 
1618 {
1619  return new EventFunctionWrapper(
1620  [this, pkt]{ processMemReqEvent(pkt); },
1621  "ComputeUnit memory request event", true);
1622 }
1623 
1626 {
1627  return new EventFunctionWrapper(
1628  [this, pkt]{ processMemRespEvent(pkt); },
1629  "ComputeUnit memory response event", true);
1630 }
1631 
1632 void
1634 {
1635  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1636  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1637  [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1638 
1639  if (pkt->req->systemReq()) {
1640  assert(compute_unit->shader->systemHub);
1641  SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
1642  compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1643  } else if (!(sendTimingReq(pkt))) {
1644  retries.push_back(std::make_pair(pkt, gpuDynInst));
1645 
1646  DPRINTF(GPUPort,
1647  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1648  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1649  id, pkt->req->getPaddr());
1650  } else {
1651  DPRINTF(GPUPort,
1652  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1653  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1654  gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1655  pkt->req->getPaddr());
1656  }
1657 }
1658 
1659 const char*
1661 {
1662  return "ComputeUnit scalar memory request event";
1663 }
1664 
1665 void
1667 {
1668  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1669  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1670  [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1671 
1672  if (pkt->req->systemReq()) {
1673  assert(compute_unit->shader->systemHub);
1674  SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
1675  compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1676  } else if (!(scalarDataPort.sendTimingReq(pkt))) {
1677  scalarDataPort.retries.push_back(pkt);
1678 
1679  DPRINTF(GPUPort,
1680  "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1681  compute_unit->cu_id, gpuDynInst->simdId,
1682  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1683  } else {
1684  DPRINTF(GPUPort,
1685  "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1686  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1687  gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1688  pkt->req->getPaddr());
1689  }
1690 }
1691 
1692 /*
1693  * The initial translation request could have been rejected,
1694  * if <retries> queue is not Retry sending the translation
1695  * request. sendRetry() is called from the peer port whenever
1696  * a translation completes.
1697  */
1698 void
1700 {
1701  int len = retries.size();
1702 
1703  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1704  computeUnit->cu_id, len);
1705 
1706  assert(len > 0);
1707  assert(isStalled());
1708  // recvReqRetry is an indication that the resource on which this
1709  // port was stalling on is freed. So, remove the stall first
1710  unstallPort();
1711 
1712  for (int i = 0; i < len; ++i) {
1713  PacketPtr pkt = retries.front();
1714  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1715  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1716 
1717  if (!sendTimingReq(pkt)) {
1718  // Stall port
1719  stallPort();
1720  DPRINTF(GPUTLB, ": failed again\n");
1721  break;
1722  } else {
1723  DPRINTF(GPUTLB, ": successful\n");
1724  retries.pop_front();
1725  }
1726  }
1727 }
1728 
1729 bool
1731 {
1732  assert(pkt->senderState);
1733 
1734  GpuTranslationState *translation_state =
1735  safe_cast<GpuTranslationState*>(pkt->senderState);
1736 
1737  // Page faults are not allowed
1738  fatal_if(!translation_state->tlbEntry,
1739  "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1740 
1741  delete translation_state->tlbEntry;
1742  assert(!translation_state->ports.size());
1743 
1744  pkt->senderState = translation_state->saved;
1745  delete translation_state;
1746 
1747  ScalarDTLBPort::SenderState *sender_state =
1748  safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1749 
1750  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1751  delete pkt->senderState;
1752 
1753  [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1754 
1755  DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1756  "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1757  w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1758 
1759  MemCmd mem_cmd;
1760 
1761  if (pkt->cmd == MemCmd::ReadResp) {
1762  mem_cmd = MemCmd::ReadReq;
1763  } else if (pkt->cmd == MemCmd::WriteResp) {
1764  mem_cmd = MemCmd::WriteReq;
1765  } else {
1766  fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1767  pkt->cmd.toString());
1768  }
1769 
1770  PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1771  req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1772  delete pkt;
1773 
1774  req_pkt->senderState =
1776 
1777  // For a system request we want to mark the GPU instruction as a system
1778  // load/store so that after the request is issued to system memory we can
1779  // return any token acquired for the request. Since tokens are returned
1780  // by the coalescer and system requests do not take that path, this needs
1781  // to be tracked.
1782  //
1783  // Device requests change the requestor ID to something in the device
1784  // memory Ruby network.
1785  if (req_pkt->req->systemReq()) {
1786  gpuDynInst->setSystemReq();
1787  } else {
1788  req_pkt->req->requestorId(computeUnit->vramRequestorId());
1789  }
1790 
1791  ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
1793  (computeUnit->scalarDataPort, req_pkt);
1794  computeUnit->schedule(scalar_mem_req_event, curTick() +
1795  computeUnit->scalar_req_tick_latency);
1796 
1797  return true;
1798 }
1799 
1800 bool
1802 {
1803  [[maybe_unused]] Addr line = pkt->req->getPaddr();
1804  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1805  computeUnit->cu_id, pkt->req->getVaddr(), line);
1806 
1807  assert(pkt->senderState);
1808 
1809  // pop off the TLB translation state
1810  GpuTranslationState *translation_state
1811  = safe_cast<GpuTranslationState*>(pkt->senderState);
1812 
1813  bool success = translation_state->tlbEntry != nullptr;
1814  delete translation_state->tlbEntry;
1815  assert(!translation_state->ports.size());
1816  pkt->senderState = translation_state->saved;
1817  delete translation_state;
1818 
1819  // use the original sender state to know how to close this transaction
1820  ITLBPort::SenderState *sender_state =
1821  safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1822 
1823  // get the wavefront associated with this translation request
1824  Wavefront *wavefront = sender_state->wavefront;
1825  delete pkt->senderState;
1826 
1827  if (success) {
1828  // pkt is reused in fetch(), don't delete it here. However, we must
1829  // reset the command to be a request so that it can be sent through
1830  // the cu's request port
1831  assert(pkt->cmd == MemCmd::ReadResp);
1832  pkt->cmd = MemCmd::ReadReq;
1833 
1834  computeUnit->fetchStage.fetch(pkt, wavefront);
1835  } else {
1836  if (wavefront->dropFetch) {
1837  assert(wavefront->instructionBuffer.empty());
1838  wavefront->dropFetch = false;
1839  }
1840 
1841  wavefront->pendingFetch = 0;
1842  }
1843 
1844  return true;
1845 }
1846 
1847 /*
1848  * The initial translation request could have been rejected, if
1849  * <retries> queue is not empty. Retry sending the translation
1850  * request. sendRetry() is called from the peer port whenever
1851  * a translation completes.
1852  */
1853 void
1855 {
1856 
1857  int len = retries.size();
1858  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1859 
1860  assert(len > 0);
1861  assert(isStalled());
1862 
1863  // recvReqRetry is an indication that the resource on which this
1864  // port was stalling on is freed. So, remove the stall first
1865  unstallPort();
1866 
1867  for (int i = 0; i < len; ++i) {
1868  PacketPtr pkt = retries.front();
1869  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1870  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1871 
1872  if (!sendTimingReq(pkt)) {
1873  stallPort(); // Stall port
1874  DPRINTF(GPUTLB, ": failed again\n");
1875  break;
1876  } else {
1877  DPRINTF(GPUTLB, ": successful\n");
1878  retries.pop_front();
1879  }
1880  }
1881 }
1882 
1883 void
1885 {
1886  if (gpuDynInst->isScalar()) {
1887  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1888  stats.sALUInsts++;
1890  } else if (gpuDynInst->isLoad()) {
1892  } else if (gpuDynInst->isStore()) {
1894  }
1895  } else {
1896  if (gpuDynInst->isALU()) {
1899  exitSimLoop("max vALU insts");
1900  }
1901  stats.vALUInsts++;
1904  += gpuDynInst->wavefront()->execMask().count();
1905  } else if (gpuDynInst->isFlat()) {
1906  if (gpuDynInst->isLocalMem()) {
1907  stats.flatLDSInsts++;
1908  } else {
1909  stats.flatVMemInsts++;
1910  }
1911  } else if (gpuDynInst->isFlatGlobal()) {
1912  stats.flatVMemInsts++;
1913  } else if (gpuDynInst->isLocalMem()) {
1915  } else if (gpuDynInst->isLoad()) {
1917  } else if (gpuDynInst->isStore()) {
1919  }
1920 
1921  if (gpuDynInst->isLoad()) {
1922  switch (gpuDynInst->executedAs()) {
1923  case enums::SC_SPILL:
1924  stats.spillReads++;
1925  break;
1926  case enums::SC_GLOBAL:
1927  stats.globalReads++;
1928  break;
1929  case enums::SC_GROUP:
1930  stats.groupReads++;
1931  break;
1932  case enums::SC_PRIVATE:
1933  stats.privReads++;
1934  break;
1935  case enums::SC_READONLY:
1936  stats.readonlyReads++;
1937  break;
1938  case enums::SC_KERNARG:
1939  stats.kernargReads++;
1940  break;
1941  case enums::SC_ARG:
1942  stats.argReads++;
1943  break;
1944  case enums::SC_NONE:
1949  break;
1950  default:
1951  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1952  break;
1953  }
1954  } else if (gpuDynInst->isStore()) {
1955  switch (gpuDynInst->executedAs()) {
1956  case enums::SC_SPILL:
1957  stats.spillWrites++;
1958  break;
1959  case enums::SC_GLOBAL:
1960  stats.globalWrites++;
1961  break;
1962  case enums::SC_GROUP:
1963  stats.groupWrites++;
1964  break;
1965  case enums::SC_PRIVATE:
1966  stats.privWrites++;
1967  break;
1968  case enums::SC_READONLY:
1970  break;
1971  case enums::SC_KERNARG:
1972  stats.kernargWrites++;
1973  break;
1974  case enums::SC_ARG:
1975  stats.argWrites++;
1976  break;
1977  case enums::SC_NONE:
1982  break;
1983  default:
1984  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1985  break;
1986  }
1987  }
1988  }
1989 }
1990 
1991 void
1993 {
1994  Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
1995 
1996  if (!pagesTouched.count(virt_page_addr))
1997  pagesTouched[virt_page_addr] = 1;
1998  else
1999  pagesTouched[virt_page_addr]++;
2000 }
2001 
2002 void
2004 {
2005  if (countPages) {
2006  std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2007 
2008  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2009  std::endl;
2010 
2011  for (auto iter : pageAccesses) {
2012  *page_stat_file << std::hex << iter.first << ",";
2013  *page_stat_file << std::dec << iter.second.first << ",";
2014  *page_stat_file << std::dec << iter.second.second << std::endl;
2015  }
2016  }
2017 }
2018 
2019 bool
2021 {
2022  for (int i = 0; i < numVectorALUs; ++i) {
2023  if (!isVectorAluIdle(i)) {
2024  return false;
2025  }
2026  }
2027 
2028  // TODO: FIXME if more than 1 of any memory pipe supported
2029  if (!srfToScalarMemPipeBus.rdy()) {
2030  return false;
2031  }
2032  if (!vrfToGlobalMemPipeBus.rdy()) {
2033  return false;
2034  }
2035  if (!vrfToLocalMemPipeBus.rdy()) {
2036  return false;
2037  }
2038 
2043  return false;
2044  }
2045 
2046  return true;
2047 }
2048 
2049 int32_t
2050 ComputeUnit::getRefCounter(const uint32_t dispatchId,
2051  const uint32_t wgId) const
2052 {
2053  return lds.getRefCounter(dispatchId, wgId);
2054 }
2055 
2056 bool
2057 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2058 {
2059  assert(simdId < numVectorALUs);
2060 
2061  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2062  if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2063  return false;
2064  }
2065  }
2066 
2067  return true;
2068 }
2069 
2075 bool
2077 {
2078  // this is just a request to carry the GPUDynInstPtr
2079  // back and forth
2080  RequestPtr newRequest = std::make_shared<Request>();
2081  newRequest->setPaddr(0x0);
2082 
2083  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2084  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2085 
2086  // This is the SenderState needed upon return
2087  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2088 
2089  return ldsPort.sendTimingReq(newPacket);
2090 }
2091 
2097 {
2098  return FullSystem ? shader->vramRequestorId() : requestorId();
2099 }
2100 
2104 bool
2106 {
2107  const ComputeUnit::LDSPort::SenderState *senderState =
2108  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2109 
2110  fatal_if(!senderState, "did not get the right sort of sender state");
2111 
2112  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2113 
2114  delete packet->senderState;
2115  delete packet;
2116 
2117  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2118  return true;
2119 }
2120 
2126 bool
2128 {
2129  ComputeUnit::LDSPort::SenderState *sender_state =
2130  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2131  fatal_if(!sender_state, "packet without a valid sender state");
2132 
2133  [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2134 
2135  if (isStalled()) {
2136  fatal_if(retries.empty(), "must have retries waiting to be stalled");
2137 
2138  retries.push(pkt);
2139 
2140  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2141  computeUnit->cu_id, gpuDynInst->simdId,
2142  gpuDynInst->wfSlotId);
2143  return false;
2144  } else if (!RequestPort::sendTimingReq(pkt)) {
2145  // need to stall the LDS port until a recvReqRetry() is received
2146  // this indicates that there is more space
2147  stallPort();
2148  retries.push(pkt);
2149 
2150  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2151  computeUnit->cu_id, gpuDynInst->simdId,
2152  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2153  return false;
2154  } else {
2155  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2156  computeUnit->cu_id, gpuDynInst->simdId,
2157  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2158  return true;
2159  }
2160 }
2161 
2168 void
2170 {
2171  auto queueSize = retries.size();
2172 
2173  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2174  computeUnit->cu_id, queueSize);
2175 
2176  fatal_if(queueSize < 1,
2177  "why was there a recvReqRetry() with no pending reqs?");
2178  fatal_if(!isStalled(),
2179  "recvReqRetry() happened when the port was not stalled");
2180 
2181  unstallPort();
2182 
2183  while (!retries.empty()) {
2184  PacketPtr packet = retries.front();
2185 
2186  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2187 
2188  if (!RequestPort::sendTimingReq(packet)) {
2189  // Stall port
2190  stallPort();
2191  DPRINTF(GPUPort, ": LDS send failed again\n");
2192  break;
2193  } else {
2194  DPRINTF(GPUTLB, ": LDS send successful\n");
2195  retries.pop();
2196  }
2197  }
2198 }
2199 
2201  int n_wf)
2202  : statistics::Group(parent),
2203  ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2204  ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2205  "per-wavefront."),
2206  ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2207  ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2208  "per-wavefront."),
2209  ADD_STAT(instCyclesVALU,
2210  "Number of cycles needed to execute VALU insts."),
2211  ADD_STAT(instCyclesSALU,
2212  "Number of cycles needed to execute SALU insts."),
2213  ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2214  "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2215  "the number of active threads."),
2216  ADD_STAT(vALUUtilization,
2217  "Percentage of active vector ALU threads in a wave."),
2218  ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2219  " accesses that resolve to LDS."),
2220  ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2221  "including FLAT accesses that resolve to LDS) per-wavefront."),
2222  ADD_STAT(flatVMemInsts,
2223  "The number of FLAT insts that resolve to vmem issued."),
2224  ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2225  "resolve to vmem issued per-wavefront."),
2226  ADD_STAT(flatLDSInsts,
2227  "The number of FLAT insts that resolve to LDS issued."),
2228  ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2229  "resolve to LDS issued per-wavefront."),
2230  ADD_STAT(vectorMemWrites,
2231  "Number of vector mem write insts (excluding FLAT insts)."),
2232  ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2233  "insts (excluding FLAT insts) per-wavefront."),
2234  ADD_STAT(vectorMemReads,
2235  "Number of vector mem read insts (excluding FLAT insts)."),
2236  ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2237  "(excluding FLAT insts) per-wavefront."),
2238  ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2239  ADD_STAT(scalarMemWritesPerWF,
2240  "The average number of scalar mem write insts per-wavefront."),
2241  ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2242  ADD_STAT(scalarMemReadsPerWF,
2243  "The average number of scalar mem read insts per-wavefront."),
2244  ADD_STAT(vectorMemReadsPerKiloInst,
2245  "Number of vector mem reads per kilo-instruction"),
2246  ADD_STAT(vectorMemWritesPerKiloInst,
2247  "Number of vector mem writes per kilo-instruction"),
2248  ADD_STAT(vectorMemInstsPerKiloInst,
2249  "Number of vector mem insts per kilo-instruction"),
2250  ADD_STAT(scalarMemReadsPerKiloInst,
2251  "Number of scalar mem reads per kilo-instruction"),
2252  ADD_STAT(scalarMemWritesPerKiloInst,
2253  "Number of scalar mem writes per kilo-instruction"),
2254  ADD_STAT(scalarMemInstsPerKiloInst,
2255  "Number of scalar mem insts per kilo-instruction"),
2256  ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2257  "command, data from VRF to vector memory unit, per SIMD"),
2258  ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2259  "command, data from SRF to scalar memory unit, per SIMD"),
2260  ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2261  "command, data from VRF to LDS unit, per SIMD"),
2262  ADD_STAT(globalReads, "Number of reads to the global segment"),
2263  ADD_STAT(globalWrites, "Number of writes to the global segment"),
2264  ADD_STAT(globalMemInsts,
2265  "Number of memory instructions sent to the global segment"),
2266  ADD_STAT(argReads, "Number of reads to the arg segment"),
2267  ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2268  ADD_STAT(argMemInsts,
2269  "Number of memory instructions sent to the arg segment"),
2270  ADD_STAT(spillReads, "Number of reads to the spill segment"),
2271  ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2272  ADD_STAT(spillMemInsts,
2273  "Number of memory instructions sent to the spill segment"),
2274  ADD_STAT(groupReads, "Number of reads to the group segment"),
2275  ADD_STAT(groupWrites, "Number of writes to the group segment"),
2276  ADD_STAT(groupMemInsts,
2277  "Number of memory instructions sent to the group segment"),
2278  ADD_STAT(privReads, "Number of reads to the private segment"),
2279  ADD_STAT(privWrites, "Number of writes to the private segment"),
2280  ADD_STAT(privMemInsts,
2281  "Number of memory instructions sent to the private segment"),
2282  ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2283  ADD_STAT(readonlyWrites,
2284  "Number of memory instructions sent to the readonly segment"),
2285  ADD_STAT(readonlyMemInsts,
2286  "Number of memory instructions sent to the readonly segment"),
2287  ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2288  ADD_STAT(kernargWrites,
2289  "Number of memory instructions sent to the kernarg segment"),
2290  ADD_STAT(kernargMemInsts,
2291  "Number of memory instructions sent to the kernarg segment"),
2292  ADD_STAT(waveLevelParallelism,
2293  "wave level parallelism: count of active waves at wave launch"),
2294  ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2295  ADD_STAT(tlbCycles,
2296  "total number of cycles for all uncoalesced requests"),
2297  ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2298  ADD_STAT(hitsPerTLBLevel,
2299  "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2300  ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2301  ADD_STAT(ldsBankConflictDist,
2302  "Number of bank conflicts per LDS memory packet"),
2303  ADD_STAT(pageDivergenceDist,
2304  "pages touched per wf (over all mem. instr.)"),
2305  ADD_STAT(dynamicGMemInstrCnt,
2306  "dynamic non-flat global memory instruction count"),
2307  ADD_STAT(dynamicFlatMemInstrCnt,
2308  "dynamic flat global memory instruction count"),
2309  ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2310  ADD_STAT(wgBlockedDueBarrierAllocation,
2311  "WG dispatch was blocked due to lack of barrier resources"),
2312  ADD_STAT(wgBlockedDueLdsAllocation,
2313  "Workgroup blocked due to LDS capacity"),
2314  ADD_STAT(numInstrExecuted, "number of instructions executed"),
2315  ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2316  "vector instructions per cycle"),
2317  ADD_STAT(numVecOpsExecuted,
2318  "number of vec ops executed (e.g. WF size/inst)"),
2319  ADD_STAT(numVecOpsExecutedF16,
2320  "number of f16 vec ops executed (e.g. WF size/inst)"),
2321  ADD_STAT(numVecOpsExecutedF32,
2322  "number of f32 vec ops executed (e.g. WF size/inst)"),
2323  ADD_STAT(numVecOpsExecutedF64,
2324  "number of f64 vec ops executed (e.g. WF size/inst)"),
2325  ADD_STAT(numVecOpsExecutedFMA16,
2326  "number of fma16 vec ops executed (e.g. WF size/inst)"),
2327  ADD_STAT(numVecOpsExecutedFMA32,
2328  "number of fma32 vec ops executed (e.g. WF size/inst)"),
2329  ADD_STAT(numVecOpsExecutedFMA64,
2330  "number of fma64 vec ops executed (e.g. WF size/inst)"),
2331  ADD_STAT(numVecOpsExecutedMAC16,
2332  "number of mac16 vec ops executed (e.g. WF size/inst)"),
2333  ADD_STAT(numVecOpsExecutedMAC32,
2334  "number of mac32 vec ops executed (e.g. WF size/inst)"),
2335  ADD_STAT(numVecOpsExecutedMAC64,
2336  "number of mac64 vec ops executed (e.g. WF size/inst)"),
2337  ADD_STAT(numVecOpsExecutedMAD16,
2338  "number of mad16 vec ops executed (e.g. WF size/inst)"),
2339  ADD_STAT(numVecOpsExecutedMAD32,
2340  "number of mad32 vec ops executed (e.g. WF size/inst)"),
2341  ADD_STAT(numVecOpsExecutedMAD64,
2342  "number of mad64 vec ops executed (e.g. WF size/inst)"),
2343  ADD_STAT(numVecOpsExecutedTwoOpFP,
2344  "number of two op FP vec ops executed (e.g. WF size/inst)"),
2345  ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2346  ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2347  ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2348  ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2349  ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2350  ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2351  ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2352  "instruction (over all instructions)"),
2353  ADD_STAT(activeLanesPerGMemInstrDist,
2354  "number of active lanes per global memory instruction"),
2355  ADD_STAT(activeLanesPerLMemInstrDist,
2356  "number of active lanes per local memory instruction"),
2357  ADD_STAT(numALUInstsExecuted,
2358  "Number of dynamic non-GM memory insts executed"),
2359  ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2360  "blocked due to VGPR allocation per SIMD"),
2361  ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2362  "blocked due to SGPR allocation per SIMD"),
2363  ADD_STAT(numCASOps, "number of compare and swap operations"),
2364  ADD_STAT(numFailedCASOps,
2365  "number of compare and swap operations that failed"),
2366  ADD_STAT(completedWfs, "number of completed wavefronts"),
2367  ADD_STAT(completedWGs, "number of completed workgroups"),
2368  ADD_STAT(headTailLatency, "ticks between first and last cache block "
2369  "arrival at coalescer"),
2370  ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2371 {
2372  ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2373 
2377 
2378  hitsPerTLBLevel.init(4);
2379  execRateDist.init(0, 10, 2);
2380  ldsBankConflictDist.init(0, cu->wfSize(), 2);
2381 
2382  pageDivergenceDist.init(1, cu->wfSize(), 4);
2383  controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
2386 
2387  headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
2389  waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2390  instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2391 
2394  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
2402 
2411 
2419 
2421 
2422  // fixed number of TLB levels
2423  for (int i = 0; i < 4; ++i) {
2424  if (!i)
2425  hitsPerTLBLevel.subname(i,"page_table");
2426  else
2427  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2428  }
2429 
2435 
2438 }
2439 
2440 } // namespace gem5
gem5::ComputeUnit::ComputeUnitStats::tlbRequests
statistics::Scalar tlbRequests
Definition: compute_unit.hh:1062
gem5::ComputeUnit::ComputeUnitStats::sALUInstsPerWF
statistics::Formula sALUInstsPerWF
Definition: compute_unit.hh:1003
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:200
gem5::PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:245
gem5::ComputeUnit::ComputeUnitStats::vALUUtilization
statistics::Formula vALUUtilization
Definition: compute_unit.hh:1007
gem5::ComputeUnit::getAndIncSeqNum
InstSeqNum getAndIncSeqNum()
Definition: compute_unit.hh:934
gem5::GMEnqueue
@ GMEnqueue
Definition: misc.hh:56
gem5::HSAQueueEntry::numWg
int numWg(int dim) const
Definition: hsa_queue_entry.hh:259
gem5::ComputeUnit::wfList
std::vector< std::vector< Wavefront * > > wfList
Definition: compute_unit.hh:291
gem5::ComputeUnit::ComputeUnit
ComputeUnit(const Params &p)
Definition: compute_unit.cc:65
gem5::BaseMMU::Read
@ Read
Definition: mmu.hh:56
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerWF
statistics::Formula scalarMemReadsPerWF
Definition: compute_unit.hh:1021
gem5::RequestPort::sendTimingReq
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition: port.hh:587
gem5::LdsState::getRefCounter
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:351
gem5::ComputeUnit::ComputeUnitStats::instCyclesSALU
statistics::Scalar instCyclesSALU
Definition: compute_unit.hh:1005
gem5::Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:112
simple_pool_manager.hh
gem5::Wavefront::S_RUNNING
@ S_RUNNING
Definition: wavefront.hh:70
gem5::ComputeUnit::fetchStage
FetchStage fetchStage
Definition: compute_unit.hh:280
gem5::ComputeUnit::ComputeUnitStats::instInterleave
statistics::VectorDistribution instInterleave
Definition: compute_unit.hh:1141
gem5::ComputeUnit::ComputeUnitStats::flatVMemInsts
statistics::Scalar flatVMemInsts
Definition: compute_unit.hh:1010
gem5::ComputeUnit::ScalarDTLBPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:765
gem5::ComputeUnit::sendRequest
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
Definition: compute_unit.cc:1039
gem5::ScalarMemPipeline::exec
void exec()
Definition: scalar_memory_pipeline.cc:54
gem5::FetchStage::processFetchReturn
void processFetchReturn(PacketPtr pkt)
Definition: fetch_stage.cc:73
gem5::ComputeUnit::debugSegFault
bool debugSegFault
Definition: compute_unit.hh:341
gem5::FetchStage::exec
void exec()
Definition: fetch_stage.cc:65
shader.hh
gem5::ComputeUnit::DataPort::processMemReqEvent
void processMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1633
gem5::ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:285
gem5::ComputeUnit::ComputeUnitStats::privWrites
statistics::Scalar privWrites
Definition: compute_unit.hh:1049
gem5::ComputeUnit::ComputeUnitStats::kernargWrites
statistics::Scalar kernargWrites
Definition: compute_unit.hh:1055
gem5::MemCmd::SwapReq
@ SwapReq
Definition: packet.hh:120
gem5::ComputeUnit::numVecRegsPerSimd
int numVecRegsPerSimd
Definition: compute_unit.hh:373
gem5::GpuTranslationState
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: gpu_translation_state.hh:58
gem5::ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition: compute_unit.hh:297
gem5::GpuTranslationState::ports
std::vector< ResponsePort * > ports
Definition: gpu_translation_state.hh:79
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerWF
statistics::Formula scalarMemWritesPerWF
Definition: compute_unit.hh:1019
gem5::ComputeUnit::ComputeUnitStats::argMemInsts
statistics::Formula argMemInsts
Definition: compute_unit.hh:1041
gem5::ComputeUnit::ITLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1801
gem5::ComputeUnit::ComputeUnitStats::spillWrites
statistics::Scalar spillWrites
Definition: compute_unit.hh:1043
gem5::ComputeUnit::ComputeUnitStats::spillMemInsts
statistics::Formula spillMemInsts
Definition: compute_unit.hh:1044
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerKiloInst
statistics::Formula scalarMemWritesPerKiloInst
Definition: compute_unit.hh:1027
gem5::MipsISA::index
Bitfield< 30, 0 > index
Definition: pra_constants.hh:47
gem5::ComputeUnit::DataPort::handleResponse
bool handleResponse(PacketPtr pkt)
Definition: compute_unit.cc:818
gem5::ComputeUnit::ComputeUnitStats::readonlyReads
statistics::Scalar readonlyReads
Definition: compute_unit.hh:1051
gem5::ComputeUnit::handleSQCReturn
void handleSQCReturn(PacketPtr pkt)
Definition: compute_unit.cc:1010
gem5::ComputeUnit::LDSPort::SenderState
SenderState is information carried along with the packet, esp.
Definition: compute_unit.hh:836
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueBarrierAllocation
Definition: compute_unit.hh:1081
gem5::ComputeUnit::DTLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1431
gem5::Packet::pushSenderState
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition: packet.cc:334
gem5::ComputeUnit::ComputeUnitStats::vectorMemReads
statistics::Scalar vectorMemReads
Definition: compute_unit.hh:1016
gem5::BaseMMU::Mode
Mode
Definition: mmu.hh:56
gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:377
gem5::Shader::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition: shader.cc:530
gem5::ComputeUnit::lastVaddrSimd
std::vector< std::vector< Addr > > lastVaddrSimd
Definition: compute_unit.hh:336
gem5::BaseMMU::Write
@ Write
Definition: mmu.hh:56
gem5::Wavefront
Definition: wavefront.hh:60
gem5::ComputeUnit::ScalarDTLBPort::isStalled
bool isStalled() const
Definition: compute_unit.hh:761
gem5::FetchStage::init
void init()
Definition: fetch_stage.cc:56
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF64
statistics::Scalar numVecOpsExecutedF64
Definition: compute_unit.hh:1097
gem5::ComputeUnit::ComputeUnitStats::dynamicGMemInstrCnt
statistics::Scalar dynamicGMemInstrCnt
Definition: compute_unit.hh:1076
gem5::HSAQueueEntry
Definition: hsa_queue_entry.hh:60
compute_unit.hh
gem5::ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition: compute_unit.cc:250
gem5::ComputeUnit::pagesTouched
std::map< Addr, int > pagesTouched
Definition: compute_unit.hh:380
gpu_static_inst.hh
gem5::VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
gem5::ComputeUnit::DataPort::SystemHubEvent
Definition: compute_unit.hh:534
gem5::ComputeUnit::scoreboardCheckStage
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:281
gem5::ComputeUnit::stats
gem5::ComputeUnit::ComputeUnitStats stats
gem5::ComputeUnit::headTailMap
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
Definition: compute_unit.hh:990
gem5::ComputeUnit::ComputeUnitStats::vpc_f16
statistics::Formula vpc_f16
Definition: compute_unit.hh:1115
gem5::floorLog2
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition: intmath.hh:59
gem5::ComputeUnit::ComputeUnitStats::tlbLatency
statistics::Formula tlbLatency
Definition: compute_unit.hh:1064
gem5::simout
OutputDirectory simout
Definition: output.cc:62
gem5::ComputeUnit::DataPort::processMemRespEvent
void processMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1349
gem5::ComputeUnit::lastVaddrCU
std::vector< Addr > lastVaddrCU
Definition: compute_unit.hh:335
gem5::MemCmd::SwapResp
@ SwapResp
Definition: packet.hh:121
gem5::ComputeUnit::ScalarDTLBPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:1730
gem5::statistics::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:401
gem5::ComputeUnit::ComputeUnitStats::sALUInsts
statistics::Scalar sALUInsts
Definition: compute_unit.hh:1002
gem5::Packet::isWrite
bool isWrite() const
Definition: packet.hh:594
gem5::ComputeUnit::exec
void exec()
Definition: compute_unit.cc:730
gem5::Wavefront::pendingFetch
bool pendingFetch
Definition: wavefront.hh:111
gem5::ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition: compute_unit.hh:239
gem5::ComputeUnit::releaseBarrier
void releaseBarrier(int bar_id)
Definition: compute_unit.cc:707
gem5::ComputeUnit::ComputeUnitStats::instCyclesScMemPerSimd
statistics::Vector instCyclesScMemPerSimd
Definition: compute_unit.hh:1033
gem5::AMDGPUSystemHub::sendRequest
void sendRequest(PacketPtr pkt, Event *callback)
Definition: system_hub.cc:40
gem5::Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:517
gem5::ComputeUnit::numYetToReachBarrier
int numYetToReachBarrier(int bar_id)
Definition: compute_unit.cc:658
gem5::HSAQueueEntry::wgId
int wgId(int dim) const
Definition: hsa_queue_entry.hh:233
gem5::VegaISA::w
Bitfield< 6 > w
Definition: pagetable.hh:59
gem5::GPUDispatcher::shader
Shader * shader
Definition: dispatcher.hh:85
gem5::ComputeUnit::ComputeUnitStats::ldsBankConflictDist
statistics::Distribution ldsBankConflictDist
Definition: compute_unit.hh:1070
gem5::ComputeUnit::getRefCounter
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Definition: compute_unit.cc:2050
gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1012
gem5::ComputeUnit::ITLBPort::SenderState::wavefront
Wavefront * wavefront
Definition: compute_unit.hh:795
gem5::ComputeUnit::ScalarDataPort::handleResponse
bool handleResponse(PacketPtr pkt)
Definition: compute_unit.cc:925
gem5::OutputDirectory::create
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:210
gem5::ComputeUnit::ComputeUnitStats::kernargReads
statistics::Scalar kernargReads
Definition: compute_unit.hh:1054
gem5::csprintf
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
gem5::Wavefront::S_STOPPED
@ S_STOPPED
Definition: wavefront.hh:66
gem5::X86ISA::PageShift
const Addr PageShift
Definition: page_size.hh:48
gem5::ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition: compute_unit.hh:223
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInsts
statistics::Scalar ldsNoFlatInsts
Definition: compute_unit.hh:1008
gem5::ComputeUnit::resetBarrier
void resetBarrier(int bar_id)
Definition: compute_unit.cc:693
gem5::ComputeUnit::ComputeUnitStats::globalReads
statistics::Scalar globalReads
Definition: compute_unit.hh:1036
gem5::ComputeUnit::ComputeUnitStats::groupMemInsts
statistics::Formula groupMemInsts
Definition: compute_unit.hh:1047
gem5::ComputeUnit::ComputeUnitStats::vpc
statistics::Formula vpc
Definition: compute_unit.hh:1114
gem5::RegisterManager::vrfPoolMgrs
std::vector< PoolManager * > vrfPoolMgrs
Definition: register_manager.hh:80
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerLMemInstrDist
statistics::Distribution activeLanesPerLMemInstrDist
Definition: compute_unit.hh:1121
gem5::ComputeUnit::memPortTokens
TokenManager * memPortTokens
Definition: compute_unit.hh:509
gem5::GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:57
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:67
gem5::ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition: compute_unit.hh:227
gem5::ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:353
gem5::ComputeUnit::req_tick_latency
Tick req_tick_latency
Definition: compute_unit.hh:355
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerWF
statistics::Formula vectorMemWritesPerWF
Definition: compute_unit.hh:1015
sim_exit.hh
gem5::HSAQueueEntry::numScalarRegs
int numScalarRegs() const
Definition: hsa_queue_entry.hh:165
gem5::isPowerOf2
static constexpr bool isPowerOf2(const T &n)
Definition: intmath.hh:98
output.hh
gem5::ComputeUnit::ComputeUnitStats::headTailLatency
statistics::Distribution headTailLatency
Definition: compute_unit.hh:1135
gem5::ComputeUnit::scalarDataPort
ScalarDataPort scalarDataPort
Definition: compute_unit.hh:902
gem5::ComputeUnit::ComputeUnitStats::threadCyclesVALU
statistics::Scalar threadCyclesVALU
Definition: compute_unit.hh:1006
gem5::ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:292
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerWF
statistics::Formula vectorMemReadsPerWF
Definition: compute_unit.hh:1017
gem5::statistics::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1327
gem5::ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:295
gem5::ComputeUnit::ComputeUnitStats::instCyclesVMemPerSimd
statistics::Vector instCyclesVMemPerSimd
Definition: compute_unit.hh:1032
wavefront.hh
gem5::exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition: sim_events.cc:88
gem5::TokenRequestPort::setTokenManager
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition: token_port.cc:72
gem5::ComputeUnit::SQCPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1016
gem5::ComputeUnit::ComputeUnitStats::groupReads
statistics::Scalar groupReads
Definition: compute_unit.hh:1045
gem5::GPUComputeDriver::setMtype
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
Definition: gpu_compute_driver.cc:1022
gem5::ComputeUnit::ComputeUnitStats::vpc_f64
statistics::Formula vpc_f64
Definition: compute_unit.hh:1117
gem5::ComputeUnit::injectGlobalMemFence
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
Definition: compute_unit.cc:1267
gem5::ComputeUnit::ScalarDataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:652
gem5::ComputeUnit::LDSPort::sendTimingReq
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
Definition: compute_unit.cc:2127
gem5::ComputeUnit::locMemToVrfBus
WaitClass locMemToVrfBus
Definition: compute_unit.hh:229
gem5::MemCmd
Definition: packet.hh:76
gem5::statistics::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:61
gem5::Shader::systemHub
AMDGPUSystemHub * systemHub
Definition: shader.hh:258
gem5::ComputeUnit::ComputeUnitStats::kernargMemInsts
statistics::Formula kernargMemInsts
Definition: compute_unit.hh:1056
gem5::ComputeUnit::ComputeUnitStats::flatVMemInstsPerWF
statistics::Formula flatVMemInstsPerWF
Definition: compute_unit.hh:1011
gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1175
gem5::Wavefront::setStatus
void setStatus(status_e newStatus)
Definition: wavefront.cc:550
gem5::LdsState::canReserve
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:499
gem5::ComputeUnit::numScalarMemUnits
int numScalarMemUnits
Definition: compute_unit.hh:235
gem5::GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:268
gem5::ComputeUnit::DTLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1699
gem5::ComputeUnit::ITLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:792
gem5::ArmISA::j
Bitfield< 24 > j
Definition: misc_types.hh:57
gem5::ComputeUnit
Definition: compute_unit.hh:201
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::process
void process()
Definition: compute_unit.cc:1666
gem5::ComputeUnit::pageAccesses
pageDataStruct pageAccesses
Definition: compute_unit.hh:488
gem5::HSAQueueEntry::MAX_DIM
const static int MAX_DIM
Definition: hsa_queue_entry.hh:334
gem5::ComputeUnit::ScalarDataPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:649
gem5::OutputStream::stream
std::ostream * stream() const
Get the output underlying output stream.
Definition: output.hh:62
gem5::ComputeUnit::ScalarDataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:604
gem5::ComputeUnit::ComputeUnitStats::flatLDSInsts
statistics::Scalar flatLDSInsts
Definition: compute_unit.hh:1012
gem5::ComputeUnit::numScalarALUs
int numScalarALUs
Definition: compute_unit.hh:248
gem5::statistics::VectorDistribution::init
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
Definition: statistics.hh:2277
gem5::ComputeUnit::numVectorALUs
int numVectorALUs
Definition: compute_unit.hh:244
vector_register_file.hh
gem5::Packet::isRead
bool isRead() const
Definition: packet.hh:593
gem5::LocalMemPipeline::exec
void exec()
Definition: local_memory_pipeline.cc:52
gem5::ComputeUnit::startWavefront
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
Definition: compute_unit.cc:320
gem5::WaitClass::init
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition: misc.hh:76
gem5::ComputeUnit::ComputeUnitStats::privReads
statistics::Scalar privReads
Definition: compute_unit.hh:1048
gem5::ComputeUnit::functionalTLB
bool functionalTLB
Definition: compute_unit.hh:345
gem5::ComputeUnit::numAtBarrier
int numAtBarrier(int bar_id)
Definition: compute_unit.cc:679
gem5::ComputeUnit::incNumAtBarrier
void incNumAtBarrier(int bar_id)
Definition: compute_unit.cc:672
gem5::statistics::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2112
gem5::MemCmd::WriteResp
@ WriteResp
Definition: packet.hh:91
gem5::ComputeUnit::ComputeUnitStats::completedWfs
statistics::Scalar completedWfs
Definition: compute_unit.hh:1130
gem5::HSAQueueEntry::numVectorRegs
int numVectorRegs() const
Definition: hsa_queue_entry.hh:159
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecuted
statistics::Scalar numVecOpsExecuted
Definition: compute_unit.hh:1091
gem5::Named::name
virtual std::string name() const
Definition: named.hh:47
gem5::WFBarrier::InvalidID
static const int InvalidID
Definition: compute_unit.hh:97
gem5::VegaISA::p
Bitfield< 54 > p
Definition: pagetable.hh:70
gem5::ScheduleStage::init
void init()
Definition: schedule_stage.cc:76
gem5::ComputeUnit::decMaxBarrierCnt
void decMaxBarrierCnt(int bar_id)
Definition: compute_unit.cc:700
gem5::ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition: compute_unit.hh:233
gem5::ComputeUnit::releaseWFsFromBarrier
void releaseWFsFromBarrier(int bar_id)
Definition: compute_unit.cc:715
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerGMemInstrDist
statistics::Distribution activeLanesPerGMemInstrDist
Definition: compute_unit.hh:1120
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:210
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition: compute_unit.hh:241
gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
gem5::ArmISA::d
Bitfield< 9 > d
Definition: misc_types.hh:64
gem5::ComputeUnit::execStage
ExecStage execStage
Definition: compute_unit.hh:283
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::description
const char * description() const
Return a C string describing the event.
Definition: compute_unit.cc:1660
gem5::ComputeUnit::ComputeUnitStats::vALUInsts
statistics::Scalar vALUInsts
Definition: compute_unit.hh:1000
gem5::probing::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:108
gem5::ComputeUnit::ComputeUnitStats::instCyclesVALU
statistics::Scalar instCyclesVALU
Definition: compute_unit.hh:1004
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gem5::Wavefront::wfSlotId
const int wfSlotId
Definition: wavefront.hh:96
gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
gem5::ComputeUnit::tickEvent
EventFunctionWrapper tickEvent
Definition: compute_unit.hh:288
gem5::LocalMemPipeline::isLMRespFIFOWrRdy
bool isLMRespFIFOWrRdy() const
Definition: local_memory_pipeline.hh:68
gem5::MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:87
gem5::RR
@ RR
Definition: compute_unit.hh:75
gem5::MemCmd::MemSyncReq
@ MemSyncReq
Definition: packet.hh:124
gem5::ComputeUnit::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
Definition: compute_unit.cc:2096
process.hh
gem5::ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:284
gem5::ComputeUnit::resetRegisterPool
void resetRegisterPool()
Definition: compute_unit.cc:421
gem5::GpuTranslationState::tlbEntry
Serializable * tlbEntry
Definition: gpu_translation_state.hh:73
len
uint16_t len
Definition: helpers.cc:62
gem5::ComputeUnit::registerManager
RegisterManager * registerManager
Definition: compute_unit.hh:278
gem5::ComputeUnit::ComputeUnitStats::numInstrExecuted
statistics::Scalar numInstrExecuted
Definition: compute_unit.hh:1086
gem5::ComputeUnit::ScalarDataPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:919
gem5::HSAQueueEntry::isInvDone
bool isInvDone() const
Is invalidate done?
Definition: hsa_queue_entry.hh:378
gem5::ComputeUnit::ITLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1854
gem5::ComputeUnit::ScalarDTLBPort::stallPort
void stallPort()
Definition: compute_unit.hh:762
gem5::GlobalMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: global_memory_pipeline.hh:95
gem5::Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:92
gem5::ComputeUnit::DataPort::createMemReqEvent
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1617
gem5::ComputeUnit::ComputeUnitStats::vectorMemInstsPerKiloInst
statistics::Formula vectorMemInstsPerKiloInst
Definition: compute_unit.hh:1025
gem5::ComputeUnit::~ComputeUnit
~ComputeUnit()
Definition: compute_unit.cc:229
scalar_register_file.hh
gpu_dyn_inst.hh
gem5::ComputeUnit::DTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:721
gem5::HSAQueueEntry::wgSize
int wgSize(int dim) const
Definition: hsa_queue_entry.hh:145
gem5::ComputeUnit::activeWaves
int activeWaves
Definition: compute_unit.hh:994
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueVgprAlloc
statistics::Scalar numTimesWgBlockedDueVgprAlloc
Definition: compute_unit.hh:1125
gem5::RegisterManager::srfPoolMgrs
std::vector< PoolManager * > srfPoolMgrs
Definition: register_manager.hh:79
gem5::GpuTranslationState::hitLevel
int hitLevel
Definition: gpu_translation_state.hh:85
gem5::HSAQueueEntry::codeAddr
Addr codeAddr() const
Definition: hsa_queue_entry.hh:201
gem5::LdsChunk
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:56
gem5::ComputeUnit::mapWaveToScalarMem
int mapWaveToScalarMem(Wavefront *w) const
Definition: compute_unit.cc:298
gpu_command_processor.hh
gem5::ComputeUnit::mapWaveToGlobalMem
int mapWaveToGlobalMem(Wavefront *w) const
Definition: compute_unit.cc:282
gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
gem5::ComputeUnit::deleteFromPipeMap
void deleteFromPipeMap(Wavefront *w)
Definition: compute_unit.cc:518
gpu_translation_state.hh
gem5::ExecStage::init
void init()
Definition: exec_stage.cc:59
gem5::ComputeUnit::doFlush
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
Definition: compute_unit.cc:413
gem5::ComputeUnit::DataPort::SenderState::port_index
PortID port_index
Definition: compute_unit.hh:524
gem5::ComputeUnit::init
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: compute_unit.cc:763
gem5::HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:247
gem5::HSAQueueEntry::gridSize
int gridSize(int dim) const
Definition: hsa_queue_entry.hh:152
gem5::ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition: compute_unit.hh:249
gem5::ComputeUnit::DataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:523
gem5::ComputeUnit::memPort
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
Definition: compute_unit.hh:898
gem5::OLDEST
@ OLDEST
Definition: compute_unit.hh:74
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
Definition: compute_unit.hh:1026
gem5::X86ISA::pf
Bitfield< 2 > pf
Definition: misc.hh:555
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
Definition: compute_unit.hh:1023
gem5::Packet::cmd
MemCmd cmd
The command field of the packet.
Definition: packet.hh:372
gem5::ComputeUnit::DTLBPort::SenderState::portIndex
PortID portIndex
Definition: compute_unit.hh:725
gem5::ComputeUnit::perLaneTLB
bool perLaneTLB
Definition: compute_unit.hh:329
gem5::ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition: compute_unit.cc:257
gem5::LocalMemPipeline::isLMReqFIFOWrRdy
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: local_memory_pipeline.hh:74
gem5::ComputeUnit::ScalarDTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:755
gem5::ComputeUnit::ComputeUnitStats::groupWrites
statistics::Scalar groupWrites
Definition: compute_unit.hh:1046
gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
gem5::ComputeUnit::ComputeUnitStats::globalWrites
statistics::Scalar globalWrites
Definition: compute_unit.hh:1037
gem5::ComputeUnit::ComputeUnitStats::vALUInstsPerWF
statistics::Formula vALUInstsPerWF
Definition: compute_unit.hh:1001
gem5::LdsState::increaseRefCounter
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:316
tlb.hh
gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:545
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueSgprAlloc
statistics::Scalar numTimesWgBlockedDueSgprAlloc
Definition: compute_unit.hh:1127
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedF16
Definition: compute_unit.hh:1093
gem5::ComputeUnit::barrierSlot
WFBarrier & barrierSlot(int bar_id)
Definition: compute_unit.hh:420
name
const std::string & name()
Definition: trace.cc:48
gem5::ComputeUnit::exitCallback
void exitCallback()
Definition: compute_unit.cc:2003
gem5::ComputeUnit::SQCPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1002
gem5::ComputeUnit::ComputeUnitStats::privMemInsts
statistics::Formula privMemInsts
Definition: compute_unit.hh:1050
gem5::ComputeUnit::mapWaveToScalarAlu
int mapWaveToScalarAlu(Wavefront *w) const
Definition: compute_unit.cc:264
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::ComputeUnit::hasDispResources
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
Definition: compute_unit.cc:530
gem5::ComputeUnit::getFreeBarrierId
int getFreeBarrierId()
Definition: compute_unit.hh:427
gem5::ComputeUnit::wfSize
int wfSize() const
Definition: compute_unit.hh:396
gem5::ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:234
gem5::GpuTranslationState::saved
Packet::SenderState * saved
Definition: gpu_translation_state.hh:86
gem5::ComputeUnit::pipeMap
std::unordered_set< uint64_t > pipeMap
Definition: compute_unit.hh:276
gem5::ComputeUnit::LDSPort::recvReqRetry
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
Definition: compute_unit.cc:2169
gem5::MemCmd::toString
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:276
gem5::Shader::timingSim
bool timingSim
Definition: shader.hh:221
gem5::Request::FLUSH_L2
@ FLUSH_L2
Definition: request.hh:329
gem5::Process
Definition: process.hh:67
gem5::GPUDispatcher::notifyWgCompl
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
Definition: dispatcher.cc:297
gem5::EventFunctionWrapper
Definition: eventq.hh:1136
gem5::ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
gem5::Clocked::nextCycle
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Definition: clocked_object.hh:213
gem5::FullSystem
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition: root.cc:220
gem5::ComputeUnit::updateInstStats
void updateInstStats(GPUDynInstPtr gpuDynInst)
Definition: compute_unit.cc:1884
gem5::ComputeUnit::ComputeUnitStats::numALUInstsExecuted
statistics::Formula numALUInstsExecuted
Definition: compute_unit.hh:1123
gem5::ComputeUnit::ComputeUnitStats::instCyclesLdsPerSimd
statistics::Vector instCyclesLdsPerSimd
Definition: compute_unit.hh:1034
gem5::ComputeUnit::ComputeUnitStats::argReads
statistics::Scalar argReads
Definition: compute_unit.hh:1039
gem5::ComputeUnit::ComputeUnitStats::globalMemInsts
statistics::Formula globalMemInsts
Definition: compute_unit.hh:1038
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueLdsAllocation
statistics::Scalar wgBlockedDueLdsAllocation
Definition: compute_unit.hh:1082
gem5::ComputeUnit::LDSPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Definition: compute_unit.cc:2105
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:214
gem5::ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition: compute_unit.hh:219
gem5::Wavefront::barrierId
void barrierId(int bar_id)
Definition: wavefront.cc:1446
gem5::ComputeUnit::Params
ComputeUnitParams Params
Definition: compute_unit.hh:290
gem5::Wavefront::S_RETURNING
@ S_RETURNING
Definition: wavefront.hh:68
gem5::ComputeUnit::ComputeUnitStats::ipc
statistics::Formula ipc
Definition: compute_unit.hh:1118
gem5::RegisterManager::allocateRegisters
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
Definition: register_manager.cc:122
gem5::ComputeUnit::updatePageDivergenceDist
void updatePageDivergenceDist(Addr addr)
Definition: compute_unit.cc:1992
gem5::ComputeUnit::vectorRegsReserved
std::vector< int > vectorRegsReserved
Definition: compute_unit.hh:369
gem5::ComputeUnit::ComputeUnitStats::readonlyWrites
statistics::Scalar readonlyWrites
Definition: compute_unit.hh:1052
gem5::Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:99
gem5::ComputeUnit::ComputeUnitStats::waveLevelParallelism
statistics::Distribution waveLevelParallelism
Definition: compute_unit.hh:1058
gem5::ComputeUnit::ComputeUnitStats::scalarMemWrites
statistics::Scalar scalarMemWrites
Definition: compute_unit.hh:1018
gem5::ComputeUnit::ComputeUnitStats::controlFlowDivergenceDist
statistics::Distribution controlFlowDivergenceDist
Definition: compute_unit.hh:1119
gem5::ScheduleStage::exec
void exec()
Definition: schedule_stage.cc:90
gem5::ComputeUnit::ComputeUnitStats::vectorMemWrites
statistics::Scalar vectorMemWrites
Definition: compute_unit.hh:1014
gem5::ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition: compute_unit.cc:509
gem5::ComputeUnit::ScalarDTLBPort::SenderState
Definition: compute_unit.hh:752
gem5::statistics::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:71
gem5::GpuTranslationState::tlbMode
BaseMMU::Mode tlbMode
Definition: gpu_translation_state.hh:61
gem5::GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:248
gem5::ComputeUnit::mapWaveToLocalMem
int mapWaveToLocalMem(Wavefront *w) const
Definition: compute_unit.cc:290
gem5::ComputeUnit::ldsPort
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
Definition: compute_unit.hh:887
gem5::MemCmd::ReadResp
@ ReadResp
Definition: packet.hh:88
gem5::ComputeUnit::ComputeUnitStats::flatLDSInstsPerWF
statistics::Formula flatLDSInstsPerWF
Definition: compute_unit.hh:1013
gem5::WFBarrier
WF barrier slots.
Definition: compute_unit.hh:90
gem5::ComputeUnit::isDone
bool isDone() const
Definition: compute_unit.cc:2020
gem5::ComputeUnit::LDSPort::SenderState::getMemInst
GPUDynInstPtr getMemInst() const
Definition: compute_unit.hh:849
gem5::Request::INV_L1
@ INV_L1
Definition: request.hh:324
gem5::ComputeUnit::ComputeUnitStats::hitsPerTLBLevel
statistics::Vector hitsPerTLBLevel
Definition: compute_unit.hh:1067
gem5::Shader::gpuCmdProc
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:256
gem5::ComputeUnit::maxBarrierCnt
int maxBarrierCnt(int bar_id)
Definition: compute_unit.cc:686
gem5::Shader::n_wf
int n_wf
Definition: shader.hh:235
gem5::ComputeUnit::scalarRegsReserved
std::vector< int > scalarRegsReserved
Definition: compute_unit.hh:371
gem5::ComputeUnit::fillKernelState
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
Definition: compute_unit.cc:306
gem5::MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:90
gem5::ComputeUnit::lds
LdsState & lds
Definition: compute_unit.hh:473
gem5::ComputeUnit::DTLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:718
gem5::ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition: compute_unit.hh:231
gem5::statistics::Group
Statistics container.
Definition: group.hh:92
gem5::ComputeUnit::ComputeUnitStats::execRateDist
statistics::Distribution execRateDist
Definition: compute_unit.hh:1089
gem5::ComputeUnit::tlbPort
std::vector< DTLBPort > tlbPort
Definition: compute_unit.hh:900
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedF32
Definition: compute_unit.hh:1095
gem5::ComputeUnit::isVectorAluIdle
bool isVectorAluIdle(uint32_t simdId) const
Definition: compute_unit.cc:2057
gem5::ComputeUnit::numScalarRegsPerSimd
int numScalarRegsPerSimd
Definition: compute_unit.hh:375
gem5::ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition: compute_unit.hh:245
gem5::Request::KERNEL
@ KERNEL
The request should be marked with KERNEL.
Definition: request.hh:183
gem5::ComputeUnit::sendScalarRequest
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
Definition: compute_unit.cc:1240
gem5::ComputeUnit::countPages
bool countPages
Definition: compute_unit.hh:351
gem5::ComputeUnit::freeBarrierIds
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
Definition: compute_unit.hh:985
sc_core::SC_NONE
@ SC_NONE
Definition: sc_report.hh:50
gem5::Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:109
gem5::ComputeUnit::ComputeUnitStats::ComputeUnitStats
ComputeUnitStats(statistics::Group *parent, int n_wf)
Definition: compute_unit.cc:2200
gem5::RegisterManager::canAllocateSgprs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:115
gem5::ComputeUnit::scalarMemToSrfBus
WaitClass scalarMemToSrfBus
Definition: compute_unit.hh:237
gem5::MipsISA::k
Bitfield< 23 > k
Definition: dt_constants.hh:81
gem5::ComputeUnit::scalarDTLBPort
ScalarDTLBPort scalarDTLBPort
Definition: compute_unit.hh:904
gem5::ComputeUnit::ComputeUnitStats::pageDivergenceDist
statistics::Distribution pageDivergenceDist
Definition: compute_unit.hh:1074
gem5::Shader::max_valu_insts
int64_t max_valu_insts
Definition: shader.hh:260
gem5::RequestorID
uint16_t RequestorID
Definition: request.hh:95
gem5::ExecStage::exec
void exec()
Definition: exec_stage.cc:152
gem5::GPUDispatcher
Definition: dispatcher.hh:62
dispatcher.hh
DPRINTFN
#define DPRINTFN(...)
Definition: trace.hh:238
gem5::ComputeUnit::ScalarDataPort::MemReqEvent
Definition: compute_unit.hh:608
gem5::statistics::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:357
gem5::MipsISA::vaddr
vaddr
Definition: pra_constants.hh:278
gem5::ComputeUnit::ComputeUnitStats::argWrites
statistics::Scalar argWrites
Definition: compute_unit.hh:1040
gem5::ComputeUnit::ComputeUnitStats::vpc_f32
statistics::Formula vpc_f32
Definition: compute_unit.hh:1116
gem5::HSAQueueEntry::ldsSize
int ldsSize() const
Definition: hsa_queue_entry.hh:213
gem5::Packet::getAddr
Addr getAddr() const
Definition: packet.hh:807
gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:207
gem5::RegisterManager::canAllocateVgprs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:109
gem5::ComputeUnit::sendToLds
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
Definition: compute_unit.cc:2076
gem5::X86ISA::PageBytes
const Addr PageBytes
Definition: page_size.hh:49
gem5::registerExitCallback
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition: core.cc:143
gem5::Wavefront::getStatus
status_e getStatus()
Definition: wavefront.hh:137
gem5::ComputeUnit::ComputeUnitStats::dynamicLMemInstrCnt
statistics::Scalar dynamicLMemInstrCnt
Definition: compute_unit.hh:1079
gem5::ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition: compute_unit.hh:286
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:236
page_table.hh
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: gpu_translation_state.hh:37
gem5::ComputeUnit::DataPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:812
gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:225
gem5::MemCmd::MemSyncResp
@ MemSyncResp
Definition: packet.hh:125
gem5::ComputeUnit::ScalarDataPort::SystemHubEvent
Definition: compute_unit.hh:625
gem5::ScoreboardCheckStage::exec
void exec()
Definition: scoreboard_check_stage.cc:248
gem5::ComputeUnit::ComputeUnitStats::readonlyMemInsts
statistics::Formula readonlyMemInsts
Definition: compute_unit.hh:1053
gem5::ComputeUnit::ScalarDataPort::recvReqRetry
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:963
gem5::ComputeUnit::ComputeUnitStats::scalarMemReads
statistics::Scalar scalarMemReads
Definition: compute_unit.hh:1020
gem5::ComputeUnit::ComputeUnitStats::totalCycles
statistics::Scalar totalCycles
Definition: compute_unit.hh:1113
gem5::statistics::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1039
gem5::ComputeUnit::dispWorkgroup
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
Definition: compute_unit.cc:431
gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:177
gem5::ArmISA::stride
Bitfield< 21, 20 > stride
Definition: misc_types.hh:504
gem5::ComputeUnit::ComputeUnitStats::tlbCycles
statistics::Scalar tlbCycles
Definition: compute_unit.hh:1063
gem5::ComputeUnit::mapWaveToScalarAluGlobalIdx
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
Definition: compute_unit.cc:275
gem5::ComputeUnit::gmTokenPort
GMTokenPort gmTokenPort
Definition: compute_unit.hh:510
gem5::LdsState::reserveSpace
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:384
gem5::Shader::total_valu_insts
int64_t total_valu_insts
Definition: shader.hh:261
gem5::ComputeUnit::DataPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:975
gem5::ComputeUnit::doInvalidate
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
Definition: compute_unit.cc:394
gem5::ComputeUnit::ComputeUnitStats::spillReads
statistics::Scalar spillReads
Definition: compute_unit.hh:1042
gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
gem5::ComputeUnit::allAtBarrier
bool allAtBarrier(int bar_id)
Definition: compute_unit.cc:665
gem5::ComputeUnit::numWfsToSched
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
Definition: compute_unit.hh:366
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInstsPerWF
statistics::Formula ldsNoFlatInstsPerWF
Definition: compute_unit.hh:1009
gem5::GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:111
gem5::KernelLaunchStaticInst
Definition: gpu_static_inst.hh:325
gem5::ComputeUnit::DataPort::SenderState
Definition: compute_unit.hh:521
gem5::Packet::getSize
unsigned getSize() const
Definition: packet.hh:817
gem5::Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:458
gem5::ComputeUnit::ScalarDataPort::SenderState
Definition: compute_unit.hh:596
gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:305
gem5::ComputeUnit::scheduleStage
ScheduleStage scheduleStage
Definition: compute_unit.hh:282
gem5::Wavefront::dropFetch
bool dropFetch
Definition: wavefront.hh:112
gem5::Shader::impl_kern_end_rel
int impl_kern_end_rel
Definition: shader.hh:227
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:188
gem5::GPUCommandProcessor::driver
GPUComputeDriver * driver()
Definition: gpu_command_processor.cc:318
gem5::ComputeUnit::DataPort::createMemRespEvent
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1625
gem5::Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:217
gem5::ComputeUnit::requestorId
RequestorID requestorId()
Definition: compute_unit.hh:462
gem5::X86ISA::addr
Bitfield< 3 > addr
Definition: types.hh:84
gem5::ComputeUnit::ComputeUnitStats::scalarMemInstsPerKiloInst
statistics::Formula scalarMemInstsPerKiloInst
Definition: compute_unit.hh:1028
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerKiloInst
statistics::Formula vectorMemWritesPerKiloInst
Definition: compute_unit.hh:1024
gem5::SenderState
RubyTester::SenderState SenderState
Definition: Check.cc:40
gem5::ComputeUnit::numExeUnits
int numExeUnits() const
Definition: compute_unit.cc:242
gem5::Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1225
gem5::ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:221
gem5::Wavefront::simdId
const int simdId
Definition: wavefront.hh:99
gem5::MipsISA::vpc
Bitfield< 1 > vpc
Definition: mt_constants.hh:44

Generated on Sun Jul 30 2023 01:56:56 for gem5 by doxygen 1.8.17