gem5  v21.2.1.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <limits>
35 
37 #include "base/output.hh"
38 #include "debug/GPUDisp.hh"
39 #include "debug/GPUExec.hh"
40 #include "debug/GPUFetch.hh"
41 #include "debug/GPUMem.hh"
42 #include "debug/GPUPort.hh"
43 #include "debug/GPUPrefetch.hh"
44 #include "debug/GPUReg.hh"
45 #include "debug/GPURename.hh"
46 #include "debug/GPUSync.hh"
47 #include "debug/GPUTLB.hh"
53 #include "gpu-compute/shader.hh"
56 #include "gpu-compute/wavefront.hh"
57 #include "mem/page_table.hh"
58 #include "sim/process.hh"
59 #include "sim/sim_exit.hh"
60 
61 namespace gem5
62 {
63 
65  numVectorGlobalMemUnits(p.num_global_mem_pipes),
66  numVectorSharedMemUnits(p.num_shared_mem_pipes),
67  numScalarMemUnits(p.num_scalar_mem_pipes),
68  numVectorALUs(p.num_SIMDs),
69  numScalarALUs(p.num_scalar_cores),
70  vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
71  coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
72  registerManager(p.register_manager),
73  fetchStage(p, *this),
74  scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
75  scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
76  execStage(p, *this, scheduleToExecute),
77  globalMemoryPipe(p, *this),
78  localMemoryPipe(p, *this),
79  scalarMemoryPipe(p, *this),
80  tickEvent([this]{ exec(); }, "Compute unit tick event",
81  false, Event::CPU_Tick_Pri),
82  cu_id(p.cu_id),
83  vrf(p.vector_register_file), srf(p.scalar_register_file),
84  simdWidth(p.simd_width),
85  spBypassPipeLength(p.spbypass_pipe_length),
86  dpBypassPipeLength(p.dpbypass_pipe_length),
87  scalarPipeStages(p.scalar_pipe_length),
88  operandNetworkLength(p.operand_network_length),
89  issuePeriod(p.issue_period),
90  vrf_gm_bus_latency(p.vrf_gm_bus_latency),
91  srf_scm_bus_latency(p.srf_scm_bus_latency),
92  vrf_lm_bus_latency(p.vrf_lm_bus_latency),
93  perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
94  prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
95  debugSegFault(p.debugSegFault),
96  functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
97  countPages(p.countPages),
98  req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
99  resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
100  _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
101  lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
102  ldsPort(csprintf("%s-port", name()), this),
103  scalarDataPort(csprintf("%s-port", name()), this),
104  scalarDTLBPort(csprintf("%s-port", name()), this),
105  sqcPort(csprintf("%s-port", name()), this),
106  sqcTLBPort(csprintf("%s-port", name()), this),
107  _cacheLineSize(p.system->cacheLineSize()),
108  _numBarrierSlots(p.num_barrier_slots),
109  globalSeqNum(0), wavefrontSize(p.wf_size),
110  scoreboardCheckToSchedule(p),
111  scheduleToExecute(p),
112  stats(this, p.n_wf)
113 {
123  fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
124  p.wf_size <= 0,
125  "WF size is larger than the host can support");
126  fatal_if(!isPowerOf2(wavefrontSize),
127  "Wavefront size should be a power of 2");
128  // calculate how many cycles a vector load or store will need to transfer
129  // its data over the corresponding buses
130  numCyclesPerStoreTransfer =
131  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
132  (double)vrfToCoalescerBusWidth);
133 
134  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
135  / coalescerToVrfBusWidth;
136 
137  // Initialization: all WF slots are assumed STOPPED
138  idleWfs = p.n_wf * numVectorALUs;
139  lastVaddrWF.resize(numVectorALUs);
140  wfList.resize(numVectorALUs);
141 
142  wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
143 
144  for (int i = 0; i < p.num_barrier_slots; ++i) {
145  freeBarrierIds.insert(i);
146  }
147 
148  for (int j = 0; j < numVectorALUs; ++j) {
149  lastVaddrWF[j].resize(p.n_wf);
150 
151  for (int i = 0; i < p.n_wf; ++i) {
152  lastVaddrWF[j][i].resize(wfSize());
153 
154  wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
155  wfList[j][i]->setParent(this);
156 
157  for (int k = 0; k < wfSize(); ++k) {
158  lastVaddrWF[j][i][k] = 0;
159  }
160  }
161  }
162 
163  lastVaddrSimd.resize(numVectorALUs);
164 
165  for (int i = 0; i < numVectorALUs; ++i) {
166  lastVaddrSimd[i].resize(wfSize(), 0);
167  }
168 
169  lastVaddrCU.resize(wfSize());
170 
171  lds.setParent(this);
172 
173  if (p.execPolicy == "OLDEST-FIRST") {
174  exec_policy = EXEC_POLICY::OLDEST;
175  } else if (p.execPolicy == "ROUND-ROBIN") {
176  exec_policy = EXEC_POLICY::RR;
177  } else {
178  fatal("Invalid WF execution policy (CU)\n");
179  }
180 
181  for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
182  memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
183  }
184 
185  for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
186  tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
187  }
188 
189  // Setup tokens for response ports. The number of tokens in memPortTokens
190  // is the total token count for the entire vector port (i.e., this CU).
191  memPortTokens = new TokenManager(p.max_cu_tokens);
192 
193  registerExitCallback([this]() { exitCallback(); });
194 
195  lastExecCycle.resize(numVectorALUs, 0);
196 
197  for (int i = 0; i < vrf.size(); ++i) {
198  vrf[i]->setParent(this);
199  }
200  for (int i = 0; i < srf.size(); ++i) {
201  srf[i]->setParent(this);
202  }
203  numVecRegsPerSimd = vrf[0]->numRegs();
204  numScalarRegsPerSimd = srf[0]->numRegs();
205 
206  registerManager->setParent(this);
207 
208  activeWaves = 0;
209 
210  instExecPerSimd.resize(numVectorALUs, 0);
211 
212  // Calculate the number of bits to address a cache line
213  panic_if(!isPowerOf2(_cacheLineSize),
214  "Cache line size should be a power of two.");
215  cacheLineBits = floorLog2(_cacheLineSize);
216 }
217 
219 {
220  // Delete wavefront slots
221  for (int j = 0; j < numVectorALUs; ++j) {
222  for (int i = 0; i < shader->n_wf; ++i) {
223  delete wfList[j][i];
224  }
225  lastVaddrSimd[j].clear();
226  }
227  lastVaddrCU.clear();
228 }
229 
230 int
232 {
235 }
236 
237 // index into readyList of the first memory unit
238 int
240 {
241  return numVectorALUs + numScalarALUs;
242 }
243 
244 // index into readyList of the last memory unit
245 int
247 {
248  return numExeUnits() - 1;
249 }
250 
251 // index into scalarALUs vector of SALU used by the wavefront
252 int
254 {
255  if (numScalarALUs == 1) {
256  return 0;
257  } else {
258  return w->simdId % numScalarALUs;
259  }
260 }
261 
262 // index into readyList of Scalar ALU unit used by wavefront
263 int
265 {
267 }
268 
269 // index into readyList of Global Memory unit used by wavefront
270 int
272 {
273  // TODO: FIXME if more than 1 GM pipe supported
274  return numVectorALUs + numScalarALUs;
275 }
276 
277 // index into readyList of Local Memory unit used by wavefront
278 int
280 {
281  // TODO: FIXME if more than 1 LM pipe supported
283 }
284 
285 // index into readyList of Scalar Memory unit used by wavefront
286 int
288 {
289  // TODO: FIXME if more than 1 ScM pipe supported
292 }
293 
294 void
296 {
297  w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
298  w->workGroupSz[0] = task->wgSize(0);
299  w->workGroupSz[1] = task->wgSize(1);
300  w->workGroupSz[2] = task->wgSize(2);
301  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
302  w->gridSz[0] = task->gridSize(0);
303  w->gridSz[1] = task->gridSize(1);
304  w->gridSz[2] = task->gridSize(2);
305  w->computeActualWgSz(task);
306 }
307 
308 void
310  HSAQueueEntry *task, int bar_id, bool fetchContext)
311 {
312  static int _n_wave = 0;
313 
314  VectorMask init_mask;
315  init_mask.reset();
316 
317  for (int k = 0; k < wfSize(); ++k) {
318  if (k + waveId * wfSize() < w->actualWgSzTotal)
319  init_mask[k] = 1;
320  }
321 
322  w->execMask() = init_mask;
323 
324  w->kernId = task->dispatchId();
325  w->wfId = waveId;
326  w->initMask = init_mask.to_ullong();
327 
328  if (bar_id > WFBarrier::InvalidID) {
329  w->barrierId(bar_id);
330  } else {
331  assert(!w->hasBarrier());
332  }
333 
334  for (int k = 0; k < wfSize(); ++k) {
335  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
336  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
337  w->actualWgSz[1];
338  w->workItemId[2][k] = (k + waveId * wfSize()) /
339  (w->actualWgSz[0] * w->actualWgSz[1]);
340 
341  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
342  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
343  w->workItemId[0][k];
344  }
345 
346  // WG state
347  w->wgId = task->globalWgId();
348  w->dispatchId = task->dispatchId();
349  w->workGroupId[0] = w->wgId % task->numWg(0);
350  w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
351  w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
352 
353  // set the wavefront context to have a pointer to this section of the LDS
354  w->ldsChunk = ldsChunk;
355 
356  [[maybe_unused]] int32_t refCount =
357  lds.increaseRefCounter(w->dispatchId, w->wgId);
358  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
359  cu_id, w->wgId, refCount);
360 
361  w->instructionBuffer.clear();
362 
363  if (w->pendingFetch)
364  w->dropFetch = true;
365 
366  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
367  "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
368  w->simdId, w->wfSlotId, refCount);
369 
370  w->initRegState(task, w->actualWgSzTotal);
371  w->start(_n_wave++, task->codeAddr());
372 
374  activeWaves++;
375 }
376 
382 void
384  GPUDynInstPtr gpuDynInst
385  = std::make_shared<GPUDynInst>(this, nullptr,
387 
388  // kern_id will be used in inv responses
389  gpuDynInst->kern_id = kernId;
390  // update contextId field
391  req->setContext(gpuDynInst->wfDynId);
392 
393  injectGlobalMemFence(gpuDynInst, true, req);
394 }
395 
401 void
403  injectGlobalMemFence(gpuDynInst, true);
404 }
405 
406 // reseting SIMD register pools
407 // I couldn't think of any other place and
408 // I think it is needed in my implementation
409 void
411 {
412  for (int i=0; i<numVectorALUs; i++)
413  {
416  }
417 }
418 
419 void
420 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
421 {
422  // If we aren't ticking, start it up!
423  if (!tickEvent.scheduled()) {
424  DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
426  }
427 
428  // the kernel's invalidate must have finished before any wg dispatch
429  assert(task->isInvDone());
430 
431  // reserve the LDS capacity allocated to the work group
432  // disambiguated by the dispatch ID and workgroup ID, which should be
433  // globally unique
434  LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
435  task->globalWgId(),
436  task->ldsSize());
437 
438  panic_if(!ldsChunk, "was not able to reserve space for this WG");
439 
440  // calculate the number of 32-bit vector registers required
441  // by each work item
442  int vregDemand = task->numVectorRegs();
443  int sregDemand = task->numScalarRegs();
444  int wave_id = 0;
445 
446  int barrier_id = WFBarrier::InvalidID;
447 
452  if (num_wfs_in_wg > 1) {
457  barrier_id = getFreeBarrierId();
458  auto &wf_barrier = barrierSlot(barrier_id);
459  assert(!wf_barrier.maxBarrierCnt());
460  assert(!wf_barrier.numAtBarrier());
461  wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
462 
463  DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
464  "%d waves using this barrier.\n", cu_id, barrier_id,
465  num_wfs_in_wg);
466  }
467 
468  // Assign WFs according to numWfsToSched vector, which is computed by
469  // hasDispResources()
470  for (int j = 0; j < shader->n_wf; ++j) {
471  for (int i = 0; i < numVectorALUs; ++i) {
472  Wavefront *w = wfList[i][j];
473  // Check if this wavefront slot is available and there are WFs
474  // remaining to be dispatched to current SIMD:
475  // WF slot must be stopped and not waiting
476  // for a release to complete S_RETURNING
477  if (w->getStatus() == Wavefront::S_STOPPED &&
478  numWfsToSched[i] > 0) {
479  // decrement number of WFs awaiting dispatch to current SIMD
480  numWfsToSched[i] -= 1;
481 
482  fillKernelState(w, task);
483 
484  DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
485  "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
486  vregDemand, sregDemand);
487 
488  registerManager->allocateRegisters(w, vregDemand, sregDemand);
489 
490  startWavefront(w, wave_id, ldsChunk, task, barrier_id);
491  ++wave_id;
492  }
493  }
494  }
495 }
496 
497 void
499 {
500  panic_if(w->instructionBuffer.empty(),
501  "Instruction Buffer of WF%d can't be empty", w->wgId);
502  GPUDynInstPtr ii = w->instructionBuffer.front();
503  pipeMap.emplace(ii->seqNum());
504 }
505 
506 void
508 {
509  panic_if(w->instructionBuffer.empty(),
510  "Instruction Buffer of WF%d can't be empty", w->wgId);
511  GPUDynInstPtr ii = w->instructionBuffer.front();
512  // delete the dynamic instruction from the pipeline map
513  auto it = pipeMap.find(ii->seqNum());
514  panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
515  pipeMap.erase(it);
516 }
517 
518 bool
520 {
521  // compute true size of workgroup (after clamping to grid size)
522  int trueWgSize[HSAQueueEntry::MAX_DIM];
523  int trueWgSizeTotal = 1;
524 
525  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
526  trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
527  task->wgId(d) * task->wgSize(d));
528 
529  trueWgSizeTotal *= trueWgSize[d];
530  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
531  }
532 
533  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
534 
535  // calculate the number of WFs in this WG
536  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
537  num_wfs_in_wg = numWfs;
538 
539  bool barrier_avail = true;
540 
541  if (numWfs > 1 && !freeBarrierIds.size()) {
542  barrier_avail = false;
543  }
544 
545  // calculate the number of 32-bit vector registers required by each
546  // work item of the work group
547  int vregDemandPerWI = task->numVectorRegs();
548  // calculate the number of 32-bit scalar registers required by each
549  // work item of the work group
550  int sregDemandPerWI = task->numScalarRegs();
551 
552  // check if the total number of VGPRs snd SGPRs required by all WFs
553  // of the WG fit in the VRFs of all SIMD units and the CU's SRF
554  panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
555  "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
556  "that has %d VGPRs\n",
557  numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
558  panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
559  "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
560  "with %d SGPRs\n",
561  numWfs, sregDemandPerWI, numScalarRegsPerSimd);
562 
563  // number of WF slots that are not occupied
564  int freeWfSlots = 0;
565  // number of Wfs from WG that were successfully mapped to a SIMD
566  int numMappedWfs = 0;
567  numWfsToSched.clear();
568  numWfsToSched.resize(numVectorALUs, 0);
569 
570  // attempt to map WFs to the SIMDs, based on WF slot availability
571  // and register file availability
572  for (int j = 0; j < shader->n_wf; ++j) {
573  for (int i = 0; i < numVectorALUs; ++i) {
574  if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
575  ++freeWfSlots;
576  // check if current WF will fit onto current SIMD/VRF
577  // if all WFs have not yet been mapped to the SIMDs
578  if (numMappedWfs < numWfs &&
580  sregDemandPerWI) &&
582  vregDemandPerWI)) {
583  numWfsToSched[i]++;
584  numMappedWfs++;
585  }
586  }
587  }
588  }
589 
590  // check that the number of mapped WFs is not greater
591  // than the actual number of WFs
592  assert(numMappedWfs <= numWfs);
593 
594  bool vregAvail = true;
595  bool sregAvail = true;
596  // if a WF to SIMD mapping was not found, find the limiting resource
597  if (numMappedWfs < numWfs) {
598 
599  for (int j = 0; j < numVectorALUs; ++j) {
600  // find if there are enough free VGPRs in the SIMD's VRF
601  // to accomodate the WFs of the new WG that would be mapped
602  // to this SIMD unit
603  vregAvail &= registerManager->
604  canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
605  // find if there are enough free SGPRs in the SIMD's SRF
606  // to accomodate the WFs of the new WG that would be mapped
607  // to this SIMD unit
608  sregAvail &= registerManager->
609  canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
610  }
611  }
612 
613  DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
614  VGPR Availability = %d, SGPR Availability = %d\n",
615  freeWfSlots, numMappedWfs, vregAvail, sregAvail);
616 
617  if (!vregAvail) {
619  }
620 
621  if (!sregAvail) {
623  }
624 
625  // Return true if enough WF slots to submit workgroup and if there are
626  // enough VGPRs to schedule all WFs to their SIMD units
627  bool ldsAvail = lds.canReserve(task->ldsSize());
628  if (!ldsAvail) {
630  }
631 
632  if (!barrier_avail) {
634  }
635 
636  // Return true if the following are all true:
637  // (a) all WFs of the WG were mapped to free WF slots
638  // (b) there are enough VGPRs to schedule all WFs to their SIMD units
639  // (c) there are enough SGPRs on the CU to schedule all WFs
640  // (d) there is enough space in LDS to allocate for all WFs
641  bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
642  && ldsAvail && barrier_avail;
643  return can_dispatch;
644 }
645 
646 int
648 {
649  auto &wf_barrier = barrierSlot(bar_id);
650  return wf_barrier.numYetToReachBarrier();
651 }
652 
653 bool
655 {
656  auto &wf_barrier = barrierSlot(bar_id);
657  return wf_barrier.allAtBarrier();
658 }
659 
660 void
662 {
663  auto &wf_barrier = barrierSlot(bar_id);
664  wf_barrier.incNumAtBarrier();
665 }
666 
667 int
669 {
670  auto &wf_barrier = barrierSlot(bar_id);
671  return wf_barrier.numAtBarrier();
672 }
673 
674 int
676 {
677  auto &wf_barrier = barrierSlot(bar_id);
678  return wf_barrier.maxBarrierCnt();
679 }
680 
681 void
683 {
684  auto &wf_barrier = barrierSlot(bar_id);
685  wf_barrier.reset();
686 }
687 
688 void
690 {
691  auto &wf_barrier = barrierSlot(bar_id);
692  wf_barrier.decMaxBarrierCnt();
693 }
694 
695 void
697 {
698  auto &wf_barrier = barrierSlot(bar_id);
699  wf_barrier.release();
700  freeBarrierIds.insert(bar_id);
701 }
702 
703 void
705 {
706  for (int i = 0; i < numVectorALUs; ++i) {
707  for (int j = 0; j < shader->n_wf; ++j) {
708  Wavefront *wf = wfList[i][j];
709  if (wf->barrierId() == bar_id) {
710  assert(wf->getStatus() == Wavefront::S_BARRIER);
712  }
713  }
714  }
715 }
716 
717 // Execute one clock worth of work on the ComputeUnit.
718 void
720 {
721  // process reads and writes in the RFs
722  for (auto &vecRegFile : vrf) {
723  vecRegFile->exec();
724  }
725 
726  for (auto &scRegFile : srf) {
727  scRegFile->exec();
728  }
729 
730  // Execute pipeline stages in reverse order to simulate
731  // the pipeline latency
735  execStage.exec();
738  fetchStage.exec();
739 
740  stats.totalCycles++;
741 
742  // Put this CU to sleep if there is no more work to be done.
743  if (!isDone()) {
745  } else {
747  DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
748  }
749 }
750 
751 void
753 {
754  // Initialize CU Bus models and execution resources
755 
756  // Vector ALUs
757  vectorALUs.clear();
758  for (int i = 0; i < numVectorALUs; i++) {
759  vectorALUs.emplace_back(this, clockPeriod());
760  }
761 
762  // Scalar ALUs
763  scalarALUs.clear();
764  for (int i = 0; i < numScalarALUs; i++) {
765  scalarALUs.emplace_back(this, clockPeriod());
766  }
767 
768  // Vector Global Memory
770  "No support for multiple Global Memory Pipelines exists!!!");
774 
775  // Vector Local/Shared Memory
777  "No support for multiple Local Memory Pipelines exists!!!");
781 
782  // Scalar Memory
784  "No support for multiple Scalar Memory Pipelines exists!!!");
785  scalarMemUnit.init(this, clockPeriod());
788 
791 
792  fetchStage.init();
794  execStage.init();
796 
798 }
799 
800 bool
802 {
803  // Ruby has completed the memory op. Schedule the mem_resp_event at the
804  // appropriate cycle to process the timing memory response
805  // This delay represents the pipeline delay
806  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
807  PortID index = sender_state->port_index;
808  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
809  GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
810 
811  // MemSyncResp + WriteAckResp are handled completely here and we don't
812  // schedule a MemRespEvent to process the responses further
813  if (pkt->cmd == MemCmd::MemSyncResp) {
814  // This response is for 1 of the following request types:
815  // - kernel launch
816  // - kernel end
817  // - non-kernel mem sync
818 
819  // Kernel Launch
820  // wavefront was nullptr when launching kernel, so it is meaningless
821  // here (simdId=-1, wfSlotId=-1)
822  if (gpuDynInst->isKernelLaunch()) {
823  // for kernel launch, the original request must be both kernel-type
824  // and INV_L1
825  assert(pkt->req->isKernel());
826  assert(pkt->req->isInvL1());
827 
828  // one D-Cache inv is done, decrement counter
829  dispatcher.updateInvCounter(gpuDynInst->kern_id);
830 
831  delete pkt->senderState;
832  delete pkt;
833  return true;
834  }
835 
836  // retrieve wavefront from inst
837  Wavefront *w = gpuDynInst->wavefront();
838 
839  // Check if we are waiting on Kernel End Flush
840  if (w->getStatus() == Wavefront::S_RETURNING
841  && gpuDynInst->isEndOfKernel()) {
842  // for kernel end, the original request must be both kernel-type
843  // and last-level GPU cache should be flushed if it contains
844  // dirty data. This request may have been quiesced and
845  // immediately responded to if the GL2 is a write-through /
846  // read-only cache.
847  assert(pkt->req->isKernel());
848  assert(pkt->req->isGL2CacheFlush());
849 
850  // once flush done, decrement counter, and return whether all
851  // dirty writeback operations are done for the kernel
852  bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
853 
854  // not all wbs are done for the kernel, just release pkt
855  // resources
856  if (!isWbDone) {
857  delete pkt->senderState;
858  delete pkt;
859  return true;
860  }
861 
862  // all wbs are completed for the kernel, do retirement work
863  // for the workgroup
864  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
865  computeUnit->cu_id, w->simdId, w->wfSlotId,
866  w->wfDynId, w->wgId);
867 
868  dispatcher.notifyWgCompl(w);
869  w->setStatus(Wavefront::S_STOPPED);
870  }
871 
872  if (!pkt->req->isKernel()) {
873  w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
874  DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
875  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
876  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
877  gpuDynInst->disassemble(), w->outstandingReqs,
878  w->outstandingReqs - 1);
880  }
881 
882  delete pkt->senderState;
883  delete pkt;
884  return true;
885  }
886 
887  EventFunctionWrapper *mem_resp_event =
888  computeUnit->memPort[index].createMemRespEvent(pkt);
889 
890  DPRINTF(GPUPort,
891  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
892  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
893  gpuDynInst->seqNum(), index, pkt->req->getPaddr());
894 
895  computeUnit->schedule(mem_resp_event,
897 
898  return true;
899 }
900 
901 bool
903 {
904  assert(!pkt->req->isKernel());
905 
906  // retrieve sender state
907  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
908  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
909 
910  assert(pkt->isRead() || pkt->isWrite());
911  assert(gpuDynInst->numScalarReqs > 0);
912 
913  gpuDynInst->numScalarReqs--;
914 
923  if (!gpuDynInst->numScalarReqs) {
924  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
925  computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
926  gpuDynInst);
927  } else {
928  computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
929  gpuDynInst);
930  }
931  }
932 
933  delete pkt->senderState;
934  delete pkt;
935 
936  return true;
937 }
938 
939 void
941 {
942  for (const auto &pkt : retries) {
943  if (!sendTimingReq(pkt)) {
944  break;
945  } else {
946  retries.pop_front();
947  }
948  }
949 }
950 
951 void
953 {
954  int len = retries.size();
955 
956  assert(len > 0);
957 
958  for (int i = 0; i < len; ++i) {
959  PacketPtr pkt = retries.front().first;
960  [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
961  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
962  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
963  pkt->req->getPaddr());
964 
968  if (!sendTimingReq(pkt)) {
969  DPRINTF(GPUMem, "failed again!\n");
970  break;
971  } else {
972  DPRINTF(GPUMem, "successful!\n");
973  retries.pop_front();
974  }
975  }
976 }
977 
978 bool
980 {
981  computeUnit->fetchStage.processFetchReturn(pkt);
982  return true;
983 }
984 
985 void
987 {
988  int len = retries.size();
989 
990  assert(len > 0);
991 
992  for (int i = 0; i < len; ++i) {
993  PacketPtr pkt = retries.front().first;
994  [[maybe_unused]] Wavefront *wavefront = retries.front().second;
995  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
996  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
997  pkt->req->getPaddr());
998  if (!sendTimingReq(pkt)) {
999  DPRINTF(GPUFetch, "failed again!\n");
1000  break;
1001  } else {
1002  DPRINTF(GPUFetch, "successful!\n");
1003  retries.pop_front();
1004  }
1005  }
1006 }
1007 
1008 void
1010 {
1011  // There must be a way around this check to do the globalMemStart...
1012  Addr tmp_vaddr = pkt->req->getVaddr();
1013 
1014  updatePageDivergenceDist(tmp_vaddr);
1015 
1016  // set PC in request
1017  pkt->req->setPC(gpuDynInst->wavefront()->pc());
1018 
1019  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1020 
1021  // figure out the type of the request to set read/write
1022  BaseMMU::Mode TLB_mode;
1023  assert(pkt->isRead() || pkt->isWrite());
1024 
1025  // only do some things if actually accessing data
1026  bool isDataAccess = pkt->isWrite() || pkt->isRead();
1027 
1028  // For dGPUs, real hardware will extract MTYPE from the PTE. Our model
1029  // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1030  // Rather than hacking up the pagetable to add these bits in, we just
1031  // keep a structure local to our GPUs that are populated in our
1032  // emulated driver whenever memory is allocated. Consult that structure
1033  // here in case we need a memtype override.
1034  shader->gpuCmdProc.driver()->setMtype(pkt->req);
1035 
1036  // Check write before read for atomic operations
1037  // since atomic operations should use BaseMMU::Write
1038  if (pkt->isWrite()) {
1039  TLB_mode = BaseMMU::Write;
1040  } else if (pkt->isRead()) {
1041  TLB_mode = BaseMMU::Read;
1042  } else {
1043  fatal("pkt is not a read nor a write\n");
1044  }
1045 
1046  stats.tlbCycles -= curTick();
1047  ++stats.tlbRequests;
1048 
1049  PortID tlbPort_index = perLaneTLB ? index : 0;
1050 
1051  if (shader->timingSim) {
1052  if (debugSegFault) {
1054  Addr vaddr = pkt->req->getVaddr();
1055  unsigned size = pkt->getSize();
1056 
1057  if ((vaddr + size - 1) % 64 < vaddr % 64) {
1058  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1059  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1060  }
1061 
1062  Addr paddr;
1063 
1064  if (!p->pTable->translate(vaddr, paddr)) {
1065  if (!p->fixupFault(vaddr)) {
1066  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1067  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1068  vaddr);
1069  }
1070  }
1071  }
1072 
1073  // This is the SenderState needed upon return
1074  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1075 
1076  // This is the senderState needed by the TLB hierarchy to function
1077  GpuTranslationState *translation_state =
1078  new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1079  pkt->senderState);
1080 
1081  pkt->senderState = translation_state;
1082 
1083  if (functionalTLB) {
1084  tlbPort[tlbPort_index].sendFunctional(pkt);
1085 
1086  // update the hitLevel distribution
1087  int hit_level = translation_state->hitLevel;
1088  assert(hit_level != -1);
1089  stats.hitsPerTLBLevel[hit_level]++;
1090 
1091  // New SenderState for the memory access
1092  GpuTranslationState *sender_state =
1093  safe_cast<GpuTranslationState*>(pkt->senderState);
1094 
1095  delete sender_state->tlbEntry;
1096  delete sender_state->saved;
1097  delete sender_state;
1098 
1099  assert(pkt->req->hasPaddr());
1100  assert(pkt->req->hasSize());
1101 
1102  // this is necessary because the GPU TLB receives packets instead
1103  // of requests. when the translation is complete, all relevent
1104  // fields in the request will be populated, but not in the packet.
1105  // here we create the new packet so we can set the size, addr,
1106  // and proper flags.
1107  PacketPtr oldPkt = pkt;
1108  pkt = new Packet(oldPkt->req, oldPkt->cmd);
1109  if (isDataAccess) {
1110  uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1111  pkt->dataStatic(tmpData);
1112  }
1113  delete oldPkt;
1114 
1115 
1116  // New SenderState for the memory access
1117  pkt->senderState =
1118  new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1119  nullptr);
1120 
1121  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1122  gpuDynInst->tlbHitLevel[index] = hit_level;
1123 
1124  // translation is done. Schedule the mem_req_event at the
1125  // appropriate cycle to send the timing memory request to ruby
1126  EventFunctionWrapper *mem_req_event =
1127  memPort[index].createMemReqEvent(pkt);
1128 
1129  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1130  "scheduled\n", cu_id, gpuDynInst->simdId,
1131  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1132 
1133  schedule(mem_req_event, curTick() + req_tick_latency);
1134  } else if (tlbPort[tlbPort_index].isStalled()) {
1135  assert(tlbPort[tlbPort_index].retries.size() > 0);
1136 
1137  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1138  "failed!\n", cu_id, gpuDynInst->simdId,
1139  gpuDynInst->wfSlotId, tmp_vaddr);
1140 
1141  tlbPort[tlbPort_index].retries.push_back(pkt);
1142  } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1143  // Stall the data port;
1144  // No more packet will be issued till
1145  // ruby indicates resources are freed by
1146  // a recvReqRetry() call back on this port.
1147  tlbPort[tlbPort_index].stallPort();
1148 
1149  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1150  "failed!\n", cu_id, gpuDynInst->simdId,
1151  gpuDynInst->wfSlotId, tmp_vaddr);
1152 
1153  tlbPort[tlbPort_index].retries.push_back(pkt);
1154  } else {
1155  DPRINTF(GPUTLB,
1156  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1157  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1158  }
1159  } else {
1160  if (pkt->cmd == MemCmd::MemSyncReq) {
1161  gpuDynInst->resetEntireStatusVector();
1162  } else {
1163  gpuDynInst->decrementStatusVector(index);
1164  }
1165 
1166  // New SenderState for the memory access
1167  delete pkt->senderState;
1168 
1169  // Because it's atomic operation, only need TLB translation state
1170  pkt->senderState = new GpuTranslationState(TLB_mode,
1171  shader->gpuTc);
1172 
1173  tlbPort[tlbPort_index].sendFunctional(pkt);
1174 
1175  // the addr of the packet is not modified, so we need to create a new
1176  // packet, or otherwise the memory access will have the old virtual
1177  // address sent in the translation packet, instead of the physical
1178  // address returned by the translation.
1179  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1180  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1181 
1182  // Translation is done. It is safe to send the packet to memory.
1183  memPort[0].sendFunctional(new_pkt);
1184 
1185  DPRINTF(GPUMem, "Functional sendRequest\n");
1186  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1187  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1188  new_pkt->req->getPaddr());
1189 
1190  // safe_cast the senderState
1191  GpuTranslationState *sender_state =
1192  safe_cast<GpuTranslationState*>(pkt->senderState);
1193 
1194  delete sender_state->tlbEntry;
1195  delete new_pkt;
1196  delete pkt->senderState;
1197  delete pkt;
1198  }
1199 }
1200 
1201 void
1203 {
1204  assert(pkt->isWrite() || pkt->isRead());
1205 
1206  BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1207 
1208  pkt->senderState =
1210 
1211  pkt->senderState =
1212  new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1213  pkt->senderState);
1214 
1215  if (scalarDTLBPort.isStalled()) {
1216  assert(scalarDTLBPort.retries.size());
1217  scalarDTLBPort.retries.push_back(pkt);
1218  } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1220  scalarDTLBPort.retries.push_back(pkt);
1221  } else {
1222  DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1223  tlb_mode == BaseMMU::Read ? "read" : "write",
1224  pkt->req->getVaddr());
1225  }
1226 }
1227 
1228 void
1230  bool kernelMemSync,
1231  RequestPtr req)
1232 {
1233  assert(gpuDynInst->isGlobalSeg() ||
1234  gpuDynInst->executedAs() == enums::SC_GLOBAL);
1235 
1236  if (!req) {
1237  req = std::make_shared<Request>(
1238  0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
1239  }
1240 
1241  // all mem sync requests have Paddr == 0
1242  req->setPaddr(0);
1243 
1244  PacketPtr pkt = nullptr;
1245 
1246  if (kernelMemSync) {
1247  if (gpuDynInst->isKernelLaunch()) {
1248  req->setCacheCoherenceFlags(Request::INV_L1);
1249  req->setReqInstSeqNum(gpuDynInst->seqNum());
1250  req->setFlags(Request::KERNEL);
1251  pkt = new Packet(req, MemCmd::MemSyncReq);
1252  pkt->pushSenderState(
1253  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1254 
1255  EventFunctionWrapper *mem_req_event =
1256  memPort[0].createMemReqEvent(pkt);
1257 
1258  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1259  "an acquire\n", cu_id, gpuDynInst->simdId,
1260  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1261 
1262  schedule(mem_req_event, curTick() + req_tick_latency);
1263  } else {
1264  // kernel end flush of GL2 cache may be quiesced by Ruby if the
1265  // GL2 is a read-only cache
1266  assert(shader->impl_kern_end_rel);
1267  assert(gpuDynInst->isEndOfKernel());
1268 
1269  req->setCacheCoherenceFlags(Request::FLUSH_L2);
1270  req->setReqInstSeqNum(gpuDynInst->seqNum());
1271  req->setFlags(Request::KERNEL);
1272  pkt = new Packet(req, MemCmd::MemSyncReq);
1273  pkt->pushSenderState(
1274  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1275 
1276  EventFunctionWrapper *mem_req_event =
1277  memPort[0].createMemReqEvent(pkt);
1278 
1279  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1280  "a release\n", cu_id, gpuDynInst->simdId,
1281  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1282 
1283  schedule(mem_req_event, curTick() + req_tick_latency);
1284  }
1285  } else {
1286  gpuDynInst->setRequestFlags(req);
1287 
1288  req->setReqInstSeqNum(gpuDynInst->seqNum());
1289 
1290  pkt = new Packet(req, MemCmd::MemSyncReq);
1291  pkt->pushSenderState(
1292  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1293 
1294  EventFunctionWrapper *mem_req_event =
1295  memPort[0].createMemReqEvent(pkt);
1296 
1297  DPRINTF(GPUPort,
1298  "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1299  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1300  pkt->req->getPaddr());
1301 
1302  schedule(mem_req_event, curTick() + req_tick_latency);
1303  }
1304 }
1305 
1306 void
1308 {
1309  DataPort::SenderState *sender_state =
1310  safe_cast<DataPort::SenderState*>(pkt->senderState);
1311 
1312  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1313  ComputeUnit *compute_unit = computeUnit;
1314 
1315  assert(gpuDynInst);
1316 
1317  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1318  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1319  pkt->req->getPaddr(), id);
1320 
1321  Addr paddr = pkt->req->getPaddr();
1322 
1323  // mem sync resp callback must be handled already in
1324  // DataPort::recvTimingResp
1325  assert(pkt->cmd != MemCmd::MemSyncResp);
1326 
1327  // The status vector and global memory response for WriteResp packets get
1328  // handled by the WriteCompleteResp packets.
1329  if (pkt->cmd == MemCmd::WriteResp) {
1330  delete pkt;
1331  return;
1332  }
1333 
1334  // this is for read, write and atomic
1335  int index = gpuDynInst->memStatusVector[paddr].back();
1336 
1337  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1338  pkt->req->getPaddr(), id);
1339 
1340  gpuDynInst->memStatusVector[paddr].pop_back();
1341  gpuDynInst->pAddr = pkt->req->getPaddr();
1342 
1343  gpuDynInst->decrementStatusVector(index);
1344  DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1345 
1346  if (gpuDynInst->allLanesZero()) {
1347  auto iter = gpuDynInst->memStatusVector.begin();
1348  auto end = gpuDynInst->memStatusVector.end();
1349 
1350  while (iter != end) {
1351  assert(iter->second.empty());
1352  ++iter;
1353  }
1354 
1355  // Calculate the difference between the arrival of the first cache
1356  // block and the last cache block to arrive if we have the time
1357  // for the first cache block.
1358  if (compute_unit->headTailMap.count(gpuDynInst)) {
1359  Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1360  compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1361  compute_unit->headTailMap.erase(gpuDynInst);
1362  }
1363 
1364  gpuDynInst->memStatusVector.clear();
1365 
1366  gpuDynInst->
1367  profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1368  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1369 
1370  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1371  compute_unit->cu_id, gpuDynInst->simdId,
1372  gpuDynInst->wfSlotId);
1373  } else {
1374  if (pkt->isRead()) {
1375  if (!compute_unit->headTailMap.count(gpuDynInst)) {
1376  compute_unit->headTailMap
1377  .insert(std::make_pair(gpuDynInst, curTick()));
1378  }
1379  }
1380  }
1381 
1382  delete pkt->senderState;
1383  delete pkt;
1384 }
1385 
1386 bool
1388 {
1389  Addr line = pkt->req->getPaddr();
1390 
1391  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1392  pkt->req->getVaddr(), line);
1393 
1394  assert(pkt->senderState);
1395  computeUnit->stats.tlbCycles += curTick();
1396 
1397  // pop off the TLB translation state
1398  GpuTranslationState *translation_state =
1399  safe_cast<GpuTranslationState*>(pkt->senderState);
1400 
1401  // no PageFaults are permitted for data accesses
1402  if (!translation_state->tlbEntry) {
1403  DTLBPort::SenderState *sender_state =
1404  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1405 
1406  [[maybe_unused]] Wavefront *w =
1407  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1408  [sender_state->_gpuDynInst->wfSlotId];
1409 
1410  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1411  pkt->req->getVaddr());
1412  }
1413 
1414  // update the hitLevel distribution
1415  int hit_level = translation_state->hitLevel;
1416  computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1417 
1418  delete translation_state->tlbEntry;
1419  assert(!translation_state->ports.size());
1420  pkt->senderState = translation_state->saved;
1421 
1422  // for prefetch pkt
1423  BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1424 
1425  delete translation_state;
1426 
1427  // use the original sender state to know how to close this transaction
1428  DTLBPort::SenderState *sender_state =
1429  safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1430 
1431  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1432  PortID mp_index = sender_state->portIndex;
1433  Addr vaddr = pkt->req->getVaddr();
1434  gpuDynInst->memStatusVector[line].push_back(mp_index);
1435  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1436 
1437  MemCmd requestCmd;
1438 
1439  if (pkt->cmd == MemCmd::ReadResp) {
1440  requestCmd = MemCmd::ReadReq;
1441  } else if (pkt->cmd == MemCmd::WriteResp) {
1442  requestCmd = MemCmd::WriteReq;
1443  } else if (pkt->cmd == MemCmd::SwapResp) {
1444  requestCmd = MemCmd::SwapReq;
1445  } else {
1446  panic("unsupported response to request conversion %s\n",
1447  pkt->cmd.toString());
1448  }
1449 
1450  if (computeUnit->prefetchDepth) {
1451  int simdId = gpuDynInst->simdId;
1452  int wfSlotId = gpuDynInst->wfSlotId;
1453  Addr last = 0;
1454 
1455  switch(computeUnit->prefetchType) {
1456  case enums::PF_CU:
1457  last = computeUnit->lastVaddrCU[mp_index];
1458  break;
1459  case enums::PF_PHASE:
1460  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1461  break;
1462  case enums::PF_WF:
1463  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1464  default:
1465  break;
1466  }
1467 
1468  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1469  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1470 
1471  int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1473  : 0;
1474 
1475  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1476 
1477  computeUnit->lastVaddrCU[mp_index] = vaddr;
1478  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1479  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1480 
1481  stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1482  computeUnit->prefetchStride: stride;
1483 
1484  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1485  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1486 
1487  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1488 
1489  // Prefetch Next few pages atomically
1490  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1491  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1493 
1494  if (!stride)
1495  break;
1496 
1497  RequestPtr prefetch_req = std::make_shared<Request>(
1499  sizeof(uint8_t), 0,
1500  computeUnit->requestorId(),
1501  0, 0, nullptr);
1502 
1503  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1504  uint8_t foo = 0;
1505  prefetch_pkt->dataStatic(&foo);
1506 
1507  // Because it's atomic operation, only need TLB translation state
1508  prefetch_pkt->senderState =
1509  new GpuTranslationState(TLB_mode,
1510  computeUnit->shader->gpuTc, true);
1511 
1512  // Currently prefetches are zero-latency, hence the sendFunctional
1513  sendFunctional(prefetch_pkt);
1514 
1515  /* safe_cast the senderState */
1516  GpuTranslationState *tlb_state =
1517  safe_cast<GpuTranslationState*>(
1518  prefetch_pkt->senderState);
1519 
1520 
1521  delete tlb_state->tlbEntry;
1522  delete tlb_state;
1523  delete prefetch_pkt;
1524  }
1525  }
1526 
1527  // First we must convert the response cmd back to a request cmd so that
1528  // the request can be sent through the cu's request port
1529  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1530  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1531  delete pkt->senderState;
1532  delete pkt;
1533 
1534  // New SenderState for the memory access
1535  new_pkt->senderState =
1536  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1537  nullptr);
1538 
1539  // translation is done. Schedule the mem_req_event at the appropriate
1540  // cycle to send the timing memory request to ruby
1541  EventFunctionWrapper *mem_req_event =
1542  computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1543 
1544  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1545  computeUnit->cu_id, gpuDynInst->simdId,
1546  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1547 
1548  computeUnit->schedule(mem_req_event, curTick() +
1549  computeUnit->req_tick_latency);
1550 
1551  return true;
1552 }
1553 
1556 {
1557  return new EventFunctionWrapper(
1558  [this, pkt]{ processMemReqEvent(pkt); },
1559  "ComputeUnit memory request event", true);
1560 }
1561 
1564 {
1565  return new EventFunctionWrapper(
1566  [this, pkt]{ processMemRespEvent(pkt); },
1567  "ComputeUnit memory response event", true);
1568 }
1569 
1570 void
1572 {
1573  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1574  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1575  [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1576 
1577  if (!(sendTimingReq(pkt))) {
1578  retries.push_back(std::make_pair(pkt, gpuDynInst));
1579 
1580  DPRINTF(GPUPort,
1581  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1582  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1583  id, pkt->req->getPaddr());
1584  } else {
1585  DPRINTF(GPUPort,
1586  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1587  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1588  gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1589  pkt->req->getPaddr());
1590  }
1591 }
1592 
1593 const char*
1595 {
1596  return "ComputeUnit scalar memory request event";
1597 }
1598 
1599 void
1601 {
1602  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1603  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1604  [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1605 
1606  if (!(scalarDataPort.sendTimingReq(pkt))) {
1607  scalarDataPort.retries.push_back(pkt);
1608 
1609  DPRINTF(GPUPort,
1610  "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1611  compute_unit->cu_id, gpuDynInst->simdId,
1612  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1613  } else {
1614  DPRINTF(GPUPort,
1615  "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1616  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1617  gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1618  pkt->req->getPaddr());
1619  }
1620 }
1621 
1622 /*
1623  * The initial translation request could have been rejected,
1624  * if <retries> queue is not Retry sending the translation
1625  * request. sendRetry() is called from the peer port whenever
1626  * a translation completes.
1627  */
1628 void
1630 {
1631  int len = retries.size();
1632 
1633  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1634  computeUnit->cu_id, len);
1635 
1636  assert(len > 0);
1637  assert(isStalled());
1638  // recvReqRetry is an indication that the resource on which this
1639  // port was stalling on is freed. So, remove the stall first
1640  unstallPort();
1641 
1642  for (int i = 0; i < len; ++i) {
1643  PacketPtr pkt = retries.front();
1644  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1645  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1646 
1647  if (!sendTimingReq(pkt)) {
1648  // Stall port
1649  stallPort();
1650  DPRINTF(GPUTLB, ": failed again\n");
1651  break;
1652  } else {
1653  DPRINTF(GPUTLB, ": successful\n");
1654  retries.pop_front();
1655  }
1656  }
1657 }
1658 
1659 bool
1661 {
1662  assert(pkt->senderState);
1663 
1664  GpuTranslationState *translation_state =
1665  safe_cast<GpuTranslationState*>(pkt->senderState);
1666 
1667  // Page faults are not allowed
1668  fatal_if(!translation_state->tlbEntry,
1669  "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1670 
1671  delete translation_state->tlbEntry;
1672  assert(!translation_state->ports.size());
1673 
1674  pkt->senderState = translation_state->saved;
1675  delete translation_state;
1676 
1677  ScalarDTLBPort::SenderState *sender_state =
1678  safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1679 
1680  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1681  delete pkt->senderState;
1682 
1683  [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1684 
1685  DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1686  "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1687  w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1688 
1689  MemCmd mem_cmd;
1690 
1691  if (pkt->cmd == MemCmd::ReadResp) {
1692  mem_cmd = MemCmd::ReadReq;
1693  } else if (pkt->cmd == MemCmd::WriteResp) {
1694  mem_cmd = MemCmd::WriteReq;
1695  } else {
1696  fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1697  pkt->cmd.toString());
1698  }
1699 
1700  PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1701  req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1702  delete pkt;
1703 
1704  req_pkt->senderState =
1706 
1707  if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1708  computeUnit->scalarDataPort.retries.push_back(req_pkt);
1709  DPRINTF(GPUMem, "send scalar req failed for: %s\n",
1710  gpuDynInst->disassemble());
1711  } else {
1712  DPRINTF(GPUMem, "send scalar req for: %s\n",
1713  gpuDynInst->disassemble());
1714  }
1715 
1716  return true;
1717 }
1718 
1719 bool
1721 {
1722  [[maybe_unused]] Addr line = pkt->req->getPaddr();
1723  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1724  computeUnit->cu_id, pkt->req->getVaddr(), line);
1725 
1726  assert(pkt->senderState);
1727 
1728  // pop off the TLB translation state
1729  GpuTranslationState *translation_state
1730  = safe_cast<GpuTranslationState*>(pkt->senderState);
1731 
1732  bool success = translation_state->tlbEntry != nullptr;
1733  delete translation_state->tlbEntry;
1734  assert(!translation_state->ports.size());
1735  pkt->senderState = translation_state->saved;
1736  delete translation_state;
1737 
1738  // use the original sender state to know how to close this transaction
1739  ITLBPort::SenderState *sender_state =
1740  safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1741 
1742  // get the wavefront associated with this translation request
1743  Wavefront *wavefront = sender_state->wavefront;
1744  delete pkt->senderState;
1745 
1746  if (success) {
1747  // pkt is reused in fetch(), don't delete it here. However, we must
1748  // reset the command to be a request so that it can be sent through
1749  // the cu's request port
1750  assert(pkt->cmd == MemCmd::ReadResp);
1751  pkt->cmd = MemCmd::ReadReq;
1752 
1753  computeUnit->fetchStage.fetch(pkt, wavefront);
1754  } else {
1755  if (wavefront->dropFetch) {
1756  assert(wavefront->instructionBuffer.empty());
1757  wavefront->dropFetch = false;
1758  }
1759 
1760  wavefront->pendingFetch = 0;
1761  }
1762 
1763  return true;
1764 }
1765 
1766 /*
1767  * The initial translation request could have been rejected, if
1768  * <retries> queue is not empty. Retry sending the translation
1769  * request. sendRetry() is called from the peer port whenever
1770  * a translation completes.
1771  */
1772 void
1774 {
1775 
1776  int len = retries.size();
1777  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1778 
1779  assert(len > 0);
1780  assert(isStalled());
1781 
1782  // recvReqRetry is an indication that the resource on which this
1783  // port was stalling on is freed. So, remove the stall first
1784  unstallPort();
1785 
1786  for (int i = 0; i < len; ++i) {
1787  PacketPtr pkt = retries.front();
1788  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1789  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1790 
1791  if (!sendTimingReq(pkt)) {
1792  stallPort(); // Stall port
1793  DPRINTF(GPUTLB, ": failed again\n");
1794  break;
1795  } else {
1796  DPRINTF(GPUTLB, ": successful\n");
1797  retries.pop_front();
1798  }
1799  }
1800 }
1801 
1802 void
1804 {
1805  if (gpuDynInst->isScalar()) {
1806  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1807  stats.sALUInsts++;
1809  } else if (gpuDynInst->isLoad()) {
1811  } else if (gpuDynInst->isStore()) {
1813  }
1814  } else {
1815  if (gpuDynInst->isALU()) {
1818  exitSimLoop("max vALU insts");
1819  }
1820  stats.vALUInsts++;
1823  += gpuDynInst->wavefront()->execMask().count();
1824  } else if (gpuDynInst->isFlat()) {
1825  if (gpuDynInst->isLocalMem()) {
1826  stats.flatLDSInsts++;
1827  } else {
1828  stats.flatVMemInsts++;
1829  }
1830  } else if (gpuDynInst->isFlatGlobal()) {
1831  stats.flatVMemInsts++;
1832  } else if (gpuDynInst->isLocalMem()) {
1834  } else if (gpuDynInst->isLoad()) {
1836  } else if (gpuDynInst->isStore()) {
1838  }
1839 
1840  if (gpuDynInst->isLoad()) {
1841  switch (gpuDynInst->executedAs()) {
1842  case enums::SC_SPILL:
1843  stats.spillReads++;
1844  break;
1845  case enums::SC_GLOBAL:
1846  stats.globalReads++;
1847  break;
1848  case enums::SC_GROUP:
1849  stats.groupReads++;
1850  break;
1851  case enums::SC_PRIVATE:
1852  stats.privReads++;
1853  break;
1854  case enums::SC_READONLY:
1855  stats.readonlyReads++;
1856  break;
1857  case enums::SC_KERNARG:
1858  stats.kernargReads++;
1859  break;
1860  case enums::SC_ARG:
1861  stats.argReads++;
1862  break;
1863  case enums::SC_NONE:
1868  break;
1869  default:
1870  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1871  break;
1872  }
1873  } else if (gpuDynInst->isStore()) {
1874  switch (gpuDynInst->executedAs()) {
1875  case enums::SC_SPILL:
1876  stats.spillWrites++;
1877  break;
1878  case enums::SC_GLOBAL:
1879  stats.globalWrites++;
1880  break;
1881  case enums::SC_GROUP:
1882  stats.groupWrites++;
1883  break;
1884  case enums::SC_PRIVATE:
1885  stats.privWrites++;
1886  break;
1887  case enums::SC_READONLY:
1889  break;
1890  case enums::SC_KERNARG:
1891  stats.kernargWrites++;
1892  break;
1893  case enums::SC_ARG:
1894  stats.argWrites++;
1895  break;
1896  case enums::SC_NONE:
1901  break;
1902  default:
1903  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1904  break;
1905  }
1906  }
1907  }
1908 }
1909 
1910 void
1912 {
1913  Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
1914 
1915  if (!pagesTouched.count(virt_page_addr))
1916  pagesTouched[virt_page_addr] = 1;
1917  else
1918  pagesTouched[virt_page_addr]++;
1919 }
1920 
1921 void
1923 {
1924  if (countPages) {
1925  std::ostream *page_stat_file = simout.create(name().c_str())->stream();
1926 
1927  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1928  std::endl;
1929 
1930  for (auto iter : pageAccesses) {
1931  *page_stat_file << std::hex << iter.first << ",";
1932  *page_stat_file << std::dec << iter.second.first << ",";
1933  *page_stat_file << std::dec << iter.second.second << std::endl;
1934  }
1935  }
1936 }
1937 
1938 bool
1940 {
1941  for (int i = 0; i < numVectorALUs; ++i) {
1942  if (!isVectorAluIdle(i)) {
1943  return false;
1944  }
1945  }
1946 
1947  // TODO: FIXME if more than 1 of any memory pipe supported
1948  if (!srfToScalarMemPipeBus.rdy()) {
1949  return false;
1950  }
1951  if (!vrfToGlobalMemPipeBus.rdy()) {
1952  return false;
1953  }
1954  if (!vrfToLocalMemPipeBus.rdy()) {
1955  return false;
1956  }
1957 
1962  return false;
1963  }
1964 
1965  return true;
1966 }
1967 
1968 int32_t
1969 ComputeUnit::getRefCounter(const uint32_t dispatchId,
1970  const uint32_t wgId) const
1971 {
1972  return lds.getRefCounter(dispatchId, wgId);
1973 }
1974 
1975 bool
1976 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
1977 {
1978  assert(simdId < numVectorALUs);
1979 
1980  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1981  if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
1982  return false;
1983  }
1984  }
1985 
1986  return true;
1987 }
1988 
1994 bool
1996 {
1997  // this is just a request to carry the GPUDynInstPtr
1998  // back and forth
1999  RequestPtr newRequest = std::make_shared<Request>();
2000  newRequest->setPaddr(0x0);
2001 
2002  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2003  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2004 
2005  // This is the SenderState needed upon return
2006  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2007 
2008  return ldsPort.sendTimingReq(newPacket);
2009 }
2010 
2014 bool
2016 {
2017  const ComputeUnit::LDSPort::SenderState *senderState =
2018  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2019 
2020  fatal_if(!senderState, "did not get the right sort of sender state");
2021 
2022  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2023 
2024  delete packet->senderState;
2025  delete packet;
2026 
2027  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2028  return true;
2029 }
2030 
2036 bool
2038 {
2039  ComputeUnit::LDSPort::SenderState *sender_state =
2040  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2041  fatal_if(!sender_state, "packet without a valid sender state");
2042 
2043  [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2044 
2045  if (isStalled()) {
2046  fatal_if(retries.empty(), "must have retries waiting to be stalled");
2047 
2048  retries.push(pkt);
2049 
2050  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2051  computeUnit->cu_id, gpuDynInst->simdId,
2052  gpuDynInst->wfSlotId);
2053  return false;
2054  } else if (!RequestPort::sendTimingReq(pkt)) {
2055  // need to stall the LDS port until a recvReqRetry() is received
2056  // this indicates that there is more space
2057  stallPort();
2058  retries.push(pkt);
2059 
2060  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2061  computeUnit->cu_id, gpuDynInst->simdId,
2062  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2063  return false;
2064  } else {
2065  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2066  computeUnit->cu_id, gpuDynInst->simdId,
2067  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2068  return true;
2069  }
2070 }
2071 
2078 void
2080 {
2081  auto queueSize = retries.size();
2082 
2083  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2084  computeUnit->cu_id, queueSize);
2085 
2086  fatal_if(queueSize < 1,
2087  "why was there a recvReqRetry() with no pending reqs?");
2088  fatal_if(!isStalled(),
2089  "recvReqRetry() happened when the port was not stalled");
2090 
2091  unstallPort();
2092 
2093  while (!retries.empty()) {
2094  PacketPtr packet = retries.front();
2095 
2096  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2097 
2098  if (!RequestPort::sendTimingReq(packet)) {
2099  // Stall port
2100  stallPort();
2101  DPRINTF(GPUPort, ": LDS send failed again\n");
2102  break;
2103  } else {
2104  DPRINTF(GPUTLB, ": LDS send successful\n");
2105  retries.pop();
2106  }
2107  }
2108 }
2109 
2111  int n_wf)
2112  : statistics::Group(parent),
2113  ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2114  ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2115  "per-wavefront."),
2116  ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2117  ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2118  "per-wavefront."),
2119  ADD_STAT(instCyclesVALU,
2120  "Number of cycles needed to execute VALU insts."),
2121  ADD_STAT(instCyclesSALU,
2122  "Number of cycles needed to execute SALU insts."),
2123  ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2124  "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2125  "the number of active threads."),
2126  ADD_STAT(vALUUtilization,
2127  "Percentage of active vector ALU threads in a wave."),
2128  ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2129  " accesses that resolve to LDS."),
2130  ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2131  "including FLAT accesses that resolve to LDS) per-wavefront."),
2132  ADD_STAT(flatVMemInsts,
2133  "The number of FLAT insts that resolve to vmem issued."),
2134  ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2135  "resolve to vmem issued per-wavefront."),
2136  ADD_STAT(flatLDSInsts,
2137  "The number of FLAT insts that resolve to LDS issued."),
2138  ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2139  "resolve to LDS issued per-wavefront."),
2140  ADD_STAT(vectorMemWrites,
2141  "Number of vector mem write insts (excluding FLAT insts)."),
2142  ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2143  "insts (excluding FLAT insts) per-wavefront."),
2144  ADD_STAT(vectorMemReads,
2145  "Number of vector mem read insts (excluding FLAT insts)."),
2146  ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2147  "(excluding FLAT insts) per-wavefront."),
2148  ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2149  ADD_STAT(scalarMemWritesPerWF,
2150  "The average number of scalar mem write insts per-wavefront."),
2151  ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2152  ADD_STAT(scalarMemReadsPerWF,
2153  "The average number of scalar mem read insts per-wavefront."),
2154  ADD_STAT(vectorMemReadsPerKiloInst,
2155  "Number of vector mem reads per kilo-instruction"),
2156  ADD_STAT(vectorMemWritesPerKiloInst,
2157  "Number of vector mem writes per kilo-instruction"),
2158  ADD_STAT(vectorMemInstsPerKiloInst,
2159  "Number of vector mem insts per kilo-instruction"),
2160  ADD_STAT(scalarMemReadsPerKiloInst,
2161  "Number of scalar mem reads per kilo-instruction"),
2162  ADD_STAT(scalarMemWritesPerKiloInst,
2163  "Number of scalar mem writes per kilo-instruction"),
2164  ADD_STAT(scalarMemInstsPerKiloInst,
2165  "Number of scalar mem insts per kilo-instruction"),
2166  ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2167  "command, data from VRF to vector memory unit, per SIMD"),
2168  ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2169  "command, data from SRF to scalar memory unit, per SIMD"),
2170  ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2171  "command, data from VRF to LDS unit, per SIMD"),
2172  ADD_STAT(globalReads, "Number of reads to the global segment"),
2173  ADD_STAT(globalWrites, "Number of writes to the global segment"),
2174  ADD_STAT(globalMemInsts,
2175  "Number of memory instructions sent to the global segment"),
2176  ADD_STAT(argReads, "Number of reads to the arg segment"),
2177  ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2178  ADD_STAT(argMemInsts,
2179  "Number of memory instructions sent to the arg segment"),
2180  ADD_STAT(spillReads, "Number of reads to the spill segment"),
2181  ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2182  ADD_STAT(spillMemInsts,
2183  "Number of memory instructions sent to the spill segment"),
2184  ADD_STAT(groupReads, "Number of reads to the group segment"),
2185  ADD_STAT(groupWrites, "Number of writes to the group segment"),
2186  ADD_STAT(groupMemInsts,
2187  "Number of memory instructions sent to the group segment"),
2188  ADD_STAT(privReads, "Number of reads to the private segment"),
2189  ADD_STAT(privWrites, "Number of writes to the private segment"),
2190  ADD_STAT(privMemInsts,
2191  "Number of memory instructions sent to the private segment"),
2192  ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2193  ADD_STAT(readonlyWrites,
2194  "Number of memory instructions sent to the readonly segment"),
2195  ADD_STAT(readonlyMemInsts,
2196  "Number of memory instructions sent to the readonly segment"),
2197  ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2198  ADD_STAT(kernargWrites,
2199  "Number of memory instructions sent to the kernarg segment"),
2200  ADD_STAT(kernargMemInsts,
2201  "Number of memory instructions sent to the kernarg segment"),
2202  ADD_STAT(waveLevelParallelism,
2203  "wave level parallelism: count of active waves at wave launch"),
2204  ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2205  ADD_STAT(tlbCycles,
2206  "total number of cycles for all uncoalesced requests"),
2207  ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2208  ADD_STAT(hitsPerTLBLevel,
2209  "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2210  ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2211  ADD_STAT(ldsBankConflictDist,
2212  "Number of bank conflicts per LDS memory packet"),
2213  ADD_STAT(pageDivergenceDist,
2214  "pages touched per wf (over all mem. instr.)"),
2215  ADD_STAT(dynamicGMemInstrCnt,
2216  "dynamic non-flat global memory instruction count"),
2217  ADD_STAT(dynamicFlatMemInstrCnt,
2218  "dynamic flat global memory instruction count"),
2219  ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2220  ADD_STAT(wgBlockedDueBarrierAllocation,
2221  "WG dispatch was blocked due to lack of barrier resources"),
2222  ADD_STAT(wgBlockedDueLdsAllocation,
2223  "Workgroup blocked due to LDS capacity"),
2224  ADD_STAT(numInstrExecuted, "number of instructions executed"),
2225  ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2226  "vector instructions per cycle"),
2227  ADD_STAT(numVecOpsExecuted,
2228  "number of vec ops executed (e.g. WF size/inst)"),
2229  ADD_STAT(numVecOpsExecutedF16,
2230  "number of f16 vec ops executed (e.g. WF size/inst)"),
2231  ADD_STAT(numVecOpsExecutedF32,
2232  "number of f32 vec ops executed (e.g. WF size/inst)"),
2233  ADD_STAT(numVecOpsExecutedF64,
2234  "number of f64 vec ops executed (e.g. WF size/inst)"),
2235  ADD_STAT(numVecOpsExecutedFMA16,
2236  "number of fma16 vec ops executed (e.g. WF size/inst)"),
2237  ADD_STAT(numVecOpsExecutedFMA32,
2238  "number of fma32 vec ops executed (e.g. WF size/inst)"),
2239  ADD_STAT(numVecOpsExecutedFMA64,
2240  "number of fma64 vec ops executed (e.g. WF size/inst)"),
2241  ADD_STAT(numVecOpsExecutedMAC16,
2242  "number of mac16 vec ops executed (e.g. WF size/inst)"),
2243  ADD_STAT(numVecOpsExecutedMAC32,
2244  "number of mac32 vec ops executed (e.g. WF size/inst)"),
2245  ADD_STAT(numVecOpsExecutedMAC64,
2246  "number of mac64 vec ops executed (e.g. WF size/inst)"),
2247  ADD_STAT(numVecOpsExecutedMAD16,
2248  "number of mad16 vec ops executed (e.g. WF size/inst)"),
2249  ADD_STAT(numVecOpsExecutedMAD32,
2250  "number of mad32 vec ops executed (e.g. WF size/inst)"),
2251  ADD_STAT(numVecOpsExecutedMAD64,
2252  "number of mad64 vec ops executed (e.g. WF size/inst)"),
2253  ADD_STAT(numVecOpsExecutedTwoOpFP,
2254  "number of two op FP vec ops executed (e.g. WF size/inst)"),
2255  ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2256  ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2257  ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2258  ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2259  ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2260  ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2261  ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2262  "instruction (over all instructions)"),
2263  ADD_STAT(activeLanesPerGMemInstrDist,
2264  "number of active lanes per global memory instruction"),
2265  ADD_STAT(activeLanesPerLMemInstrDist,
2266  "number of active lanes per local memory instruction"),
2267  ADD_STAT(numALUInstsExecuted,
2268  "Number of dynamic non-GM memory insts executed"),
2269  ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2270  "blocked due to VGPR allocation per SIMD"),
2271  ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2272  "blocked due to SGPR allocation per SIMD"),
2273  ADD_STAT(numCASOps, "number of compare and swap operations"),
2274  ADD_STAT(numFailedCASOps,
2275  "number of compare and swap operations that failed"),
2276  ADD_STAT(completedWfs, "number of completed wavefronts"),
2277  ADD_STAT(completedWGs, "number of completed workgroups"),
2278  ADD_STAT(headTailLatency, "ticks between first and last cache block "
2279  "arrival at coalescer"),
2280  ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2281 {
2282  ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2283 
2287 
2288  hitsPerTLBLevel.init(4);
2289  execRateDist.init(0, 10, 2);
2290  ldsBankConflictDist.init(0, cu->wfSize(), 2);
2291 
2292  pageDivergenceDist.init(1, cu->wfSize(), 4);
2293  controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
2296 
2297  headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
2299  waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2300  instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2301 
2304  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
2312 
2321 
2329 
2331 
2332  // fixed number of TLB levels
2333  for (int i = 0; i < 4; ++i) {
2334  if (!i)
2335  hitsPerTLBLevel.subname(i,"page_table");
2336  else
2337  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2338  }
2339 
2345 
2348 }
2349 
2350 } // namespace gem5
gem5::ComputeUnit::ComputeUnitStats::tlbRequests
statistics::Scalar tlbRequests
Definition: compute_unit.hh:1009
gem5::ComputeUnit::ComputeUnitStats::sALUInstsPerWF
statistics::Formula sALUInstsPerWF
Definition: compute_unit.hh:950
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
gem5::PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:252
gem5::ComputeUnit::ComputeUnitStats::vALUUtilization
statistics::Formula vALUUtilization
Definition: compute_unit.hh:954
gem5::ComputeUnit::getAndIncSeqNum
InstSeqNum getAndIncSeqNum()
Definition: compute_unit.hh:881
gem5::GMEnqueue
@ GMEnqueue
Definition: misc.hh:56
gem5::HSAQueueEntry::numWg
int numWg(int dim) const
Definition: hsa_queue_entry.hh:235
gem5::ComputeUnit::wfList
std::vector< std::vector< Wavefront * > > wfList
Definition: compute_unit.hh:291
gem5::ComputeUnit::ComputeUnit
ComputeUnit(const Params &p)
Definition: compute_unit.cc:64
gem5::BaseMMU::Read
@ Read
Definition: mmu.hh:56
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerWF
statistics::Formula scalarMemReadsPerWF
Definition: compute_unit.hh:968
gem5::RequestPort::sendTimingReq
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition: port.hh:495
gem5::ArmISA::len
Bitfield< 18, 16 > len
Definition: misc_types.hh:445
gem5::LdsState::getRefCounter
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:330
gem5::ComputeUnit::ComputeUnitStats::instCyclesSALU
statistics::Scalar instCyclesSALU
Definition: compute_unit.hh:952
gem5::Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:108
simple_pool_manager.hh
gem5::Wavefront::S_RUNNING
@ S_RUNNING
Definition: wavefront.hh:70
gem5::ComputeUnit::fetchStage
FetchStage fetchStage
Definition: compute_unit.hh:280
gem5::ComputeUnit::ComputeUnitStats::instInterleave
statistics::VectorDistribution instInterleave
Definition: compute_unit.hh:1088
gem5::ComputeUnit::ComputeUnitStats::flatVMemInsts
statistics::Scalar flatVMemInsts
Definition: compute_unit.hh:957
gem5::ComputeUnit::ScalarDTLBPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:712
gem5::ComputeUnit::sendRequest
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
Definition: compute_unit.cc:1009
gem5::ScalarMemPipeline::exec
void exec()
Definition: scalar_memory_pipeline.cc:54
gem5::MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:281
gem5::ComputeUnit::debugSegFault
bool debugSegFault
Definition: compute_unit.hh:341
gem5::FetchStage::exec
void exec()
Definition: fetch_stage.cc:65
shader.hh
gem5::ComputeUnit::DataPort::processMemReqEvent
void processMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1571
gem5::ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:285
gem5::ComputeUnit::ComputeUnitStats::privWrites
statistics::Scalar privWrites
Definition: compute_unit.hh:996
gem5::ComputeUnit::ComputeUnitStats::kernargWrites
statistics::Scalar kernargWrites
Definition: compute_unit.hh:1002
gem5::MemCmd::SwapReq
@ SwapReq
Definition: packet.hh:115
gem5::ComputeUnit::numVecRegsPerSimd
int numVecRegsPerSimd
Definition: compute_unit.hh:371
gem5::ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition: compute_unit.hh:297
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerWF
statistics::Formula scalarMemWritesPerWF
Definition: compute_unit.hh:966
gem5::ComputeUnit::ComputeUnitStats::argMemInsts
statistics::Formula argMemInsts
Definition: compute_unit.hh:988
gem5::ComputeUnit::ITLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1720
gem5::ComputeUnit::ComputeUnitStats::spillWrites
statistics::Scalar spillWrites
Definition: compute_unit.hh:990
gem5::ComputeUnit::ComputeUnitStats::spillMemInsts
statistics::Formula spillMemInsts
Definition: compute_unit.hh:991
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerKiloInst
statistics::Formula scalarMemWritesPerKiloInst
Definition: compute_unit.hh:974
gem5::MipsISA::index
Bitfield< 30, 0 > index
Definition: pra_constants.hh:47
gem5::ComputeUnit::ComputeUnitStats::readonlyReads
statistics::Scalar readonlyReads
Definition: compute_unit.hh:998
gem5::ComputeUnit::LDSPort::SenderState
SenderState is information carried along with the packet, esp.
Definition: compute_unit.hh:783
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueBarrierAllocation
Definition: compute_unit.hh:1028
gem5::ComputeUnit::DTLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1387
gem5::Packet::pushSenderState
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition: packet.cc:316
gem5::ComputeUnit::ComputeUnitStats::vectorMemReads
statistics::Scalar vectorMemReads
Definition: compute_unit.hh:963
gem5::BaseMMU::Mode
Mode
Definition: mmu.hh:56
gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:366
gem5::ComputeUnit::lastVaddrSimd
std::vector< std::vector< Addr > > lastVaddrSimd
Definition: compute_unit.hh:336
gem5::BaseMMU::Write
@ Write
Definition: mmu.hh:56
gem5::Wavefront
Definition: wavefront.hh:60
gem5::ComputeUnit::ScalarDTLBPort::isStalled
bool isStalled() const
Definition: compute_unit.hh:708
gem5::FetchStage::init
void init()
Definition: fetch_stage.cc:56
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF64
statistics::Scalar numVecOpsExecutedF64
Definition: compute_unit.hh:1044
gem5::ComputeUnit::ComputeUnitStats::dynamicGMemInstrCnt
statistics::Scalar dynamicGMemInstrCnt
Definition: compute_unit.hh:1023
gem5::HSAQueueEntry
Definition: hsa_queue_entry.hh:59
compute_unit.hh
gem5::ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition: compute_unit.cc:239
gem5::X86ISA::GpuTLB::TranslationState::saved
Packet::SenderState * saved
Definition: tlb.hh:308
gem5::ComputeUnit::pagesTouched
std::map< Addr, int > pagesTouched
Definition: compute_unit.hh:378
gpu_static_inst.hh
gem5::VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
gem5::ComputeUnit::scoreboardCheckStage
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:281
gem5::ComputeUnit::stats
gem5::ComputeUnit::ComputeUnitStats stats
gem5::ComputeUnit::headTailMap
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
Definition: compute_unit.hh:937
gem5::ComputeUnit::ComputeUnitStats::vpc_f16
statistics::Formula vpc_f16
Definition: compute_unit.hh:1062
gem5::floorLog2
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition: intmath.hh:59
gem5::ComputeUnit::ComputeUnitStats::tlbLatency
statistics::Formula tlbLatency
Definition: compute_unit.hh:1011
gem5::simout
OutputDirectory simout
Definition: output.cc:62
gem5::ComputeUnit::DataPort::processMemRespEvent
void processMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1307
gem5::ComputeUnit::lastVaddrCU
std::vector< Addr > lastVaddrCU
Definition: compute_unit.hh:335
gem5::MemCmd::SwapResp
@ SwapResp
Definition: packet.hh:116
gem5::ComputeUnit::ScalarDTLBPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:1660
gem5::ComputeUnit::resp_tick_latency
Tick resp_tick_latency
Definition: compute_unit.hh:356
gem5::statistics::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:402
gem5::ComputeUnit::ComputeUnitStats::sALUInsts
statistics::Scalar sALUInsts
Definition: compute_unit.hh:949
gem5::Packet::isWrite
bool isWrite() const
Definition: packet.hh:583
gem5::ComputeUnit::exec
void exec()
Definition: compute_unit.cc:719
gem5::X86ISA::GpuTLB::TranslationState::tlbMode
Mode tlbMode
Definition: tlb.hh:286
gem5::Wavefront::pendingFetch
bool pendingFetch
Definition: wavefront.hh:111
gem5::ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition: compute_unit.hh:239
gem5::ComputeUnit::releaseBarrier
void releaseBarrier(int bar_id)
Definition: compute_unit.cc:696
gem5::ComputeUnit::ComputeUnitStats::instCyclesScMemPerSimd
statistics::Vector instCyclesScMemPerSimd
Definition: compute_unit.hh:980
gem5::Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:516
gem5::ComputeUnit::numYetToReachBarrier
int numYetToReachBarrier(int bar_id)
Definition: compute_unit.cc:647
gem5::HSAQueueEntry::wgId
int wgId(int dim) const
Definition: hsa_queue_entry.hh:209
gem5::ComputeUnit::ComputeUnitStats::ldsBankConflictDist
statistics::Distribution ldsBankConflictDist
Definition: compute_unit.hh:1017
gem5::ComputeUnit::getRefCounter
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Definition: compute_unit.cc:1969
gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
gem5::ComputeUnit::ITLBPort::SenderState::wavefront
Wavefront * wavefront
Definition: compute_unit.hh:742
gem5::OutputDirectory::create
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:210
gem5::ComputeUnit::ComputeUnitStats::kernargReads
statistics::Scalar kernargReads
Definition: compute_unit.hh:1001
gem5::csprintf
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
gem5::Wavefront::S_STOPPED
@ S_STOPPED
Definition: wavefront.hh:66
gem5::X86ISA::PageShift
const Addr PageShift
Definition: page_size.hh:48
gem5::ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition: compute_unit.hh:223
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInsts
statistics::Scalar ldsNoFlatInsts
Definition: compute_unit.hh:955
gem5::ComputeUnit::resetBarrier
void resetBarrier(int bar_id)
Definition: compute_unit.cc:682
gem5::ComputeUnit::ComputeUnitStats::globalReads
statistics::Scalar globalReads
Definition: compute_unit.hh:983
gem5::ComputeUnit::ComputeUnitStats::groupMemInsts
statistics::Formula groupMemInsts
Definition: compute_unit.hh:994
gem5::ComputeUnit::ComputeUnitStats::vpc
statistics::Formula vpc
Definition: compute_unit.hh:1061
gem5::RegisterManager::vrfPoolMgrs
std::vector< PoolManager * > vrfPoolMgrs
Definition: register_manager.hh:80
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerLMemInstrDist
statistics::Distribution activeLanesPerLMemInstrDist
Definition: compute_unit.hh:1068
gem5::ComputeUnit::memPortTokens
TokenManager * memPortTokens
Definition: compute_unit.hh:504
gem5::GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:57
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:67
gem5::ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition: compute_unit.hh:227
gem5::ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:353
gem5::ComputeUnit::req_tick_latency
Tick req_tick_latency
Definition: compute_unit.hh:355
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerWF
statistics::Formula vectorMemWritesPerWF
Definition: compute_unit.hh:962
sim_exit.hh
gem5::HSAQueueEntry::numScalarRegs
int numScalarRegs() const
Definition: hsa_queue_entry.hh:141
gem5::isPowerOf2
static constexpr bool isPowerOf2(const T &n)
Definition: intmath.hh:98
output.hh
gem5::ComputeUnit::ComputeUnitStats::headTailLatency
statistics::Distribution headTailLatency
Definition: compute_unit.hh:1082
gem5::ComputeUnit::scalarDataPort
ScalarDataPort scalarDataPort
Definition: compute_unit.hh:849
gem5::ComputeUnit::ComputeUnitStats::threadCyclesVALU
statistics::Scalar threadCyclesVALU
Definition: compute_unit.hh:953
gem5::ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:292
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerWF
statistics::Formula vectorMemReadsPerWF
Definition: compute_unit.hh:964
gem5::statistics::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1328
gem5::ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:295
gem5::ComputeUnit::ComputeUnitStats::instCyclesVMemPerSimd
statistics::Vector instCyclesVMemPerSimd
Definition: compute_unit.hh:979
wavefront.hh
gem5::exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition: sim_events.cc:88
gem5::TokenRequestPort::setTokenManager
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition: token_port.cc:72
gem5::ComputeUnit::SQCPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:986
gem5::ComputeUnit::ComputeUnitStats::groupReads
statistics::Scalar groupReads
Definition: compute_unit.hh:992
gem5::GPUComputeDriver::setMtype
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
Definition: gpu_compute_driver.cc:1016
gem5::ComputeUnit::ComputeUnitStats::vpc_f64
statistics::Formula vpc_f64
Definition: compute_unit.hh:1064
gem5::ComputeUnit::injectGlobalMemFence
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
Definition: compute_unit.cc:1229
gem5::ComputeUnit::ScalarDataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:599
gem5::ComputeUnit::LDSPort::sendTimingReq
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
Definition: compute_unit.cc:2037
gem5::ComputeUnit::locMemToVrfBus
WaitClass locMemToVrfBus
Definition: compute_unit.hh:229
gem5::MemCmd
Definition: packet.hh:75
gem5::statistics::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:62
gem5::ComputeUnit::ComputeUnitStats::kernargMemInsts
statistics::Formula kernargMemInsts
Definition: compute_unit.hh:1003
gem5::ComputeUnit::ComputeUnitStats::flatVMemInstsPerWF
statistics::Formula flatVMemInstsPerWF
Definition: compute_unit.hh:958
gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1134
gem5::Wavefront::setStatus
void setStatus(status_e newStatus)
Definition: wavefront.cc:518
gem5::LdsState::canReserve
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:478
gem5::ComputeUnit::numScalarMemUnits
int numScalarMemUnits
Definition: compute_unit.hh:235
gem5::GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:266
gem5::ComputeUnit::DTLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1629
gem5::ComputeUnit::ITLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:739
gem5::ArmISA::j
Bitfield< 24 > j
Definition: misc_types.hh:57
gem5::X86ISA::GpuTLB::TranslationState
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: tlb.hh:283
gem5::ComputeUnit
Definition: compute_unit.hh:201
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::process
void process()
Definition: compute_unit.cc:1600
gem5::ComputeUnit::pageAccesses
pageDataStruct pageAccesses
Definition: compute_unit.hh:483
gem5::X86ISA::GpuTLB::TranslationState::hitLevel
int hitLevel
Definition: tlb.hh:307
gem5::HSAQueueEntry::MAX_DIM
const static int MAX_DIM
Definition: hsa_queue_entry.hh:310
gem5::ComputeUnit::ScalarDataPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:596
gem5::OutputStream::stream
std::ostream * stream() const
Get the output underlying output stream.
Definition: output.hh:62
gem5::ComputeUnit::ScalarDataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:575
gem5::X86ISA::GpuTLB::TranslationState::ports
std::vector< ResponsePort * > ports
Definition: tlb.hh:301
gem5::ComputeUnit::ComputeUnitStats::flatLDSInsts
statistics::Scalar flatLDSInsts
Definition: compute_unit.hh:959
gem5::ComputeUnit::numScalarALUs
int numScalarALUs
Definition: compute_unit.hh:248
gem5::statistics::VectorDistribution::init
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
Definition: statistics.hh:2278
gem5::ComputeUnit::numVectorALUs
int numVectorALUs
Definition: compute_unit.hh:244
vector_register_file.hh
gem5::Packet::isRead
bool isRead() const
Definition: packet.hh:582
gem5::LocalMemPipeline::exec
void exec()
Definition: local_memory_pipeline.cc:52
gem5::ComputeUnit::startWavefront
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
Definition: compute_unit.cc:309
gem5::WaitClass::init
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition: misc.hh:76
gem5::ComputeUnit::ComputeUnitStats::privReads
statistics::Scalar privReads
Definition: compute_unit.hh:995
gem5::ComputeUnit::functionalTLB
bool functionalTLB
Definition: compute_unit.hh:345
gem5::ComputeUnit::numAtBarrier
int numAtBarrier(int bar_id)
Definition: compute_unit.cc:668
gem5::ComputeUnit::incNumAtBarrier
void incNumAtBarrier(int bar_id)
Definition: compute_unit.cc:661
gem5::statistics::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
gem5::MemCmd::WriteResp
@ WriteResp
Definition: packet.hh:90
gem5::ComputeUnit::ComputeUnitStats::completedWfs
statistics::Scalar completedWfs
Definition: compute_unit.hh:1077
gem5::HSAQueueEntry::numVectorRegs
int numVectorRegs() const
Definition: hsa_queue_entry.hh:135
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecuted
statistics::Scalar numVecOpsExecuted
Definition: compute_unit.hh:1038
gem5::Named::name
virtual std::string name() const
Definition: named.hh:47
gem5::WFBarrier::InvalidID
static const int InvalidID
Definition: compute_unit.hh:97
gem5::ScheduleStage::init
void init()
Definition: schedule_stage.cc:76
gem5::ComputeUnit::decMaxBarrierCnt
void decMaxBarrierCnt(int bar_id)
Definition: compute_unit.cc:689
gem5::ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition: compute_unit.hh:233
gem5::ComputeUnit::releaseWFsFromBarrier
void releaseWFsFromBarrier(int bar_id)
Definition: compute_unit.cc:704
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerGMemInstrDist
statistics::Distribution activeLanesPerGMemInstrDist
Definition: compute_unit.hh:1067
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:186
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition: compute_unit.hh:241
gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:283
gem5::ArmISA::d
Bitfield< 9 > d
Definition: misc_types.hh:64
gem5::ComputeUnit::execStage
ExecStage execStage
Definition: compute_unit.hh:283
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::description
const char * description() const
Return a C string describing the event.
Definition: compute_unit.cc:1594
gem5::ComputeUnit::ComputeUnitStats::vALUInsts
statistics::Scalar vALUInsts
Definition: compute_unit.hh:947
gem5::probing::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
gem5::ComputeUnit::ComputeUnitStats::instCyclesVALU
statistics::Scalar instCyclesVALU
Definition: compute_unit.hh:951
gem5::MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:326
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gem5::Wavefront::wfSlotId
const int wfSlotId
Definition: wavefront.hh:96
gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
gem5::ComputeUnit::tickEvent
EventFunctionWrapper tickEvent
Definition: compute_unit.hh:288
gem5::LocalMemPipeline::isLMRespFIFOWrRdy
bool isLMRespFIFOWrRdy() const
Definition: local_memory_pipeline.hh:68
gem5::MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:86
gem5::RR
@ RR
Definition: compute_unit.hh:75
gem5::MemCmd::MemSyncReq
@ MemSyncReq
Definition: packet.hh:119
process.hh
gem5::ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:284
gem5::ComputeUnit::resetRegisterPool
void resetRegisterPool()
Definition: compute_unit.cc:410
gem5::ComputeUnit::registerManager
RegisterManager * registerManager
Definition: compute_unit.hh:278
gem5::ComputeUnit::ComputeUnitStats::numInstrExecuted
statistics::Scalar numInstrExecuted
Definition: compute_unit.hh:1033
gem5::ComputeUnit::ScalarDataPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:902
gem5::HSAQueueEntry::isInvDone
bool isInvDone() const
Is invalidate done?
Definition: hsa_queue_entry.hh:354
gem5::ComputeUnit::ITLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1773
gem5::ComputeUnit::ScalarDTLBPort::stallPort
void stallPort()
Definition: compute_unit.hh:709
gem5::GlobalMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: global_memory_pipeline.hh:95
gem5::Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:92
gem5::ComputeUnit::DataPort::createMemReqEvent
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1555
gem5::ComputeUnit::ComputeUnitStats::vectorMemInstsPerKiloInst
statistics::Formula vectorMemInstsPerKiloInst
Definition: compute_unit.hh:972
gem5::ComputeUnit::~ComputeUnit
~ComputeUnit()
Definition: compute_unit.cc:218
scalar_register_file.hh
gpu_dyn_inst.hh
gem5::ComputeUnit::DTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:668
gem5::HSAQueueEntry::wgSize
int wgSize(int dim) const
Definition: hsa_queue_entry.hh:121
gem5::ComputeUnit::activeWaves
int activeWaves
Definition: compute_unit.hh:941
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueVgprAlloc
statistics::Scalar numTimesWgBlockedDueVgprAlloc
Definition: compute_unit.hh:1072
gem5::RegisterManager::srfPoolMgrs
std::vector< PoolManager * > srfPoolMgrs
Definition: register_manager.hh:79
gem5::HSAQueueEntry::codeAddr
Addr codeAddr() const
Definition: hsa_queue_entry.hh:177
gem5::LdsChunk
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:56
gem5::ComputeUnit::mapWaveToScalarMem
int mapWaveToScalarMem(Wavefront *w) const
Definition: compute_unit.cc:287
gpu_command_processor.hh
gem5::ComputeUnit::mapWaveToGlobalMem
int mapWaveToGlobalMem(Wavefront *w) const
Definition: compute_unit.cc:271
gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
gem5::ComputeUnit::deleteFromPipeMap
void deleteFromPipeMap(Wavefront *w)
Definition: compute_unit.cc:507
gem5::ExecStage::init
void init()
Definition: exec_stage.cc:59
gem5::ComputeUnit::doFlush
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
Definition: compute_unit.cc:402
gem5::ComputeUnit::DataPort::SenderState::port_index
PortID port_index
Definition: compute_unit.hh:519
gem5::ComputeUnit::init
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: compute_unit.cc:752
gem5::HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:223
gem5::HSAQueueEntry::gridSize
int gridSize(int dim) const
Definition: hsa_queue_entry.hh:128
gem5::ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition: compute_unit.hh:249
gem5::ComputeUnit::DataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:518
gem5::ComputeUnit::memPort
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
Definition: compute_unit.hh:845
gem5::OLDEST
@ OLDEST
Definition: compute_unit.hh:74
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
Definition: compute_unit.hh:973
gem5::X86ISA::pf
Bitfield< 2 > pf
Definition: misc.hh:556
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
Definition: compute_unit.hh:970
gem5::Packet::cmd
MemCmd cmd
The command field of the packet.
Definition: packet.hh:361
gem5::ComputeUnit::DTLBPort::SenderState::portIndex
PortID portIndex
Definition: compute_unit.hh:672
gem5::ComputeUnit::perLaneTLB
bool perLaneTLB
Definition: compute_unit.hh:329
gem5::ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition: compute_unit.cc:246
gem5::LocalMemPipeline::isLMReqFIFOWrRdy
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: local_memory_pipeline.hh:74
gem5::ComputeUnit::ScalarDTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:702
gem5::ComputeUnit::ComputeUnitStats::groupWrites
statistics::Scalar groupWrites
Definition: compute_unit.hh:993
gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
gem5::ComputeUnit::ComputeUnitStats::globalWrites
statistics::Scalar globalWrites
Definition: compute_unit.hh:984
gem5::ComputeUnit::ComputeUnitStats::vALUInstsPerWF
statistics::Formula vALUInstsPerWF
Definition: compute_unit.hh:948
gem5::LdsState::increaseRefCounter
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:295
tlb.hh
gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:534
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueSgprAlloc
statistics::Scalar numTimesWgBlockedDueSgprAlloc
Definition: compute_unit.hh:1074
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedF16
Definition: compute_unit.hh:1040
gem5::GpuTranslationState
X86ISA::GpuTLB::TranslationState GpuTranslationState
Definition: tlb.hh:439
gem5::ComputeUnit::barrierSlot
WFBarrier & barrierSlot(int bar_id)
Definition: compute_unit.hh:418
name
const std::string & name()
Definition: trace.cc:49
gem5::ComputeUnit::exitCallback
void exitCallback()
Definition: compute_unit.cc:1922
gem5::ComputeUnit::SQCPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:979
gem5::ComputeUnit::ComputeUnitStats::privMemInsts
statistics::Formula privMemInsts
Definition: compute_unit.hh:997
gem5::ComputeUnit::mapWaveToScalarAlu
int mapWaveToScalarAlu(Wavefront *w) const
Definition: compute_unit.cc:253
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::ComputeUnit::hasDispResources
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
Definition: compute_unit.cc:519
gem5::ComputeUnit::DataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:538
gem5::ComputeUnit::getFreeBarrierId
int getFreeBarrierId()
Definition: compute_unit.hh:425
gem5::ComputeUnit::wfSize
int wfSize() const
Definition: compute_unit.hh:394
gem5::ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:234
gem5::ComputeUnit::pipeMap
std::unordered_set< uint64_t > pipeMap
Definition: compute_unit.hh:276
gem5::ComputeUnit::LDSPort::recvReqRetry
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
Definition: compute_unit.cc:2079
gem5::MemCmd::toString
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:265
gem5::Shader::timingSim
bool timingSim
Definition: shader.hh:189
gem5::Process
Definition: process.hh:68
gem5::GPUDispatcher::notifyWgCompl
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
Definition: dispatcher.cc:295
gem5::EventFunctionWrapper
Definition: eventq.hh:1115
gem5::ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
gem5::Clocked::nextCycle
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Definition: clocked_object.hh:213
gem5::ComputeUnit::updateInstStats
void updateInstStats(GPUDynInstPtr gpuDynInst)
Definition: compute_unit.cc:1803
gem5::ComputeUnit::ComputeUnitStats::numALUInstsExecuted
statistics::Formula numALUInstsExecuted
Definition: compute_unit.hh:1070
gem5::ComputeUnit::ComputeUnitStats::instCyclesLdsPerSimd
statistics::Vector instCyclesLdsPerSimd
Definition: compute_unit.hh:981
gem5::ComputeUnit::ComputeUnitStats::argReads
statistics::Scalar argReads
Definition: compute_unit.hh:986
gem5::ComputeUnit::ComputeUnitStats::globalMemInsts
statistics::Formula globalMemInsts
Definition: compute_unit.hh:985
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueLdsAllocation
statistics::Scalar wgBlockedDueLdsAllocation
Definition: compute_unit.hh:1029
gem5::ComputeUnit::LDSPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Definition: compute_unit.cc:2015
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
gem5::ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition: compute_unit.hh:219
gem5::Wavefront::barrierId
void barrierId(int bar_id)
Definition: wavefront.cc:1414
gem5::ComputeUnit::Params
ComputeUnitParams Params
Definition: compute_unit.hh:290
gem5::Wavefront::S_RETURNING
@ S_RETURNING
Definition: wavefront.hh:68
gem5::ComputeUnit::ComputeUnitStats::ipc
statistics::Formula ipc
Definition: compute_unit.hh:1065
gem5::RegisterManager::allocateRegisters
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
Definition: register_manager.cc:122
gem5::ComputeUnit::updatePageDivergenceDist
void updatePageDivergenceDist(Addr addr)
Definition: compute_unit.cc:1911
gem5::X86ISA::GpuTLB::TranslationState::tlbEntry
TlbEntry * tlbEntry
Definition: tlb.hh:295
gem5::ComputeUnit::vectorRegsReserved
std::vector< int > vectorRegsReserved
Definition: compute_unit.hh:367
gem5::ComputeUnit::ComputeUnitStats::readonlyWrites
statistics::Scalar readonlyWrites
Definition: compute_unit.hh:999
gem5::Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:98
gem5::ComputeUnit::ComputeUnitStats::waveLevelParallelism
statistics::Distribution waveLevelParallelism
Definition: compute_unit.hh:1005
gem5::ComputeUnit::ComputeUnitStats::scalarMemWrites
statistics::Scalar scalarMemWrites
Definition: compute_unit.hh:965
gem5::ComputeUnit::ComputeUnitStats::controlFlowDivergenceDist
statistics::Distribution controlFlowDivergenceDist
Definition: compute_unit.hh:1066
gem5::ScheduleStage::exec
void exec()
Definition: schedule_stage.cc:90
gem5::ComputeUnit::ComputeUnitStats::vectorMemWrites
statistics::Scalar vectorMemWrites
Definition: compute_unit.hh:961
gem5::ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition: compute_unit.cc:498
gem5::ComputeUnit::ScalarDTLBPort::SenderState
Definition: compute_unit.hh:699
gem5::statistics::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:72
gem5::GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:246
gem5::ComputeUnit::mapWaveToLocalMem
int mapWaveToLocalMem(Wavefront *w) const
Definition: compute_unit.cc:279
gem5::ComputeUnit::ldsPort
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
Definition: compute_unit.hh:834
gem5::MemCmd::ReadResp
@ ReadResp
Definition: packet.hh:87
gem5::ComputeUnit::ComputeUnitStats::flatLDSInstsPerWF
statistics::Formula flatLDSInstsPerWF
Definition: compute_unit.hh:960
gem5::WFBarrier
WF barrier slots.
Definition: compute_unit.hh:90
gem5::ComputeUnit::isDone
bool isDone() const
Definition: compute_unit.cc:1939
gem5::ComputeUnit::LDSPort::SenderState::getMemInst
GPUDynInstPtr getMemInst() const
Definition: compute_unit.hh:796
gem5::ComputeUnit::ComputeUnitStats::hitsPerTLBLevel
statistics::Vector hitsPerTLBLevel
Definition: compute_unit.hh:1014
gem5::Shader::gpuCmdProc
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:224
gem5::ComputeUnit::maxBarrierCnt
int maxBarrierCnt(int bar_id)
Definition: compute_unit.cc:675
gem5::Shader::n_wf
int n_wf
Definition: shader.hh:203
gem5::ComputeUnit::scalarRegsReserved
std::vector< int > scalarRegsReserved
Definition: compute_unit.hh:369
gem5::ComputeUnit::fillKernelState
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
Definition: compute_unit.cc:295
gem5::MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:89
gem5::ComputeUnit::lds
LdsState & lds
Definition: compute_unit.hh:468
gem5::ComputeUnit::DTLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:665
gem5::ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition: compute_unit.hh:231
gem5::statistics::Group
Statistics container.
Definition: group.hh:93
gem5::Request::INV_L1
@ INV_L1
Definition: request.hh:307
gem5::ComputeUnit::ComputeUnitStats::execRateDist
statistics::Distribution execRateDist
Definition: compute_unit.hh:1036
gem5::ComputeUnit::tlbPort
std::vector< DTLBPort > tlbPort
Definition: compute_unit.hh:847
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedF32
Definition: compute_unit.hh:1042
gem5::ComputeUnit::isVectorAluIdle
bool isVectorAluIdle(uint32_t simdId) const
Definition: compute_unit.cc:1976
gem5::ComputeUnit::numScalarRegsPerSimd
int numScalarRegsPerSimd
Definition: compute_unit.hh:373
gem5::ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition: compute_unit.hh:245
gem5::ComputeUnit::sendScalarRequest
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
Definition: compute_unit.cc:1202
gem5::ComputeUnit::countPages
bool countPages
Definition: compute_unit.hh:351
gem5::ComputeUnit::freeBarrierIds
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
Definition: compute_unit.hh:932
sc_core::SC_NONE
@ SC_NONE
Definition: sc_report.hh:50
gem5::Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:109
gem5::ComputeUnit::ComputeUnitStats::ComputeUnitStats
ComputeUnitStats(statistics::Group *parent, int n_wf)
Definition: compute_unit.cc:2110
gem5::RegisterManager::canAllocateSgprs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:115
gem5::ComputeUnit::scalarMemToSrfBus
WaitClass scalarMemToSrfBus
Definition: compute_unit.hh:237
gem5::MipsISA::k
Bitfield< 23 > k
Definition: dt_constants.hh:81
gem5::ComputeUnit::scalarDTLBPort
ScalarDTLBPort scalarDTLBPort
Definition: compute_unit.hh:851
gem5::ComputeUnit::ComputeUnitStats::pageDivergenceDist
statistics::Distribution pageDivergenceDist
Definition: compute_unit.hh:1021
gem5::Shader::max_valu_insts
int64_t max_valu_insts
Definition: shader.hh:227
gem5::ExecStage::exec
void exec()
Definition: exec_stage.cc:152
gem5::GPUDispatcher
Definition: dispatcher.hh:62
dispatcher.hh
DPRINTFN
#define DPRINTFN(...)
Definition: trace.hh:214
gem5::statistics::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:358
gem5::MipsISA::vaddr
vaddr
Definition: pra_constants.hh:278
gem5::ComputeUnit::ComputeUnitStats::argWrites
statistics::Scalar argWrites
Definition: compute_unit.hh:987
gem5::ComputeUnit::ComputeUnitStats::vpc_f32
statistics::Formula vpc_f32
Definition: compute_unit.hh:1063
gem5::HSAQueueEntry::ldsSize
int ldsSize() const
Definition: hsa_queue_entry.hh:189
gem5::Packet::getAddr
Addr getAddr() const
Definition: packet.hh:781
gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
gem5::RegisterManager::canAllocateVgprs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:109
gem5::ComputeUnit::sendToLds
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
Definition: compute_unit.cc:1995
gem5::X86ISA::PageBytes
const Addr PageBytes
Definition: page_size.hh:49
gem5::registerExitCallback
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition: core.cc:146
gem5::Wavefront::getStatus
status_e getStatus()
Definition: wavefront.hh:137
gem5::ComputeUnit::ComputeUnitStats::dynamicLMemInstrCnt
statistics::Scalar dynamicLMemInstrCnt
Definition: compute_unit.hh:1026
gem5::ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition: compute_unit.hh:286
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
page_table.hh
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: tlb.cc:60
gem5::ComputeUnit::DataPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:801
gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:225
gem5::MemCmd::MemSyncResp
@ MemSyncResp
Definition: packet.hh:120
gem5::ScoreboardCheckStage::exec
void exec()
Definition: scoreboard_check_stage.cc:248
gem5::ComputeUnit::ComputeUnitStats::readonlyMemInsts
statistics::Formula readonlyMemInsts
Definition: compute_unit.hh:1000
gem5::Request::FLUSH_L2
@ FLUSH_L2
Definition: request.hh:312
gem5::ComputeUnit::ScalarDataPort::recvReqRetry
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:940
gem5::ComputeUnit::ComputeUnitStats::scalarMemReads
statistics::Scalar scalarMemReads
Definition: compute_unit.hh:967
gem5::ComputeUnit::ComputeUnitStats::totalCycles
statistics::Scalar totalCycles
Definition: compute_unit.hh:1060
gem5::statistics::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1040
gem5::ComputeUnit::dispWorkgroup
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
Definition: compute_unit.cc:420
gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:153
gem5::ArmISA::stride
Bitfield< 21, 20 > stride
Definition: misc_types.hh:447
gem5::ComputeUnit::ComputeUnitStats::tlbCycles
statistics::Scalar tlbCycles
Definition: compute_unit.hh:1010
gem5::ComputeUnit::mapWaveToScalarAluGlobalIdx
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
Definition: compute_unit.cc:264
gem5::ComputeUnit::gmTokenPort
GMTokenPort gmTokenPort
Definition: compute_unit.hh:505
gem5::LdsState::reserveSpace
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:363
gem5::Shader::total_valu_insts
int64_t total_valu_insts
Definition: shader.hh:228
gem5::ComputeUnit::DataPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:952
gem5::ComputeUnit::doInvalidate
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
Definition: compute_unit.cc:383
gem5::ComputeUnit::ComputeUnitStats::spillReads
statistics::Scalar spillReads
Definition: compute_unit.hh:989
gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
gem5::ComputeUnit::allAtBarrier
bool allAtBarrier(int bar_id)
Definition: compute_unit.cc:654
gem5::ComputeUnit::numWfsToSched
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
Definition: compute_unit.hh:364
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInstsPerWF
statistics::Formula ldsNoFlatInstsPerWF
Definition: compute_unit.hh:956
gem5::GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:107
gem5::KernelLaunchStaticInst
Definition: gpu_static_inst.hh:325
gem5::ComputeUnit::DataPort::SenderState
Definition: compute_unit.hh:516
gem5::Packet::getSize
unsigned getSize() const
Definition: packet.hh:791
gem5::Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
gem5::ComputeUnit::ScalarDataPort::SenderState
Definition: compute_unit.hh:567
gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:301
gem5::ComputeUnit::scheduleStage
ScheduleStage scheduleStage
Definition: compute_unit.hh:282
gem5::Wavefront::dropFetch
bool dropFetch
Definition: wavefront.hh:112
gem5::Request::KERNEL
@ KERNEL
The request should be marked with KERNEL.
Definition: request.hh:183
gem5::Shader::impl_kern_end_rel
int impl_kern_end_rel
Definition: shader.hh:195
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
gem5::GPUCommandProcessor::driver
GPUComputeDriver * driver()
Definition: gpu_command_processor.cc:231
gem5::ComputeUnit::DataPort::createMemRespEvent
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1563
gem5::Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:217
gem5::ComputeUnit::requestorId
RequestorID requestorId()
Definition: compute_unit.hh:460
gem5::X86ISA::addr
Bitfield< 3 > addr
Definition: types.hh:84
gem5::ComputeUnit::ComputeUnitStats::scalarMemInstsPerKiloInst
statistics::Formula scalarMemInstsPerKiloInst
Definition: compute_unit.hh:975
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerKiloInst
statistics::Formula vectorMemWritesPerKiloInst
Definition: compute_unit.hh:971
gem5::SenderState
RubyTester::SenderState SenderState
Definition: Check.cc:40
gem5::ComputeUnit::numExeUnits
int numExeUnits() const
Definition: compute_unit.cc:231
gem5::Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1184
gem5::ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:221
gem5::Wavefront::simdId
const int simdId
Definition: wavefront.hh:99
gem5::MipsISA::vpc
Bitfield< 1 > vpc
Definition: mt_constants.hh:44

Generated on Tue Feb 8 2022 11:47:08 for gem5 by doxygen 1.8.17