gem5  v21.1.0.2
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
35 
36 #include <limits>
37 
38 #include "arch/x86/page_size.hh"
39 #include "base/output.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
55 #include "gpu-compute/shader.hh"
58 #include "gpu-compute/wavefront.hh"
59 #include "mem/page_table.hh"
60 #include "sim/process.hh"
61 #include "sim/sim_exit.hh"
62 
63 namespace gem5
64 {
65 
67  numVectorGlobalMemUnits(p.num_global_mem_pipes),
68  numVectorSharedMemUnits(p.num_shared_mem_pipes),
69  numScalarMemUnits(p.num_scalar_mem_pipes),
70  numVectorALUs(p.num_SIMDs),
71  numScalarALUs(p.num_scalar_cores),
72  vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
73  coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
74  registerManager(p.register_manager),
75  fetchStage(p, *this),
76  scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
77  scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
78  execStage(p, *this, scheduleToExecute),
79  globalMemoryPipe(p, *this),
80  localMemoryPipe(p, *this),
81  scalarMemoryPipe(p, *this),
82  tickEvent([this]{ exec(); }, "Compute unit tick event",
83  false, Event::CPU_Tick_Pri),
84  cu_id(p.cu_id),
85  vrf(p.vector_register_file), srf(p.scalar_register_file),
86  simdWidth(p.simd_width),
87  spBypassPipeLength(p.spbypass_pipe_length),
88  dpBypassPipeLength(p.dpbypass_pipe_length),
89  scalarPipeStages(p.scalar_pipe_length),
90  operandNetworkLength(p.operand_network_length),
91  issuePeriod(p.issue_period),
92  vrf_gm_bus_latency(p.vrf_gm_bus_latency),
93  srf_scm_bus_latency(p.srf_scm_bus_latency),
94  vrf_lm_bus_latency(p.vrf_lm_bus_latency),
95  perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
96  prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
97  debugSegFault(p.debugSegFault),
98  functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
99  countPages(p.countPages),
100  req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
101  resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
102  _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
103  lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
104  ldsPort(csprintf("%s-port", name()), this),
105  scalarDataPort(csprintf("%s-port", name()), this),
106  scalarDTLBPort(csprintf("%s-port", name()), this),
107  sqcPort(csprintf("%s-port", name()), this),
108  sqcTLBPort(csprintf("%s-port", name()), this),
109  _cacheLineSize(p.system->cacheLineSize()),
110  _numBarrierSlots(p.num_barrier_slots),
111  globalSeqNum(0), wavefrontSize(p.wf_size),
112  scoreboardCheckToSchedule(p),
113  scheduleToExecute(p),
114  stats(this, p.n_wf)
115 {
125  fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
126  p.wf_size <= 0,
127  "WF size is larger than the host can support");
128  fatal_if(!isPowerOf2(wavefrontSize),
129  "Wavefront size should be a power of 2");
130  // calculate how many cycles a vector load or store will need to transfer
131  // its data over the corresponding buses
132  numCyclesPerStoreTransfer =
133  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
134  (double)vrfToCoalescerBusWidth);
135 
136  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
137  / coalescerToVrfBusWidth;
138 
139  // Initialization: all WF slots are assumed STOPPED
140  idleWfs = p.n_wf * numVectorALUs;
141  lastVaddrWF.resize(numVectorALUs);
142  wfList.resize(numVectorALUs);
143 
144  wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
145 
146  for (int i = 0; i < p.num_barrier_slots; ++i) {
147  freeBarrierIds.insert(i);
148  }
149 
150  for (int j = 0; j < numVectorALUs; ++j) {
151  lastVaddrWF[j].resize(p.n_wf);
152 
153  for (int i = 0; i < p.n_wf; ++i) {
154  lastVaddrWF[j][i].resize(wfSize());
155 
156  wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
157  wfList[j][i]->setParent(this);
158 
159  for (int k = 0; k < wfSize(); ++k) {
160  lastVaddrWF[j][i][k] = 0;
161  }
162  }
163  }
164 
165  lastVaddrSimd.resize(numVectorALUs);
166 
167  for (int i = 0; i < numVectorALUs; ++i) {
168  lastVaddrSimd[i].resize(wfSize(), 0);
169  }
170 
171  lastVaddrCU.resize(wfSize());
172 
173  lds.setParent(this);
174 
175  if (p.execPolicy == "OLDEST-FIRST") {
176  exec_policy = EXEC_POLICY::OLDEST;
177  } else if (p.execPolicy == "ROUND-ROBIN") {
178  exec_policy = EXEC_POLICY::RR;
179  } else {
180  fatal("Invalid WF execution policy (CU)\n");
181  }
182 
183  for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
184  memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
185  }
186 
187  for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
188  tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
189  }
190 
191  // Setup tokens for response ports. The number of tokens in memPortTokens
192  // is the total token count for the entire vector port (i.e., this CU).
193  memPortTokens = new TokenManager(p.max_cu_tokens);
194 
195  registerExitCallback([this]() { exitCallback(); });
196 
197  lastExecCycle.resize(numVectorALUs, 0);
198 
199  for (int i = 0; i < vrf.size(); ++i) {
200  vrf[i]->setParent(this);
201  }
202  for (int i = 0; i < srf.size(); ++i) {
203  srf[i]->setParent(this);
204  }
205  numVecRegsPerSimd = vrf[0]->numRegs();
206  numScalarRegsPerSimd = srf[0]->numRegs();
207 
208  registerManager->setParent(this);
209 
210  activeWaves = 0;
211 
212  instExecPerSimd.resize(numVectorALUs, 0);
213 
214  // Calculate the number of bits to address a cache line
215  panic_if(!isPowerOf2(_cacheLineSize),
216  "Cache line size should be a power of two.");
217  cacheLineBits = floorLog2(_cacheLineSize);
218 }
219 
221 {
222  // Delete wavefront slots
223  for (int j = 0; j < numVectorALUs; ++j) {
224  for (int i = 0; i < shader->n_wf; ++i) {
225  delete wfList[j][i];
226  }
227  lastVaddrSimd[j].clear();
228  }
229  lastVaddrCU.clear();
230 }
231 
232 int
234 {
237 }
238 
239 // index into readyList of the first memory unit
240 int
242 {
243  return numVectorALUs + numScalarALUs;
244 }
245 
246 // index into readyList of the last memory unit
247 int
249 {
250  return numExeUnits() - 1;
251 }
252 
253 // index into scalarALUs vector of SALU used by the wavefront
254 int
256 {
257  if (numScalarALUs == 1) {
258  return 0;
259  } else {
260  return w->simdId % numScalarALUs;
261  }
262 }
263 
264 // index into readyList of Scalar ALU unit used by wavefront
265 int
267 {
269 }
270 
271 // index into readyList of Global Memory unit used by wavefront
272 int
274 {
275  // TODO: FIXME if more than 1 GM pipe supported
276  return numVectorALUs + numScalarALUs;
277 }
278 
279 // index into readyList of Local Memory unit used by wavefront
280 int
282 {
283  // TODO: FIXME if more than 1 LM pipe supported
285 }
286 
287 // index into readyList of Scalar Memory unit used by wavefront
288 int
290 {
291  // TODO: FIXME if more than 1 ScM pipe supported
294 }
295 
296 void
298 {
299  w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
300  w->workGroupSz[0] = task->wgSize(0);
301  w->workGroupSz[1] = task->wgSize(1);
302  w->workGroupSz[2] = task->wgSize(2);
303  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
304  w->gridSz[0] = task->gridSize(0);
305  w->gridSz[1] = task->gridSize(1);
306  w->gridSz[2] = task->gridSize(2);
307  w->computeActualWgSz(task);
308 }
309 
310 void
312  HSAQueueEntry *task, int bar_id, bool fetchContext)
313 {
314  static int _n_wave = 0;
315 
316  VectorMask init_mask;
317  init_mask.reset();
318 
319  for (int k = 0; k < wfSize(); ++k) {
320  if (k + waveId * wfSize() < w->actualWgSzTotal)
321  init_mask[k] = 1;
322  }
323 
324  w->execMask() = init_mask;
325 
326  w->kernId = task->dispatchId();
327  w->wfId = waveId;
328  w->initMask = init_mask.to_ullong();
329 
330  if (bar_id > WFBarrier::InvalidID) {
331  w->barrierId(bar_id);
332  } else {
333  assert(!w->hasBarrier());
334  }
335 
336  for (int k = 0; k < wfSize(); ++k) {
337  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
338  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
339  w->actualWgSz[1];
340  w->workItemId[2][k] = (k + waveId * wfSize()) /
341  (w->actualWgSz[0] * w->actualWgSz[1]);
342 
343  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
344  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
345  w->workItemId[0][k];
346  }
347 
348  // WG state
349  w->wgId = task->globalWgId();
350  w->dispatchId = task->dispatchId();
351  w->workGroupId[0] = w->wgId % task->numWg(0);
352  w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
353  w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
354 
355  // set the wavefront context to have a pointer to this section of the LDS
356  w->ldsChunk = ldsChunk;
357 
358  GEM5_VAR_USED int32_t refCount =
359  lds.increaseRefCounter(w->dispatchId, w->wgId);
360  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
361  cu_id, w->wgId, refCount);
362 
363  w->instructionBuffer.clear();
364 
365  if (w->pendingFetch)
366  w->dropFetch = true;
367 
368  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
369  "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
370  w->simdId, w->wfSlotId, refCount);
371 
372  w->initRegState(task, w->actualWgSzTotal);
373  w->start(_n_wave++, task->codeAddr());
374 
376  activeWaves++;
377 }
378 
384 void
386  GPUDynInstPtr gpuDynInst
387  = std::make_shared<GPUDynInst>(this, nullptr,
389 
390  // kern_id will be used in inv responses
391  gpuDynInst->kern_id = kernId;
392  // update contextId field
393  req->setContext(gpuDynInst->wfDynId);
394 
395  injectGlobalMemFence(gpuDynInst, true, req);
396 }
397 
403 void
405  injectGlobalMemFence(gpuDynInst, true);
406 }
407 
408 // reseting SIMD register pools
409 // I couldn't think of any other place and
410 // I think it is needed in my implementation
411 void
413 {
414  for (int i=0; i<numVectorALUs; i++)
415  {
418  }
419 }
420 
421 void
422 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
423 {
424  // If we aren't ticking, start it up!
425  if (!tickEvent.scheduled()) {
426  DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
428  }
429 
430  // the kernel's invalidate must have finished before any wg dispatch
431  assert(task->isInvDone());
432 
433  // reserve the LDS capacity allocated to the work group
434  // disambiguated by the dispatch ID and workgroup ID, which should be
435  // globally unique
436  LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
437  task->globalWgId(),
438  task->ldsSize());
439 
440  panic_if(!ldsChunk, "was not able to reserve space for this WG");
441 
442  // calculate the number of 32-bit vector registers required
443  // by each work item
444  int vregDemand = task->numVectorRegs();
445  int sregDemand = task->numScalarRegs();
446  int wave_id = 0;
447 
448  int barrier_id = WFBarrier::InvalidID;
449 
454  if (num_wfs_in_wg > 1) {
459  barrier_id = getFreeBarrierId();
460  auto &wf_barrier = barrierSlot(barrier_id);
461  assert(!wf_barrier.maxBarrierCnt());
462  assert(!wf_barrier.numAtBarrier());
463  wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
464 
465  DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
466  "%d waves using this barrier.\n", cu_id, barrier_id,
467  num_wfs_in_wg);
468  }
469 
470  // Assign WFs according to numWfsToSched vector, which is computed by
471  // hasDispResources()
472  for (int j = 0; j < shader->n_wf; ++j) {
473  for (int i = 0; i < numVectorALUs; ++i) {
474  Wavefront *w = wfList[i][j];
475  // Check if this wavefront slot is available and there are WFs
476  // remaining to be dispatched to current SIMD:
477  // WF slot must be stopped and not waiting
478  // for a release to complete S_RETURNING
479  if (w->getStatus() == Wavefront::S_STOPPED &&
480  numWfsToSched[i] > 0) {
481  // decrement number of WFs awaiting dispatch to current SIMD
482  numWfsToSched[i] -= 1;
483 
484  fillKernelState(w, task);
485 
486  DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
487  "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
488  vregDemand, sregDemand);
489 
490  registerManager->allocateRegisters(w, vregDemand, sregDemand);
491 
492  startWavefront(w, wave_id, ldsChunk, task, barrier_id);
493  ++wave_id;
494  }
495  }
496  }
497 }
498 
499 void
501 {
502  panic_if(w->instructionBuffer.empty(),
503  "Instruction Buffer of WF%d can't be empty", w->wgId);
504  GPUDynInstPtr ii = w->instructionBuffer.front();
505  pipeMap.emplace(ii->seqNum());
506 }
507 
508 void
510 {
511  panic_if(w->instructionBuffer.empty(),
512  "Instruction Buffer of WF%d can't be empty", w->wgId);
513  GPUDynInstPtr ii = w->instructionBuffer.front();
514  // delete the dynamic instruction from the pipeline map
515  auto it = pipeMap.find(ii->seqNum());
516  panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
517  pipeMap.erase(it);
518 }
519 
520 bool
522 {
523  // compute true size of workgroup (after clamping to grid size)
524  int trueWgSize[HSAQueueEntry::MAX_DIM];
525  int trueWgSizeTotal = 1;
526 
527  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
528  trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
529  task->wgId(d) * task->wgSize(d));
530 
531  trueWgSizeTotal *= trueWgSize[d];
532  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
533  }
534 
535  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
536 
537  // calculate the number of WFs in this WG
538  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
539  num_wfs_in_wg = numWfs;
540 
541  bool barrier_avail = true;
542 
543  if (numWfs > 1 && !freeBarrierIds.size()) {
544  barrier_avail = false;
545  }
546 
547  // calculate the number of 32-bit vector registers required by each
548  // work item of the work group
549  int vregDemandPerWI = task->numVectorRegs();
550  // calculate the number of 32-bit scalar registers required by each
551  // work item of the work group
552  int sregDemandPerWI = task->numScalarRegs();
553 
554  // check if the total number of VGPRs snd SGPRs required by all WFs
555  // of the WG fit in the VRFs of all SIMD units and the CU's SRF
556  panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
557  "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
558  "that has %d VGPRs\n",
559  numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
560  panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
561  "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
562  "with %d SGPRs\n",
563  numWfs, sregDemandPerWI, numScalarRegsPerSimd);
564 
565  // number of WF slots that are not occupied
566  int freeWfSlots = 0;
567  // number of Wfs from WG that were successfully mapped to a SIMD
568  int numMappedWfs = 0;
569  numWfsToSched.clear();
570  numWfsToSched.resize(numVectorALUs, 0);
571 
572  // attempt to map WFs to the SIMDs, based on WF slot availability
573  // and register file availability
574  for (int j = 0; j < shader->n_wf; ++j) {
575  for (int i = 0; i < numVectorALUs; ++i) {
576  if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
577  ++freeWfSlots;
578  // check if current WF will fit onto current SIMD/VRF
579  // if all WFs have not yet been mapped to the SIMDs
580  if (numMappedWfs < numWfs &&
582  sregDemandPerWI) &&
584  vregDemandPerWI)) {
585  numWfsToSched[i]++;
586  numMappedWfs++;
587  }
588  }
589  }
590  }
591 
592  // check that the number of mapped WFs is not greater
593  // than the actual number of WFs
594  assert(numMappedWfs <= numWfs);
595 
596  bool vregAvail = true;
597  bool sregAvail = true;
598  // if a WF to SIMD mapping was not found, find the limiting resource
599  if (numMappedWfs < numWfs) {
600 
601  for (int j = 0; j < numVectorALUs; ++j) {
602  // find if there are enough free VGPRs in the SIMD's VRF
603  // to accomodate the WFs of the new WG that would be mapped
604  // to this SIMD unit
605  vregAvail &= registerManager->
606  canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
607  // find if there are enough free SGPRs in the SIMD's SRF
608  // to accomodate the WFs of the new WG that would be mapped
609  // to this SIMD unit
610  sregAvail &= registerManager->
611  canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
612  }
613  }
614 
615  DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
616  VGPR Availability = %d, SGPR Availability = %d\n",
617  freeWfSlots, numMappedWfs, vregAvail, sregAvail);
618 
619  if (!vregAvail) {
621  }
622 
623  if (!sregAvail) {
625  }
626 
627  // Return true if enough WF slots to submit workgroup and if there are
628  // enough VGPRs to schedule all WFs to their SIMD units
629  bool ldsAvail = lds.canReserve(task->ldsSize());
630  if (!ldsAvail) {
632  }
633 
634  if (!barrier_avail) {
636  }
637 
638  // Return true if the following are all true:
639  // (a) all WFs of the WG were mapped to free WF slots
640  // (b) there are enough VGPRs to schedule all WFs to their SIMD units
641  // (c) there are enough SGPRs on the CU to schedule all WFs
642  // (d) there is enough space in LDS to allocate for all WFs
643  bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
644  && ldsAvail && barrier_avail;
645  return can_dispatch;
646 }
647 
648 int
650 {
651  auto &wf_barrier = barrierSlot(bar_id);
652  return wf_barrier.numYetToReachBarrier();
653 }
654 
655 bool
657 {
658  auto &wf_barrier = barrierSlot(bar_id);
659  return wf_barrier.allAtBarrier();
660 }
661 
662 void
664 {
665  auto &wf_barrier = barrierSlot(bar_id);
666  wf_barrier.incNumAtBarrier();
667 }
668 
669 int
671 {
672  auto &wf_barrier = barrierSlot(bar_id);
673  return wf_barrier.numAtBarrier();
674 }
675 
676 int
678 {
679  auto &wf_barrier = barrierSlot(bar_id);
680  return wf_barrier.maxBarrierCnt();
681 }
682 
683 void
685 {
686  auto &wf_barrier = barrierSlot(bar_id);
687  wf_barrier.reset();
688 }
689 
690 void
692 {
693  auto &wf_barrier = barrierSlot(bar_id);
694  wf_barrier.decMaxBarrierCnt();
695 }
696 
697 void
699 {
700  auto &wf_barrier = barrierSlot(bar_id);
701  wf_barrier.release();
702  freeBarrierIds.insert(bar_id);
703 }
704 
705 void
707 {
708  for (int i = 0; i < numVectorALUs; ++i) {
709  for (int j = 0; j < shader->n_wf; ++j) {
710  Wavefront *wf = wfList[i][j];
711  if (wf->barrierId() == bar_id) {
712  assert(wf->getStatus() == Wavefront::S_BARRIER);
714  }
715  }
716  }
717 }
718 
719 // Execute one clock worth of work on the ComputeUnit.
720 void
722 {
723  // process reads and writes in the RFs
724  for (auto &vecRegFile : vrf) {
725  vecRegFile->exec();
726  }
727 
728  for (auto &scRegFile : srf) {
729  scRegFile->exec();
730  }
731 
732  // Execute pipeline stages in reverse order to simulate
733  // the pipeline latency
737  execStage.exec();
740  fetchStage.exec();
741 
742  stats.totalCycles++;
743 
744  // Put this CU to sleep if there is no more work to be done.
745  if (!isDone()) {
747  } else {
749  DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
750  }
751 }
752 
753 void
755 {
756  // Initialize CU Bus models and execution resources
757 
758  // Vector ALUs
759  vectorALUs.clear();
760  for (int i = 0; i < numVectorALUs; i++) {
761  vectorALUs.emplace_back(this, clockPeriod());
762  }
763 
764  // Scalar ALUs
765  scalarALUs.clear();
766  for (int i = 0; i < numScalarALUs; i++) {
767  scalarALUs.emplace_back(this, clockPeriod());
768  }
769 
770  // Vector Global Memory
772  "No support for multiple Global Memory Pipelines exists!!!");
776 
777  // Vector Local/Shared Memory
779  "No support for multiple Local Memory Pipelines exists!!!");
783 
784  // Scalar Memory
786  "No support for multiple Scalar Memory Pipelines exists!!!");
787  scalarMemUnit.init(this, clockPeriod());
790 
793 
794  fetchStage.init();
796  execStage.init();
798 
800 }
801 
802 bool
804 {
805  // Ruby has completed the memory op. Schedule the mem_resp_event at the
806  // appropriate cycle to process the timing memory response
807  // This delay represents the pipeline delay
808  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
809  PortID index = sender_state->port_index;
810  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
811  GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
812 
813  // MemSyncResp + WriteAckResp are handled completely here and we don't
814  // schedule a MemRespEvent to process the responses further
815  if (pkt->cmd == MemCmd::MemSyncResp) {
816  // This response is for 1 of the following request types:
817  // - kernel launch
818  // - kernel end
819  // - non-kernel mem sync
820 
821  // Kernel Launch
822  // wavefront was nullptr when launching kernel, so it is meaningless
823  // here (simdId=-1, wfSlotId=-1)
824  if (gpuDynInst->isKernelLaunch()) {
825  // for kernel launch, the original request must be both kernel-type
826  // and INV_L1
827  assert(pkt->req->isKernel());
828  assert(pkt->req->isInvL1());
829 
830  // one D-Cache inv is done, decrement counter
831  dispatcher.updateInvCounter(gpuDynInst->kern_id);
832 
833  delete pkt->senderState;
834  delete pkt;
835  return true;
836  }
837 
838  // retrieve wavefront from inst
839  Wavefront *w = gpuDynInst->wavefront();
840 
841  // Check if we are waiting on Kernel End Flush
842  if (w->getStatus() == Wavefront::S_RETURNING
843  && gpuDynInst->isEndOfKernel()) {
844  // for kernel end, the original request must be both kernel-type
845  // and last-level GPU cache should be flushed if it contains
846  // dirty data. This request may have been quiesced and
847  // immediately responded to if the GL2 is a write-through /
848  // read-only cache.
849  assert(pkt->req->isKernel());
850  assert(pkt->req->isGL2CacheFlush());
851 
852  // once flush done, decrement counter, and return whether all
853  // dirty writeback operations are done for the kernel
854  bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
855 
856  // not all wbs are done for the kernel, just release pkt
857  // resources
858  if (!isWbDone) {
859  delete pkt->senderState;
860  delete pkt;
861  return true;
862  }
863 
864  // all wbs are completed for the kernel, do retirement work
865  // for the workgroup
866  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
867  computeUnit->cu_id, w->simdId, w->wfSlotId,
868  w->wfDynId, w->wgId);
869 
870  dispatcher.notifyWgCompl(w);
871  w->setStatus(Wavefront::S_STOPPED);
872  }
873 
874  if (!pkt->req->isKernel()) {
875  w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
876  DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
877  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
878  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
879  gpuDynInst->disassemble(), w->outstandingReqs,
880  w->outstandingReqs - 1);
882  }
883 
884  delete pkt->senderState;
885  delete pkt;
886  return true;
887  }
888 
889  EventFunctionWrapper *mem_resp_event =
890  computeUnit->memPort[index].createMemRespEvent(pkt);
891 
892  DPRINTF(GPUPort,
893  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
894  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
895  gpuDynInst->seqNum(), index, pkt->req->getPaddr());
896 
897  computeUnit->schedule(mem_resp_event,
899 
900  return true;
901 }
902 
903 bool
905 {
906  assert(!pkt->req->isKernel());
907 
908  // retrieve sender state
909  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
910  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
911 
912  assert(pkt->isRead() || pkt->isWrite());
913  assert(gpuDynInst->numScalarReqs > 0);
914 
915  gpuDynInst->numScalarReqs--;
916 
925  if (!gpuDynInst->numScalarReqs) {
926  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
927  computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
928  gpuDynInst);
929  } else {
930  computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
931  gpuDynInst);
932  }
933  }
934 
935  delete pkt->senderState;
936  delete pkt;
937 
938  return true;
939 }
940 
941 void
943 {
944  for (const auto &pkt : retries) {
945  if (!sendTimingReq(pkt)) {
946  break;
947  } else {
948  retries.pop_front();
949  }
950  }
951 }
952 
953 void
955 {
956  int len = retries.size();
957 
958  assert(len > 0);
959 
960  for (int i = 0; i < len; ++i) {
961  PacketPtr pkt = retries.front().first;
962  GEM5_VAR_USED GPUDynInstPtr gpuDynInst = retries.front().second;
963  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
964  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
965  pkt->req->getPaddr());
966 
970  if (!sendTimingReq(pkt)) {
971  DPRINTF(GPUMem, "failed again!\n");
972  break;
973  } else {
974  DPRINTF(GPUMem, "successful!\n");
975  retries.pop_front();
976  }
977  }
978 }
979 
980 bool
982 {
983  computeUnit->fetchStage.processFetchReturn(pkt);
984  return true;
985 }
986 
987 void
989 {
990  int len = retries.size();
991 
992  assert(len > 0);
993 
994  for (int i = 0; i < len; ++i) {
995  PacketPtr pkt = retries.front().first;
996  GEM5_VAR_USED Wavefront *wavefront = retries.front().second;
997  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
998  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
999  pkt->req->getPaddr());
1000  if (!sendTimingReq(pkt)) {
1001  DPRINTF(GPUFetch, "failed again!\n");
1002  break;
1003  } else {
1004  DPRINTF(GPUFetch, "successful!\n");
1005  retries.pop_front();
1006  }
1007  }
1008 }
1009 
1010 void
1012 {
1013  // There must be a way around this check to do the globalMemStart...
1014  Addr tmp_vaddr = pkt->req->getVaddr();
1015 
1016  updatePageDivergenceDist(tmp_vaddr);
1017 
1018  // set PC in request
1019  pkt->req->setPC(gpuDynInst->wavefront()->pc());
1020 
1021  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1022 
1023  // figure out the type of the request to set read/write
1024  BaseMMU::Mode TLB_mode;
1025  assert(pkt->isRead() || pkt->isWrite());
1026 
1027  // only do some things if actually accessing data
1028  bool isDataAccess = pkt->isWrite() || pkt->isRead();
1029 
1030  // For dGPUs, real hardware will extract MTYPE from the PTE. Our model
1031  // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1032  // Rather than hacking up the pagetable to add these bits in, we just
1033  // keep a structure local to our GPUs that are populated in our
1034  // emulated driver whenever memory is allocated. Consult that structure
1035  // here in case we need a memtype override.
1036  shader->gpuCmdProc.driver()->setMtype(pkt->req);
1037 
1038  // Check write before read for atomic operations
1039  // since atomic operations should use BaseMMU::Write
1040  if (pkt->isWrite()) {
1041  TLB_mode = BaseMMU::Write;
1042  } else if (pkt->isRead()) {
1043  TLB_mode = BaseMMU::Read;
1044  } else {
1045  fatal("pkt is not a read nor a write\n");
1046  }
1047 
1048  stats.tlbCycles -= curTick();
1049  ++stats.tlbRequests;
1050 
1051  PortID tlbPort_index = perLaneTLB ? index : 0;
1052 
1053  if (shader->timingSim) {
1054  if (debugSegFault) {
1056  Addr vaddr = pkt->req->getVaddr();
1057  unsigned size = pkt->getSize();
1058 
1059  if ((vaddr + size - 1) % 64 < vaddr % 64) {
1060  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1061  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1062  }
1063 
1064  Addr paddr;
1065 
1066  if (!p->pTable->translate(vaddr, paddr)) {
1067  if (!p->fixupFault(vaddr)) {
1068  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1069  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1070  vaddr);
1071  }
1072  }
1073  }
1074 
1075  // This is the SenderState needed upon return
1076  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1077 
1078  // This is the senderState needed by the TLB hierarchy to function
1079  X86ISA::GpuTLB::TranslationState *translation_state =
1080  new X86ISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
1081  pkt->senderState);
1082 
1083  pkt->senderState = translation_state;
1084 
1085  if (functionalTLB) {
1086  tlbPort[tlbPort_index].sendFunctional(pkt);
1087 
1088  // update the hitLevel distribution
1089  int hit_level = translation_state->hitLevel;
1090  assert(hit_level != -1);
1091  stats.hitsPerTLBLevel[hit_level]++;
1092 
1093  // New SenderState for the memory access
1094  X86ISA::GpuTLB::TranslationState *sender_state =
1095  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1096 
1097  delete sender_state->tlbEntry;
1098  delete sender_state->saved;
1099  delete sender_state;
1100 
1101  assert(pkt->req->hasPaddr());
1102  assert(pkt->req->hasSize());
1103 
1104  // this is necessary because the GPU TLB receives packets instead
1105  // of requests. when the translation is complete, all relevent
1106  // fields in the request will be populated, but not in the packet.
1107  // here we create the new packet so we can set the size, addr,
1108  // and proper flags.
1109  PacketPtr oldPkt = pkt;
1110  pkt = new Packet(oldPkt->req, oldPkt->cmd);
1111  if (isDataAccess) {
1112  uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1113  pkt->dataStatic(tmpData);
1114  }
1115  delete oldPkt;
1116 
1117 
1118  // New SenderState for the memory access
1119  pkt->senderState =
1120  new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1121  nullptr);
1122 
1123  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1124  gpuDynInst->tlbHitLevel[index] = hit_level;
1125 
1126  // translation is done. Schedule the mem_req_event at the
1127  // appropriate cycle to send the timing memory request to ruby
1128  EventFunctionWrapper *mem_req_event =
1129  memPort[index].createMemReqEvent(pkt);
1130 
1131  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1132  "scheduled\n", cu_id, gpuDynInst->simdId,
1133  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1134 
1135  schedule(mem_req_event, curTick() + req_tick_latency);
1136  } else if (tlbPort[tlbPort_index].isStalled()) {
1137  assert(tlbPort[tlbPort_index].retries.size() > 0);
1138 
1139  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1140  "failed!\n", cu_id, gpuDynInst->simdId,
1141  gpuDynInst->wfSlotId, tmp_vaddr);
1142 
1143  tlbPort[tlbPort_index].retries.push_back(pkt);
1144  } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1145  // Stall the data port;
1146  // No more packet will be issued till
1147  // ruby indicates resources are freed by
1148  // a recvReqRetry() call back on this port.
1149  tlbPort[tlbPort_index].stallPort();
1150 
1151  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1152  "failed!\n", cu_id, gpuDynInst->simdId,
1153  gpuDynInst->wfSlotId, tmp_vaddr);
1154 
1155  tlbPort[tlbPort_index].retries.push_back(pkt);
1156  } else {
1157  DPRINTF(GPUTLB,
1158  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1159  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1160  }
1161  } else {
1162  if (pkt->cmd == MemCmd::MemSyncReq) {
1163  gpuDynInst->resetEntireStatusVector();
1164  } else {
1165  gpuDynInst->decrementStatusVector(index);
1166  }
1167 
1168  // New SenderState for the memory access
1169  delete pkt->senderState;
1170 
1171  // Because it's atomic operation, only need TLB translation state
1172  pkt->senderState = new X86ISA::GpuTLB::TranslationState(TLB_mode,
1173  shader->gpuTc);
1174 
1175  tlbPort[tlbPort_index].sendFunctional(pkt);
1176 
1177  // the addr of the packet is not modified, so we need to create a new
1178  // packet, or otherwise the memory access will have the old virtual
1179  // address sent in the translation packet, instead of the physical
1180  // address returned by the translation.
1181  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1182  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1183 
1184  // Translation is done. It is safe to send the packet to memory.
1185  memPort[0].sendFunctional(new_pkt);
1186 
1187  DPRINTF(GPUMem, "Functional sendRequest\n");
1188  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1189  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1190  new_pkt->req->getPaddr());
1191 
1192  // safe_cast the senderState
1193  X86ISA::GpuTLB::TranslationState *sender_state =
1194  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1195 
1196  delete sender_state->tlbEntry;
1197  delete new_pkt;
1198  delete pkt->senderState;
1199  delete pkt;
1200  }
1201 }
1202 
1203 void
1205 {
1206  assert(pkt->isWrite() || pkt->isRead());
1207 
1208  BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1209 
1210  pkt->senderState =
1212 
1213  pkt->senderState =
1214  new X86ISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
1215  pkt->senderState);
1216 
1217  if (scalarDTLBPort.isStalled()) {
1218  assert(scalarDTLBPort.retries.size());
1219  scalarDTLBPort.retries.push_back(pkt);
1220  } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1222  scalarDTLBPort.retries.push_back(pkt);
1223  } else {
1224  DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1225  tlb_mode == BaseMMU::Read ? "read" : "write",
1226  pkt->req->getVaddr());
1227  }
1228 }
1229 
1230 void
1232  bool kernelMemSync,
1233  RequestPtr req)
1234 {
1235  assert(gpuDynInst->isGlobalSeg() ||
1236  gpuDynInst->executedAs() == enums::SC_GLOBAL);
1237 
1238  if (!req) {
1239  req = std::make_shared<Request>(
1240  0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
1241  }
1242 
1243  // all mem sync requests have Paddr == 0
1244  req->setPaddr(0);
1245 
1246  PacketPtr pkt = nullptr;
1247 
1248  if (kernelMemSync) {
1249  if (gpuDynInst->isKernelLaunch()) {
1250  req->setCacheCoherenceFlags(Request::INV_L1);
1251  req->setReqInstSeqNum(gpuDynInst->seqNum());
1252  req->setFlags(Request::KERNEL);
1253  pkt = new Packet(req, MemCmd::MemSyncReq);
1254  pkt->pushSenderState(
1255  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1256 
1257  EventFunctionWrapper *mem_req_event =
1258  memPort[0].createMemReqEvent(pkt);
1259 
1260  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1261  "an acquire\n", cu_id, gpuDynInst->simdId,
1262  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1263 
1264  schedule(mem_req_event, curTick() + req_tick_latency);
1265  } else {
1266  // kernel end flush of GL2 cache may be quiesced by Ruby if the
1267  // GL2 is a read-only cache
1268  assert(shader->impl_kern_end_rel);
1269  assert(gpuDynInst->isEndOfKernel());
1270 
1271  req->setCacheCoherenceFlags(Request::FLUSH_L2);
1272  req->setReqInstSeqNum(gpuDynInst->seqNum());
1273  req->setFlags(Request::KERNEL);
1274  pkt = new Packet(req, MemCmd::MemSyncReq);
1275  pkt->pushSenderState(
1276  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1277 
1278  EventFunctionWrapper *mem_req_event =
1279  memPort[0].createMemReqEvent(pkt);
1280 
1281  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1282  "a release\n", cu_id, gpuDynInst->simdId,
1283  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1284 
1285  schedule(mem_req_event, curTick() + req_tick_latency);
1286  }
1287  } else {
1288  gpuDynInst->setRequestFlags(req);
1289 
1290  req->setReqInstSeqNum(gpuDynInst->seqNum());
1291 
1292  pkt = new Packet(req, MemCmd::MemSyncReq);
1293  pkt->pushSenderState(
1294  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1295 
1296  EventFunctionWrapper *mem_req_event =
1297  memPort[0].createMemReqEvent(pkt);
1298 
1299  DPRINTF(GPUPort,
1300  "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1301  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1302  pkt->req->getPaddr());
1303 
1304  schedule(mem_req_event, curTick() + req_tick_latency);
1305  }
1306 }
1307 
1308 void
1310 {
1311  DataPort::SenderState *sender_state =
1312  safe_cast<DataPort::SenderState*>(pkt->senderState);
1313 
1314  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1315  ComputeUnit *compute_unit = computeUnit;
1316 
1317  assert(gpuDynInst);
1318 
1319  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1320  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1321  pkt->req->getPaddr(), id);
1322 
1323  Addr paddr = pkt->req->getPaddr();
1324 
1325  // mem sync resp callback must be handled already in
1326  // DataPort::recvTimingResp
1327  assert(pkt->cmd != MemCmd::MemSyncResp);
1328 
1329  // The status vector and global memory response for WriteResp packets get
1330  // handled by the WriteCompleteResp packets.
1331  if (pkt->cmd == MemCmd::WriteResp) {
1332  delete pkt;
1333  return;
1334  }
1335 
1336  // this is for read, write and atomic
1337  int index = gpuDynInst->memStatusVector[paddr].back();
1338 
1339  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1340  pkt->req->getPaddr(), id);
1341 
1342  gpuDynInst->memStatusVector[paddr].pop_back();
1343  gpuDynInst->pAddr = pkt->req->getPaddr();
1344 
1345  gpuDynInst->decrementStatusVector(index);
1346  DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1347 
1348  if (gpuDynInst->allLanesZero()) {
1349  auto iter = gpuDynInst->memStatusVector.begin();
1350  auto end = gpuDynInst->memStatusVector.end();
1351 
1352  while (iter != end) {
1353  assert(iter->second.empty());
1354  ++iter;
1355  }
1356 
1357  // Calculate the difference between the arrival of the first cache
1358  // block and the last cache block to arrive if we have the time
1359  // for the first cache block.
1360  if (compute_unit->headTailMap.count(gpuDynInst)) {
1361  Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1362  compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1363  compute_unit->headTailMap.erase(gpuDynInst);
1364  }
1365 
1366  gpuDynInst->memStatusVector.clear();
1367 
1368  gpuDynInst->
1369  profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1370  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1371 
1372  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1373  compute_unit->cu_id, gpuDynInst->simdId,
1374  gpuDynInst->wfSlotId);
1375  } else {
1376  if (pkt->isRead()) {
1377  if (!compute_unit->headTailMap.count(gpuDynInst)) {
1378  compute_unit->headTailMap
1379  .insert(std::make_pair(gpuDynInst, curTick()));
1380  }
1381  }
1382  }
1383 
1384  delete pkt->senderState;
1385  delete pkt;
1386 }
1387 
1388 bool
1390 {
1391  Addr line = pkt->req->getPaddr();
1392 
1393  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1394  pkt->req->getVaddr(), line);
1395 
1396  assert(pkt->senderState);
1397  computeUnit->stats.tlbCycles += curTick();
1398 
1399  // pop off the TLB translation state
1400  X86ISA::GpuTLB::TranslationState *translation_state =
1401  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1402 
1403  // no PageFaults are permitted for data accesses
1404  if (!translation_state->tlbEntry) {
1405  DTLBPort::SenderState *sender_state =
1406  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1407 
1408  GEM5_VAR_USED Wavefront *w =
1409  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1410  [sender_state->_gpuDynInst->wfSlotId];
1411 
1412  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1413  pkt->req->getVaddr());
1414  }
1415 
1416  // update the hitLevel distribution
1417  int hit_level = translation_state->hitLevel;
1418  computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1419 
1420  delete translation_state->tlbEntry;
1421  assert(!translation_state->ports.size());
1422  pkt->senderState = translation_state->saved;
1423 
1424  // for prefetch pkt
1425  BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1426 
1427  delete translation_state;
1428 
1429  // use the original sender state to know how to close this transaction
1430  DTLBPort::SenderState *sender_state =
1431  safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1432 
1433  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1434  PortID mp_index = sender_state->portIndex;
1435  Addr vaddr = pkt->req->getVaddr();
1436  gpuDynInst->memStatusVector[line].push_back(mp_index);
1437  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1438 
1439  MemCmd requestCmd;
1440 
1441  if (pkt->cmd == MemCmd::ReadResp) {
1442  requestCmd = MemCmd::ReadReq;
1443  } else if (pkt->cmd == MemCmd::WriteResp) {
1444  requestCmd = MemCmd::WriteReq;
1445  } else if (pkt->cmd == MemCmd::SwapResp) {
1446  requestCmd = MemCmd::SwapReq;
1447  } else {
1448  panic("unsupported response to request conversion %s\n",
1449  pkt->cmd.toString());
1450  }
1451 
1452  if (computeUnit->prefetchDepth) {
1453  int simdId = gpuDynInst->simdId;
1454  int wfSlotId = gpuDynInst->wfSlotId;
1455  Addr last = 0;
1456 
1457  switch(computeUnit->prefetchType) {
1458  case enums::PF_CU:
1459  last = computeUnit->lastVaddrCU[mp_index];
1460  break;
1461  case enums::PF_PHASE:
1462  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1463  break;
1464  case enums::PF_WF:
1465  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1466  default:
1467  break;
1468  }
1469 
1470  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1471  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1472 
1473  int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1475  : 0;
1476 
1477  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1478 
1479  computeUnit->lastVaddrCU[mp_index] = vaddr;
1480  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1481  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1482 
1483  stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1484  computeUnit->prefetchStride: stride;
1485 
1486  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1487  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1488 
1489  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1490 
1491  // Prefetch Next few pages atomically
1492  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1493  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1495 
1496  if (!stride)
1497  break;
1498 
1499  RequestPtr prefetch_req = std::make_shared<Request>(
1501  sizeof(uint8_t), 0,
1502  computeUnit->requestorId(),
1503  0, 0, nullptr);
1504 
1505  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1506  uint8_t foo = 0;
1507  prefetch_pkt->dataStatic(&foo);
1508 
1509  // Because it's atomic operation, only need TLB translation state
1510  prefetch_pkt->senderState =
1511  new X86ISA::GpuTLB::TranslationState(TLB_mode,
1512  computeUnit->shader->gpuTc, true);
1513 
1514  // Currently prefetches are zero-latency, hence the sendFunctional
1515  sendFunctional(prefetch_pkt);
1516 
1517  /* safe_cast the senderState */
1519  safe_cast<X86ISA::GpuTLB::TranslationState*>(
1520  prefetch_pkt->senderState);
1521 
1522 
1523  delete tlb_state->tlbEntry;
1524  delete tlb_state;
1525  delete prefetch_pkt;
1526  }
1527  }
1528 
1529  // First we must convert the response cmd back to a request cmd so that
1530  // the request can be sent through the cu's request port
1531  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1532  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1533  delete pkt->senderState;
1534  delete pkt;
1535 
1536  // New SenderState for the memory access
1537  new_pkt->senderState =
1538  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1539  nullptr);
1540 
1541  // translation is done. Schedule the mem_req_event at the appropriate
1542  // cycle to send the timing memory request to ruby
1543  EventFunctionWrapper *mem_req_event =
1544  computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1545 
1546  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1547  computeUnit->cu_id, gpuDynInst->simdId,
1548  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1549 
1550  computeUnit->schedule(mem_req_event, curTick() +
1551  computeUnit->req_tick_latency);
1552 
1553  return true;
1554 }
1555 
1558 {
1559  return new EventFunctionWrapper(
1560  [this, pkt]{ processMemReqEvent(pkt); },
1561  "ComputeUnit memory request event", true);
1562 }
1563 
1566 {
1567  return new EventFunctionWrapper(
1568  [this, pkt]{ processMemRespEvent(pkt); },
1569  "ComputeUnit memory response event", true);
1570 }
1571 
1572 void
1574 {
1575  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1576  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1577  GEM5_VAR_USED ComputeUnit *compute_unit = computeUnit;
1578 
1579  if (!(sendTimingReq(pkt))) {
1580  retries.push_back(std::make_pair(pkt, gpuDynInst));
1581 
1582  DPRINTF(GPUPort,
1583  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1584  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1585  id, pkt->req->getPaddr());
1586  } else {
1587  DPRINTF(GPUPort,
1588  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1589  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1590  gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1591  pkt->req->getPaddr());
1592  }
1593 }
1594 
1595 const char*
1597 {
1598  return "ComputeUnit scalar memory request event";
1599 }
1600 
1601 void
1603 {
1604  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1605  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1606  GEM5_VAR_USED ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1607 
1608  if (!(scalarDataPort.sendTimingReq(pkt))) {
1609  scalarDataPort.retries.push_back(pkt);
1610 
1611  DPRINTF(GPUPort,
1612  "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1613  compute_unit->cu_id, gpuDynInst->simdId,
1614  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1615  } else {
1616  DPRINTF(GPUPort,
1617  "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1618  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1619  gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1620  pkt->req->getPaddr());
1621  }
1622 }
1623 
1624 /*
1625  * The initial translation request could have been rejected,
1626  * if <retries> queue is not Retry sending the translation
1627  * request. sendRetry() is called from the peer port whenever
1628  * a translation completes.
1629  */
1630 void
1632 {
1633  int len = retries.size();
1634 
1635  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1636  computeUnit->cu_id, len);
1637 
1638  assert(len > 0);
1639  assert(isStalled());
1640  // recvReqRetry is an indication that the resource on which this
1641  // port was stalling on is freed. So, remove the stall first
1642  unstallPort();
1643 
1644  for (int i = 0; i < len; ++i) {
1645  PacketPtr pkt = retries.front();
1646  GEM5_VAR_USED Addr vaddr = pkt->req->getVaddr();
1647  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1648 
1649  if (!sendTimingReq(pkt)) {
1650  // Stall port
1651  stallPort();
1652  DPRINTF(GPUTLB, ": failed again\n");
1653  break;
1654  } else {
1655  DPRINTF(GPUTLB, ": successful\n");
1656  retries.pop_front();
1657  }
1658  }
1659 }
1660 
1661 bool
1663 {
1664  assert(pkt->senderState);
1665 
1666  X86ISA::GpuTLB::TranslationState *translation_state =
1667  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1668 
1669  // Page faults are not allowed
1670  fatal_if(!translation_state->tlbEntry,
1671  "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1672 
1673  delete translation_state->tlbEntry;
1674  assert(!translation_state->ports.size());
1675 
1676  pkt->senderState = translation_state->saved;
1677  delete translation_state;
1678 
1679  ScalarDTLBPort::SenderState *sender_state =
1680  safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1681 
1682  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1683  delete pkt->senderState;
1684 
1685  GEM5_VAR_USED Wavefront *w = gpuDynInst->wavefront();
1686 
1687  DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1688  "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1689  w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1690 
1691  MemCmd mem_cmd;
1692 
1693  if (pkt->cmd == MemCmd::ReadResp) {
1694  mem_cmd = MemCmd::ReadReq;
1695  } else if (pkt->cmd == MemCmd::WriteResp) {
1696  mem_cmd = MemCmd::WriteReq;
1697  } else {
1698  fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1699  pkt->cmd.toString());
1700  }
1701 
1702  PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1703  req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1704  delete pkt;
1705 
1706  req_pkt->senderState =
1708 
1709  if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1710  computeUnit->scalarDataPort.retries.push_back(req_pkt);
1711  DPRINTF(GPUMem, "send scalar req failed for: %s\n",
1712  gpuDynInst->disassemble());
1713  } else {
1714  DPRINTF(GPUMem, "send scalar req for: %s\n",
1715  gpuDynInst->disassemble());
1716  }
1717 
1718  return true;
1719 }
1720 
1721 bool
1723 {
1724  GEM5_VAR_USED Addr line = pkt->req->getPaddr();
1725  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1726  computeUnit->cu_id, pkt->req->getVaddr(), line);
1727 
1728  assert(pkt->senderState);
1729 
1730  // pop off the TLB translation state
1731  X86ISA::GpuTLB::TranslationState *translation_state
1732  = safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1733 
1734  bool success = translation_state->tlbEntry != nullptr;
1735  delete translation_state->tlbEntry;
1736  assert(!translation_state->ports.size());
1737  pkt->senderState = translation_state->saved;
1738  delete translation_state;
1739 
1740  // use the original sender state to know how to close this transaction
1741  ITLBPort::SenderState *sender_state =
1742  safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1743 
1744  // get the wavefront associated with this translation request
1745  Wavefront *wavefront = sender_state->wavefront;
1746  delete pkt->senderState;
1747 
1748  if (success) {
1749  // pkt is reused in fetch(), don't delete it here. However, we must
1750  // reset the command to be a request so that it can be sent through
1751  // the cu's request port
1752  assert(pkt->cmd == MemCmd::ReadResp);
1753  pkt->cmd = MemCmd::ReadReq;
1754 
1755  computeUnit->fetchStage.fetch(pkt, wavefront);
1756  } else {
1757  if (wavefront->dropFetch) {
1758  assert(wavefront->instructionBuffer.empty());
1759  wavefront->dropFetch = false;
1760  }
1761 
1762  wavefront->pendingFetch = 0;
1763  }
1764 
1765  return true;
1766 }
1767 
1768 /*
1769  * The initial translation request could have been rejected, if
1770  * <retries> queue is not empty. Retry sending the translation
1771  * request. sendRetry() is called from the peer port whenever
1772  * a translation completes.
1773  */
1774 void
1776 {
1777 
1778  int len = retries.size();
1779  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1780 
1781  assert(len > 0);
1782  assert(isStalled());
1783 
1784  // recvReqRetry is an indication that the resource on which this
1785  // port was stalling on is freed. So, remove the stall first
1786  unstallPort();
1787 
1788  for (int i = 0; i < len; ++i) {
1789  PacketPtr pkt = retries.front();
1790  GEM5_VAR_USED Addr vaddr = pkt->req->getVaddr();
1791  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1792 
1793  if (!sendTimingReq(pkt)) {
1794  stallPort(); // Stall port
1795  DPRINTF(GPUTLB, ": failed again\n");
1796  break;
1797  } else {
1798  DPRINTF(GPUTLB, ": successful\n");
1799  retries.pop_front();
1800  }
1801  }
1802 }
1803 
1804 void
1806 {
1807  if (gpuDynInst->isScalar()) {
1808  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1809  stats.sALUInsts++;
1811  } else if (gpuDynInst->isLoad()) {
1813  } else if (gpuDynInst->isStore()) {
1815  }
1816  } else {
1817  if (gpuDynInst->isALU()) {
1820  exitSimLoop("max vALU insts");
1821  }
1822  stats.vALUInsts++;
1825  += gpuDynInst->wavefront()->execMask().count();
1826  } else if (gpuDynInst->isFlat()) {
1827  if (gpuDynInst->isLocalMem()) {
1828  stats.flatLDSInsts++;
1829  } else {
1830  stats.flatVMemInsts++;
1831  }
1832  } else if (gpuDynInst->isLocalMem()) {
1834  } else if (gpuDynInst->isLoad()) {
1836  } else if (gpuDynInst->isStore()) {
1838  }
1839 
1840  if (gpuDynInst->isLoad()) {
1841  switch (gpuDynInst->executedAs()) {
1842  case enums::SC_SPILL:
1843  stats.spillReads++;
1844  break;
1845  case enums::SC_GLOBAL:
1846  stats.globalReads++;
1847  break;
1848  case enums::SC_GROUP:
1849  stats.groupReads++;
1850  break;
1851  case enums::SC_PRIVATE:
1852  stats.privReads++;
1853  break;
1854  case enums::SC_READONLY:
1855  stats.readonlyReads++;
1856  break;
1857  case enums::SC_KERNARG:
1858  stats.kernargReads++;
1859  break;
1860  case enums::SC_ARG:
1861  stats.argReads++;
1862  break;
1863  case enums::SC_NONE:
1868  break;
1869  default:
1870  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1871  break;
1872  }
1873  } else if (gpuDynInst->isStore()) {
1874  switch (gpuDynInst->executedAs()) {
1875  case enums::SC_SPILL:
1876  stats.spillWrites++;
1877  break;
1878  case enums::SC_GLOBAL:
1879  stats.globalWrites++;
1880  break;
1881  case enums::SC_GROUP:
1882  stats.groupWrites++;
1883  break;
1884  case enums::SC_PRIVATE:
1885  stats.privWrites++;
1886  break;
1887  case enums::SC_READONLY:
1889  break;
1890  case enums::SC_KERNARG:
1891  stats.kernargWrites++;
1892  break;
1893  case enums::SC_ARG:
1894  stats.argWrites++;
1895  break;
1896  case enums::SC_NONE:
1901  break;
1902  default:
1903  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1904  break;
1905  }
1906  }
1907  }
1908 }
1909 
1910 void
1912 {
1913  Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
1914 
1915  if (!pagesTouched.count(virt_page_addr))
1916  pagesTouched[virt_page_addr] = 1;
1917  else
1918  pagesTouched[virt_page_addr]++;
1919 }
1920 
1921 void
1923 {
1924  if (countPages) {
1925  std::ostream *page_stat_file = simout.create(name().c_str())->stream();
1926 
1927  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1928  std::endl;
1929 
1930  for (auto iter : pageAccesses) {
1931  *page_stat_file << std::hex << iter.first << ",";
1932  *page_stat_file << std::dec << iter.second.first << ",";
1933  *page_stat_file << std::dec << iter.second.second << std::endl;
1934  }
1935  }
1936 }
1937 
1938 bool
1940 {
1941  for (int i = 0; i < numVectorALUs; ++i) {
1942  if (!isVectorAluIdle(i)) {
1943  return false;
1944  }
1945  }
1946 
1947  // TODO: FIXME if more than 1 of any memory pipe supported
1948  if (!srfToScalarMemPipeBus.rdy()) {
1949  return false;
1950  }
1951  if (!vrfToGlobalMemPipeBus.rdy()) {
1952  return false;
1953  }
1954  if (!vrfToLocalMemPipeBus.rdy()) {
1955  return false;
1956  }
1957 
1962  return false;
1963  }
1964 
1965  return true;
1966 }
1967 
1968 int32_t
1969 ComputeUnit::getRefCounter(const uint32_t dispatchId,
1970  const uint32_t wgId) const
1971 {
1972  return lds.getRefCounter(dispatchId, wgId);
1973 }
1974 
1975 bool
1976 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
1977 {
1978  assert(simdId < numVectorALUs);
1979 
1980  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1981  if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
1982  return false;
1983  }
1984  }
1985 
1986  return true;
1987 }
1988 
1994 bool
1996 {
1997  // this is just a request to carry the GPUDynInstPtr
1998  // back and forth
1999  RequestPtr newRequest = std::make_shared<Request>();
2000  newRequest->setPaddr(0x0);
2001 
2002  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2003  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2004 
2005  // This is the SenderState needed upon return
2006  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2007 
2008  return ldsPort.sendTimingReq(newPacket);
2009 }
2010 
2014 bool
2016 {
2017  const ComputeUnit::LDSPort::SenderState *senderState =
2018  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2019 
2020  fatal_if(!senderState, "did not get the right sort of sender state");
2021 
2022  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2023 
2024  delete packet->senderState;
2025  delete packet;
2026 
2027  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2028  return true;
2029 }
2030 
2036 bool
2038 {
2039  ComputeUnit::LDSPort::SenderState *sender_state =
2040  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2041  fatal_if(!sender_state, "packet without a valid sender state");
2042 
2043  GEM5_VAR_USED GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2044 
2045  if (isStalled()) {
2046  fatal_if(retries.empty(), "must have retries waiting to be stalled");
2047 
2048  retries.push(pkt);
2049 
2050  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2051  computeUnit->cu_id, gpuDynInst->simdId,
2052  gpuDynInst->wfSlotId);
2053  return false;
2054  } else if (!RequestPort::sendTimingReq(pkt)) {
2055  // need to stall the LDS port until a recvReqRetry() is received
2056  // this indicates that there is more space
2057  stallPort();
2058  retries.push(pkt);
2059 
2060  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2061  computeUnit->cu_id, gpuDynInst->simdId,
2062  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2063  return false;
2064  } else {
2065  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2066  computeUnit->cu_id, gpuDynInst->simdId,
2067  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2068  return true;
2069  }
2070 }
2071 
2078 void
2080 {
2081  auto queueSize = retries.size();
2082 
2083  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2084  computeUnit->cu_id, queueSize);
2085 
2086  fatal_if(queueSize < 1,
2087  "why was there a recvReqRetry() with no pending reqs?");
2088  fatal_if(!isStalled(),
2089  "recvReqRetry() happened when the port was not stalled");
2090 
2091  unstallPort();
2092 
2093  while (!retries.empty()) {
2094  PacketPtr packet = retries.front();
2095 
2096  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2097 
2098  if (!RequestPort::sendTimingReq(packet)) {
2099  // Stall port
2100  stallPort();
2101  DPRINTF(GPUPort, ": LDS send failed again\n");
2102  break;
2103  } else {
2104  DPRINTF(GPUTLB, ": LDS send successful\n");
2105  retries.pop();
2106  }
2107  }
2108 }
2109 
2111  int n_wf)
2112  : statistics::Group(parent),
2113  ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2114  ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2115  "per-wavefront."),
2116  ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2117  ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2118  "per-wavefront."),
2119  ADD_STAT(instCyclesVALU,
2120  "Number of cycles needed to execute VALU insts."),
2121  ADD_STAT(instCyclesSALU,
2122  "Number of cycles needed to execute SALU insts."),
2123  ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2124  "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2125  "the number of active threads."),
2126  ADD_STAT(vALUUtilization,
2127  "Percentage of active vector ALU threads in a wave."),
2128  ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2129  " accesses that resolve to LDS."),
2130  ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2131  "including FLAT accesses that resolve to LDS) per-wavefront."),
2132  ADD_STAT(flatVMemInsts,
2133  "The number of FLAT insts that resolve to vmem issued."),
2134  ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2135  "resolve to vmem issued per-wavefront."),
2136  ADD_STAT(flatLDSInsts,
2137  "The number of FLAT insts that resolve to LDS issued."),
2138  ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2139  "resolve to LDS issued per-wavefront."),
2140  ADD_STAT(vectorMemWrites,
2141  "Number of vector mem write insts (excluding FLAT insts)."),
2142  ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2143  "insts (excluding FLAT insts) per-wavefront."),
2144  ADD_STAT(vectorMemReads,
2145  "Number of vector mem read insts (excluding FLAT insts)."),
2146  ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2147  "(excluding FLAT insts) per-wavefront."),
2148  ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2149  ADD_STAT(scalarMemWritesPerWF,
2150  "The average number of scalar mem write insts per-wavefront."),
2151  ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2152  ADD_STAT(scalarMemReadsPerWF,
2153  "The average number of scalar mem read insts per-wavefront."),
2154  ADD_STAT(vectorMemReadsPerKiloInst,
2155  "Number of vector mem reads per kilo-instruction"),
2156  ADD_STAT(vectorMemWritesPerKiloInst,
2157  "Number of vector mem writes per kilo-instruction"),
2158  ADD_STAT(vectorMemInstsPerKiloInst,
2159  "Number of vector mem insts per kilo-instruction"),
2160  ADD_STAT(scalarMemReadsPerKiloInst,
2161  "Number of scalar mem reads per kilo-instruction"),
2162  ADD_STAT(scalarMemWritesPerKiloInst,
2163  "Number of scalar mem writes per kilo-instruction"),
2164  ADD_STAT(scalarMemInstsPerKiloInst,
2165  "Number of scalar mem insts per kilo-instruction"),
2166  ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2167  "command, data from VRF to vector memory unit, per SIMD"),
2168  ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2169  "command, data from SRF to scalar memory unit, per SIMD"),
2170  ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2171  "command, data from VRF to LDS unit, per SIMD"),
2172  ADD_STAT(globalReads, "Number of reads to the global segment"),
2173  ADD_STAT(globalWrites, "Number of writes to the global segment"),
2174  ADD_STAT(globalMemInsts,
2175  "Number of memory instructions sent to the global segment"),
2176  ADD_STAT(argReads, "Number of reads to the arg segment"),
2177  ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2178  ADD_STAT(argMemInsts,
2179  "Number of memory instructions sent to the arg segment"),
2180  ADD_STAT(spillReads, "Number of reads to the spill segment"),
2181  ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2182  ADD_STAT(spillMemInsts,
2183  "Number of memory instructions sent to the spill segment"),
2184  ADD_STAT(groupReads, "Number of reads to the group segment"),
2185  ADD_STAT(groupWrites, "Number of writes to the group segment"),
2186  ADD_STAT(groupMemInsts,
2187  "Number of memory instructions sent to the group segment"),
2188  ADD_STAT(privReads, "Number of reads to the private segment"),
2189  ADD_STAT(privWrites, "Number of writes to the private segment"),
2190  ADD_STAT(privMemInsts,
2191  "Number of memory instructions sent to the private segment"),
2192  ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2193  ADD_STAT(readonlyWrites,
2194  "Number of memory instructions sent to the readonly segment"),
2195  ADD_STAT(readonlyMemInsts,
2196  "Number of memory instructions sent to the readonly segment"),
2197  ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2198  ADD_STAT(kernargWrites,
2199  "Number of memory instructions sent to the kernarg segment"),
2200  ADD_STAT(kernargMemInsts,
2201  "Number of memory instructions sent to the kernarg segment"),
2202  ADD_STAT(waveLevelParallelism,
2203  "wave level parallelism: count of active waves at wave launch"),
2204  ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2205  ADD_STAT(tlbCycles,
2206  "total number of cycles for all uncoalesced requests"),
2207  ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2208  ADD_STAT(hitsPerTLBLevel,
2209  "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2210  ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2211  ADD_STAT(ldsBankConflictDist,
2212  "Number of bank conflicts per LDS memory packet"),
2213  ADD_STAT(pageDivergenceDist,
2214  "pages touched per wf (over all mem. instr.)"),
2215  ADD_STAT(dynamicGMemInstrCnt,
2216  "dynamic non-flat global memory instruction count"),
2217  ADD_STAT(dynamicFlatMemInstrCnt,
2218  "dynamic flat global memory instruction count"),
2219  ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2220  ADD_STAT(wgBlockedDueBarrierAllocation,
2221  "WG dispatch was blocked due to lack of barrier resources"),
2222  ADD_STAT(wgBlockedDueLdsAllocation,
2223  "Workgroup blocked due to LDS capacity"),
2224  ADD_STAT(numInstrExecuted, "number of instructions executed"),
2225  ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2226  "vector instructions per cycle"),
2227  ADD_STAT(numVecOpsExecuted,
2228  "number of vec ops executed (e.g. WF size/inst)"),
2229  ADD_STAT(numVecOpsExecutedF16,
2230  "number of f16 vec ops executed (e.g. WF size/inst)"),
2231  ADD_STAT(numVecOpsExecutedF32,
2232  "number of f32 vec ops executed (e.g. WF size/inst)"),
2233  ADD_STAT(numVecOpsExecutedF64,
2234  "number of f64 vec ops executed (e.g. WF size/inst)"),
2235  ADD_STAT(numVecOpsExecutedFMA16,
2236  "number of fma16 vec ops executed (e.g. WF size/inst)"),
2237  ADD_STAT(numVecOpsExecutedFMA32,
2238  "number of fma32 vec ops executed (e.g. WF size/inst)"),
2239  ADD_STAT(numVecOpsExecutedFMA64,
2240  "number of fma64 vec ops executed (e.g. WF size/inst)"),
2241  ADD_STAT(numVecOpsExecutedMAC16,
2242  "number of mac16 vec ops executed (e.g. WF size/inst)"),
2243  ADD_STAT(numVecOpsExecutedMAC32,
2244  "number of mac32 vec ops executed (e.g. WF size/inst)"),
2245  ADD_STAT(numVecOpsExecutedMAC64,
2246  "number of mac64 vec ops executed (e.g. WF size/inst)"),
2247  ADD_STAT(numVecOpsExecutedMAD16,
2248  "number of mad16 vec ops executed (e.g. WF size/inst)"),
2249  ADD_STAT(numVecOpsExecutedMAD32,
2250  "number of mad32 vec ops executed (e.g. WF size/inst)"),
2251  ADD_STAT(numVecOpsExecutedMAD64,
2252  "number of mad64 vec ops executed (e.g. WF size/inst)"),
2253  ADD_STAT(numVecOpsExecutedTwoOpFP,
2254  "number of two op FP vec ops executed (e.g. WF size/inst)"),
2255  ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2256  ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2257  ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2258  ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2259  ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2260  ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2261  ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2262  "instruction (over all instructions)"),
2263  ADD_STAT(activeLanesPerGMemInstrDist,
2264  "number of active lanes per global memory instruction"),
2265  ADD_STAT(activeLanesPerLMemInstrDist,
2266  "number of active lanes per local memory instruction"),
2267  ADD_STAT(numALUInstsExecuted,
2268  "Number of dynamic non-GM memory insts executed"),
2269  ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2270  "blocked due to VGPR allocation per SIMD"),
2271  ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2272  "blocked due to SGPR allocation per SIMD"),
2273  ADD_STAT(numCASOps, "number of compare and swap operations"),
2274  ADD_STAT(numFailedCASOps,
2275  "number of compare and swap operations that failed"),
2276  ADD_STAT(completedWfs, "number of completed wavefronts"),
2277  ADD_STAT(completedWGs, "number of completed workgroups"),
2278  ADD_STAT(headTailLatency, "ticks between first and last cache block "
2279  "arrival at coalescer"),
2280  ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2281 {
2282  ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2283 
2287 
2288  hitsPerTLBLevel.init(4);
2289  execRateDist.init(0, 10, 2);
2290  ldsBankConflictDist.init(0, cu->wfSize(), 2);
2291 
2292  pageDivergenceDist.init(1, cu->wfSize(), 4);
2293  controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
2296 
2297  headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
2299  waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2300  instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2301 
2304  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
2312 
2321 
2329 
2331 
2332  // fixed number of TLB levels
2333  for (int i = 0; i < 4; ++i) {
2334  if (!i)
2335  hitsPerTLBLevel.subname(i,"page_table");
2336  else
2337  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2338  }
2339 
2345 
2348 }
2349 
2350 } // namespace gem5
gem5::ComputeUnit::ComputeUnitStats::tlbRequests
statistics::Scalar tlbRequests
Definition: compute_unit.hh:1011
gem5::ComputeUnit::ComputeUnitStats::sALUInstsPerWF
statistics::Formula sALUInstsPerWF
Definition: compute_unit.hh:952
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:189
gem5::PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:252
gem5::ComputeUnit::ComputeUnitStats::vALUUtilization
statistics::Formula vALUUtilization
Definition: compute_unit.hh:956
gem5::ComputeUnit::getAndIncSeqNum
InstSeqNum getAndIncSeqNum()
Definition: compute_unit.hh:883
gem5::GMEnqueue
@ GMEnqueue
Definition: misc.hh:58
gem5::HSAQueueEntry::numWg
int numWg(int dim) const
Definition: hsa_queue_entry.hh:237
gem5::ComputeUnit::wfList
std::vector< std::vector< Wavefront * > > wfList
Definition: compute_unit.hh:293
gem5::ComputeUnit::ComputeUnit
ComputeUnit(const Params &p)
Definition: compute_unit.cc:66
gem5::BaseMMU::Read
@ Read
Definition: mmu.hh:53
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerWF
statistics::Formula scalarMemReadsPerWF
Definition: compute_unit.hh:970
gem5::RequestPort::sendTimingReq
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition: port.hh:495
gem5::ArmISA::len
Bitfield< 18, 16 > len
Definition: misc_types.hh:444
gem5::LdsState::getRefCounter
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:332
gem5::ComputeUnit::ComputeUnitStats::instCyclesSALU
statistics::Scalar instCyclesSALU
Definition: compute_unit.hh:954
gem5::Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:110
simple_pool_manager.hh
gem5::Wavefront::S_RUNNING
@ S_RUNNING
Definition: wavefront.hh:72
gem5::ComputeUnit::fetchStage
FetchStage fetchStage
Definition: compute_unit.hh:282
gem5::ComputeUnit::ComputeUnitStats::instInterleave
statistics::VectorDistribution instInterleave
Definition: compute_unit.hh:1090
gem5::ComputeUnit::ComputeUnitStats::flatVMemInsts
statistics::Scalar flatVMemInsts
Definition: compute_unit.hh:959
gem5::ComputeUnit::ScalarDTLBPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:714
gem5::ComputeUnit::sendRequest
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
Definition: compute_unit.cc:1011
gem5::ScalarMemPipeline::exec
void exec()
Definition: scalar_memory_pipeline.cc:56
gem5::MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:281
gem5::ComputeUnit::debugSegFault
bool debugSegFault
Definition: compute_unit.hh:343
gem5::FetchStage::exec
void exec()
Definition: fetch_stage.cc:67
shader.hh
gem5::ComputeUnit::DataPort::processMemReqEvent
void processMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1573
gem5::ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:287
gem5::ComputeUnit::ComputeUnitStats::privWrites
statistics::Scalar privWrites
Definition: compute_unit.hh:998
gem5::ComputeUnit::ComputeUnitStats::kernargWrites
statistics::Scalar kernargWrites
Definition: compute_unit.hh:1004
gem5::MemCmd::SwapReq
@ SwapReq
Definition: packet.hh:115
gem5::ComputeUnit::numVecRegsPerSimd
int numVecRegsPerSimd
Definition: compute_unit.hh:373
gem5::ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition: compute_unit.hh:299
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerWF
statistics::Formula scalarMemWritesPerWF
Definition: compute_unit.hh:968
gem5::ComputeUnit::ComputeUnitStats::argMemInsts
statistics::Formula argMemInsts
Definition: compute_unit.hh:990
gem5::ComputeUnit::ITLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1722
gem5::ComputeUnit::ComputeUnitStats::spillWrites
statistics::Scalar spillWrites
Definition: compute_unit.hh:992
gem5::ComputeUnit::ComputeUnitStats::spillMemInsts
statistics::Formula spillMemInsts
Definition: compute_unit.hh:993
gem5::ComputeUnit::ComputeUnitStats::scalarMemWritesPerKiloInst
statistics::Formula scalarMemWritesPerKiloInst
Definition: compute_unit.hh:976
gem5::MipsISA::index
Bitfield< 30, 0 > index
Definition: pra_constants.hh:47
gem5::ComputeUnit::ComputeUnitStats::readonlyReads
statistics::Scalar readonlyReads
Definition: compute_unit.hh:1000
gem5::ComputeUnit::LDSPort::SenderState
SenderState is information carried along with the packet, esp.
Definition: compute_unit.hh:785
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueBarrierAllocation
Definition: compute_unit.hh:1030
gem5::ComputeUnit::DTLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1389
gem5::Packet::pushSenderState
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition: packet.cc:316
gem5::ComputeUnit::ComputeUnitStats::vectorMemReads
statistics::Scalar vectorMemReads
Definition: compute_unit.hh:965
gem5::BaseMMU::Mode
Mode
Definition: mmu.hh:53
gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:366
gem5::ComputeUnit::lastVaddrSimd
std::vector< std::vector< Addr > > lastVaddrSimd
Definition: compute_unit.hh:338
gem5::BaseMMU::Write
@ Write
Definition: mmu.hh:53
gem5::Wavefront
Definition: wavefront.hh:62
gem5::ComputeUnit::ScalarDTLBPort::isStalled
bool isStalled() const
Definition: compute_unit.hh:710
gem5::FetchStage::init
void init()
Definition: fetch_stage.cc:58
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF64
statistics::Scalar numVecOpsExecutedF64
Definition: compute_unit.hh:1046
gem5::ComputeUnit::ComputeUnitStats::dynamicGMemInstrCnt
statistics::Scalar dynamicGMemInstrCnt
Definition: compute_unit.hh:1025
gem5::HSAQueueEntry
Definition: hsa_queue_entry.hh:61
compute_unit.hh
gem5::ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition: compute_unit.cc:241
gem5::X86ISA::GpuTLB::TranslationState::saved
Packet::SenderState * saved
Definition: gpu_tlb.hh:310
gem5::ComputeUnit::pagesTouched
std::map< Addr, int > pagesTouched
Definition: compute_unit.hh:380
gpu_static_inst.hh
gem5::VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:47
gem5::ComputeUnit::scoreboardCheckStage
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:283
gem5::ComputeUnit::stats
gem5::ComputeUnit::ComputeUnitStats stats
gem5::ComputeUnit::headTailMap
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
Definition: compute_unit.hh:939
gem5::ComputeUnit::ComputeUnitStats::vpc_f16
statistics::Formula vpc_f16
Definition: compute_unit.hh:1064
gem5::ComputeUnit::ComputeUnitStats::tlbLatency
statistics::Formula tlbLatency
Definition: compute_unit.hh:1013
gem5::simout
OutputDirectory simout
Definition: output.cc:62
gem5::ComputeUnit::DataPort::processMemRespEvent
void processMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1309
gem5::ComputeUnit::lastVaddrCU
std::vector< Addr > lastVaddrCU
Definition: compute_unit.hh:337
gem5::MemCmd::SwapResp
@ SwapResp
Definition: packet.hh:116
gem5::ComputeUnit::ScalarDTLBPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:1662
gem5::ComputeUnit::resp_tick_latency
Tick resp_tick_latency
Definition: compute_unit.hh:358
gem5::statistics::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:399
gem5::ComputeUnit::ComputeUnitStats::sALUInsts
statistics::Scalar sALUInsts
Definition: compute_unit.hh:951
gem5::Packet::isWrite
bool isWrite() const
Definition: packet.hh:583
gem5::ComputeUnit::exec
void exec()
Definition: compute_unit.cc:721
gem5::X86ISA::GpuTLB::TranslationState::tlbMode
Mode tlbMode
Definition: gpu_tlb.hh:288
gem5::Wavefront::pendingFetch
bool pendingFetch
Definition: wavefront.hh:113
gem5::ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition: compute_unit.hh:241
gem5::ComputeUnit::releaseBarrier
void releaseBarrier(int bar_id)
Definition: compute_unit.cc:698
gem5::ComputeUnit::ComputeUnitStats::instCyclesScMemPerSimd
statistics::Vector instCyclesScMemPerSimd
Definition: compute_unit.hh:982
gem5::Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:519
gem5::ComputeUnit::numYetToReachBarrier
int numYetToReachBarrier(int bar_id)
Definition: compute_unit.cc:649
gem5::HSAQueueEntry::wgId
int wgId(int dim) const
Definition: hsa_queue_entry.hh:211
gem5::ComputeUnit::ComputeUnitStats::ldsBankConflictDist
statistics::Distribution ldsBankConflictDist
Definition: compute_unit.hh:1019
gem5::ComputeUnit::getRefCounter
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Definition: compute_unit.cc:1969
gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
gem5::ComputeUnit::ITLBPort::SenderState::wavefront
Wavefront * wavefront
Definition: compute_unit.hh:744
gem5::OutputDirectory::create
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:210
gem5::ComputeUnit::ComputeUnitStats::kernargReads
statistics::Scalar kernargReads
Definition: compute_unit.hh:1003
gem5::csprintf
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
gem5::Wavefront::S_STOPPED
@ S_STOPPED
Definition: wavefront.hh:68
gem5::X86ISA::PageShift
const Addr PageShift
Definition: page_size.hh:48
gem5::ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition: compute_unit.hh:225
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInsts
statistics::Scalar ldsNoFlatInsts
Definition: compute_unit.hh:957
gem5::ComputeUnit::resetBarrier
void resetBarrier(int bar_id)
Definition: compute_unit.cc:684
gem5::ComputeUnit::ComputeUnitStats::globalReads
statistics::Scalar globalReads
Definition: compute_unit.hh:985
gem5::ComputeUnit::ComputeUnitStats::groupMemInsts
statistics::Formula groupMemInsts
Definition: compute_unit.hh:996
gem5::ComputeUnit::ComputeUnitStats::vpc
statistics::Formula vpc
Definition: compute_unit.hh:1063
gem5::RegisterManager::vrfPoolMgrs
std::vector< PoolManager * > vrfPoolMgrs
Definition: register_manager.hh:82
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerLMemInstrDist
statistics::Distribution activeLanesPerLMemInstrDist
Definition: compute_unit.hh:1070
gem5::ComputeUnit::memPortTokens
TokenManager * memPortTokens
Definition: compute_unit.hh:506
gem5::GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:59
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:66
gem5::ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition: compute_unit.hh:229
gem5::ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:355
gem5::ComputeUnit::req_tick_latency
Tick req_tick_latency
Definition: compute_unit.hh:357
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerWF
statistics::Formula vectorMemWritesPerWF
Definition: compute_unit.hh:964
sim_exit.hh
gem5::HSAQueueEntry::numScalarRegs
int numScalarRegs() const
Definition: hsa_queue_entry.hh:143
gem5::isPowerOf2
static constexpr bool isPowerOf2(const T &n)
Definition: intmath.hh:98
output.hh
gem5::ComputeUnit::ComputeUnitStats::headTailLatency
statistics::Distribution headTailLatency
Definition: compute_unit.hh:1084
gem5::ComputeUnit::scalarDataPort
ScalarDataPort scalarDataPort
Definition: compute_unit.hh:851
gem5::ComputeUnit::ComputeUnitStats::threadCyclesVALU
statistics::Scalar threadCyclesVALU
Definition: compute_unit.hh:955
gem5::ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:294
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerWF
statistics::Formula vectorMemReadsPerWF
Definition: compute_unit.hh:966
gem5::statistics::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1325
gem5::ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:297
gem5::ComputeUnit::ComputeUnitStats::instCyclesVMemPerSimd
statistics::Vector instCyclesVMemPerSimd
Definition: compute_unit.hh:981
wavefront.hh
gem5::exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition: sim_events.cc:88
gem5::TokenRequestPort::setTokenManager
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition: token_port.cc:74
gem5::ComputeUnit::SQCPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:988
gem5::ComputeUnit::ComputeUnitStats::groupReads
statistics::Scalar groupReads
Definition: compute_unit.hh:994
gem5::GPUComputeDriver::setMtype
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
Definition: gpu_compute_driver.cc:1010
gem5::ComputeUnit::ComputeUnitStats::vpc_f64
statistics::Formula vpc_f64
Definition: compute_unit.hh:1066
gem5::ComputeUnit::injectGlobalMemFence
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
Definition: compute_unit.cc:1231
gem5::ComputeUnit::ScalarDataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:601
gem5::ComputeUnit::LDSPort::sendTimingReq
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
Definition: compute_unit.cc:2037
gem5::ComputeUnit::locMemToVrfBus
WaitClass locMemToVrfBus
Definition: compute_unit.hh:231
gem5::MemCmd
Definition: packet.hh:75
gem5::Request::FLUSH_L2
@ FLUSH_L2
Definition: request.hh:310
gem5::statistics::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:62
gem5::ComputeUnit::ComputeUnitStats::kernargMemInsts
statistics::Formula kernargMemInsts
Definition: compute_unit.hh:1005
gem5::ComputeUnit::ComputeUnitStats::flatVMemInstsPerWF
statistics::Formula flatVMemInstsPerWF
Definition: compute_unit.hh:960
gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1134
gem5::Wavefront::setStatus
void setStatus(status_e newStatus)
Definition: wavefront.cc:520
gem5::LdsState::canReserve
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:480
gem5::ComputeUnit::numScalarMemUnits
int numScalarMemUnits
Definition: compute_unit.hh:237
gem5::GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:268
gem5::ComputeUnit::DTLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1631
gem5::ComputeUnit::ITLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:741
gem5::ArmISA::j
Bitfield< 24 > j
Definition: misc_types.hh:57
gem5::X86ISA::GpuTLB::TranslationState
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: gpu_tlb.hh:285
gem5::ComputeUnit
Definition: compute_unit.hh:203
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::process
void process()
Definition: compute_unit.cc:1602
gem5::ComputeUnit::pageAccesses
pageDataStruct pageAccesses
Definition: compute_unit.hh:485
gem5::X86ISA::GpuTLB::TranslationState::hitLevel
int hitLevel
Definition: gpu_tlb.hh:309
gem5::HSAQueueEntry::MAX_DIM
const static int MAX_DIM
Definition: hsa_queue_entry.hh:312
gem5::ComputeUnit::ScalarDataPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:598
gem5::OutputStream::stream
std::ostream * stream() const
Get the output underlying output stream.
Definition: output.hh:62
gem5::ComputeUnit::ScalarDataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:577
gem5::X86ISA::GpuTLB::TranslationState::ports
std::vector< ResponsePort * > ports
Definition: gpu_tlb.hh:303
gem5::ComputeUnit::ComputeUnitStats::flatLDSInsts
statistics::Scalar flatLDSInsts
Definition: compute_unit.hh:961
gem5::ComputeUnit::numScalarALUs
int numScalarALUs
Definition: compute_unit.hh:250
gem5::statistics::VectorDistribution::init
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
Definition: statistics.hh:2275
gem5::ComputeUnit::numVectorALUs
int numVectorALUs
Definition: compute_unit.hh:246
vector_register_file.hh
gem5::Packet::isRead
bool isRead() const
Definition: packet.hh:582
gem5::LocalMemPipeline::exec
void exec()
Definition: local_memory_pipeline.cc:54
gem5::ComputeUnit::startWavefront
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
Definition: compute_unit.cc:311
gem5::WaitClass::init
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition: misc.hh:78
gem5::ComputeUnit::ComputeUnitStats::privReads
statistics::Scalar privReads
Definition: compute_unit.hh:997
gem5::ComputeUnit::functionalTLB
bool functionalTLB
Definition: compute_unit.hh:347
gem5::ComputeUnit::numAtBarrier
int numAtBarrier(int bar_id)
Definition: compute_unit.cc:670
gem5::Request::INV_L1
@ INV_L1
Definition: request.hh:305
gem5::ComputeUnit::incNumAtBarrier
void incNumAtBarrier(int bar_id)
Definition: compute_unit.cc:663
gem5::statistics::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2110
gem5::MemCmd::WriteResp
@ WriteResp
Definition: packet.hh:90
gem5::ComputeUnit::ComputeUnitStats::completedWfs
statistics::Scalar completedWfs
Definition: compute_unit.hh:1079
gem5::HSAQueueEntry::numVectorRegs
int numVectorRegs() const
Definition: hsa_queue_entry.hh:137
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecuted
statistics::Scalar numVecOpsExecuted
Definition: compute_unit.hh:1040
gem5::Named::name
virtual std::string name() const
Definition: named.hh:47
gem5::WFBarrier::InvalidID
static const int InvalidID
Definition: compute_unit.hh:99
gem5::ScheduleStage::init
void init()
Definition: schedule_stage.cc:78
gem5::ComputeUnit::decMaxBarrierCnt
void decMaxBarrierCnt(int bar_id)
Definition: compute_unit.cc:691
gem5::ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition: compute_unit.hh:235
gem5::ComputeUnit::releaseWFsFromBarrier
void releaseWFsFromBarrier(int bar_id)
Definition: compute_unit.cc:706
gem5::ComputeUnit::ComputeUnitStats::activeLanesPerGMemInstrDist
statistics::Distribution activeLanesPerGMemInstrDist
Definition: compute_unit.hh:1069
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:186
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition: compute_unit.hh:243
gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:283
gem5::ArmISA::d
Bitfield< 9 > d
Definition: misc_types.hh:63
gem5::ComputeUnit::execStage
ExecStage execStage
Definition: compute_unit.hh:285
gem5::ComputeUnit::ScalarDataPort::MemReqEvent::description
const char * description() const
Return a C string describing the event.
Definition: compute_unit.cc:1596
gem5::ComputeUnit::ComputeUnitStats::vALUInsts
statistics::Scalar vALUInsts
Definition: compute_unit.hh:949
gem5::probing::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
gem5::ComputeUnit::ComputeUnitStats::instCyclesVALU
statistics::Scalar instCyclesVALU
Definition: compute_unit.hh:953
gem5::MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:326
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
gem5::ComputeUnit::tickEvent
EventFunctionWrapper tickEvent
Definition: compute_unit.hh:290
gem5::LocalMemPipeline::isLMRespFIFOWrRdy
bool isLMRespFIFOWrRdy() const
Definition: local_memory_pipeline.hh:70
gem5::MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:86
gem5::RR
@ RR
Definition: compute_unit.hh:77
gem5::MemCmd::MemSyncReq
@ MemSyncReq
Definition: packet.hh:119
process.hh
gem5::ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:286
gem5::ComputeUnit::resetRegisterPool
void resetRegisterPool()
Definition: compute_unit.cc:412
gem5::ComputeUnit::registerManager
RegisterManager * registerManager
Definition: compute_unit.hh:280
gem5::ComputeUnit::ComputeUnitStats::numInstrExecuted
statistics::Scalar numInstrExecuted
Definition: compute_unit.hh:1035
gem5::ComputeUnit::ScalarDataPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:904
gem5::HSAQueueEntry::isInvDone
bool isInvDone() const
Is invalidate done?
Definition: hsa_queue_entry.hh:356
gem5::ComputeUnit::ITLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1775
gem5::ComputeUnit::ScalarDTLBPort::stallPort
void stallPort()
Definition: compute_unit.hh:711
gem5::GlobalMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: global_memory_pipeline.hh:97
gem5::Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:94
gem5::ComputeUnit::DataPort::createMemReqEvent
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1557
gem5::ComputeUnit::ComputeUnitStats::vectorMemInstsPerKiloInst
statistics::Formula vectorMemInstsPerKiloInst
Definition: compute_unit.hh:974
gem5::ComputeUnit::~ComputeUnit
~ComputeUnit()
Definition: compute_unit.cc:220
scalar_register_file.hh
gpu_dyn_inst.hh
page_size.hh
gem5::ComputeUnit::DTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:670
gem5::HSAQueueEntry::wgSize
int wgSize(int dim) const
Definition: hsa_queue_entry.hh:123
gem5::ComputeUnit::activeWaves
int activeWaves
Definition: compute_unit.hh:943
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueVgprAlloc
statistics::Scalar numTimesWgBlockedDueVgprAlloc
Definition: compute_unit.hh:1074
gem5::RegisterManager::srfPoolMgrs
std::vector< PoolManager * > srfPoolMgrs
Definition: register_manager.hh:81
gem5::HSAQueueEntry::codeAddr
Addr codeAddr() const
Definition: hsa_queue_entry.hh:179
gem5::LdsChunk
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:58
gem5::ComputeUnit::mapWaveToScalarMem
int mapWaveToScalarMem(Wavefront *w) const
Definition: compute_unit.cc:289
gpu_command_processor.hh
gem5::ComputeUnit::mapWaveToGlobalMem
int mapWaveToGlobalMem(Wavefront *w) const
Definition: compute_unit.cc:273
gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
gem5::ComputeUnit::deleteFromPipeMap
void deleteFromPipeMap(Wavefront *w)
Definition: compute_unit.cc:509
gem5::ExecStage::init
void init()
Definition: exec_stage.cc:61
gem5::ComputeUnit::doFlush
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
Definition: compute_unit.cc:404
gem5::ComputeUnit::DataPort::SenderState::port_index
PortID port_index
Definition: compute_unit.hh:521
gem5::ComputeUnit::init
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: compute_unit.cc:754
gem5::HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:225
gem5::HSAQueueEntry::gridSize
int gridSize(int dim) const
Definition: hsa_queue_entry.hh:130
gem5::ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition: compute_unit.hh:251
gem5::ComputeUnit::DataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:520
gem5::ComputeUnit::memPort
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
Definition: compute_unit.hh:847
gem5::OLDEST
@ OLDEST
Definition: compute_unit.hh:76
gem5::ComputeUnit::ComputeUnitStats::scalarMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
Definition: compute_unit.hh:975
gem5::X86ISA::pf
Bitfield< 2 > pf
Definition: misc.hh:556
gem5::ComputeUnit::ComputeUnitStats::vectorMemReadsPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
Definition: compute_unit.hh:972
gem5::Packet::cmd
MemCmd cmd
The command field of the packet.
Definition: packet.hh:361
gem5::ComputeUnit::DTLBPort::SenderState::portIndex
PortID portIndex
Definition: compute_unit.hh:674
gem5::ComputeUnit::perLaneTLB
bool perLaneTLB
Definition: compute_unit.hh:331
gem5::ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition: compute_unit.cc:248
gem5::LocalMemPipeline::isLMReqFIFOWrRdy
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: local_memory_pipeline.hh:76
gem5::ComputeUnit::ScalarDTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:704
gem5::Request::KERNEL
@ KERNEL
The request should be marked with KERNEL.
Definition: request.hh:181
gem5::ComputeUnit::ComputeUnitStats::groupWrites
statistics::Scalar groupWrites
Definition: compute_unit.hh:995
gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
gem5::ComputeUnit::ComputeUnitStats::globalWrites
statistics::Scalar globalWrites
Definition: compute_unit.hh:986
gem5::ComputeUnit::ComputeUnitStats::vALUInstsPerWF
statistics::Formula vALUInstsPerWF
Definition: compute_unit.hh:950
gem5::LdsState::increaseRefCounter
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:297
gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:534
gem5::ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueSgprAlloc
statistics::Scalar numTimesWgBlockedDueSgprAlloc
Definition: compute_unit.hh:1076
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedF16
Definition: compute_unit.hh:1042
gem5::ComputeUnit::barrierSlot
WFBarrier & barrierSlot(int bar_id)
Definition: compute_unit.hh:420
name
const std::string & name()
Definition: trace.cc:49
gem5::ComputeUnit::exitCallback
void exitCallback()
Definition: compute_unit.cc:1922
gem5::ComputeUnit::SQCPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:981
gem5::ComputeUnit::ComputeUnitStats::privMemInsts
statistics::Formula privMemInsts
Definition: compute_unit.hh:999
gem5::ComputeUnit::mapWaveToScalarAlu
int mapWaveToScalarAlu(Wavefront *w) const
Definition: compute_unit.cc:255
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:51
gem5::ComputeUnit::hasDispResources
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
Definition: compute_unit.cc:521
gem5::ComputeUnit::DataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:540
gem5::ComputeUnit::getFreeBarrierId
int getFreeBarrierId()
Definition: compute_unit.hh:427
gem5::ComputeUnit::wfSize
int wfSize() const
Definition: compute_unit.hh:396
gem5::ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:234
gem5::ComputeUnit::pipeMap
std::unordered_set< uint64_t > pipeMap
Definition: compute_unit.hh:278
gem5::ComputeUnit::LDSPort::recvReqRetry
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
Definition: compute_unit.cc:2079
gem5::MemCmd::toString
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:265
gem5::Shader::timingSim
bool timingSim
Definition: shader.hh:191
gem5::Process
Definition: process.hh:67
gem5::GPUDispatcher::notifyWgCompl
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
Definition: dispatcher.cc:297
gem5::EventFunctionWrapper
Definition: eventq.hh:1115
gem5::ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
gem5::Clocked::nextCycle
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Definition: clocked_object.hh:213
gem5::ComputeUnit::updateInstStats
void updateInstStats(GPUDynInstPtr gpuDynInst)
Definition: compute_unit.cc:1805
gem5::ComputeUnit::ComputeUnitStats::numALUInstsExecuted
statistics::Formula numALUInstsExecuted
Definition: compute_unit.hh:1072
gem5::ComputeUnit::ComputeUnitStats::instCyclesLdsPerSimd
statistics::Vector instCyclesLdsPerSimd
Definition: compute_unit.hh:983
gem5::ComputeUnit::ComputeUnitStats::argReads
statistics::Scalar argReads
Definition: compute_unit.hh:988
gem5::ComputeUnit::ComputeUnitStats::globalMemInsts
statistics::Formula globalMemInsts
Definition: compute_unit.hh:987
gem5::ComputeUnit::ComputeUnitStats::wgBlockedDueLdsAllocation
statistics::Scalar wgBlockedDueLdsAllocation
Definition: compute_unit.hh:1031
gem5::ComputeUnit::LDSPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Definition: compute_unit.cc:2015
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:203
gem5::ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition: compute_unit.hh:221
gem5::Wavefront::barrierId
void barrierId(int bar_id)
Definition: wavefront.cc:1416
gem5::ComputeUnit::Params
ComputeUnitParams Params
Definition: compute_unit.hh:292
gem5::Wavefront::S_RETURNING
@ S_RETURNING
Definition: wavefront.hh:70
gem5::ComputeUnit::ComputeUnitStats::ipc
statistics::Formula ipc
Definition: compute_unit.hh:1067
gem5::RegisterManager::allocateRegisters
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
Definition: register_manager.cc:124
gem5::ComputeUnit::updatePageDivergenceDist
void updatePageDivergenceDist(Addr addr)
Definition: compute_unit.cc:1911
gem5::X86ISA::GpuTLB::TranslationState::tlbEntry
TlbEntry * tlbEntry
Definition: gpu_tlb.hh:297
gem5::ComputeUnit::vectorRegsReserved
std::vector< int > vectorRegsReserved
Definition: compute_unit.hh:369
gem5::ComputeUnit::ComputeUnitStats::readonlyWrites
statistics::Scalar readonlyWrites
Definition: compute_unit.hh:1001
gem5::Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:101
gem5::ComputeUnit::ComputeUnitStats::waveLevelParallelism
statistics::Distribution waveLevelParallelism
Definition: compute_unit.hh:1007
gem5::ComputeUnit::ComputeUnitStats::scalarMemWrites
statistics::Scalar scalarMemWrites
Definition: compute_unit.hh:967
gem5::ComputeUnit::ComputeUnitStats::controlFlowDivergenceDist
statistics::Distribution controlFlowDivergenceDist
Definition: compute_unit.hh:1068
gem5::ScheduleStage::exec
void exec()
Definition: schedule_stage.cc:92
gem5::ComputeUnit::ComputeUnitStats::vectorMemWrites
statistics::Scalar vectorMemWrites
Definition: compute_unit.hh:963
gem5::ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition: compute_unit.cc:500
gem5::ComputeUnit::ScalarDTLBPort::SenderState
Definition: compute_unit.hh:701
gem5::statistics::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:72
gem5::GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:248
gem5::ComputeUnit::mapWaveToLocalMem
int mapWaveToLocalMem(Wavefront *w) const
Definition: compute_unit.cc:281
gem5::ComputeUnit::ldsPort
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
Definition: compute_unit.hh:836
gem5::MemCmd::ReadResp
@ ReadResp
Definition: packet.hh:87
gem5::ComputeUnit::ComputeUnitStats::flatLDSInstsPerWF
statistics::Formula flatLDSInstsPerWF
Definition: compute_unit.hh:962
gem5::WFBarrier
WF barrier slots.
Definition: compute_unit.hh:92
gem5::ComputeUnit::isDone
bool isDone() const
Definition: compute_unit.cc:1939
gem5::ComputeUnit::LDSPort::SenderState::getMemInst
GPUDynInstPtr getMemInst() const
Definition: compute_unit.hh:798
gem5::ComputeUnit::ComputeUnitStats::hitsPerTLBLevel
statistics::Vector hitsPerTLBLevel
Definition: compute_unit.hh:1016
gem5::floorLog2
static constexpr std::enable_if_t< std::is_integral< T >::value, int > floorLog2(T x)
Definition: intmath.hh:59
gem5::Shader::gpuCmdProc
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:226
gem5::ComputeUnit::maxBarrierCnt
int maxBarrierCnt(int bar_id)
Definition: compute_unit.cc:677
gem5::Shader::n_wf
int n_wf
Definition: shader.hh:205
gem5::ComputeUnit::scalarRegsReserved
std::vector< int > scalarRegsReserved
Definition: compute_unit.hh:371
gem5::ComputeUnit::fillKernelState
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
Definition: compute_unit.cc:297
gem5::MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:89
gem5::ComputeUnit::lds
LdsState & lds
Definition: compute_unit.hh:470
gem5::ComputeUnit::DTLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:667
gem5::ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition: compute_unit.hh:233
gem5::statistics::Group
Statistics container.
Definition: group.hh:93
gem5::ComputeUnit::ComputeUnitStats::execRateDist
statistics::Distribution execRateDist
Definition: compute_unit.hh:1038
gem5::ComputeUnit::tlbPort
std::vector< DTLBPort > tlbPort
Definition: compute_unit.hh:849
gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedF32
Definition: compute_unit.hh:1044
gem5::ComputeUnit::isVectorAluIdle
bool isVectorAluIdle(uint32_t simdId) const
Definition: compute_unit.cc:1976
gem5::ComputeUnit::numScalarRegsPerSimd
int numScalarRegsPerSimd
Definition: compute_unit.hh:375
gem5::ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition: compute_unit.hh:247
gem5::ComputeUnit::sendScalarRequest
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
Definition: compute_unit.cc:1204
gem5::ComputeUnit::countPages
bool countPages
Definition: compute_unit.hh:353
gem5::ComputeUnit::freeBarrierIds
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
Definition: compute_unit.hh:934
sc_core::SC_NONE
@ SC_NONE
Definition: sc_report.hh:50
gem5::Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:111
gem5::ComputeUnit::ComputeUnitStats::ComputeUnitStats
ComputeUnitStats(statistics::Group *parent, int n_wf)
Definition: compute_unit.cc:2110
gem5::RegisterManager::canAllocateSgprs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:117
gem5::ComputeUnit::scalarMemToSrfBus
WaitClass scalarMemToSrfBus
Definition: compute_unit.hh:239
gem5::MipsISA::k
Bitfield< 23 > k
Definition: dt_constants.hh:81
gem5::ComputeUnit::scalarDTLBPort
ScalarDTLBPort scalarDTLBPort
Definition: compute_unit.hh:853
gem5::ComputeUnit::ComputeUnitStats::pageDivergenceDist
statistics::Distribution pageDivergenceDist
Definition: compute_unit.hh:1023
gem5::Shader::max_valu_insts
int64_t max_valu_insts
Definition: shader.hh:229
gem5::ExecStage::exec
void exec()
Definition: exec_stage.cc:154
gem5::GPUDispatcher
Definition: dispatcher.hh:64
dispatcher.hh
DPRINTFN
#define DPRINTFN(...)
Definition: trace.hh:214
gem5::statistics::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:355
gem5::MipsISA::vaddr
vaddr
Definition: pra_constants.hh:278
gem5::ComputeUnit::ComputeUnitStats::argWrites
statistics::Scalar argWrites
Definition: compute_unit.hh:989
gem5::ComputeUnit::ComputeUnitStats::vpc_f32
statistics::Formula vpc_f32
Definition: compute_unit.hh:1065
gem5::HSAQueueEntry::ldsSize
int ldsSize() const
Definition: hsa_queue_entry.hh:191
gem5::Packet::getAddr
Addr getAddr() const
Definition: packet.hh:781
gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
gem5::RegisterManager::canAllocateVgprs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:111
gem5::ComputeUnit::sendToLds
GEM5_NO_DISCARD bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
Definition: compute_unit.cc:1995
gem5::X86ISA::PageBytes
const Addr PageBytes
Definition: page_size.hh:49
gem5::registerExitCallback
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition: core.cc:146
gem5::Wavefront::getStatus
status_e getStatus()
Definition: wavefront.hh:139
gem5::ComputeUnit::ComputeUnitStats::dynamicLMemInstrCnt
statistics::Scalar dynamicLMemInstrCnt
Definition: compute_unit.hh:1028
gem5::ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition: compute_unit.hh:288
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:225
page_table.hh
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: decoder.cc:40
gem5::ComputeUnit::DataPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:803
gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:227
gem5::MemCmd::MemSyncResp
@ MemSyncResp
Definition: packet.hh:120
gem5::ScoreboardCheckStage::exec
void exec()
Definition: scoreboard_check_stage.cc:250
gem5::ComputeUnit::ComputeUnitStats::readonlyMemInsts
statistics::Formula readonlyMemInsts
Definition: compute_unit.hh:1002
gem5::ComputeUnit::ScalarDataPort::recvReqRetry
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:942
gem5::ComputeUnit::ComputeUnitStats::scalarMemReads
statistics::Scalar scalarMemReads
Definition: compute_unit.hh:969
gem5::ComputeUnit::ComputeUnitStats::totalCycles
statistics::Scalar totalCycles
Definition: compute_unit.hh:1062
gem5::statistics::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1037
gem5::ComputeUnit::dispWorkgroup
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
Definition: compute_unit.cc:422
gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:155
gem5::ArmISA::stride
Bitfield< 21, 20 > stride
Definition: misc_types.hh:446
gem5::ComputeUnit::ComputeUnitStats::tlbCycles
statistics::Scalar tlbCycles
Definition: compute_unit.hh:1012
gem5::ComputeUnit::mapWaveToScalarAluGlobalIdx
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
Definition: compute_unit.cc:266
gem5::ComputeUnit::gmTokenPort
GMTokenPort gmTokenPort
Definition: compute_unit.hh:507
gem5::LdsState::reserveSpace
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:365
gem5::Shader::total_valu_insts
int64_t total_valu_insts
Definition: shader.hh:230
gem5::ComputeUnit::DataPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:954
gem5::ComputeUnit::doInvalidate
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
Definition: compute_unit.cc:385
gem5::ComputeUnit::ComputeUnitStats::spillReads
statistics::Scalar spillReads
Definition: compute_unit.hh:991
gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:95
gem5::ComputeUnit::allAtBarrier
bool allAtBarrier(int bar_id)
Definition: compute_unit.cc:656
gem5::ComputeUnit::numWfsToSched
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
Definition: compute_unit.hh:366
gem5::ComputeUnit::ComputeUnitStats::ldsNoFlatInstsPerWF
statistics::Formula ldsNoFlatInstsPerWF
Definition: compute_unit.hh:958
gem5::GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:109
gem5::KernelLaunchStaticInst
Definition: gpu_static_inst.hh:326
gem5::ComputeUnit::DataPort::SenderState
Definition: compute_unit.hh:518
gem5::Packet::getSize
unsigned getSize() const
Definition: packet.hh:791
gem5::Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
gem5::ComputeUnit::ScalarDataPort::SenderState
Definition: compute_unit.hh:569
gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:303
gem5::ComputeUnit::scheduleStage
ScheduleStage scheduleStage
Definition: compute_unit.hh:284
gem5::Wavefront::dropFetch
bool dropFetch
Definition: wavefront.hh:114
gem5::Shader::impl_kern_end_rel
int impl_kern_end_rel
Definition: shader.hh:197
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:177
gem5::GPUCommandProcessor::driver
GPUComputeDriver * driver()
Definition: gpu_command_processor.cc:231
gem5::ComputeUnit::DataPort::createMemRespEvent
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1565
gem5::Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:217
gem5::ComputeUnit::requestorId
RequestorID requestorId()
Definition: compute_unit.hh:462
gem5::X86ISA::addr
Bitfield< 3 > addr
Definition: types.hh:84
gem5::ComputeUnit::ComputeUnitStats::scalarMemInstsPerKiloInst
statistics::Formula scalarMemInstsPerKiloInst
Definition: compute_unit.hh:977
gem5::ComputeUnit::ComputeUnitStats::vectorMemWritesPerKiloInst
statistics::Formula vectorMemWritesPerKiloInst
Definition: compute_unit.hh:973
gem5::SenderState
RubyTester::SenderState SenderState
Definition: Check.cc:40
gem5::ComputeUnit::numExeUnits
int numExeUnits() const
Definition: compute_unit.cc:233
gem5::Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1184
gem5::ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:223
gem5::MipsISA::vpc
Bitfield< 1 > vpc
Definition: mt_constants.hh:44

Generated on Tue Sep 21 2021 12:25:23 for gem5 by doxygen 1.8.17