gem5  v22.1.0.0
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <limits>
35 
38 #include "base/output.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUReg.hh"
46 #include "debug/GPURename.hh"
47 #include "debug/GPUSync.hh"
48 #include "debug/GPUTLB.hh"
54 #include "gpu-compute/shader.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 #include "sim/sim_exit.hh"
61 
62 namespace gem5
63 {
64 
66  numVectorGlobalMemUnits(p.num_global_mem_pipes),
67  numVectorSharedMemUnits(p.num_shared_mem_pipes),
68  numScalarMemUnits(p.num_scalar_mem_pipes),
69  numVectorALUs(p.num_SIMDs),
70  numScalarALUs(p.num_scalar_cores),
71  vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
72  coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
73  registerManager(p.register_manager),
74  fetchStage(p, *this),
75  scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
76  scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
77  execStage(p, *this, scheduleToExecute),
78  globalMemoryPipe(p, *this),
79  localMemoryPipe(p, *this),
80  scalarMemoryPipe(p, *this),
81  tickEvent([this]{ exec(); }, "Compute unit tick event",
82  false, Event::CPU_Tick_Pri),
83  cu_id(p.cu_id),
84  vrf(p.vector_register_file), srf(p.scalar_register_file),
85  simdWidth(p.simd_width),
86  spBypassPipeLength(p.spbypass_pipe_length),
87  dpBypassPipeLength(p.dpbypass_pipe_length),
88  scalarPipeStages(p.scalar_pipe_length),
89  operandNetworkLength(p.operand_network_length),
90  issuePeriod(p.issue_period),
91  vrf_gm_bus_latency(p.vrf_gm_bus_latency),
92  srf_scm_bus_latency(p.srf_scm_bus_latency),
93  vrf_lm_bus_latency(p.vrf_lm_bus_latency),
94  perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
95  prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
96  debugSegFault(p.debugSegFault),
97  functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
98  countPages(p.countPages),
99  req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
100  resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
101  _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
102  lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
103  ldsPort(csprintf("%s-port", name()), this),
104  scalarDataPort(csprintf("%s-port", name()), this),
105  scalarDTLBPort(csprintf("%s-port", name()), this),
106  sqcPort(csprintf("%s-port", name()), this),
107  sqcTLBPort(csprintf("%s-port", name()), this),
108  _cacheLineSize(p.system->cacheLineSize()),
109  _numBarrierSlots(p.num_barrier_slots),
110  globalSeqNum(0), wavefrontSize(p.wf_size),
111  scoreboardCheckToSchedule(p),
112  scheduleToExecute(p),
113  stats(this, p.n_wf)
114 {
115  // This is not currently supported and would require adding more handling
116  // for system vs. device memory requests on the functional paths, so we
117  // fatal immediately in the constructor if this configuration is seen.
118  fatal_if(functionalTLB && FullSystem,
119  "Functional TLB not supported in full-system GPU simulation");
120 
130  fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
131  p.wf_size <= 0,
132  "WF size is larger than the host can support");
133  fatal_if(!isPowerOf2(wavefrontSize),
134  "Wavefront size should be a power of 2");
135  // calculate how many cycles a vector load or store will need to transfer
136  // its data over the corresponding buses
137  numCyclesPerStoreTransfer =
138  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
139  (double)vrfToCoalescerBusWidth);
140 
141  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
142  / coalescerToVrfBusWidth;
143 
144  // Initialization: all WF slots are assumed STOPPED
145  idleWfs = p.n_wf * numVectorALUs;
146  lastVaddrWF.resize(numVectorALUs);
147  wfList.resize(numVectorALUs);
148 
149  wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
150 
151  for (int i = 0; i < p.num_barrier_slots; ++i) {
152  freeBarrierIds.insert(i);
153  }
154 
155  for (int j = 0; j < numVectorALUs; ++j) {
156  lastVaddrWF[j].resize(p.n_wf);
157 
158  for (int i = 0; i < p.n_wf; ++i) {
159  lastVaddrWF[j][i].resize(wfSize());
160 
161  wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
162  wfList[j][i]->setParent(this);
163 
164  for (int k = 0; k < wfSize(); ++k) {
165  lastVaddrWF[j][i][k] = 0;
166  }
167  }
168  }
169 
170  lastVaddrSimd.resize(numVectorALUs);
171 
172  for (int i = 0; i < numVectorALUs; ++i) {
173  lastVaddrSimd[i].resize(wfSize(), 0);
174  }
175 
176  lastVaddrCU.resize(wfSize());
177 
178  lds.setParent(this);
179 
180  if (p.execPolicy == "OLDEST-FIRST") {
181  exec_policy = EXEC_POLICY::OLDEST;
182  } else if (p.execPolicy == "ROUND-ROBIN") {
183  exec_policy = EXEC_POLICY::RR;
184  } else {
185  fatal("Invalid WF execution policy (CU)\n");
186  }
187 
188  for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
189  memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
190  }
191 
192  for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
193  tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
194  }
195 
196  // Setup tokens for response ports. The number of tokens in memPortTokens
197  // is the total token count for the entire vector port (i.e., this CU).
198  memPortTokens = new TokenManager(p.max_cu_tokens);
199 
200  registerExitCallback([this]() { exitCallback(); });
201 
202  lastExecCycle.resize(numVectorALUs, 0);
203 
204  for (int i = 0; i < vrf.size(); ++i) {
205  vrf[i]->setParent(this);
206  }
207  for (int i = 0; i < srf.size(); ++i) {
208  srf[i]->setParent(this);
209  }
210  numVecRegsPerSimd = vrf[0]->numRegs();
211  numScalarRegsPerSimd = srf[0]->numRegs();
212 
213  registerManager->setParent(this);
214 
215  activeWaves = 0;
216 
217  instExecPerSimd.resize(numVectorALUs, 0);
218 
219  // Calculate the number of bits to address a cache line
220  panic_if(!isPowerOf2(_cacheLineSize),
221  "Cache line size should be a power of two.");
222  cacheLineBits = floorLog2(_cacheLineSize);
223 }
224 
226 {
227  // Delete wavefront slots
228  for (int j = 0; j < numVectorALUs; ++j) {
229  for (int i = 0; i < shader->n_wf; ++i) {
230  delete wfList[j][i];
231  }
232  lastVaddrSimd[j].clear();
233  }
234  lastVaddrCU.clear();
235 }
236 
237 int
239 {
242 }
243 
244 // index into readyList of the first memory unit
245 int
247 {
248  return numVectorALUs + numScalarALUs;
249 }
250 
251 // index into readyList of the last memory unit
252 int
254 {
255  return numExeUnits() - 1;
256 }
257 
258 // index into scalarALUs vector of SALU used by the wavefront
259 int
261 {
262  if (numScalarALUs == 1) {
263  return 0;
264  } else {
265  return w->simdId % numScalarALUs;
266  }
267 }
268 
269 // index into readyList of Scalar ALU unit used by wavefront
270 int
272 {
274 }
275 
276 // index into readyList of Global Memory unit used by wavefront
277 int
279 {
280  // TODO: FIXME if more than 1 GM pipe supported
281  return numVectorALUs + numScalarALUs;
282 }
283 
284 // index into readyList of Local Memory unit used by wavefront
285 int
287 {
288  // TODO: FIXME if more than 1 LM pipe supported
290 }
291 
292 // index into readyList of Scalar Memory unit used by wavefront
293 int
295 {
296  // TODO: FIXME if more than 1 ScM pipe supported
299 }
300 
301 void
303 {
304  w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
305  w->workGroupSz[0] = task->wgSize(0);
306  w->workGroupSz[1] = task->wgSize(1);
307  w->workGroupSz[2] = task->wgSize(2);
308  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
309  w->gridSz[0] = task->gridSize(0);
310  w->gridSz[1] = task->gridSize(1);
311  w->gridSz[2] = task->gridSize(2);
312  w->computeActualWgSz(task);
313 }
314 
315 void
317  HSAQueueEntry *task, int bar_id, bool fetchContext)
318 {
319  static int _n_wave = 0;
320 
321  VectorMask init_mask;
322  init_mask.reset();
323 
324  for (int k = 0; k < wfSize(); ++k) {
325  if (k + waveId * wfSize() < w->actualWgSzTotal)
326  init_mask[k] = 1;
327  }
328 
329  w->execMask() = init_mask;
330 
331  w->kernId = task->dispatchId();
332  w->wfId = waveId;
333  w->initMask = init_mask.to_ullong();
334 
335  if (bar_id > WFBarrier::InvalidID) {
336  w->barrierId(bar_id);
337  } else {
338  assert(!w->hasBarrier());
339  }
340 
341  for (int k = 0; k < wfSize(); ++k) {
342  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
343  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
344  w->actualWgSz[1];
345  w->workItemId[2][k] = (k + waveId * wfSize()) /
346  (w->actualWgSz[0] * w->actualWgSz[1]);
347 
348  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
349  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
350  w->workItemId[0][k];
351  }
352 
353  // WG state
354  w->wgId = task->globalWgId();
355  w->dispatchId = task->dispatchId();
356  w->workGroupId[0] = w->wgId % task->numWg(0);
357  w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
358  w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
359 
360  // set the wavefront context to have a pointer to this section of the LDS
361  w->ldsChunk = ldsChunk;
362 
363  [[maybe_unused]] int32_t refCount =
364  lds.increaseRefCounter(w->dispatchId, w->wgId);
365  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
366  cu_id, w->wgId, refCount);
367 
368  w->instructionBuffer.clear();
369 
370  if (w->pendingFetch)
371  w->dropFetch = true;
372 
373  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
374  "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
375  w->simdId, w->wfSlotId, refCount);
376 
377  w->initRegState(task, w->actualWgSzTotal);
378  w->start(_n_wave++, task->codeAddr());
379 
381  activeWaves++;
382 }
383 
389 void
391  GPUDynInstPtr gpuDynInst
392  = std::make_shared<GPUDynInst>(this, nullptr,
394 
395  // kern_id will be used in inv responses
396  gpuDynInst->kern_id = kernId;
397  // update contextId field
398  req->setContext(gpuDynInst->wfDynId);
399 
400  injectGlobalMemFence(gpuDynInst, true, req);
401 }
402 
408 void
410  injectGlobalMemFence(gpuDynInst, true);
411 }
412 
413 // reseting SIMD register pools
414 // I couldn't think of any other place and
415 // I think it is needed in my implementation
416 void
418 {
419  for (int i=0; i<numVectorALUs; i++)
420  {
423  }
424 }
425 
426 void
427 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
428 {
429  // If we aren't ticking, start it up!
430  if (!tickEvent.scheduled()) {
431  DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
433  }
434 
435  // the kernel's invalidate must have finished before any wg dispatch
436  assert(task->isInvDone());
437 
438  // reserve the LDS capacity allocated to the work group
439  // disambiguated by the dispatch ID and workgroup ID, which should be
440  // globally unique
441  LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
442  task->globalWgId(),
443  task->ldsSize());
444 
445  panic_if(!ldsChunk, "was not able to reserve space for this WG");
446 
447  // calculate the number of 32-bit vector registers required
448  // by each work item
449  int vregDemand = task->numVectorRegs();
450  int sregDemand = task->numScalarRegs();
451  int wave_id = 0;
452 
453  int barrier_id = WFBarrier::InvalidID;
454 
459  if (num_wfs_in_wg > 1) {
464  barrier_id = getFreeBarrierId();
465  auto &wf_barrier = barrierSlot(barrier_id);
466  assert(!wf_barrier.maxBarrierCnt());
467  assert(!wf_barrier.numAtBarrier());
468  wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
469 
470  DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
471  "%d waves using this barrier.\n", cu_id, barrier_id,
472  num_wfs_in_wg);
473  }
474 
475  // Assign WFs according to numWfsToSched vector, which is computed by
476  // hasDispResources()
477  for (int j = 0; j < shader->n_wf; ++j) {
478  for (int i = 0; i < numVectorALUs; ++i) {
479  Wavefront *w = wfList[i][j];
480  // Check if this wavefront slot is available and there are WFs
481  // remaining to be dispatched to current SIMD:
482  // WF slot must be stopped and not waiting
483  // for a release to complete S_RETURNING
484  if (w->getStatus() == Wavefront::S_STOPPED &&
485  numWfsToSched[i] > 0) {
486  // decrement number of WFs awaiting dispatch to current SIMD
487  numWfsToSched[i] -= 1;
488 
489  fillKernelState(w, task);
490 
491  DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
492  "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
493  vregDemand, sregDemand);
494 
495  registerManager->allocateRegisters(w, vregDemand, sregDemand);
496 
497  startWavefront(w, wave_id, ldsChunk, task, barrier_id);
498  ++wave_id;
499  }
500  }
501  }
502 }
503 
504 void
506 {
507  panic_if(w->instructionBuffer.empty(),
508  "Instruction Buffer of WF%d can't be empty", w->wgId);
509  GPUDynInstPtr ii = w->instructionBuffer.front();
510  pipeMap.emplace(ii->seqNum());
511 }
512 
513 void
515 {
516  panic_if(w->instructionBuffer.empty(),
517  "Instruction Buffer of WF%d can't be empty", w->wgId);
518  GPUDynInstPtr ii = w->instructionBuffer.front();
519  // delete the dynamic instruction from the pipeline map
520  auto it = pipeMap.find(ii->seqNum());
521  panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
522  pipeMap.erase(it);
523 }
524 
525 bool
527 {
528  // compute true size of workgroup (after clamping to grid size)
529  int trueWgSize[HSAQueueEntry::MAX_DIM];
530  int trueWgSizeTotal = 1;
531 
532  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
533  trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
534  task->wgId(d) * task->wgSize(d));
535 
536  trueWgSizeTotal *= trueWgSize[d];
537  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
538  }
539 
540  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
541 
542  // calculate the number of WFs in this WG
543  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
544  num_wfs_in_wg = numWfs;
545 
546  bool barrier_avail = true;
547 
548  if (numWfs > 1 && !freeBarrierIds.size()) {
549  barrier_avail = false;
550  }
551 
552  // calculate the number of 32-bit vector registers required by each
553  // work item of the work group
554  int vregDemandPerWI = task->numVectorRegs();
555  // calculate the number of 32-bit scalar registers required by each
556  // work item of the work group
557  int sregDemandPerWI = task->numScalarRegs();
558 
559  // check if the total number of VGPRs snd SGPRs required by all WFs
560  // of the WG fit in the VRFs of all SIMD units and the CU's SRF
561  panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
562  "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
563  "that has %d VGPRs\n",
564  numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
565  panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
566  "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
567  "with %d SGPRs\n",
568  numWfs, sregDemandPerWI, numScalarRegsPerSimd);
569 
570  // number of WF slots that are not occupied
571  int freeWfSlots = 0;
572  // number of Wfs from WG that were successfully mapped to a SIMD
573  int numMappedWfs = 0;
574  numWfsToSched.clear();
575  numWfsToSched.resize(numVectorALUs, 0);
576 
577  // attempt to map WFs to the SIMDs, based on WF slot availability
578  // and register file availability
579  for (int j = 0; j < shader->n_wf; ++j) {
580  for (int i = 0; i < numVectorALUs; ++i) {
581  if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
582  ++freeWfSlots;
583  // check if current WF will fit onto current SIMD/VRF
584  // if all WFs have not yet been mapped to the SIMDs
585  if (numMappedWfs < numWfs &&
587  sregDemandPerWI) &&
589  vregDemandPerWI)) {
590  numWfsToSched[i]++;
591  numMappedWfs++;
592  }
593  }
594  }
595  }
596 
597  // check that the number of mapped WFs is not greater
598  // than the actual number of WFs
599  assert(numMappedWfs <= numWfs);
600 
601  bool vregAvail = true;
602  bool sregAvail = true;
603  // if a WF to SIMD mapping was not found, find the limiting resource
604  if (numMappedWfs < numWfs) {
605 
606  for (int j = 0; j < numVectorALUs; ++j) {
607  // find if there are enough free VGPRs in the SIMD's VRF
608  // to accomodate the WFs of the new WG that would be mapped
609  // to this SIMD unit
610  vregAvail &= registerManager->
611  canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
612  // find if there are enough free SGPRs in the SIMD's SRF
613  // to accomodate the WFs of the new WG that would be mapped
614  // to this SIMD unit
615  sregAvail &= registerManager->
616  canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
617  }
618  }
619 
620  DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
621  VGPR Availability = %d, SGPR Availability = %d\n",
622  freeWfSlots, numMappedWfs, vregAvail, sregAvail);
623 
624  if (!vregAvail) {
626  }
627 
628  if (!sregAvail) {
630  }
631 
632  // Return true if enough WF slots to submit workgroup and if there are
633  // enough VGPRs to schedule all WFs to their SIMD units
634  bool ldsAvail = lds.canReserve(task->ldsSize());
635  if (!ldsAvail) {
637  }
638 
639  if (!barrier_avail) {
641  }
642 
643  // Return true if the following are all true:
644  // (a) all WFs of the WG were mapped to free WF slots
645  // (b) there are enough VGPRs to schedule all WFs to their SIMD units
646  // (c) there are enough SGPRs on the CU to schedule all WFs
647  // (d) there is enough space in LDS to allocate for all WFs
648  bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
649  && ldsAvail && barrier_avail;
650  return can_dispatch;
651 }
652 
653 int
655 {
656  auto &wf_barrier = barrierSlot(bar_id);
657  return wf_barrier.numYetToReachBarrier();
658 }
659 
660 bool
662 {
663  auto &wf_barrier = barrierSlot(bar_id);
664  return wf_barrier.allAtBarrier();
665 }
666 
667 void
669 {
670  auto &wf_barrier = barrierSlot(bar_id);
671  wf_barrier.incNumAtBarrier();
672 }
673 
674 int
676 {
677  auto &wf_barrier = barrierSlot(bar_id);
678  return wf_barrier.numAtBarrier();
679 }
680 
681 int
683 {
684  auto &wf_barrier = barrierSlot(bar_id);
685  return wf_barrier.maxBarrierCnt();
686 }
687 
688 void
690 {
691  auto &wf_barrier = barrierSlot(bar_id);
692  wf_barrier.reset();
693 }
694 
695 void
697 {
698  auto &wf_barrier = barrierSlot(bar_id);
699  wf_barrier.decMaxBarrierCnt();
700 }
701 
702 void
704 {
705  auto &wf_barrier = barrierSlot(bar_id);
706  wf_barrier.release();
707  freeBarrierIds.insert(bar_id);
708 }
709 
710 void
712 {
713  for (int i = 0; i < numVectorALUs; ++i) {
714  for (int j = 0; j < shader->n_wf; ++j) {
715  Wavefront *wf = wfList[i][j];
716  if (wf->barrierId() == bar_id) {
717  assert(wf->getStatus() == Wavefront::S_BARRIER);
719  }
720  }
721  }
722 }
723 
724 // Execute one clock worth of work on the ComputeUnit.
725 void
727 {
728  // process reads and writes in the RFs
729  for (auto &vecRegFile : vrf) {
730  vecRegFile->exec();
731  }
732 
733  for (auto &scRegFile : srf) {
734  scRegFile->exec();
735  }
736 
737  // Execute pipeline stages in reverse order to simulate
738  // the pipeline latency
742  execStage.exec();
745  fetchStage.exec();
746 
747  stats.totalCycles++;
748 
749  // Put this CU to sleep if there is no more work to be done.
750  if (!isDone()) {
752  } else {
754  DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
755  }
756 }
757 
758 void
760 {
761  // Initialize CU Bus models and execution resources
762 
763  // Vector ALUs
764  vectorALUs.clear();
765  for (int i = 0; i < numVectorALUs; i++) {
766  vectorALUs.emplace_back(this, clockPeriod());
767  }
768 
769  // Scalar ALUs
770  scalarALUs.clear();
771  for (int i = 0; i < numScalarALUs; i++) {
772  scalarALUs.emplace_back(this, clockPeriod());
773  }
774 
775  // Vector Global Memory
777  "No support for multiple Global Memory Pipelines exists!!!");
781 
782  // Vector Local/Shared Memory
784  "No support for multiple Local Memory Pipelines exists!!!");
788 
789  // Scalar Memory
791  "No support for multiple Scalar Memory Pipelines exists!!!");
792  scalarMemUnit.init(this, clockPeriod());
795 
798 
799  fetchStage.init();
801  execStage.init();
803 
805 }
806 
807 bool
809 {
810  return handleResponse(pkt);
811 }
812 
813 bool
815 {
816  // Ruby has completed the memory op. Schedule the mem_resp_event at the
817  // appropriate cycle to process the timing memory response
818  // This delay represents the pipeline delay
819  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
820  PortID index = sender_state->port_index;
821  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
822  GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
823 
824  // MemSyncResp + WriteAckResp are handled completely here and we don't
825  // schedule a MemRespEvent to process the responses further
826  if (pkt->cmd == MemCmd::MemSyncResp) {
827  // This response is for 1 of the following request types:
828  // - kernel launch
829  // - kernel end
830  // - non-kernel mem sync
831 
832  // Kernel Launch
833  // wavefront was nullptr when launching kernel, so it is meaningless
834  // here (simdId=-1, wfSlotId=-1)
835  if (gpuDynInst->isKernelLaunch()) {
836  // for kernel launch, the original request must be both kernel-type
837  // and INV_L1
838  assert(pkt->req->isKernel());
839  assert(pkt->req->isInvL1());
840 
841  // one D-Cache inv is done, decrement counter
842  dispatcher.updateInvCounter(gpuDynInst->kern_id);
843 
844  delete pkt->senderState;
845  delete pkt;
846  return true;
847  }
848 
849  // retrieve wavefront from inst
850  Wavefront *w = gpuDynInst->wavefront();
851 
852  // Check if we are waiting on Kernel End Flush
853  if (w->getStatus() == Wavefront::S_RETURNING
854  && gpuDynInst->isEndOfKernel()) {
855  // for kernel end, the original request must be both kernel-type
856  // and last-level GPU cache should be flushed if it contains
857  // dirty data. This request may have been quiesced and
858  // immediately responded to if the GL2 is a write-through /
859  // read-only cache.
860  assert(pkt->req->isKernel());
861  assert(pkt->req->isGL2CacheFlush());
862 
863  // once flush done, decrement counter, and return whether all
864  // dirty writeback operations are done for the kernel
865  bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
866 
867  // not all wbs are done for the kernel, just release pkt
868  // resources
869  if (!isWbDone) {
870  delete pkt->senderState;
871  delete pkt;
872  return true;
873  }
874 
875  // all wbs are completed for the kernel, do retirement work
876  // for the workgroup
877  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
878  computeUnit->cu_id, w->simdId, w->wfSlotId,
879  w->wfDynId, w->wgId);
880 
881  dispatcher.notifyWgCompl(w);
882  w->setStatus(Wavefront::S_STOPPED);
883  }
884 
885  if (!pkt->req->isKernel()) {
886  w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
887  DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
888  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
889  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
890  gpuDynInst->disassemble(), w->outstandingReqs,
891  w->outstandingReqs - 1);
892  computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
893  }
894 
895  delete pkt->senderState;
896  delete pkt;
897  return true;
898  }
899 
900  EventFunctionWrapper *mem_resp_event =
901  computeUnit->memPort[index].createMemRespEvent(pkt);
902 
903  DPRINTF(GPUPort,
904  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
905  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
906  gpuDynInst->seqNum(), index, pkt->req->getPaddr());
907 
908  computeUnit->schedule(mem_resp_event,
909  curTick() + computeUnit->resp_tick_latency);
910 
911  return true;
912 }
913 
914 bool
916 {
917  return handleResponse(pkt);
918 }
919 
920 bool
922 {
923  assert(!pkt->req->isKernel());
924 
925  // retrieve sender state
926  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
927  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
928 
929  assert(pkt->isRead() || pkt->isWrite());
930  assert(gpuDynInst->numScalarReqs > 0);
931 
932  gpuDynInst->numScalarReqs--;
933 
942  if (!gpuDynInst->numScalarReqs) {
943  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
944  computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
945  gpuDynInst);
946  } else {
947  computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
948  gpuDynInst);
949  }
950  }
951 
952  delete pkt->senderState;
953  delete pkt;
954 
955  return true;
956 }
957 
958 void
960 {
961  for (const auto &pkt : retries) {
962  if (!sendTimingReq(pkt)) {
963  break;
964  } else {
965  retries.pop_front();
966  }
967  }
968 }
969 
970 void
972 {
973  int len = retries.size();
974 
975  assert(len > 0);
976 
977  for (int i = 0; i < len; ++i) {
978  PacketPtr pkt = retries.front().first;
979  [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
980  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
981  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
982  pkt->req->getPaddr());
983 
987  if (!sendTimingReq(pkt)) {
988  DPRINTF(GPUMem, "failed again!\n");
989  break;
990  } else {
991  DPRINTF(GPUMem, "successful!\n");
992  retries.pop_front();
993  }
994  }
995 }
996 
997 bool
999 {
1000  computeUnit->handleSQCReturn(pkt);
1001 
1002  return true;
1003 }
1004 
1005 void
1007 {
1009 }
1010 
1011 void
1013 {
1014  int len = retries.size();
1015 
1016  assert(len > 0);
1017 
1018  for (int i = 0; i < len; ++i) {
1019  PacketPtr pkt = retries.front().first;
1020  [[maybe_unused]] Wavefront *wavefront = retries.front().second;
1021  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1022  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1023  pkt->req->getPaddr());
1024  if (!sendTimingReq(pkt)) {
1025  DPRINTF(GPUFetch, "failed again!\n");
1026  break;
1027  } else {
1028  DPRINTF(GPUFetch, "successful!\n");
1029  retries.pop_front();
1030  }
1031  }
1032 }
1033 
1034 void
1036 {
1037  // There must be a way around this check to do the globalMemStart...
1038  Addr tmp_vaddr = pkt->req->getVaddr();
1039 
1040  updatePageDivergenceDist(tmp_vaddr);
1041 
1042  // set PC in request
1043  pkt->req->setPC(gpuDynInst->wavefront()->pc());
1044 
1045  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1046 
1047  // figure out the type of the request to set read/write
1048  BaseMMU::Mode TLB_mode;
1049  assert(pkt->isRead() || pkt->isWrite());
1050 
1051  // only do some things if actually accessing data
1052  bool isDataAccess = pkt->isWrite() || pkt->isRead();
1053 
1054  // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode
1055  // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1056  // Rather than hacking up the pagetable to add these bits in, we just
1057  // keep a structure local to our GPUs that are populated in our
1058  // emulated driver whenever memory is allocated. Consult that structure
1059  // here in case we need a memtype override.
1060  //
1061  // In full system mode these can be extracted from the PTE and assigned
1062  // after address translation takes place.
1063  if (!FullSystem) {
1064  shader->gpuCmdProc.driver()->setMtype(pkt->req);
1065  }
1066 
1067  // Check write before read for atomic operations
1068  // since atomic operations should use BaseMMU::Write
1069  if (pkt->isWrite()) {
1070  TLB_mode = BaseMMU::Write;
1071  } else if (pkt->isRead()) {
1072  TLB_mode = BaseMMU::Read;
1073  } else {
1074  fatal("pkt is not a read nor a write\n");
1075  }
1076 
1077  stats.tlbCycles -= curTick();
1078  ++stats.tlbRequests;
1079 
1080  PortID tlbPort_index = perLaneTLB ? index : 0;
1081 
1082  if (shader->timingSim) {
1083  if (!FullSystem && debugSegFault) {
1085  Addr vaddr = pkt->req->getVaddr();
1086  unsigned size = pkt->getSize();
1087 
1088  if ((vaddr + size - 1) % 64 < vaddr % 64) {
1089  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1090  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1091  }
1092 
1093  Addr paddr;
1094 
1095  if (!p->pTable->translate(vaddr, paddr)) {
1096  if (!p->fixupFault(vaddr)) {
1097  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1098  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1099  vaddr);
1100  }
1101  }
1102  }
1103 
1104  // This is the SenderState needed upon return
1105  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1106 
1107  // This is the senderState needed by the TLB hierarchy to function
1108  GpuTranslationState *translation_state =
1109  new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1110  pkt->senderState);
1111 
1112  pkt->senderState = translation_state;
1113 
1114  if (functionalTLB) {
1115  tlbPort[tlbPort_index].sendFunctional(pkt);
1116 
1117  // update the hitLevel distribution
1118  int hit_level = translation_state->hitLevel;
1119  assert(hit_level != -1);
1120  stats.hitsPerTLBLevel[hit_level]++;
1121 
1122  // New SenderState for the memory access
1123  GpuTranslationState *sender_state =
1124  safe_cast<GpuTranslationState*>(pkt->senderState);
1125 
1126  delete sender_state->tlbEntry;
1127  delete sender_state->saved;
1128  delete sender_state;
1129 
1130  assert(pkt->req->hasPaddr());
1131  assert(pkt->req->hasSize());
1132 
1133  // this is necessary because the GPU TLB receives packets instead
1134  // of requests. when the translation is complete, all relevent
1135  // fields in the request will be populated, but not in the packet.
1136  // here we create the new packet so we can set the size, addr,
1137  // and proper flags.
1138  PacketPtr oldPkt = pkt;
1139  pkt = new Packet(oldPkt->req, oldPkt->cmd);
1140  if (isDataAccess) {
1141  uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1142  pkt->dataStatic(tmpData);
1143  }
1144  delete oldPkt;
1145 
1146 
1147  // New SenderState for the memory access
1148  pkt->senderState =
1149  new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1150  nullptr);
1151 
1152  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1153  gpuDynInst->tlbHitLevel[index] = hit_level;
1154 
1155  // translation is done. Schedule the mem_req_event at the
1156  // appropriate cycle to send the timing memory request to ruby
1157  EventFunctionWrapper *mem_req_event =
1158  memPort[index].createMemReqEvent(pkt);
1159 
1160  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1161  "scheduled\n", cu_id, gpuDynInst->simdId,
1162  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1163 
1164  schedule(mem_req_event, curTick() + req_tick_latency);
1165  } else if (tlbPort[tlbPort_index].isStalled()) {
1166  assert(tlbPort[tlbPort_index].retries.size() > 0);
1167 
1168  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1169  "failed!\n", cu_id, gpuDynInst->simdId,
1170  gpuDynInst->wfSlotId, tmp_vaddr);
1171 
1172  tlbPort[tlbPort_index].retries.push_back(pkt);
1173  } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1174  // Stall the data port;
1175  // No more packet will be issued till
1176  // ruby indicates resources are freed by
1177  // a recvReqRetry() call back on this port.
1178  tlbPort[tlbPort_index].stallPort();
1179 
1180  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1181  "failed!\n", cu_id, gpuDynInst->simdId,
1182  gpuDynInst->wfSlotId, tmp_vaddr);
1183 
1184  tlbPort[tlbPort_index].retries.push_back(pkt);
1185  } else {
1186  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x from "
1187  "instruction %s sent!\n", cu_id, gpuDynInst->simdId,
1188  gpuDynInst->wfSlotId, tmp_vaddr,
1189  gpuDynInst->disassemble().c_str());
1190  }
1191  } else {
1192  if (pkt->cmd == MemCmd::MemSyncReq) {
1193  gpuDynInst->resetEntireStatusVector();
1194  } else {
1195  gpuDynInst->decrementStatusVector(index);
1196  }
1197 
1198  // New SenderState for the memory access
1199  delete pkt->senderState;
1200 
1201  // Because it's atomic operation, only need TLB translation state
1202  pkt->senderState = new GpuTranslationState(TLB_mode,
1203  shader->gpuTc);
1204 
1205  tlbPort[tlbPort_index].sendFunctional(pkt);
1206 
1207  // the addr of the packet is not modified, so we need to create a new
1208  // packet, or otherwise the memory access will have the old virtual
1209  // address sent in the translation packet, instead of the physical
1210  // address returned by the translation.
1211  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1212  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1213 
1214  // Translation is done. It is safe to send the packet to memory.
1215  memPort[0].sendFunctional(new_pkt);
1216 
1217  DPRINTF(GPUMem, "Functional sendRequest\n");
1218  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1219  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1220  new_pkt->req->getPaddr());
1221 
1222  // safe_cast the senderState
1223  GpuTranslationState *sender_state =
1224  safe_cast<GpuTranslationState*>(pkt->senderState);
1225 
1226  delete sender_state->tlbEntry;
1227  delete new_pkt;
1228  delete pkt->senderState;
1229  delete pkt;
1230  }
1231 }
1232 
1233 void
1235 {
1236  assert(pkt->isWrite() || pkt->isRead());
1237 
1238  BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1239 
1240  pkt->senderState =
1242 
1243  pkt->senderState =
1244  new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1245  pkt->senderState);
1246 
1247  if (scalarDTLBPort.isStalled()) {
1248  assert(scalarDTLBPort.retries.size());
1249  scalarDTLBPort.retries.push_back(pkt);
1250  } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1252  scalarDTLBPort.retries.push_back(pkt);
1253  } else {
1254  DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1255  tlb_mode == BaseMMU::Read ? "read" : "write",
1256  pkt->req->getVaddr());
1257  }
1258 }
1259 
1260 void
1262  bool kernelMemSync,
1263  RequestPtr req)
1264 {
1265  assert(gpuDynInst->isGlobalSeg() ||
1266  gpuDynInst->executedAs() == enums::SC_GLOBAL);
1267 
1268  // Fences will never be issued to system memory, so we can mark the
1269  // requestor as a device memory ID here.
1270  if (!req) {
1271  req = std::make_shared<Request>(
1272  0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
1273  } else {
1274  req->requestorId(vramRequestorId());
1275  }
1276 
1277  // all mem sync requests have Paddr == 0
1278  req->setPaddr(0);
1279 
1280  PacketPtr pkt = nullptr;
1281 
1282  if (kernelMemSync) {
1283  if (gpuDynInst->isKernelLaunch()) {
1284  req->setCacheCoherenceFlags(Request::INV_L1);
1285  req->setReqInstSeqNum(gpuDynInst->seqNum());
1286  req->setFlags(Request::KERNEL);
1287  pkt = new Packet(req, MemCmd::MemSyncReq);
1288  pkt->pushSenderState(
1289  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1290 
1291  EventFunctionWrapper *mem_req_event =
1292  memPort[0].createMemReqEvent(pkt);
1293 
1294  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1295  "an acquire\n", cu_id, gpuDynInst->simdId,
1296  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1297 
1298  schedule(mem_req_event, curTick() + req_tick_latency);
1299  } else {
1300  // kernel end flush of GL2 cache may be quiesced by Ruby if the
1301  // GL2 is a read-only cache
1302  assert(shader->impl_kern_end_rel);
1303  assert(gpuDynInst->isEndOfKernel());
1304 
1305  req->setCacheCoherenceFlags(Request::FLUSH_L2);
1306  req->setReqInstSeqNum(gpuDynInst->seqNum());
1307  req->setFlags(Request::KERNEL);
1308  pkt = new Packet(req, MemCmd::MemSyncReq);
1309  pkt->pushSenderState(
1310  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1311 
1312  EventFunctionWrapper *mem_req_event =
1313  memPort[0].createMemReqEvent(pkt);
1314 
1315  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1316  "a release\n", cu_id, gpuDynInst->simdId,
1317  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1318 
1319  schedule(mem_req_event, curTick() + req_tick_latency);
1320  }
1321  } else {
1322  gpuDynInst->setRequestFlags(req);
1323 
1324  req->setReqInstSeqNum(gpuDynInst->seqNum());
1325 
1326  pkt = new Packet(req, MemCmd::MemSyncReq);
1327  pkt->pushSenderState(
1328  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1329 
1330  EventFunctionWrapper *mem_req_event =
1331  memPort[0].createMemReqEvent(pkt);
1332 
1333  DPRINTF(GPUPort,
1334  "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1335  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1336  pkt->req->getPaddr());
1337 
1338  schedule(mem_req_event, curTick() + req_tick_latency);
1339  }
1340 }
1341 
1342 void
1344 {
1345  DataPort::SenderState *sender_state =
1346  safe_cast<DataPort::SenderState*>(pkt->senderState);
1347 
1348  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1349  ComputeUnit *compute_unit = computeUnit;
1350 
1351  assert(gpuDynInst);
1352 
1353  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1354  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1355  pkt->req->getPaddr(), id);
1356 
1357  Addr paddr = pkt->req->getPaddr();
1358 
1359  // mem sync resp callback must be handled already in
1360  // DataPort::recvTimingResp
1361  assert(pkt->cmd != MemCmd::MemSyncResp);
1362 
1363  // The status vector and global memory response for WriteResp packets get
1364  // handled by the WriteCompleteResp packets.
1365  if (pkt->cmd == MemCmd::WriteResp) {
1366  if (!FullSystem || !pkt->req->systemReq()) {
1367  delete pkt;
1368  return;
1369  }
1370  }
1371 
1372  // this is for read, write and atomic
1373  int index = gpuDynInst->memStatusVector[paddr].back();
1374 
1375  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1376  pkt->req->getPaddr(), id);
1377 
1378  gpuDynInst->memStatusVector[paddr].pop_back();
1379  gpuDynInst->pAddr = pkt->req->getPaddr();
1380 
1381  gpuDynInst->decrementStatusVector(index);
1382  DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1383 
1384  if (gpuDynInst->allLanesZero()) {
1385  auto iter = gpuDynInst->memStatusVector.begin();
1386  auto end = gpuDynInst->memStatusVector.end();
1387 
1388  while (iter != end) {
1389  assert(iter->second.empty());
1390  ++iter;
1391  }
1392 
1393  // Calculate the difference between the arrival of the first cache
1394  // block and the last cache block to arrive if we have the time
1395  // for the first cache block.
1396  if (compute_unit->headTailMap.count(gpuDynInst)) {
1397  Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1398  compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1399  compute_unit->headTailMap.erase(gpuDynInst);
1400  }
1401 
1402  gpuDynInst->memStatusVector.clear();
1403 
1404  gpuDynInst->
1405  profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1406  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1407 
1408  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1409  compute_unit->cu_id, gpuDynInst->simdId,
1410  gpuDynInst->wfSlotId);
1411  } else {
1412  if (pkt->isRead()) {
1413  if (!compute_unit->headTailMap.count(gpuDynInst)) {
1414  compute_unit->headTailMap
1415  .insert(std::make_pair(gpuDynInst, curTick()));
1416  }
1417  }
1418  }
1419 
1420  delete pkt->senderState;
1421  delete pkt;
1422 }
1423 
1424 bool
1426 {
1427  Addr line = pkt->req->getPaddr();
1428 
1429  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1430  pkt->req->getVaddr(), line);
1431 
1432  assert(pkt->senderState);
1433  computeUnit->stats.tlbCycles += curTick();
1434 
1435  // pop off the TLB translation state
1436  GpuTranslationState *translation_state =
1437  safe_cast<GpuTranslationState*>(pkt->senderState);
1438 
1439  // no PageFaults are permitted for data accesses
1440  if (!translation_state->tlbEntry) {
1441  DTLBPort::SenderState *sender_state =
1442  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1443 
1444  [[maybe_unused]] Wavefront *w =
1445  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1446  [sender_state->_gpuDynInst->wfSlotId];
1447 
1448  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1449  pkt->req->getVaddr());
1450  }
1451 
1452  // update the hitLevel distribution
1453  int hit_level = translation_state->hitLevel;
1454  computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1455 
1456  delete translation_state->tlbEntry;
1457  assert(!translation_state->ports.size());
1458  pkt->senderState = translation_state->saved;
1459 
1460  // for prefetch pkt
1461  BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1462 
1463  delete translation_state;
1464 
1465  // use the original sender state to know how to close this transaction
1466  DTLBPort::SenderState *sender_state =
1467  safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1468 
1469  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1470  PortID mp_index = sender_state->portIndex;
1471  Addr vaddr = pkt->req->getVaddr();
1472  gpuDynInst->memStatusVector[line].push_back(mp_index);
1473  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1474 
1475  MemCmd requestCmd;
1476 
1477  if (pkt->cmd == MemCmd::ReadResp) {
1478  requestCmd = MemCmd::ReadReq;
1479  } else if (pkt->cmd == MemCmd::WriteResp) {
1480  requestCmd = MemCmd::WriteReq;
1481  } else if (pkt->cmd == MemCmd::SwapResp) {
1482  requestCmd = MemCmd::SwapReq;
1483  } else {
1484  panic("unsupported response to request conversion %s\n",
1485  pkt->cmd.toString());
1486  }
1487 
1488  if (computeUnit->prefetchDepth) {
1489  int simdId = gpuDynInst->simdId;
1490  int wfSlotId = gpuDynInst->wfSlotId;
1491  Addr last = 0;
1492 
1493  switch(computeUnit->prefetchType) {
1494  case enums::PF_CU:
1495  last = computeUnit->lastVaddrCU[mp_index];
1496  break;
1497  case enums::PF_PHASE:
1498  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1499  break;
1500  case enums::PF_WF:
1501  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1502  default:
1503  break;
1504  }
1505 
1506  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1507  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1508 
1509  int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1511  : 0;
1512 
1513  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1514 
1515  computeUnit->lastVaddrCU[mp_index] = vaddr;
1516  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1517  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1518 
1519  stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1520  computeUnit->prefetchStride: stride;
1521 
1522  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1523  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1524 
1525  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1526 
1527  // Prefetch Next few pages atomically
1528  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1529  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1531 
1532  if (!stride)
1533  break;
1534 
1535  RequestPtr prefetch_req = std::make_shared<Request>(
1537  sizeof(uint8_t), 0,
1538  computeUnit->requestorId(),
1539  0, 0, nullptr);
1540 
1541  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1542  uint8_t foo = 0;
1543  prefetch_pkt->dataStatic(&foo);
1544 
1545  // Because it's atomic operation, only need TLB translation state
1546  prefetch_pkt->senderState =
1547  new GpuTranslationState(TLB_mode,
1548  computeUnit->shader->gpuTc, true);
1549 
1550  // Currently prefetches are zero-latency, hence the sendFunctional
1551  sendFunctional(prefetch_pkt);
1552 
1553  /* safe_cast the senderState */
1554  GpuTranslationState *tlb_state =
1555  safe_cast<GpuTranslationState*>(
1556  prefetch_pkt->senderState);
1557 
1558 
1559  delete tlb_state->tlbEntry;
1560  delete tlb_state;
1561  delete prefetch_pkt;
1562  }
1563  }
1564 
1565  // First we must convert the response cmd back to a request cmd so that
1566  // the request can be sent through the cu's request port
1567  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1568  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1569  delete pkt->senderState;
1570  delete pkt;
1571 
1572  // New SenderState for the memory access
1573  new_pkt->senderState =
1574  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1575  nullptr);
1576 
1577  // Set VRAM ID for device requests
1578  // For now, system vmem requests use functional reads. This is not that
1579  // critical to model as the region of interest should always be accessing
1580  // device memory. System vmem requests are used by blit kernels to do
1581  // memcpys and load code objects into device memory.
1582  if (new_pkt->req->systemReq()) {
1583  // There will be multiple packets returned for the same gpuDynInst,
1584  // so first check if systemReq is not already set and if so, return
1585  // the token acquired when the dispatch list is filled as system
1586  // requests do not require a GPU coalescer token.
1587  if (!gpuDynInst->isSystemReq()) {
1588  computeUnit->getTokenManager()->recvTokens(1);
1589  gpuDynInst->setSystemReq();
1590  }
1591  } else {
1592  new_pkt->req->requestorId(computeUnit->vramRequestorId());
1593  }
1594 
1595  // translation is done. Schedule the mem_req_event at the appropriate
1596  // cycle to send the timing memory request to ruby
1597  EventFunctionWrapper *mem_req_event =
1598  computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1599 
1600  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1601  computeUnit->cu_id, gpuDynInst->simdId,
1602  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1603 
1604  computeUnit->schedule(mem_req_event, curTick() +
1605  computeUnit->req_tick_latency);
1606 
1607  return true;
1608 }
1609 
1612 {
1613  return new EventFunctionWrapper(
1614  [this, pkt]{ processMemReqEvent(pkt); },
1615  "ComputeUnit memory request event", true);
1616 }
1617 
1620 {
1621  return new EventFunctionWrapper(
1622  [this, pkt]{ processMemRespEvent(pkt); },
1623  "ComputeUnit memory response event", true);
1624 }
1625 
1626 void
1628 {
1629  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1630  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1631  [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1632 
1633  if (pkt->req->systemReq()) {
1634  assert(compute_unit->shader->systemHub);
1635  SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
1636  compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1637  } else if (!(sendTimingReq(pkt))) {
1638  retries.push_back(std::make_pair(pkt, gpuDynInst));
1639 
1640  DPRINTF(GPUPort,
1641  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1642  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1643  id, pkt->req->getPaddr());
1644  } else {
1645  DPRINTF(GPUPort,
1646  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1647  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1648  gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1649  pkt->req->getPaddr());
1650  }
1651 }
1652 
1653 const char*
1655 {
1656  return "ComputeUnit scalar memory request event";
1657 }
1658 
1659 void
1661 {
1662  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1663  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1664  [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1665 
1666  if (pkt->req->systemReq()) {
1667  assert(compute_unit->shader->systemHub);
1668  SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
1669  compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1670  } else if (!(scalarDataPort.sendTimingReq(pkt))) {
1671  scalarDataPort.retries.push_back(pkt);
1672 
1673  DPRINTF(GPUPort,
1674  "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1675  compute_unit->cu_id, gpuDynInst->simdId,
1676  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1677  } else {
1678  DPRINTF(GPUPort,
1679  "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1680  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1681  gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1682  pkt->req->getPaddr());
1683  }
1684 }
1685 
1686 /*
1687  * The initial translation request could have been rejected,
1688  * if <retries> queue is not Retry sending the translation
1689  * request. sendRetry() is called from the peer port whenever
1690  * a translation completes.
1691  */
1692 void
1694 {
1695  int len = retries.size();
1696 
1697  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1698  computeUnit->cu_id, len);
1699 
1700  assert(len > 0);
1701  assert(isStalled());
1702  // recvReqRetry is an indication that the resource on which this
1703  // port was stalling on is freed. So, remove the stall first
1704  unstallPort();
1705 
1706  for (int i = 0; i < len; ++i) {
1707  PacketPtr pkt = retries.front();
1708  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1709  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1710 
1711  if (!sendTimingReq(pkt)) {
1712  // Stall port
1713  stallPort();
1714  DPRINTF(GPUTLB, ": failed again\n");
1715  break;
1716  } else {
1717  DPRINTF(GPUTLB, ": successful\n");
1718  retries.pop_front();
1719  }
1720  }
1721 }
1722 
1723 bool
1725 {
1726  assert(pkt->senderState);
1727 
1728  GpuTranslationState *translation_state =
1729  safe_cast<GpuTranslationState*>(pkt->senderState);
1730 
1731  // Page faults are not allowed
1732  fatal_if(!translation_state->tlbEntry,
1733  "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1734 
1735  delete translation_state->tlbEntry;
1736  assert(!translation_state->ports.size());
1737 
1738  pkt->senderState = translation_state->saved;
1739  delete translation_state;
1740 
1741  ScalarDTLBPort::SenderState *sender_state =
1742  safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1743 
1744  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1745  delete pkt->senderState;
1746 
1747  [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1748 
1749  DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1750  "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1751  w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1752 
1753  MemCmd mem_cmd;
1754 
1755  if (pkt->cmd == MemCmd::ReadResp) {
1756  mem_cmd = MemCmd::ReadReq;
1757  } else if (pkt->cmd == MemCmd::WriteResp) {
1758  mem_cmd = MemCmd::WriteReq;
1759  } else {
1760  fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1761  pkt->cmd.toString());
1762  }
1763 
1764  PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1765  req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1766  delete pkt;
1767 
1768  req_pkt->senderState =
1770 
1771  // For a system request we want to mark the GPU instruction as a system
1772  // load/store so that after the request is issued to system memory we can
1773  // return any token acquired for the request. Since tokens are returned
1774  // by the coalescer and system requests do not take that path, this needs
1775  // to be tracked.
1776  //
1777  // Device requests change the requestor ID to something in the device
1778  // memory Ruby network.
1779  if (req_pkt->req->systemReq()) {
1780  gpuDynInst->setSystemReq();
1781  } else {
1782  req_pkt->req->requestorId(computeUnit->vramRequestorId());
1783  }
1784 
1785  ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
1787  (computeUnit->scalarDataPort, req_pkt);
1788  computeUnit->schedule(scalar_mem_req_event, curTick() +
1789  computeUnit->req_tick_latency);
1790 
1791  return true;
1792 }
1793 
1794 bool
1796 {
1797  [[maybe_unused]] Addr line = pkt->req->getPaddr();
1798  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1799  computeUnit->cu_id, pkt->req->getVaddr(), line);
1800 
1801  assert(pkt->senderState);
1802 
1803  // pop off the TLB translation state
1804  GpuTranslationState *translation_state
1805  = safe_cast<GpuTranslationState*>(pkt->senderState);
1806 
1807  bool success = translation_state->tlbEntry != nullptr;
1808  delete translation_state->tlbEntry;
1809  assert(!translation_state->ports.size());
1810  pkt->senderState = translation_state->saved;
1811  delete translation_state;
1812 
1813  // use the original sender state to know how to close this transaction
1814  ITLBPort::SenderState *sender_state =
1815  safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1816 
1817  // get the wavefront associated with this translation request
1818  Wavefront *wavefront = sender_state->wavefront;
1819  delete pkt->senderState;
1820 
1821  if (success) {
1822  // pkt is reused in fetch(), don't delete it here. However, we must
1823  // reset the command to be a request so that it can be sent through
1824  // the cu's request port
1825  assert(pkt->cmd == MemCmd::ReadResp);
1826  pkt->cmd = MemCmd::ReadReq;
1827 
1828  computeUnit->fetchStage.fetch(pkt, wavefront);
1829  } else {
1830  if (wavefront->dropFetch) {
1831  assert(wavefront->instructionBuffer.empty());
1832  wavefront->dropFetch = false;
1833  }
1834 
1835  wavefront->pendingFetch = 0;
1836  }
1837 
1838  return true;
1839 }
1840 
1841 /*
1842  * The initial translation request could have been rejected, if
1843  * <retries> queue is not empty. Retry sending the translation
1844  * request. sendRetry() is called from the peer port whenever
1845  * a translation completes.
1846  */
1847 void
1849 {
1850 
1851  int len = retries.size();
1852  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1853 
1854  assert(len > 0);
1855  assert(isStalled());
1856 
1857  // recvReqRetry is an indication that the resource on which this
1858  // port was stalling on is freed. So, remove the stall first
1859  unstallPort();
1860 
1861  for (int i = 0; i < len; ++i) {
1862  PacketPtr pkt = retries.front();
1863  [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1864  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1865 
1866  if (!sendTimingReq(pkt)) {
1867  stallPort(); // Stall port
1868  DPRINTF(GPUTLB, ": failed again\n");
1869  break;
1870  } else {
1871  DPRINTF(GPUTLB, ": successful\n");
1872  retries.pop_front();
1873  }
1874  }
1875 }
1876 
1877 void
1879 {
1880  if (gpuDynInst->isScalar()) {
1881  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1882  stats.sALUInsts++;
1884  } else if (gpuDynInst->isLoad()) {
1886  } else if (gpuDynInst->isStore()) {
1888  }
1889  } else {
1890  if (gpuDynInst->isALU()) {
1893  exitSimLoop("max vALU insts");
1894  }
1895  stats.vALUInsts++;
1898  += gpuDynInst->wavefront()->execMask().count();
1899  } else if (gpuDynInst->isFlat()) {
1900  if (gpuDynInst->isLocalMem()) {
1901  stats.flatLDSInsts++;
1902  } else {
1903  stats.flatVMemInsts++;
1904  }
1905  } else if (gpuDynInst->isFlatGlobal()) {
1906  stats.flatVMemInsts++;
1907  } else if (gpuDynInst->isLocalMem()) {
1909  } else if (gpuDynInst->isLoad()) {
1911  } else if (gpuDynInst->isStore()) {
1913  }
1914 
1915  if (gpuDynInst->isLoad()) {
1916  switch (gpuDynInst->executedAs()) {
1917  case enums::SC_SPILL:
1918  stats.spillReads++;
1919  break;
1920  case enums::SC_GLOBAL:
1921  stats.globalReads++;
1922  break;
1923  case enums::SC_GROUP:
1924  stats.groupReads++;
1925  break;
1926  case enums::SC_PRIVATE:
1927  stats.privReads++;
1928  break;
1929  case enums::SC_READONLY:
1930  stats.readonlyReads++;
1931  break;
1932  case enums::SC_KERNARG:
1933  stats.kernargReads++;
1934  break;
1935  case enums::SC_ARG:
1936  stats.argReads++;
1937  break;
1938  case enums::SC_NONE:
1943  break;
1944  default:
1945  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1946  break;
1947  }
1948  } else if (gpuDynInst->isStore()) {
1949  switch (gpuDynInst->executedAs()) {
1950  case enums::SC_SPILL:
1951  stats.spillWrites++;
1952  break;
1953  case enums::SC_GLOBAL:
1954  stats.globalWrites++;
1955  break;
1956  case enums::SC_GROUP:
1957  stats.groupWrites++;
1958  break;
1959  case enums::SC_PRIVATE:
1960  stats.privWrites++;
1961  break;
1962  case enums::SC_READONLY:
1964  break;
1965  case enums::SC_KERNARG:
1966  stats.kernargWrites++;
1967  break;
1968  case enums::SC_ARG:
1969  stats.argWrites++;
1970  break;
1971  case enums::SC_NONE:
1976  break;
1977  default:
1978  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1979  break;
1980  }
1981  }
1982  }
1983 }
1984 
1985 void
1987 {
1988  Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
1989 
1990  if (!pagesTouched.count(virt_page_addr))
1991  pagesTouched[virt_page_addr] = 1;
1992  else
1993  pagesTouched[virt_page_addr]++;
1994 }
1995 
1996 void
1998 {
1999  if (countPages) {
2000  std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2001 
2002  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2003  std::endl;
2004 
2005  for (auto iter : pageAccesses) {
2006  *page_stat_file << std::hex << iter.first << ",";
2007  *page_stat_file << std::dec << iter.second.first << ",";
2008  *page_stat_file << std::dec << iter.second.second << std::endl;
2009  }
2010  }
2011 }
2012 
2013 bool
2015 {
2016  for (int i = 0; i < numVectorALUs; ++i) {
2017  if (!isVectorAluIdle(i)) {
2018  return false;
2019  }
2020  }
2021 
2022  // TODO: FIXME if more than 1 of any memory pipe supported
2023  if (!srfToScalarMemPipeBus.rdy()) {
2024  return false;
2025  }
2026  if (!vrfToGlobalMemPipeBus.rdy()) {
2027  return false;
2028  }
2029  if (!vrfToLocalMemPipeBus.rdy()) {
2030  return false;
2031  }
2032 
2037  return false;
2038  }
2039 
2040  return true;
2041 }
2042 
2043 int32_t
2044 ComputeUnit::getRefCounter(const uint32_t dispatchId,
2045  const uint32_t wgId) const
2046 {
2047  return lds.getRefCounter(dispatchId, wgId);
2048 }
2049 
2050 bool
2051 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2052 {
2053  assert(simdId < numVectorALUs);
2054 
2055  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2056  if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2057  return false;
2058  }
2059  }
2060 
2061  return true;
2062 }
2063 
2069 bool
2071 {
2072  // this is just a request to carry the GPUDynInstPtr
2073  // back and forth
2074  RequestPtr newRequest = std::make_shared<Request>();
2075  newRequest->setPaddr(0x0);
2076 
2077  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2078  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2079 
2080  // This is the SenderState needed upon return
2081  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2082 
2083  return ldsPort.sendTimingReq(newPacket);
2084 }
2085 
2091 {
2092  return FullSystem ? shader->vramRequestorId() : requestorId();
2093 }
2094 
2098 bool
2100 {
2101  const ComputeUnit::LDSPort::SenderState *senderState =
2102  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2103 
2104  fatal_if(!senderState, "did not get the right sort of sender state");
2105 
2106  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2107 
2108  delete packet->senderState;
2109  delete packet;
2110 
2111  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2112  return true;
2113 }
2114 
2120 bool
2122 {
2123  ComputeUnit::LDSPort::SenderState *sender_state =
2124  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2125  fatal_if(!sender_state, "packet without a valid sender state");
2126 
2127  [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2128 
2129  if (isStalled()) {
2130  fatal_if(retries.empty(), "must have retries waiting to be stalled");
2131 
2132  retries.push(pkt);
2133 
2134  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2135  computeUnit->cu_id, gpuDynInst->simdId,
2136  gpuDynInst->wfSlotId);
2137  return false;
2138  } else if (!RequestPort::sendTimingReq(pkt)) {
2139  // need to stall the LDS port until a recvReqRetry() is received
2140  // this indicates that there is more space
2141  stallPort();
2142  retries.push(pkt);
2143 
2144  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2145  computeUnit->cu_id, gpuDynInst->simdId,
2146  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2147  return false;
2148  } else {
2149  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2150  computeUnit->cu_id, gpuDynInst->simdId,
2151  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2152  return true;
2153  }
2154 }
2155 
2162 void
2164 {
2165  auto queueSize = retries.size();
2166 
2167  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2168  computeUnit->cu_id, queueSize);
2169 
2170  fatal_if(queueSize < 1,
2171  "why was there a recvReqRetry() with no pending reqs?");
2172  fatal_if(!isStalled(),
2173  "recvReqRetry() happened when the port was not stalled");
2174 
2175  unstallPort();
2176 
2177  while (!retries.empty()) {
2178  PacketPtr packet = retries.front();
2179 
2180  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2181 
2182  if (!RequestPort::sendTimingReq(packet)) {
2183  // Stall port
2184  stallPort();
2185  DPRINTF(GPUPort, ": LDS send failed again\n");
2186  break;
2187  } else {
2188  DPRINTF(GPUTLB, ": LDS send successful\n");
2189  retries.pop();
2190  }
2191  }
2192 }
2193 
2195  int n_wf)
2196  : statistics::Group(parent),
2197  ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2198  ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2199  "per-wavefront."),
2200  ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2201  ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2202  "per-wavefront."),
2203  ADD_STAT(instCyclesVALU,
2204  "Number of cycles needed to execute VALU insts."),
2205  ADD_STAT(instCyclesSALU,
2206  "Number of cycles needed to execute SALU insts."),
2207  ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2208  "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2209  "the number of active threads."),
2210  ADD_STAT(vALUUtilization,
2211  "Percentage of active vector ALU threads in a wave."),
2212  ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2213  " accesses that resolve to LDS."),
2214  ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2215  "including FLAT accesses that resolve to LDS) per-wavefront."),
2216  ADD_STAT(flatVMemInsts,
2217  "The number of FLAT insts that resolve to vmem issued."),
2218  ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2219  "resolve to vmem issued per-wavefront."),
2220  ADD_STAT(flatLDSInsts,
2221  "The number of FLAT insts that resolve to LDS issued."),
2222  ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2223  "resolve to LDS issued per-wavefront."),
2224  ADD_STAT(vectorMemWrites,
2225  "Number of vector mem write insts (excluding FLAT insts)."),
2226  ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2227  "insts (excluding FLAT insts) per-wavefront."),
2228  ADD_STAT(vectorMemReads,
2229  "Number of vector mem read insts (excluding FLAT insts)."),
2230  ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2231  "(excluding FLAT insts) per-wavefront."),
2232  ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2233  ADD_STAT(scalarMemWritesPerWF,
2234  "The average number of scalar mem write insts per-wavefront."),
2235  ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2236  ADD_STAT(scalarMemReadsPerWF,
2237  "The average number of scalar mem read insts per-wavefront."),
2238  ADD_STAT(vectorMemReadsPerKiloInst,
2239  "Number of vector mem reads per kilo-instruction"),
2240  ADD_STAT(vectorMemWritesPerKiloInst,
2241  "Number of vector mem writes per kilo-instruction"),
2242  ADD_STAT(vectorMemInstsPerKiloInst,
2243  "Number of vector mem insts per kilo-instruction"),
2244  ADD_STAT(scalarMemReadsPerKiloInst,
2245  "Number of scalar mem reads per kilo-instruction"),
2246  ADD_STAT(scalarMemWritesPerKiloInst,
2247  "Number of scalar mem writes per kilo-instruction"),
2248  ADD_STAT(scalarMemInstsPerKiloInst,
2249  "Number of scalar mem insts per kilo-instruction"),
2250  ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2251  "command, data from VRF to vector memory unit, per SIMD"),
2252  ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2253  "command, data from SRF to scalar memory unit, per SIMD"),
2254  ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2255  "command, data from VRF to LDS unit, per SIMD"),
2256  ADD_STAT(globalReads, "Number of reads to the global segment"),
2257  ADD_STAT(globalWrites, "Number of writes to the global segment"),
2258  ADD_STAT(globalMemInsts,
2259  "Number of memory instructions sent to the global segment"),
2260  ADD_STAT(argReads, "Number of reads to the arg segment"),
2261  ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2262  ADD_STAT(argMemInsts,
2263  "Number of memory instructions sent to the arg segment"),
2264  ADD_STAT(spillReads, "Number of reads to the spill segment"),
2265  ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2266  ADD_STAT(spillMemInsts,
2267  "Number of memory instructions sent to the spill segment"),
2268  ADD_STAT(groupReads, "Number of reads to the group segment"),
2269  ADD_STAT(groupWrites, "Number of writes to the group segment"),
2270  ADD_STAT(groupMemInsts,
2271  "Number of memory instructions sent to the group segment"),
2272  ADD_STAT(privReads, "Number of reads to the private segment"),
2273  ADD_STAT(privWrites, "Number of writes to the private segment"),
2274  ADD_STAT(privMemInsts,
2275  "Number of memory instructions sent to the private segment"),
2276  ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2277  ADD_STAT(readonlyWrites,
2278  "Number of memory instructions sent to the readonly segment"),
2279  ADD_STAT(readonlyMemInsts,
2280  "Number of memory instructions sent to the readonly segment"),
2281  ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2282  ADD_STAT(kernargWrites,
2283  "Number of memory instructions sent to the kernarg segment"),
2284  ADD_STAT(kernargMemInsts,
2285  "Number of memory instructions sent to the kernarg segment"),
2286  ADD_STAT(waveLevelParallelism,
2287  "wave level parallelism: count of active waves at wave launch"),
2288  ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2289  ADD_STAT(tlbCycles,
2290  "total number of cycles for all uncoalesced requests"),
2291  ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2292  ADD_STAT(hitsPerTLBLevel,
2293  "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2294  ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2295  ADD_STAT(ldsBankConflictDist,
2296  "Number of bank conflicts per LDS memory packet"),
2297  ADD_STAT(pageDivergenceDist,
2298  "pages touched per wf (over all mem. instr.)"),
2299  ADD_STAT(dynamicGMemInstrCnt,
2300  "dynamic non-flat global memory instruction count"),
2301  ADD_STAT(dynamicFlatMemInstrCnt,
2302  "dynamic flat global memory instruction count"),
2303  ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2304  ADD_STAT(wgBlockedDueBarrierAllocation,
2305  "WG dispatch was blocked due to lack of barrier resources"),
2306  ADD_STAT(wgBlockedDueLdsAllocation,
2307  "Workgroup blocked due to LDS capacity"),
2308  ADD_STAT(numInstrExecuted, "number of instructions executed"),
2309  ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2310  "vector instructions per cycle"),
2311  ADD_STAT(numVecOpsExecuted,
2312  "number of vec ops executed (e.g. WF size/inst)"),
2313  ADD_STAT(numVecOpsExecutedF16,
2314  "number of f16 vec ops executed (e.g. WF size/inst)"),
2315  ADD_STAT(numVecOpsExecutedF32,
2316  "number of f32 vec ops executed (e.g. WF size/inst)"),
2317  ADD_STAT(numVecOpsExecutedF64,
2318  "number of f64 vec ops executed (e.g. WF size/inst)"),
2319  ADD_STAT(numVecOpsExecutedFMA16,
2320  "number of fma16 vec ops executed (e.g. WF size/inst)"),
2321  ADD_STAT(numVecOpsExecutedFMA32,
2322  "number of fma32 vec ops executed (e.g. WF size/inst)"),
2323  ADD_STAT(numVecOpsExecutedFMA64,
2324  "number of fma64 vec ops executed (e.g. WF size/inst)"),
2325  ADD_STAT(numVecOpsExecutedMAC16,
2326  "number of mac16 vec ops executed (e.g. WF size/inst)"),
2327  ADD_STAT(numVecOpsExecutedMAC32,
2328  "number of mac32 vec ops executed (e.g. WF size/inst)"),
2329  ADD_STAT(numVecOpsExecutedMAC64,
2330  "number of mac64 vec ops executed (e.g. WF size/inst)"),
2331  ADD_STAT(numVecOpsExecutedMAD16,
2332  "number of mad16 vec ops executed (e.g. WF size/inst)"),
2333  ADD_STAT(numVecOpsExecutedMAD32,
2334  "number of mad32 vec ops executed (e.g. WF size/inst)"),
2335  ADD_STAT(numVecOpsExecutedMAD64,
2336  "number of mad64 vec ops executed (e.g. WF size/inst)"),
2337  ADD_STAT(numVecOpsExecutedTwoOpFP,
2338  "number of two op FP vec ops executed (e.g. WF size/inst)"),
2339  ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2340  ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2341  ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2342  ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2343  ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2344  ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2345  ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2346  "instruction (over all instructions)"),
2347  ADD_STAT(activeLanesPerGMemInstrDist,
2348  "number of active lanes per global memory instruction"),
2349  ADD_STAT(activeLanesPerLMemInstrDist,
2350  "number of active lanes per local memory instruction"),
2351  ADD_STAT(numALUInstsExecuted,
2352  "Number of dynamic non-GM memory insts executed"),
2353  ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2354  "blocked due to VGPR allocation per SIMD"),
2355  ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2356  "blocked due to SGPR allocation per SIMD"),
2357  ADD_STAT(numCASOps, "number of compare and swap operations"),
2358  ADD_STAT(numFailedCASOps,
2359  "number of compare and swap operations that failed"),
2360  ADD_STAT(completedWfs, "number of completed wavefronts"),
2361  ADD_STAT(completedWGs, "number of completed workgroups"),
2362  ADD_STAT(headTailLatency, "ticks between first and last cache block "
2363  "arrival at coalescer"),
2364  ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2365 {
2366  ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2367 
2371 
2372  hitsPerTLBLevel.init(4);
2373  execRateDist.init(0, 10, 2);
2374  ldsBankConflictDist.init(0, cu->wfSize(), 2);
2375 
2376  pageDivergenceDist.init(1, cu->wfSize(), 4);
2377  controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
2380 
2381  headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
2383  waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2384  instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2385 
2388  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
2396 
2405 
2413 
2415 
2416  // fixed number of TLB levels
2417  for (int i = 0; i < 4; ++i) {
2418  if (!i)
2419  hitsPerTLBLevel.subname(i,"page_table");
2420  else
2421  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2422  }
2423 
2429 
2432 }
2433 
2434 } // namespace gem5
#define DPRINTFN(...)
Definition: trace.hh:214
#define DPRINTF(x,...)
Definition: trace.hh:186
void sendRequest(PacketPtr pkt, Event *callback)
Definition: system_hub.cc:40
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Tick clockPeriod() const
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
const char * description() const
Return a C string describing the event.
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
bool handleResponse(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int wfSize() const
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
Definition: compute_unit.cc:65
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
WFBarrier & barrierSlot(int bar_id)
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void processFetchReturn(PacketPtr pkt)
Definition: fetch_stage.cc:73
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:246
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:266
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
Definition: dispatcher.cc:295
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
int numWg(int dim) const
int wgId(int dim) const
static const int MAX_DIM
int wgSize(int dim) const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:57
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:384
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:316
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:499
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:351
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:275
virtual std::string name() const
Definition: named.hh:47
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:210
std::ostream * stream() const
Get the output underlying output stream.
Definition: output.hh:62
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
bool isRead() const
Definition: packet.hh:592
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1212
Addr getAddr() const
Definition: packet.hh:805
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1162
SenderState * senderState
This packet's sender state.
Definition: packet.hh:544
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition: packet.cc:334
bool isWrite() const
Definition: packet.hh:593
RequestPtr req
A pointer to the original request.
Definition: packet.hh:376
unsigned getSize() const
Definition: packet.hh:815
MemCmd cmd
The command field of the packet.
Definition: packet.hh:371
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
std::vector< PoolManager * > vrfPoolMgrs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
std::vector< PoolManager * > srfPoolMgrs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition: port.hh:495
@ KERNEL
The request should be marked with KERNEL.
Definition: request.hh:183
bool timingSim
Definition: shader.hh:221
void notifyCuSleep()
Definition: shader.cc:517
int64_t total_valu_insts
Definition: shader.hh:261
ThreadContext * gpuTc
Definition: shader.hh:112
GPUDispatcher & dispatcher()
Definition: shader.cc:99
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition: shader.cc:530
AMDGPUSystemHub * systemHub
Definition: shader.hh:258
int64_t max_valu_insts
Definition: shader.hh:260
int impl_kern_end_rel
Definition: shader.hh:227
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:256
virtual Process * getProcessPtr()=0
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition: token_port.cc:72
WF barrier slots.
Definition: compute_unit.hh:91
static const int InvalidID
Definition: compute_unit.hh:97
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition: misc.hh:76
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
void setStatus(status_e newStatus)
Definition: wavefront.cc:542
const int simdId
Definition: wavefront.hh:99
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:109
status_e getStatus()
Definition: wavefront.hh:137
const int wfSlotId
Definition: wavefront.hh:96
void barrierId(int bar_id)
Definition: wavefront.cc:1438
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:92
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:402
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:358
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1328
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
Statistics container.
Definition: group.hh:94
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1040
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
Definition: statistics.hh:2278
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition: intmath.hh:59
static constexpr bool isPowerOf2(const T &n)
Definition: intmath.hh:98
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
uint16_t len
Definition: helpers.cc:62
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 21, 20 > stride
Definition: misc_types.hh:453
Bitfield< 9 > d
Definition: misc_types.hh:64
Bitfield< 24 > j
Definition: misc_types.hh:57
Bitfield< 1 > vpc
Definition: mt_constants.hh:44
Bitfield< 30, 0 > index
Bitfield< 23 > k
Definition: dt_constants.hh:81
Bitfield< 6 > w
Definition: pagetable.hh:59
Bitfield< 54 > p
Definition: pagetable.hh:70
const Addr PageShift
Definition: page_size.hh:48
Bitfield< 3 > addr
Definition: types.hh:84
Bitfield< 2 > pf
Definition: misc.hh:555
const Addr PageBytes
Definition: page_size.hh:49
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:62
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:72
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
@ GMEnqueue
Definition: misc.hh:56
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:245
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition: root.cc:220
OutputDirectory simout
Definition: output.cc:62
uint64_t Tick
Tick count type.
Definition: types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition: sim_events.cc:88
uint16_t RequestorID
Definition: request.hh:95
RubyTester::SenderState SenderState
Definition: Check.cc:40
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition: core.cc:146
@ OLDEST
Definition: compute_unit.hh:74
@ SC_NONE
Definition: sc_report.hh:50
Declarations of a non-full system Page Table.
statistics::Formula vectorMemWritesPerWF
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Vector instCyclesVMemPerSimd
statistics::Formula flatVMemInstsPerWF
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Formula numALUInstsExecuted
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Formula scalarMemWritesPerWF
statistics::Formula vectorMemReadsPerWF
statistics::Formula scalarMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerWF
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Formula ldsNoFlatInstsPerWF
statistics::Vector instCyclesScMemPerSimd
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
const std::string & name()
Definition: trace.cc:49

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1