gem5  v20.1.0.0
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
35 
36 #include <limits>
37 
38 #include "arch/x86/isa_traits.hh"
39 #include "base/output.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
54 #include "gpu-compute/shader.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 #include "sim/sim_exit.hh"
61 
63  numVectorGlobalMemUnits(p->num_global_mem_pipes),
64  numVectorSharedMemUnits(p->num_shared_mem_pipes),
65  numScalarMemUnits(p->num_scalar_mem_pipes),
66  numVectorALUs(p->num_SIMDs),
67  numScalarALUs(p->num_scalar_cores),
68  vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
69  coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
70  registerManager(p->register_manager),
71  fetchStage(p, *this),
72  scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
73  scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
74  execStage(p, *this, scheduleToExecute),
75  globalMemoryPipe(p, *this),
76  localMemoryPipe(p, *this),
77  scalarMemoryPipe(p, *this),
78  tickEvent([this]{ exec(); }, "Compute unit tick event",
79  false, Event::CPU_Tick_Pri),
80  cu_id(p->cu_id),
81  vrf(p->vector_register_file), srf(p->scalar_register_file),
82  simdWidth(p->simd_width),
83  spBypassPipeLength(p->spbypass_pipe_length),
84  dpBypassPipeLength(p->dpbypass_pipe_length),
85  scalarPipeStages(p->scalar_pipe_length),
86  operandNetworkLength(p->operand_network_length),
87  issuePeriod(p->issue_period),
88  vrf_gm_bus_latency(p->vrf_gm_bus_latency),
89  srf_scm_bus_latency(p->srf_scm_bus_latency),
90  vrf_lm_bus_latency(p->vrf_lm_bus_latency),
91  perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
92  prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
93  debugSegFault(p->debugSegFault),
94  functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
95  countPages(p->countPages),
96  req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
97  resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
98  _requestorId(p->system->getRequestorId(this, "ComputeUnit")),
99  lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
100  ldsPort(csprintf("%s-port", name()), this),
101  scalarDataPort(csprintf("%s-port", name()), this),
102  scalarDTLBPort(csprintf("%s-port", name()), this),
103  sqcPort(csprintf("%s-port", name()), this),
104  sqcTLBPort(csprintf("%s-port", name()), this),
105  _cacheLineSize(p->system->cacheLineSize()),
106  _numBarrierSlots(p->num_barrier_slots),
107  globalSeqNum(0), wavefrontSize(p->wf_size),
108  scoreboardCheckToSchedule(p),
109  scheduleToExecute(p)
110 {
120  fatal_if(p->wf_size > std::numeric_limits<unsigned long long>::digits ||
121  p->wf_size <= 0,
122  "WF size is larger than the host can support");
123  fatal_if(!isPowerOf2(wavefrontSize),
124  "Wavefront size should be a power of 2");
125  // calculate how many cycles a vector load or store will need to transfer
126  // its data over the corresponding buses
127  numCyclesPerStoreTransfer =
128  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
129  (double)vrfToCoalescerBusWidth);
130 
131  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
132  / coalescerToVrfBusWidth;
133 
134  // Initialization: all WF slots are assumed STOPPED
135  idleWfs = p->n_wf * numVectorALUs;
136  lastVaddrWF.resize(numVectorALUs);
137  wfList.resize(numVectorALUs);
138 
139  wfBarrierSlots.resize(p->num_barrier_slots, WFBarrier());
140 
141  for (int i = 0; i < p->num_barrier_slots; ++i) {
142  freeBarrierIds.insert(i);
143  }
144 
145  for (int j = 0; j < numVectorALUs; ++j) {
146  lastVaddrWF[j].resize(p->n_wf);
147 
148  for (int i = 0; i < p->n_wf; ++i) {
149  lastVaddrWF[j][i].resize(wfSize());
150 
151  wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
152  wfList[j][i]->setParent(this);
153 
154  for (int k = 0; k < wfSize(); ++k) {
155  lastVaddrWF[j][i][k] = 0;
156  }
157  }
158  }
159 
160  lastVaddrSimd.resize(numVectorALUs);
161 
162  for (int i = 0; i < numVectorALUs; ++i) {
163  lastVaddrSimd[i].resize(wfSize(), 0);
164  }
165 
166  lastVaddrCU.resize(wfSize());
167 
168  lds.setParent(this);
169 
170  if (p->execPolicy == "OLDEST-FIRST") {
171  exec_policy = EXEC_POLICY::OLDEST;
172  } else if (p->execPolicy == "ROUND-ROBIN") {
173  exec_policy = EXEC_POLICY::RR;
174  } else {
175  fatal("Invalid WF execution policy (CU)\n");
176  }
177 
178  for (int i = 0; i < p->port_memory_port_connection_count; ++i) {
179  memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
180  }
181 
182  for (int i = 0; i < p->port_translation_port_connection_count; ++i) {
183  tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
184  }
185 
186  // Setup tokens for response ports. The number of tokens in memPortTokens
187  // is the total token count for the entire vector port (i.e., this CU).
188  memPortTokens = new TokenManager(p->max_cu_tokens);
189 
190  registerExitCallback([this]() { exitCallback(); });
191 
192  lastExecCycle.resize(numVectorALUs, 0);
193 
194  for (int i = 0; i < vrf.size(); ++i) {
195  vrf[i]->setParent(this);
196  }
197  for (int i = 0; i < srf.size(); ++i) {
198  srf[i]->setParent(this);
199  }
200  numVecRegsPerSimd = vrf[0]->numRegs();
201  numScalarRegsPerSimd = srf[0]->numRegs();
202 
203  registerManager->setParent(this);
204 
205  activeWaves = 0;
206 
207  instExecPerSimd.resize(numVectorALUs, 0);
208 
209  // Calculate the number of bits to address a cache line
210  panic_if(!isPowerOf2(_cacheLineSize),
211  "Cache line size should be a power of two.");
212  cacheLineBits = floorLog2(_cacheLineSize);
213 }
214 
216 {
217  // Delete wavefront slots
218  for (int j = 0; j < numVectorALUs; ++j) {
219  for (int i = 0; i < shader->n_wf; ++i) {
220  delete wfList[j][i];
221  }
222  lastVaddrSimd[j].clear();
223  }
224  lastVaddrCU.clear();
225 }
226 
227 int
229 {
232 }
233 
234 // index into readyList of the first memory unit
235 int
237 {
238  return numVectorALUs + numScalarALUs;
239 }
240 
241 // index into readyList of the last memory unit
242 int
244 {
245  return numExeUnits() - 1;
246 }
247 
248 // index into scalarALUs vector of SALU used by the wavefront
249 int
251 {
252  if (numScalarALUs == 1) {
253  return 0;
254  } else {
255  return w->simdId % numScalarALUs;
256  }
257 }
258 
259 // index into readyList of Scalar ALU unit used by wavefront
260 int
262 {
264 }
265 
266 // index into readyList of Global Memory unit used by wavefront
267 int
269 {
270  // TODO: FIXME if more than 1 GM pipe supported
271  return numVectorALUs + numScalarALUs;
272 }
273 
274 // index into readyList of Local Memory unit used by wavefront
275 int
277 {
278  // TODO: FIXME if more than 1 LM pipe supported
280 }
281 
282 // index into readyList of Scalar Memory unit used by wavefront
283 int
285 {
286  // TODO: FIXME if more than 1 ScM pipe supported
289 }
290 
291 void
293 {
294  w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
295  w->workGroupSz[0] = task->wgSize(0);
296  w->workGroupSz[1] = task->wgSize(1);
297  w->workGroupSz[2] = task->wgSize(2);
298  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
299  w->gridSz[0] = task->gridSize(0);
300  w->gridSz[1] = task->gridSize(1);
301  w->gridSz[2] = task->gridSize(2);
302  w->computeActualWgSz(task);
303 }
304 
305 void
307  HSAQueueEntry *task, int bar_id, bool fetchContext)
308 {
309  static int _n_wave = 0;
310 
311  VectorMask init_mask;
312  init_mask.reset();
313 
314  for (int k = 0; k < wfSize(); ++k) {
315  if (k + waveId * wfSize() < w->actualWgSzTotal)
316  init_mask[k] = 1;
317  }
318 
319  w->execMask() = init_mask;
320 
321  w->kernId = task->dispatchId();
322  w->wfId = waveId;
323  w->initMask = init_mask.to_ullong();
324 
325  if (bar_id > WFBarrier::InvalidID) {
326  w->barrierId(bar_id);
327  } else {
328  assert(!w->hasBarrier());
329  }
330 
331  for (int k = 0; k < wfSize(); ++k) {
332  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
333  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
334  w->actualWgSz[1];
335  w->workItemId[2][k] = (k + waveId * wfSize()) /
336  (w->actualWgSz[0] * w->actualWgSz[1]);
337 
338  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
339  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
340  w->workItemId[0][k];
341  }
342 
343  // WG state
344  w->wgId = task->globalWgId();
345  w->dispatchId = task->dispatchId();
346  w->workGroupId[0] = w->wgId % task->numWg(0);
347  w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
348  w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
349 
350  // set the wavefront context to have a pointer to this section of the LDS
351  w->ldsChunk = ldsChunk;
352 
353  int32_t refCount M5_VAR_USED =
354  lds.increaseRefCounter(w->dispatchId, w->wgId);
355  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
356  cu_id, w->wgId, refCount);
357 
358  w->instructionBuffer.clear();
359 
360  if (w->pendingFetch)
361  w->dropFetch = true;
362 
363  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
364  "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
365  w->simdId, w->wfSlotId, refCount);
366 
367  w->initRegState(task, w->actualWgSzTotal);
368  w->start(_n_wave++, task->codeAddr());
369 
371  activeWaves++;
372 }
373 
379 void
381  GPUDynInstPtr gpuDynInst
382  = std::make_shared<GPUDynInst>(this, nullptr,
384 
385  // kern_id will be used in inv responses
386  gpuDynInst->kern_id = kernId;
387  // update contextId field
388  req->setContext(gpuDynInst->wfDynId);
389 
390  injectGlobalMemFence(gpuDynInst, true, req);
391 }
392 
398 void
400  injectGlobalMemFence(gpuDynInst, true);
401 }
402 
403 void
404 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
405 {
406  // If we aren't ticking, start it up!
407  if (!tickEvent.scheduled()) {
408  DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
410  }
411 
412  // the kernel's invalidate must have finished before any wg dispatch
413  assert(task->isInvDone());
414 
415  // reserve the LDS capacity allocated to the work group
416  // disambiguated by the dispatch ID and workgroup ID, which should be
417  // globally unique
418  LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
419  task->globalWgId(),
420  task->ldsSize());
421 
422  panic_if(!ldsChunk, "was not able to reserve space for this WG");
423 
424  // calculate the number of 32-bit vector registers required
425  // by each work item
426  int vregDemand = task->numVectorRegs();
427  int sregDemand = task->numScalarRegs();
428  int wave_id = 0;
429 
430  int barrier_id = WFBarrier::InvalidID;
431 
436  if (num_wfs_in_wg > 1) {
441  barrier_id = getFreeBarrierId();
442  auto &wf_barrier = barrierSlot(barrier_id);
443  assert(!wf_barrier.maxBarrierCnt());
444  assert(!wf_barrier.numAtBarrier());
445  wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
446 
447  DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
448  "%d waves using this barrier.\n", cu_id, barrier_id,
449  num_wfs_in_wg);
450  }
451 
452  // Assign WFs according to numWfsToSched vector, which is computed by
453  // hasDispResources()
454  for (int j = 0; j < shader->n_wf; ++j) {
455  for (int i = 0; i < numVectorALUs; ++i) {
456  Wavefront *w = wfList[i][j];
457  // Check if this wavefront slot is available and there are WFs
458  // remaining to be dispatched to current SIMD:
459  // WF slot must be stopped and not waiting
460  // for a release to complete S_RETURNING
461  if (w->getStatus() == Wavefront::S_STOPPED &&
462  numWfsToSched[i] > 0) {
463  // decrement number of WFs awaiting dispatch to current SIMD
464  numWfsToSched[i] -= 1;
465 
466  fillKernelState(w, task);
467 
468  DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
469  "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
470  vregDemand, sregDemand);
471 
472  registerManager->allocateRegisters(w, vregDemand, sregDemand);
473 
474  startWavefront(w, wave_id, ldsChunk, task, barrier_id);
475  ++wave_id;
476  }
477  }
478  }
479 }
480 
481 void
483 {
484  panic_if(w->instructionBuffer.empty(),
485  "Instruction Buffer of WF%d can't be empty", w->wgId);
486  GPUDynInstPtr ii = w->instructionBuffer.front();
487  pipeMap.emplace(ii->seqNum());
488 }
489 
490 void
492 {
493  panic_if(w->instructionBuffer.empty(),
494  "Instruction Buffer of WF%d can't be empty", w->wgId);
495  GPUDynInstPtr ii = w->instructionBuffer.front();
496  // delete the dynamic instruction from the pipeline map
497  auto it = pipeMap.find(ii->seqNum());
498  panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
499  pipeMap.erase(it);
500 }
501 
502 bool
504 {
505  // compute true size of workgroup (after clamping to grid size)
506  int trueWgSize[HSAQueueEntry::MAX_DIM];
507  int trueWgSizeTotal = 1;
508 
509  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
510  trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
511  task->wgId(d) * task->wgSize(d));
512 
513  trueWgSizeTotal *= trueWgSize[d];
514  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
515  }
516 
517  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
518 
519  // calculate the number of WFs in this WG
520  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
521  num_wfs_in_wg = numWfs;
522 
523  bool barrier_avail = true;
524 
525  if (numWfs > 1 && !freeBarrierIds.size()) {
526  barrier_avail = false;
527  }
528 
529  // calculate the number of 32-bit vector registers required by each
530  // work item of the work group
531  int vregDemandPerWI = task->numVectorRegs();
532  // calculate the number of 32-bit scalar registers required by each
533  // work item of the work group
534  int sregDemandPerWI = task->numScalarRegs();
535 
536  // check if the total number of VGPRs snd SGPRs required by all WFs
537  // of the WG fit in the VRFs of all SIMD units and the CU's SRF
538  panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
539  "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
540  "that has %d VGPRs\n",
541  numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
542  panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
543  "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
544  "with %d SGPRs\n",
545  numWfs, sregDemandPerWI, numScalarRegsPerSimd);
546 
547  // number of WF slots that are not occupied
548  int freeWfSlots = 0;
549  // number of Wfs from WG that were successfully mapped to a SIMD
550  int numMappedWfs = 0;
551  numWfsToSched.clear();
552  numWfsToSched.resize(numVectorALUs, 0);
553 
554  // attempt to map WFs to the SIMDs, based on WF slot availability
555  // and register file availability
556  for (int j = 0; j < shader->n_wf; ++j) {
557  for (int i = 0; i < numVectorALUs; ++i) {
558  if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
559  ++freeWfSlots;
560  // check if current WF will fit onto current SIMD/VRF
561  // if all WFs have not yet been mapped to the SIMDs
562  if (numMappedWfs < numWfs &&
564  sregDemandPerWI) &&
566  vregDemandPerWI)) {
567  numWfsToSched[i]++;
568  numMappedWfs++;
569  }
570  }
571  }
572  }
573 
574  // check that the number of mapped WFs is not greater
575  // than the actual number of WFs
576  assert(numMappedWfs <= numWfs);
577 
578  bool vregAvail = true;
579  bool sregAvail = true;
580  // if a WF to SIMD mapping was not found, find the limiting resource
581  if (numMappedWfs < numWfs) {
582 
583  for (int j = 0; j < numVectorALUs; ++j) {
584  // find if there are enough free VGPRs in the SIMD's VRF
585  // to accomodate the WFs of the new WG that would be mapped
586  // to this SIMD unit
587  vregAvail &= registerManager->
588  canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
589  // find if there are enough free SGPRs in the SIMD's SRF
590  // to accomodate the WFs of the new WG that would be mapped
591  // to this SIMD unit
592  sregAvail &= registerManager->
593  canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
594  }
595  }
596 
597  DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
598  VGPR Availability = %d, SGPR Availability = %d\n",
599  freeWfSlots, numMappedWfs, vregAvail, sregAvail);
600 
601  if (!vregAvail) {
603  }
604 
605  if (!sregAvail) {
607  }
608 
609  // Return true if enough WF slots to submit workgroup and if there are
610  // enough VGPRs to schedule all WFs to their SIMD units
611  bool ldsAvail = lds.canReserve(task->ldsSize());
612  if (!ldsAvail) {
614  }
615 
616  if (!barrier_avail) {
618  }
619 
620  // Return true if the following are all true:
621  // (a) all WFs of the WG were mapped to free WF slots
622  // (b) there are enough VGPRs to schedule all WFs to their SIMD units
623  // (c) there are enough SGPRs on the CU to schedule all WFs
624  // (d) there is enough space in LDS to allocate for all WFs
625  bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
626  && ldsAvail && barrier_avail;
627  return can_dispatch;
628 }
629 
630 int
632 {
633  auto &wf_barrier = barrierSlot(bar_id);
634  return wf_barrier.numYetToReachBarrier();
635 }
636 
637 bool
639 {
640  auto &wf_barrier = barrierSlot(bar_id);
641  return wf_barrier.allAtBarrier();
642 }
643 
644 void
646 {
647  auto &wf_barrier = barrierSlot(bar_id);
648  wf_barrier.incNumAtBarrier();
649 }
650 
651 int
653 {
654  auto &wf_barrier = barrierSlot(bar_id);
655  return wf_barrier.numAtBarrier();
656 }
657 
658 int
660 {
661  auto &wf_barrier = barrierSlot(bar_id);
662  return wf_barrier.maxBarrierCnt();
663 }
664 
665 void
667 {
668  auto &wf_barrier = barrierSlot(bar_id);
669  wf_barrier.reset();
670 }
671 
672 void
674 {
675  auto &wf_barrier = barrierSlot(bar_id);
676  wf_barrier.decMaxBarrierCnt();
677 }
678 
679 void
681 {
682  auto &wf_barrier = barrierSlot(bar_id);
683  wf_barrier.release();
684  freeBarrierIds.insert(bar_id);
685 }
686 
687 void
689 {
690  for (int i = 0; i < numVectorALUs; ++i) {
691  for (int j = 0; j < shader->n_wf; ++j) {
692  Wavefront *wf = wfList[i][j];
693  if (wf->barrierId() == bar_id) {
694  assert(wf->getStatus() == Wavefront::S_BARRIER);
696  }
697  }
698  }
699 }
700 
701 // Execute one clock worth of work on the ComputeUnit.
702 void
704 {
705  // process reads and writes in the RFs
706  for (auto &vecRegFile : vrf) {
707  vecRegFile->exec();
708  }
709 
710  for (auto &scRegFile : srf) {
711  scRegFile->exec();
712  }
713 
714  // Execute pipeline stages in reverse order to simulate
715  // the pipeline latency
719  execStage.exec();
722  fetchStage.exec();
723 
724  totalCycles++;
725 
726  // Put this CU to sleep if there is no more work to be done.
727  if (!isDone()) {
729  } else {
731  DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
732  }
733 }
734 
735 void
737 {
738  // Initialize CU Bus models and execution resources
739 
740  // Vector ALUs
741  vectorALUs.clear();
742  for (int i = 0; i < numVectorALUs; i++) {
743  vectorALUs.emplace_back(this, clockPeriod());
744  }
745 
746  // Scalar ALUs
747  scalarALUs.clear();
748  for (int i = 0; i < numScalarALUs; i++) {
749  scalarALUs.emplace_back(this, clockPeriod());
750  }
751 
752  // Vector Global Memory
754  "No support for multiple Global Memory Pipelines exists!!!");
758 
759  // Vector Local/Shared Memory
761  "No support for multiple Local Memory Pipelines exists!!!");
765 
766  // Scalar Memory
768  "No support for multiple Scalar Memory Pipelines exists!!!");
769  scalarMemUnit.init(this, clockPeriod());
772 
775 
776  fetchStage.init();
778  execStage.init();
780 
782 }
783 
784 bool
786 {
787  // Ruby has completed the memory op. Schedule the mem_resp_event at the
788  // appropriate cycle to process the timing memory response
789  // This delay represents the pipeline delay
790  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
791  PortID index = sender_state->port_index;
792  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
793  GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
794 
795  // MemSyncResp + WriteAckResp are handled completely here and we don't
796  // schedule a MemRespEvent to process the responses further
797  if (pkt->cmd == MemCmd::MemSyncResp) {
798  // This response is for 1 of the following request types:
799  // - kernel launch
800  // - kernel end
801  // - non-kernel mem sync
802 
803  // Kernel Launch
804  // wavefront was nullptr when launching kernel, so it is meaningless
805  // here (simdId=-1, wfSlotId=-1)
806  if (gpuDynInst->isKernelLaunch()) {
807  // for kernel launch, the original request must be both kernel-type
808  // and acquire
809  assert(pkt->req->isKernel());
810  assert(pkt->req->isAcquire());
811 
812  // one D-Cache inv is done, decrement counter
813  dispatcher.updateInvCounter(gpuDynInst->kern_id);
814 
815  delete pkt->senderState;
816  delete pkt;
817  return true;
818  }
819 
820  // retrieve wavefront from inst
821  Wavefront *w = gpuDynInst->wavefront();
822 
823  // Check if we are waiting on Kernel End Release
824  if (w->getStatus() == Wavefront::S_RETURNING
825  && gpuDynInst->isEndOfKernel()) {
826  // for kernel end, the original request must be both kernel-type
827  // and release
828  assert(pkt->req->isKernel());
829  assert(pkt->req->isRelease());
830 
831  // one wb done, decrement counter, and return whether all wbs are
832  // done for the kernel
833  bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
834 
835  // not all wbs are done for the kernel, just release pkt
836  // resources
837  if (!isWbDone) {
838  delete pkt->senderState;
839  delete pkt;
840  return true;
841  }
842 
843  // all wbs are completed for the kernel, do retirement work
844  // for the workgroup
845  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
846  computeUnit->cu_id, w->simdId, w->wfSlotId,
847  w->wfDynId, w->wgId);
848 
849  dispatcher.notifyWgCompl(w);
850  w->setStatus(Wavefront::S_STOPPED);
851  }
852 
853  if (!pkt->req->isKernel()) {
854  w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
855  DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
856  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
857  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
858  gpuDynInst->disassemble(), w->outstandingReqs,
859  w->outstandingReqs - 1);
861  }
862 
863  delete pkt->senderState;
864  delete pkt;
865  return true;
866  } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
867  // this is for writeComplete callback
868  // we simply get decrement write-related wait counters
869  assert(gpuDynInst);
870  Wavefront *w M5_VAR_USED =
871  computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
872  assert(w);
873  DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
874  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
875  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
876  gpuDynInst->disassemble(), w->outstandingReqs,
877  w->outstandingReqs - 1);
878  if (gpuDynInst->allLanesZero()) {
879  // ask gm pipe to decrement request counters, instead of directly
880  // performing here, to avoid asynchronous counter update and
881  // instruction retirement (which may hurt waincnt effects)
883 
884  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
885  computeUnit->cu_id, gpuDynInst->simdId,
886  gpuDynInst->wfSlotId);
887  }
888 
889  delete pkt->senderState;
890  delete pkt;
891 
892  return true;
893  }
894 
895  EventFunctionWrapper *mem_resp_event =
896  computeUnit->memPort[index].createMemRespEvent(pkt);
897 
898  DPRINTF(GPUPort,
899  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
900  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
901  gpuDynInst->seqNum(), index, pkt->req->getPaddr());
902 
903  computeUnit->schedule(mem_resp_event,
905 
906  return true;
907 }
908 
909 bool
911 {
912  assert(!pkt->req->isKernel());
913 
914  // retrieve sender state
915  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
916  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
917 
918  assert(pkt->isRead() || pkt->isWrite());
919  assert(gpuDynInst->numScalarReqs > 0);
920 
921  gpuDynInst->numScalarReqs--;
922 
931  if (!gpuDynInst->numScalarReqs) {
932  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
933  computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
934  gpuDynInst);
935  } else {
936  computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
937  gpuDynInst);
938  }
939  }
940 
941  delete pkt->senderState;
942  delete pkt;
943 
944  return true;
945 }
946 
947 void
949 {
950  for (const auto &pkt : retries) {
951  if (!sendTimingReq(pkt)) {
952  break;
953  } else {
954  retries.pop_front();
955  }
956  }
957 }
958 
959 void
961 {
962  int len = retries.size();
963 
964  assert(len > 0);
965 
966  for (int i = 0; i < len; ++i) {
967  PacketPtr pkt = retries.front().first;
968  GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
969  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
970  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
971  pkt->req->getPaddr());
972 
976  if (!sendTimingReq(pkt)) {
977  DPRINTF(GPUMem, "failed again!\n");
978  break;
979  } else {
980  DPRINTF(GPUMem, "successful!\n");
981  retries.pop_front();
982  }
983  }
984 }
985 
986 bool
988 {
989  computeUnit->fetchStage.processFetchReturn(pkt);
990  return true;
991 }
992 
993 void
995 {
996  int len = retries.size();
997 
998  assert(len > 0);
999 
1000  for (int i = 0; i < len; ++i) {
1001  PacketPtr pkt = retries.front().first;
1002  Wavefront *wavefront M5_VAR_USED = retries.front().second;
1003  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1004  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1005  pkt->req->getPaddr());
1006  if (!sendTimingReq(pkt)) {
1007  DPRINTF(GPUFetch, "failed again!\n");
1008  break;
1009  } else {
1010  DPRINTF(GPUFetch, "successful!\n");
1011  retries.pop_front();
1012  }
1013  }
1014 }
1015 
1016 void
1018 {
1019  // There must be a way around this check to do the globalMemStart...
1020  Addr tmp_vaddr = pkt->req->getVaddr();
1021 
1022  updatePageDivergenceDist(tmp_vaddr);
1023 
1024  // set PC in request
1025  pkt->req->setPC(gpuDynInst->wavefront()->pc());
1026 
1027  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1028 
1029  // figure out the type of the request to set read/write
1030  BaseTLB::Mode TLB_mode;
1031  assert(pkt->isRead() || pkt->isWrite());
1032 
1033  // only do some things if actually accessing data
1034  bool isDataAccess = pkt->isWrite() || pkt->isRead();
1035 
1036  // Check write before read for atomic operations
1037  // since atomic operations should use BaseTLB::Write
1038  if (pkt->isWrite()) {
1039  TLB_mode = BaseTLB::Write;
1040  } else if (pkt->isRead()) {
1041  TLB_mode = BaseTLB::Read;
1042  } else {
1043  fatal("pkt is not a read nor a write\n");
1044  }
1045 
1046  tlbCycles -= curTick();
1047  ++tlbRequests;
1048 
1049  PortID tlbPort_index = perLaneTLB ? index : 0;
1050 
1051  if (shader->timingSim) {
1052  if (debugSegFault) {
1054  Addr vaddr = pkt->req->getVaddr();
1055  unsigned size = pkt->getSize();
1056 
1057  if ((vaddr + size - 1) % 64 < vaddr % 64) {
1058  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1059  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1060  }
1061 
1062  Addr paddr;
1063 
1064  if (!p->pTable->translate(vaddr, paddr)) {
1065  if (!p->fixupFault(vaddr)) {
1066  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1067  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1068  vaddr);
1069  }
1070  }
1071  }
1072 
1073  // This is the SenderState needed upon return
1074  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1075 
1076  // This is the senderState needed by the TLB hierarchy to function
1077  TheISA::GpuTLB::TranslationState *translation_state =
1078  new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
1079  pkt->senderState);
1080 
1081  pkt->senderState = translation_state;
1082 
1083  if (functionalTLB) {
1084  tlbPort[tlbPort_index].sendFunctional(pkt);
1085 
1086  // update the hitLevel distribution
1087  int hit_level = translation_state->hitLevel;
1088  assert(hit_level != -1);
1089  hitsPerTLBLevel[hit_level]++;
1090 
1091  // New SenderState for the memory access
1092  X86ISA::GpuTLB::TranslationState *sender_state =
1093  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1094 
1095  delete sender_state->tlbEntry;
1096  delete sender_state->saved;
1097  delete sender_state;
1098 
1099  assert(pkt->req->hasPaddr());
1100  assert(pkt->req->hasSize());
1101 
1102  // this is necessary because the GPU TLB receives packets instead
1103  // of requests. when the translation is complete, all relevent
1104  // fields in the request will be populated, but not in the packet.
1105  // here we create the new packet so we can set the size, addr,
1106  // and proper flags.
1107  PacketPtr oldPkt = pkt;
1108  pkt = new Packet(oldPkt->req, oldPkt->cmd);
1109  if (isDataAccess) {
1110  uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1111  pkt->dataStatic(tmpData);
1112  }
1113  delete oldPkt;
1114 
1115 
1116  // New SenderState for the memory access
1117  pkt->senderState =
1118  new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1119  nullptr);
1120 
1121  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1122  gpuDynInst->tlbHitLevel[index] = hit_level;
1123 
1124  // translation is done. Schedule the mem_req_event at the
1125  // appropriate cycle to send the timing memory request to ruby
1126  EventFunctionWrapper *mem_req_event =
1127  memPort[index].createMemReqEvent(pkt);
1128 
1129  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1130  "scheduled\n", cu_id, gpuDynInst->simdId,
1131  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1132 
1133  schedule(mem_req_event, curTick() + req_tick_latency);
1134  } else if (tlbPort[tlbPort_index].isStalled()) {
1135  assert(tlbPort[tlbPort_index].retries.size() > 0);
1136 
1137  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1138  "failed!\n", cu_id, gpuDynInst->simdId,
1139  gpuDynInst->wfSlotId, tmp_vaddr);
1140 
1141  tlbPort[tlbPort_index].retries.push_back(pkt);
1142  } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1143  // Stall the data port;
1144  // No more packet will be issued till
1145  // ruby indicates resources are freed by
1146  // a recvReqRetry() call back on this port.
1147  tlbPort[tlbPort_index].stallPort();
1148 
1149  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1150  "failed!\n", cu_id, gpuDynInst->simdId,
1151  gpuDynInst->wfSlotId, tmp_vaddr);
1152 
1153  tlbPort[tlbPort_index].retries.push_back(pkt);
1154  } else {
1155  DPRINTF(GPUTLB,
1156  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1157  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1158  }
1159  } else {
1160  if (pkt->cmd == MemCmd::MemSyncReq) {
1161  gpuDynInst->resetEntireStatusVector();
1162  } else {
1163  gpuDynInst->decrementStatusVector(index);
1164  }
1165 
1166  // New SenderState for the memory access
1167  delete pkt->senderState;
1168 
1169  // Because it's atomic operation, only need TLB translation state
1170  pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
1171  shader->gpuTc);
1172 
1173  tlbPort[tlbPort_index].sendFunctional(pkt);
1174 
1175  // the addr of the packet is not modified, so we need to create a new
1176  // packet, or otherwise the memory access will have the old virtual
1177  // address sent in the translation packet, instead of the physical
1178  // address returned by the translation.
1179  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1180  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1181 
1182  // Translation is done. It is safe to send the packet to memory.
1183  memPort[0].sendFunctional(new_pkt);
1184 
1185  DPRINTF(GPUMem, "Functional sendRequest\n");
1186  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1187  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1188  new_pkt->req->getPaddr());
1189 
1190  // safe_cast the senderState
1191  TheISA::GpuTLB::TranslationState *sender_state =
1192  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1193 
1194  delete sender_state->tlbEntry;
1195  delete new_pkt;
1196  delete pkt->senderState;
1197  delete pkt;
1198  }
1199 }
1200 
1201 void
1203 {
1204  assert(pkt->isWrite() || pkt->isRead());
1205 
1206  BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write;
1207 
1208  pkt->senderState =
1210 
1211  pkt->senderState =
1212  new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
1213  pkt->senderState);
1214 
1215  if (scalarDTLBPort.isStalled()) {
1216  assert(scalarDTLBPort.retries.size());
1217  scalarDTLBPort.retries.push_back(pkt);
1218  } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1220  scalarDTLBPort.retries.push_back(pkt);
1221  } else {
1222  DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1223  tlb_mode == BaseTLB::Read ? "read" : "write",
1224  pkt->req->getVaddr());
1225  }
1226 }
1227 
1228 void
1230  bool kernelMemSync,
1231  RequestPtr req)
1232 {
1233  assert(gpuDynInst->isGlobalSeg() ||
1234  gpuDynInst->executedAs() == Enums::SC_GLOBAL);
1235 
1236  if (!req) {
1237  req = std::make_shared<Request>(
1238  0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
1239  }
1240 
1241  // all mem sync requests have Paddr == 0
1242  req->setPaddr(0);
1243 
1244  PacketPtr pkt = nullptr;
1245 
1246  if (kernelMemSync) {
1247  if (gpuDynInst->isKernelLaunch()) {
1248  req->setCacheCoherenceFlags(Request::ACQUIRE);
1249  req->setReqInstSeqNum(gpuDynInst->seqNum());
1250  req->setFlags(Request::KERNEL);
1251  pkt = new Packet(req, MemCmd::MemSyncReq);
1252  pkt->pushSenderState(
1253  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1254 
1255  EventFunctionWrapper *mem_req_event =
1256  memPort[0].createMemReqEvent(pkt);
1257 
1258  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1259  "an acquire\n", cu_id, gpuDynInst->simdId,
1260  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1261 
1262  schedule(mem_req_event, curTick() + req_tick_latency);
1263  } else {
1264  // kernel end release must be enabled
1265  assert(shader->impl_kern_end_rel);
1266  assert(gpuDynInst->isEndOfKernel());
1267 
1268  req->setCacheCoherenceFlags(Request::WB_L2);
1269  req->setReqInstSeqNum(gpuDynInst->seqNum());
1270  req->setFlags(Request::KERNEL);
1271  pkt = new Packet(req, MemCmd::MemSyncReq);
1272  pkt->pushSenderState(
1273  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1274 
1275  EventFunctionWrapper *mem_req_event =
1276  memPort[0].createMemReqEvent(pkt);
1277 
1278  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1279  "a release\n", cu_id, gpuDynInst->simdId,
1280  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1281 
1282  schedule(mem_req_event, curTick() + req_tick_latency);
1283  }
1284  } else {
1285  gpuDynInst->setRequestFlags(req);
1286 
1287  req->setReqInstSeqNum(gpuDynInst->seqNum());
1288 
1289  pkt = new Packet(req, MemCmd::MemSyncReq);
1290  pkt->pushSenderState(
1291  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1292 
1293  EventFunctionWrapper *mem_req_event =
1294  memPort[0].createMemReqEvent(pkt);
1295 
1296  DPRINTF(GPUPort,
1297  "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1298  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1299  pkt->req->getPaddr());
1300 
1301  schedule(mem_req_event, curTick() + req_tick_latency);
1302  }
1303 }
1304 
1305 void
1307 {
1308  DataPort::SenderState *sender_state =
1309  safe_cast<DataPort::SenderState*>(pkt->senderState);
1310 
1311  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1312  ComputeUnit *compute_unit = computeUnit;
1313 
1314  assert(gpuDynInst);
1315 
1316  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1317  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1318  pkt->req->getPaddr(), id);
1319 
1320  Addr paddr = pkt->req->getPaddr();
1321 
1322  // mem sync resp and write-complete callback must be handled already in
1323  // DataPort::recvTimingResp
1324  assert(pkt->cmd != MemCmd::MemSyncResp);
1325  assert(pkt->cmd != MemCmd::WriteCompleteResp);
1326 
1327  // this is for read, write and atomic
1328  int index = gpuDynInst->memStatusVector[paddr].back();
1329 
1330  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1331  pkt->req->getPaddr(), id);
1332 
1333  gpuDynInst->memStatusVector[paddr].pop_back();
1334  gpuDynInst->pAddr = pkt->req->getPaddr();
1335 
1336  gpuDynInst->decrementStatusVector(index);
1337  DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1338 
1339  if (gpuDynInst->allLanesZero()) {
1340  auto iter = gpuDynInst->memStatusVector.begin();
1341  auto end = gpuDynInst->memStatusVector.end();
1342 
1343  while (iter != end) {
1344  assert(iter->second.empty());
1345  ++iter;
1346  }
1347 
1348  // Calculate the difference between the arrival of the first cache
1349  // block and the last cache block to arrive if we have the time
1350  // for the first cache block.
1351  if (compute_unit->headTailMap.count(gpuDynInst)) {
1352  Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1353  compute_unit->headTailLatency.sample(curTick() - headTick);
1354  compute_unit->headTailMap.erase(gpuDynInst);
1355  }
1356 
1357  gpuDynInst->memStatusVector.clear();
1358 
1359  // note: only handle read response here; for write, the response
1360  // is separately handled when writeComplete callback is received
1361  if (pkt->isRead()) {
1362  gpuDynInst->
1363  profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1364  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1365 
1366  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1367  compute_unit->cu_id, gpuDynInst->simdId,
1368  gpuDynInst->wfSlotId);
1369  }
1370  } else {
1371  if (pkt->isRead()) {
1372  if (!compute_unit->headTailMap.count(gpuDynInst)) {
1373  compute_unit->headTailMap
1374  .insert(std::make_pair(gpuDynInst, curTick()));
1375  }
1376  }
1377  }
1378 
1379  delete pkt->senderState;
1380  delete pkt;
1381 }
1382 
1383 ComputeUnit*
1384 ComputeUnitParams::create()
1385 {
1386  return new ComputeUnit(this);
1387 }
1388 
1389 bool
1391 {
1392  Addr line = pkt->req->getPaddr();
1393 
1394  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1395  pkt->req->getVaddr(), line);
1396 
1397  assert(pkt->senderState);
1398  computeUnit->tlbCycles += curTick();
1399 
1400  // pop off the TLB translation state
1401  TheISA::GpuTLB::TranslationState *translation_state =
1402  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1403 
1404  // no PageFaults are permitted for data accesses
1405  if (!translation_state->tlbEntry) {
1406  DTLBPort::SenderState *sender_state =
1407  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1408 
1409  Wavefront *w M5_VAR_USED =
1410  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1411  [sender_state->_gpuDynInst->wfSlotId];
1412 
1413  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1414  pkt->req->getVaddr());
1415  }
1416 
1417  // update the hitLevel distribution
1418  int hit_level = translation_state->hitLevel;
1419  computeUnit->hitsPerTLBLevel[hit_level]++;
1420 
1421  delete translation_state->tlbEntry;
1422  assert(!translation_state->ports.size());
1423  pkt->senderState = translation_state->saved;
1424 
1425  // for prefetch pkt
1426  BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1427 
1428  delete translation_state;
1429 
1430  // use the original sender state to know how to close this transaction
1431  DTLBPort::SenderState *sender_state =
1432  safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1433 
1434  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1435  PortID mp_index = sender_state->portIndex;
1436  Addr vaddr = pkt->req->getVaddr();
1437  gpuDynInst->memStatusVector[line].push_back(mp_index);
1438  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1439 
1440  MemCmd requestCmd;
1441 
1442  if (pkt->cmd == MemCmd::ReadResp) {
1443  requestCmd = MemCmd::ReadReq;
1444  } else if (pkt->cmd == MemCmd::WriteResp) {
1445  requestCmd = MemCmd::WriteReq;
1446  } else if (pkt->cmd == MemCmd::SwapResp) {
1447  requestCmd = MemCmd::SwapReq;
1448  } else {
1449  panic("unsupported response to request conversion %s\n",
1450  pkt->cmd.toString());
1451  }
1452 
1453  if (computeUnit->prefetchDepth) {
1454  int simdId = gpuDynInst->simdId;
1455  int wfSlotId = gpuDynInst->wfSlotId;
1456  Addr last = 0;
1457 
1458  switch(computeUnit->prefetchType) {
1459  case Enums::PF_CU:
1460  last = computeUnit->lastVaddrCU[mp_index];
1461  break;
1462  case Enums::PF_PHASE:
1463  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1464  break;
1465  case Enums::PF_WF:
1466  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1467  default:
1468  break;
1469  }
1470 
1471  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1472  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1473 
1474  int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1476  : 0;
1477 
1478  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1479 
1480  computeUnit->lastVaddrCU[mp_index] = vaddr;
1481  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1482  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1483 
1484  stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1485  computeUnit->prefetchStride: stride;
1486 
1487  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1488  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1489 
1490  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1491 
1492  // Prefetch Next few pages atomically
1493  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1494  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1496 
1497  if (!stride)
1498  break;
1499 
1500  RequestPtr prefetch_req = std::make_shared<Request>(
1502  sizeof(uint8_t), 0,
1503  computeUnit->requestorId(),
1504  0, 0, nullptr);
1505 
1506  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1507  uint8_t foo = 0;
1508  prefetch_pkt->dataStatic(&foo);
1509 
1510  // Because it's atomic operation, only need TLB translation state
1511  prefetch_pkt->senderState =
1512  new TheISA::GpuTLB::TranslationState(TLB_mode,
1513  computeUnit->shader->gpuTc, true);
1514 
1515  // Currently prefetches are zero-latency, hence the sendFunctional
1516  sendFunctional(prefetch_pkt);
1517 
1518  /* safe_cast the senderState */
1519  TheISA::GpuTLB::TranslationState *tlb_state =
1520  safe_cast<TheISA::GpuTLB::TranslationState*>(
1521  prefetch_pkt->senderState);
1522 
1523 
1524  delete tlb_state->tlbEntry;
1525  delete tlb_state;
1526  delete prefetch_pkt;
1527  }
1528  }
1529 
1530  // First we must convert the response cmd back to a request cmd so that
1531  // the request can be sent through the cu's request port
1532  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1533  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1534  delete pkt->senderState;
1535  delete pkt;
1536 
1537  // New SenderState for the memory access
1538  new_pkt->senderState =
1539  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1540  nullptr);
1541 
1542  // translation is done. Schedule the mem_req_event at the appropriate
1543  // cycle to send the timing memory request to ruby
1544  EventFunctionWrapper *mem_req_event =
1545  computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1546 
1547  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1548  computeUnit->cu_id, gpuDynInst->simdId,
1549  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1550 
1551  computeUnit->schedule(mem_req_event, curTick() +
1552  computeUnit->req_tick_latency);
1553 
1554  return true;
1555 }
1556 
1559 {
1560  return new EventFunctionWrapper(
1561  [this, pkt]{ processMemReqEvent(pkt); },
1562  "ComputeUnit memory request event", true);
1563 }
1564 
1567 {
1568  return new EventFunctionWrapper(
1569  [this, pkt]{ processMemRespEvent(pkt); },
1570  "ComputeUnit memory response event", true);
1571 }
1572 
1573 void
1575 {
1576  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1577  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1578  ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1579 
1580  if (!(sendTimingReq(pkt))) {
1581  retries.push_back(std::make_pair(pkt, gpuDynInst));
1582 
1583  DPRINTF(GPUPort,
1584  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1585  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1586  id, pkt->req->getPaddr());
1587  } else {
1588  DPRINTF(GPUPort,
1589  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1590  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1591  gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1592  pkt->req->getPaddr());
1593  }
1594 }
1595 
1596 const char*
1598 {
1599  return "ComputeUnit scalar memory request event";
1600 }
1601 
1602 void
1604 {
1605  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1606  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1607  ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort.computeUnit;
1608 
1609  if (!(scalarDataPort.sendTimingReq(pkt))) {
1610  scalarDataPort.retries.push_back(pkt);
1611 
1612  DPRINTF(GPUPort,
1613  "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1614  compute_unit->cu_id, gpuDynInst->simdId,
1615  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1616  } else {
1617  DPRINTF(GPUPort,
1618  "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1619  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1620  gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1621  pkt->req->getPaddr());
1622  }
1623 }
1624 
1625 /*
1626  * The initial translation request could have been rejected,
1627  * if <retries> queue is not Retry sending the translation
1628  * request. sendRetry() is called from the peer port whenever
1629  * a translation completes.
1630  */
1631 void
1633 {
1634  int len = retries.size();
1635 
1636  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1637  computeUnit->cu_id, len);
1638 
1639  assert(len > 0);
1640  assert(isStalled());
1641  // recvReqRetry is an indication that the resource on which this
1642  // port was stalling on is freed. So, remove the stall first
1643  unstallPort();
1644 
1645  for (int i = 0; i < len; ++i) {
1646  PacketPtr pkt = retries.front();
1647  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1648  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1649 
1650  if (!sendTimingReq(pkt)) {
1651  // Stall port
1652  stallPort();
1653  DPRINTF(GPUTLB, ": failed again\n");
1654  break;
1655  } else {
1656  DPRINTF(GPUTLB, ": successful\n");
1657  retries.pop_front();
1658  }
1659  }
1660 }
1661 
1662 bool
1664 {
1665  assert(pkt->senderState);
1666 
1667  TheISA::GpuTLB::TranslationState *translation_state =
1668  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1669 
1670  // Page faults are not allowed
1671  fatal_if(!translation_state->tlbEntry,
1672  "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1673 
1674  delete translation_state->tlbEntry;
1675  assert(!translation_state->ports.size());
1676 
1677  pkt->senderState = translation_state->saved;
1678  delete translation_state;
1679 
1680  ScalarDTLBPort::SenderState *sender_state =
1681  safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1682 
1683  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1684  delete pkt->senderState;
1685 
1686  Wavefront *w M5_VAR_USED = gpuDynInst->wavefront();
1687 
1688  DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1689  "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1690  w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1691 
1692  MemCmd mem_cmd;
1693 
1694  if (pkt->cmd == MemCmd::ReadResp) {
1695  mem_cmd = MemCmd::ReadReq;
1696  } else if (pkt->cmd == MemCmd::WriteResp) {
1697  mem_cmd = MemCmd::WriteReq;
1698  } else {
1699  fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1700  pkt->cmd.toString());
1701  }
1702 
1703  PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1704  req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1705  delete pkt;
1706 
1707  req_pkt->senderState =
1709 
1710  if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1711  computeUnit->scalarDataPort.retries.push_back(req_pkt);
1712  DPRINTF(GPUMem, "send scalar req failed for: %s\n",
1713  gpuDynInst->disassemble());
1714  } else {
1715  DPRINTF(GPUMem, "send scalar req for: %s\n",
1716  gpuDynInst->disassemble());
1717  }
1718 
1719  return true;
1720 }
1721 
1722 bool
1724 {
1725  Addr line M5_VAR_USED = pkt->req->getPaddr();
1726  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1727  computeUnit->cu_id, pkt->req->getVaddr(), line);
1728 
1729  assert(pkt->senderState);
1730 
1731  // pop off the TLB translation state
1732  TheISA::GpuTLB::TranslationState *translation_state
1733  = safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1734 
1735  bool success = translation_state->tlbEntry != nullptr;
1736  delete translation_state->tlbEntry;
1737  assert(!translation_state->ports.size());
1738  pkt->senderState = translation_state->saved;
1739  delete translation_state;
1740 
1741  // use the original sender state to know how to close this transaction
1742  ITLBPort::SenderState *sender_state =
1743  safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1744 
1745  // get the wavefront associated with this translation request
1746  Wavefront *wavefront = sender_state->wavefront;
1747  delete pkt->senderState;
1748 
1749  if (success) {
1750  // pkt is reused in fetch(), don't delete it here. However, we must
1751  // reset the command to be a request so that it can be sent through
1752  // the cu's request port
1753  assert(pkt->cmd == MemCmd::ReadResp);
1754  pkt->cmd = MemCmd::ReadReq;
1755 
1756  computeUnit->fetchStage.fetch(pkt, wavefront);
1757  } else {
1758  if (wavefront->dropFetch) {
1759  assert(wavefront->instructionBuffer.empty());
1760  wavefront->dropFetch = false;
1761  }
1762 
1763  wavefront->pendingFetch = 0;
1764  }
1765 
1766  return true;
1767 }
1768 
1769 /*
1770  * The initial translation request could have been rejected, if
1771  * <retries> queue is not empty. Retry sending the translation
1772  * request. sendRetry() is called from the peer port whenever
1773  * a translation completes.
1774  */
1775 void
1777 {
1778 
1779  int len = retries.size();
1780  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1781 
1782  assert(len > 0);
1783  assert(isStalled());
1784 
1785  // recvReqRetry is an indication that the resource on which this
1786  // port was stalling on is freed. So, remove the stall first
1787  unstallPort();
1788 
1789  for (int i = 0; i < len; ++i) {
1790  PacketPtr pkt = retries.front();
1791  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1792  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1793 
1794  if (!sendTimingReq(pkt)) {
1795  stallPort(); // Stall port
1796  DPRINTF(GPUTLB, ": failed again\n");
1797  break;
1798  } else {
1799  DPRINTF(GPUTLB, ": successful\n");
1800  retries.pop_front();
1801  }
1802  }
1803 }
1804 
1805 void
1807 {
1809 
1810  vALUInsts
1811  .name(name() + ".valu_insts")
1812  .desc("Number of vector ALU insts issued.")
1813  ;
1815  .name(name() + ".valu_insts_per_wf")
1816  .desc("The avg. number of vector ALU insts issued per-wavefront.")
1817  ;
1818  sALUInsts
1819  .name(name() + ".salu_insts")
1820  .desc("Number of scalar ALU insts issued.")
1821  ;
1823  .name(name() + ".salu_insts_per_wf")
1824  .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1825  ;
1827  .name(name() + ".inst_cycles_valu")
1828  .desc("Number of cycles needed to execute VALU insts.")
1829  ;
1831  .name(name() + ".inst_cycles_salu")
1832  .desc("Number of cycles needed to execute SALU insts.")
1833  ;
1835  .name(name() + ".thread_cycles_valu")
1836  .desc("Number of thread cycles used to execute vector ALU ops. "
1837  "Similar to instCyclesVALU but multiplied by the number of "
1838  "active threads.")
1839  ;
1841  .name(name() + ".valu_utilization")
1842  .desc("Percentage of active vector ALU threads in a wave.")
1843  ;
1845  .name(name() + ".lds_no_flat_insts")
1846  .desc("Number of LDS insts issued, not including FLAT "
1847  "accesses that resolve to LDS.")
1848  ;
1850  .name(name() + ".lds_no_flat_insts_per_wf")
1851  .desc("The avg. number of LDS insts (not including FLAT "
1852  "accesses that resolve to LDS) per-wavefront.")
1853  ;
1855  .name(name() + ".flat_vmem_insts")
1856  .desc("The number of FLAT insts that resolve to vmem issued.")
1857  ;
1859  .name(name() + ".flat_vmem_insts_per_wf")
1860  .desc("The average number of FLAT insts that resolve to vmem "
1861  "issued per-wavefront.")
1862  ;
1863  flatLDSInsts
1864  .name(name() + ".flat_lds_insts")
1865  .desc("The number of FLAT insts that resolve to LDS issued.")
1866  ;
1868  .name(name() + ".flat_lds_insts_per_wf")
1869  .desc("The average number of FLAT insts that resolve to LDS "
1870  "issued per-wavefront.")
1871  ;
1873  .name(name() + ".vector_mem_writes")
1874  .desc("Number of vector mem write insts (excluding FLAT insts).")
1875  ;
1877  .name(name() + ".vector_mem_writes_per_wf")
1878  .desc("The average number of vector mem write insts "
1879  "(excluding FLAT insts) per-wavefront.")
1880  ;
1882  .name(name() + ".vector_mem_reads")
1883  .desc("Number of vector mem read insts (excluding FLAT insts).")
1884  ;
1886  .name(name() + ".vector_mem_reads_per_wf")
1887  .desc("The avg. number of vector mem read insts (excluding "
1888  "FLAT insts) per-wavefront.")
1889  ;
1891  .name(name() + ".scalar_mem_writes")
1892  .desc("Number of scalar mem write insts.")
1893  ;
1895  .name(name() + ".scalar_mem_writes_per_wf")
1896  .desc("The average number of scalar mem write insts per-wavefront.")
1897  ;
1899  .name(name() + ".scalar_mem_reads")
1900  .desc("Number of scalar mem read insts.")
1901  ;
1903  .name(name() + ".scalar_mem_reads_per_wf")
1904  .desc("The average number of scalar mem read insts per-wavefront.")
1905  ;
1906 
1909  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
1917 
1919  .name(name() + ".vector_mem_reads_per_kilo_inst")
1920  .desc("Number of vector mem reads per kilo-instruction")
1921  ;
1924  .name(name() + ".vector_mem_writes_per_kilo_inst")
1925  .desc("Number of vector mem writes per kilo-instruction")
1926  ;
1929  .name(name() + ".vector_mem_insts_per_kilo_inst")
1930  .desc("Number of vector mem insts per kilo-instruction")
1931  ;
1935  .name(name() + ".scalar_mem_reads_per_kilo_inst")
1936  .desc("Number of scalar mem reads per kilo-instruction")
1937  ;
1940  .name(name() + ".scalar_mem_writes_per_kilo_inst")
1941  .desc("Number of scalar mem writes per kilo-instruction")
1942  ;
1945  .name(name() + ".scalar_mem_insts_per_kilo_inst")
1946  .desc("Number of scalar mem insts per kilo-instruction")
1947  ;
1950 
1953  .name(name() + ".inst_cycles_vector_memory")
1954  .desc("Number of cycles to send address, command, data from VRF to "
1955  "vector memory unit, per SIMD")
1956  ;
1957 
1960  .name(name() + ".inst_cycles_scalar_memory")
1961  .desc("Number of cycles to send address, command, data from SRF to "
1962  "scalar memory unit, per SIMD")
1963  ;
1964 
1967  .name(name() + ".inst_cycles_lds")
1968  .desc("Number of cycles to send address, command, data from VRF to "
1969  "LDS unit, per SIMD")
1970  ;
1971 
1972  globalReads
1973  .name(name() + ".global_mem_reads")
1974  .desc("Number of reads to the global segment")
1975  ;
1976  globalWrites
1977  .name(name() + ".global_mem_writes")
1978  .desc("Number of writes to the global segment")
1979  ;
1981  .name(name() + ".global_mem_insts")
1982  .desc("Number of memory instructions sent to the global segment")
1983  ;
1985  argReads
1986  .name(name() + ".arg_reads")
1987  .desc("Number of reads to the arg segment")
1988  ;
1989  argWrites
1990  .name(name() + ".arg_writes")
1991  .desc("NUmber of writes to the arg segment")
1992  ;
1993  argMemInsts
1994  .name(name() + ".arg_mem_insts")
1995  .desc("Number of memory instructions sent to the arg segment")
1996  ;
1998  spillReads
1999  .name(name() + ".spill_reads")
2000  .desc("Number of reads to the spill segment")
2001  ;
2002  spillWrites
2003  .name(name() + ".spill_writes")
2004  .desc("Number of writes to the spill segment")
2005  ;
2007  .name(name() + ".spill_mem_insts")
2008  .desc("Number of memory instructions sent to the spill segment")
2009  ;
2011  groupReads
2012  .name(name() + ".group_reads")
2013  .desc("Number of reads to the group segment")
2014  ;
2015  groupWrites
2016  .name(name() + ".group_writes")
2017  .desc("Number of writes to the group segment")
2018  ;
2020  .name(name() + ".group_mem_insts")
2021  .desc("Number of memory instructions sent to the group segment")
2022  ;
2024  privReads
2025  .name(name() + ".private_reads")
2026  .desc("Number of reads to the private segment")
2027  ;
2028  privWrites
2029  .name(name() + ".private_writes")
2030  .desc("Number of writes to the private segment")
2031  ;
2032  privMemInsts
2033  .name(name() + ".private_mem_insts")
2034  .desc("Number of memory instructions sent to the private segment")
2035  ;
2038  .name(name() + ".readonly_reads")
2039  .desc("Number of reads to the readonly segment")
2040  ;
2042  .name(name() + ".readonly_writes")
2043  .desc("Number of memory instructions sent to the readonly segment")
2044  ;
2046  .name(name() + ".readonly_mem_insts")
2047  .desc("Number of memory instructions sent to the readonly segment")
2048  ;
2050  kernargReads
2051  .name(name() + ".kernarg_reads")
2052  .desc("Number of reads sent to the kernarg segment")
2053  ;
2055  .name(name() + ".kernarg_writes")
2056  .desc("Number of memory instructions sent to the kernarg segment")
2057  ;
2059  .name(name() + ".kernarg_mem_insts")
2060  .desc("Number of memory instructions sent to the kernarg segment")
2061  ;
2063 
2064  tlbCycles
2065  .name(name() + ".tlb_cycles")
2066  .desc("total number of cycles for all uncoalesced requests")
2067  ;
2068 
2069  tlbRequests
2070  .name(name() + ".tlb_requests")
2071  .desc("number of uncoalesced requests")
2072  ;
2073 
2074  tlbLatency
2075  .name(name() + ".avg_translation_latency")
2076  .desc("Avg. translation latency for data translations")
2077  ;
2078 
2080 
2082  .init(4)
2083  .name(name() + ".TLB_hits_distribution")
2084  .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
2085  ;
2086 
2087  // fixed number of TLB levels
2088  for (int i = 0; i < 4; ++i) {
2089  if (!i)
2090  hitsPerTLBLevel.subname(i,"page_table");
2091  else
2092  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2093  }
2094 
2095  execRateDist
2096  .init(0, 10, 2)
2097  .name(name() + ".inst_exec_rate")
2098  .desc("Instruction Execution Rate: Number of executed vector "
2099  "instructions per cycle")
2100  ;
2101 
2103  .init(0, wfSize(), 2)
2104  .name(name() + ".lds_bank_conflicts")
2105  .desc("Number of bank conflicts per LDS memory packet")
2106  ;
2107 
2109  .name(name() + ".lds_bank_access_cnt")
2110  .desc("Total number of LDS bank accesses")
2111  ;
2112 
2114  // A wavefront can touch up to N pages per memory instruction where
2115  // N is equal to the wavefront size
2116  // The number of pages per bin can be configured (here it's 4).
2117  .init(1, wfSize(), 4)
2118  .name(name() + ".page_divergence_dist")
2119  .desc("pages touched per wf (over all mem. instr.)")
2120  ;
2121 
2123  .init(1, wfSize(), 4)
2124  .name(name() + ".warp_execution_dist")
2125  .desc("number of lanes active per instruction (oval all instructions)")
2126  ;
2127 
2129  .init(1, wfSize(), 4)
2130  .name(name() + ".gmem_lanes_execution_dist")
2131  .desc("number of active lanes per global memory instruction")
2132  ;
2133 
2135  .init(1, wfSize(), 4)
2136  .name(name() + ".lmem_lanes_execution_dist")
2137  .desc("number of active lanes per local memory instruction")
2138  ;
2139 
2141  .name(name() + ".num_instr_executed")
2142  .desc("number of instructions executed")
2143  ;
2144 
2146  .name(name() + ".num_vec_ops_executed")
2147  .desc("number of vec ops executed (e.g. WF size/inst)")
2148  ;
2149 
2151  .name(name() + ".num_vec_ops_f16_executed")
2152  .desc("number of f16 vec ops executed (e.g. WF size/inst)")
2153  ;
2154 
2156  .name(name() + ".num_vec_ops_f32_executed")
2157  .desc("number of f32 vec ops executed (e.g. WF size/inst)")
2158  ;
2159 
2161  .name(name() + ".num_vec_ops_f64_executed")
2162  .desc("number of f64 vec ops executed (e.g. WF size/inst)")
2163  ;
2164 
2166  .name(name() + ".num_vec_ops_fma16_executed")
2167  .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
2168  ;
2169 
2171  .name(name() + ".num_vec_ops_fma32_executed")
2172  .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
2173  ;
2174 
2176  .name(name() + ".num_vec_ops_fma64_executed")
2177  .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
2178  ;
2179 
2181  .name(name() + ".num_vec_ops_mad16_executed")
2182  .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
2183  ;
2184 
2186  .name(name() + ".num_vec_ops_mad32_executed")
2187  .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
2188  ;
2189 
2191  .name(name() + ".num_vec_ops_mad64_executed")
2192  .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
2193  ;
2194 
2196  .name(name() + ".num_vec_ops_mac16_executed")
2197  .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
2198  ;
2199 
2201  .name(name() + ".num_vec_ops_mac32_executed")
2202  .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
2203  ;
2204 
2206  .name(name() + ".num_vec_ops_mac64_executed")
2207  .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
2208  ;
2209 
2211  .name(name() + ".num_vec_ops_two_op_fp_executed")
2212  .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
2213  ;
2214 
2215  totalCycles
2216  .name(name() + ".num_total_cycles")
2217  .desc("number of cycles the CU ran for")
2218  ;
2219 
2220  ipc
2221  .name(name() + ".ipc")
2222  .desc("Instructions per cycle (this CU only)")
2223  ;
2224 
2225  vpc
2226  .name(name() + ".vpc")
2227  .desc("Vector Operations per cycle (this CU only)")
2228  ;
2229 
2230  vpc_f16
2231  .name(name() + ".vpc_f16")
2232  .desc("F16 Vector Operations per cycle (this CU only)")
2233  ;
2234 
2235  vpc_f32
2236  .name(name() + ".vpc_f32")
2237  .desc("F32 Vector Operations per cycle (this CU only)")
2238  ;
2239 
2240  vpc_f64
2241  .name(name() + ".vpc_f64")
2242  .desc("F64 Vector Operations per cycle (this CU only)")
2243  ;
2244 
2246  .name(name() + ".num_alu_insts_executed")
2247  .desc("Number of dynamic non-GM memory insts executed")
2248  ;
2249 
2251  .name(name() + ".wg_blocked_due_barrier_alloc")
2252  .desc("WG dispatch was blocked due to lack of barrier resources")
2253  ;
2254 
2256  .name(name() + ".wg_blocked_due_lds_alloc")
2257  .desc("Workgroup blocked due to LDS capacity")
2258  ;
2259 
2265 
2267  .name(name() + ".times_wg_blocked_due_vgpr_alloc")
2268  .desc("Number of times WGs are blocked due to VGPR allocation per "
2269  "SIMD")
2270  ;
2271 
2273  .name(name() + ".times_wg_blocked_due_sgpr_alloc")
2274  .desc("Number of times WGs are blocked due to SGPR allocation per "
2275  "SIMD")
2276  ;
2277 
2279  .name(name() + ".global_mem_instr_cnt")
2280  .desc("dynamic non-flat global memory instruction count")
2281  ;
2282 
2284  .name(name() + ".flat_global_mem_instr_cnt")
2285  .desc("dynamic flat global memory instruction count")
2286  ;
2287 
2289  .name(name() + ".local_mem_instr_cnt")
2290  .desc("dynamic local memory intruction count")
2291  ;
2292 
2295 
2296  completedWfs
2297  .name(name() + ".num_completed_wfs")
2298  .desc("number of completed wavefronts")
2299  ;
2300 
2301  completedWGs
2302  .name(name() + ".num_completed_wgs")
2303  .desc("number of completed workgroups")
2304  ;
2305 
2306  numCASOps
2307  .name(name() + ".num_CAS_ops")
2308  .desc("number of compare and swap operations")
2309  ;
2310 
2312  .name(name() + ".num_failed_CAS_ops")
2313  .desc("number of compare and swap operations that failed")
2314  ;
2315 
2317  .init(0, 1000000, 10000)
2318  .name(name() + ".head_tail_latency")
2319  .desc("ticks between first and last cache block arrival at coalescer")
2321  ;
2322 
2324  .init(0, shader->n_wf * numVectorALUs, 1)
2325  .name(name() + ".wlp")
2326  .desc("wave level parallelism: count of active waves at wave launch")
2327  ;
2328 
2330  .init(numVectorALUs, 0, 20, 1)
2331  .name(name() + ".interleaving")
2332  .desc("Measure of instruction interleaving per SIMD")
2333  ;
2334 
2335  // register stats of pipeline stages
2336  fetchStage.regStats();
2339  execStage.regStats();
2340 
2341  // register stats of memory pipelines
2345 
2347 }
2348 
2349 void
2351 {
2352  if (gpuDynInst->isScalar()) {
2353  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2354  sALUInsts++;
2355  instCyclesSALU++;
2356  } else if (gpuDynInst->isLoad()) {
2357  scalarMemReads++;
2358  } else if (gpuDynInst->isStore()) {
2359  scalarMemWrites++;
2360  }
2361  } else {
2362  if (gpuDynInst->isALU()) {
2365  exitSimLoop("max vALU insts");
2366  }
2367  vALUInsts++;
2368  instCyclesVALU++;
2369  threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
2370  } else if (gpuDynInst->isFlat()) {
2371  if (gpuDynInst->isLocalMem()) {
2372  flatLDSInsts++;
2373  } else {
2374  flatVMemInsts++;
2375  }
2376  } else if (gpuDynInst->isLocalMem()) {
2377  ldsNoFlatInsts++;
2378  } else if (gpuDynInst->isLoad()) {
2379  vectorMemReads++;
2380  } else if (gpuDynInst->isStore()) {
2381  vectorMemWrites++;
2382  }
2383 
2384  if (gpuDynInst->isLoad()) {
2385  switch (gpuDynInst->executedAs()) {
2386  case Enums::SC_SPILL:
2387  spillReads++;
2388  break;
2389  case Enums::SC_GLOBAL:
2390  globalReads++;
2391  break;
2392  case Enums::SC_GROUP:
2393  groupReads++;
2394  break;
2395  case Enums::SC_PRIVATE:
2396  privReads++;
2397  break;
2398  case Enums::SC_READONLY:
2399  readonlyReads++;
2400  break;
2401  case Enums::SC_KERNARG:
2402  kernargReads++;
2403  break;
2404  case Enums::SC_ARG:
2405  argReads++;
2406  break;
2407  case Enums::SC_NONE:
2412  break;
2413  default:
2414  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2415  break;
2416  }
2417  } else if (gpuDynInst->isStore()) {
2418  switch (gpuDynInst->executedAs()) {
2419  case Enums::SC_SPILL:
2420  spillWrites++;
2421  break;
2422  case Enums::SC_GLOBAL:
2423  globalWrites++;
2424  break;
2425  case Enums::SC_GROUP:
2426  groupWrites++;
2427  break;
2428  case Enums::SC_PRIVATE:
2429  privWrites++;
2430  break;
2431  case Enums::SC_READONLY:
2432  readonlyWrites++;
2433  break;
2434  case Enums::SC_KERNARG:
2435  kernargWrites++;
2436  break;
2437  case Enums::SC_ARG:
2438  argWrites++;
2439  break;
2440  case Enums::SC_NONE:
2445  break;
2446  default:
2447  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2448  break;
2449  }
2450  }
2451  }
2452 }
2453 
2454 void
2456 {
2457  Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
2458 
2459  if (!pagesTouched.count(virt_page_addr))
2460  pagesTouched[virt_page_addr] = 1;
2461  else
2462  pagesTouched[virt_page_addr]++;
2463 }
2464 
2465 void
2467 {
2468  if (countPages) {
2469  std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2470 
2471  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2472  std::endl;
2473 
2474  for (auto iter : pageAccesses) {
2475  *page_stat_file << std::hex << iter.first << ",";
2476  *page_stat_file << std::dec << iter.second.first << ",";
2477  *page_stat_file << std::dec << iter.second.second << std::endl;
2478  }
2479  }
2480 }
2481 
2482 bool
2484 {
2485  for (int i = 0; i < numVectorALUs; ++i) {
2486  if (!isVectorAluIdle(i)) {
2487  return false;
2488  }
2489  }
2490 
2491  // TODO: FIXME if more than 1 of any memory pipe supported
2492  if (!srfToScalarMemPipeBus.rdy()) {
2493  return false;
2494  }
2495  if (!vrfToGlobalMemPipeBus.rdy()) {
2496  return false;
2497  }
2498  if (!vrfToLocalMemPipeBus.rdy()) {
2499  return false;
2500  }
2501 
2506  return false;
2507  }
2508 
2509  return true;
2510 }
2511 
2512 int32_t
2513 ComputeUnit::getRefCounter(const uint32_t dispatchId,
2514  const uint32_t wgId) const
2515 {
2516  return lds.getRefCounter(dispatchId, wgId);
2517 }
2518 
2519 bool
2520 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2521 {
2522  assert(simdId < numVectorALUs);
2523 
2524  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2525  if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2526  return false;
2527  }
2528  }
2529 
2530  return true;
2531 }
2532 
2538 bool
2540 {
2541  // this is just a request to carry the GPUDynInstPtr
2542  // back and forth
2543  RequestPtr newRequest = std::make_shared<Request>();
2544  newRequest->setPaddr(0x0);
2545 
2546  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2547  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2548 
2549  // This is the SenderState needed upon return
2550  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2551 
2552  return ldsPort.sendTimingReq(newPacket);
2553 }
2554 
2558 bool
2560 {
2561  const ComputeUnit::LDSPort::SenderState *senderState =
2562  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2563 
2564  fatal_if(!senderState, "did not get the right sort of sender state");
2565 
2566  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2567 
2568  delete packet->senderState;
2569  delete packet;
2570 
2571  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2572  return true;
2573 }
2574 
2580 bool
2582 {
2583  ComputeUnit::LDSPort::SenderState *sender_state =
2584  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2585  fatal_if(!sender_state, "packet without a valid sender state");
2586 
2587  GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
2588 
2589  if (isStalled()) {
2590  fatal_if(retries.empty(), "must have retries waiting to be stalled");
2591 
2592  retries.push(pkt);
2593 
2594  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2595  computeUnit->cu_id, gpuDynInst->simdId,
2596  gpuDynInst->wfSlotId);
2597  return false;
2598  } else if (!RequestPort::sendTimingReq(pkt)) {
2599  // need to stall the LDS port until a recvReqRetry() is received
2600  // this indicates that there is more space
2601  stallPort();
2602  retries.push(pkt);
2603 
2604  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2605  computeUnit->cu_id, gpuDynInst->simdId,
2606  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2607  return false;
2608  } else {
2609  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2610  computeUnit->cu_id, gpuDynInst->simdId,
2611  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2612  return true;
2613  }
2614 }
2615 
2622 void
2624 {
2625  auto queueSize = retries.size();
2626 
2627  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2628  computeUnit->cu_id, queueSize);
2629 
2630  fatal_if(queueSize < 1,
2631  "why was there a recvReqRetry() with no pending reqs?");
2632  fatal_if(!isStalled(),
2633  "recvReqRetry() happened when the port was not stalled");
2634 
2635  unstallPort();
2636 
2637  while (!retries.empty()) {
2638  PacketPtr packet = retries.front();
2639 
2640  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2641 
2642  if (!RequestPort::sendTimingReq(packet)) {
2643  // Stall port
2644  stallPort();
2645  DPRINTF(GPUPort, ": LDS send failed again\n");
2646  break;
2647  } else {
2648  DPRINTF(GPUTLB, ": LDS send successful\n");
2649  retries.pop();
2650  }
2651  }
2652 }
ComputeUnit::controlFlowDivergenceDist
Stats::Distribution controlFlowDivergenceDist
Definition: compute_unit.hh:593
ScheduleStage::exec
void exec()
Definition: schedule_stage.cc:88
ComputeUnit::DataPort::createMemReqEvent
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1558
ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition: compute_unit.hh:242
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:183
ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition: compute_unit.hh:230
LocalMemPipeline::isLMRespFIFOWrRdy
bool isLMRespFIFOWrRdy() const
Definition: local_memory_pipeline.hh:66
Stats::Group::regStats
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:64
LocalMemPipeline::isLMReqFIFOWrRdy
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: local_memory_pipeline.hh:72
ComputeUnit::globalWrites
Stats::Scalar globalWrites
Definition: compute_unit.hh:509
Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:460
ComputeUnit::vectorMemWrites
Stats::Scalar vectorMemWrites
Definition: compute_unit.hh:486
RegisterManager::canAllocateVgprs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:108
Request::ACQUIRE
@ ACQUIRE
The request should be marked with ACQUIRE.
Definition: request.hh:157
ComputeUnit::privWrites
Stats::Scalar privWrites
Definition: compute_unit.hh:521
ComputeUnit::numScalarRegsPerSimd
int numScalarRegsPerSimd
Definition: compute_unit.hh:376
roundDown
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:150
ComputeUnit::ScalarDataPort::MemReqEvent::description
const char * description() const
Return a C string describing the event.
Definition: compute_unit.cc:1597
Shader::impl_kern_end_rel
int impl_kern_end_rel
Definition: shader.hh:220
RegisterManager::allocateRegisters
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
Definition: register_manager.cc:121
simple_pool_manager.hh
ComputeUnit::sALUInsts
Stats::Scalar sALUInsts
Definition: compute_unit.hh:474
ComputeUnit::resetBarrier
void resetBarrier(int bar_id)
Definition: compute_unit.cc:666
ComputeUnit::deleteFromPipeMap
void deleteFromPipeMap(Wavefront *w)
Definition: compute_unit.cc:491
ComputeUnit::req_tick_latency
Tick req_tick_latency
Definition: compute_unit.hh:358
ComputeUnit::numInstrExecuted
Stats::Scalar numInstrExecuted
Definition: compute_unit.hh:560
ComputeUnit::pagesTouched
std::map< Addr, int > pagesTouched
Definition: compute_unit.hh:381
ComputeUnit::init
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: compute_unit.cc:736
ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition: compute_unit.hh:220
ComputeUnit::mapWaveToScalarAluGlobalIdx
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
Definition: compute_unit.cc:261
ComputeUnit::ldsBankConflictDist
Stats::Distribution ldsBankConflictDist
Definition: compute_unit.hh:544
BaseTLB::Read
@ Read
Definition: tlb.hh:57
ComputeUnit::locMemToVrfBus
WaitClass locMemToVrfBus
Definition: compute_unit.hh:226
ComputeUnit::ScalarDataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:743
EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:199
ComputeUnit::vectorMemWritesPerWF
Stats::Formula vectorMemWritesPerWF
Definition: compute_unit.hh:487
shader.hh
ComputeUnit::numVecOpsExecutedF16
Stats::Scalar numVecOpsExecutedF16
Definition: compute_unit.hh:567
HSAQueueEntry::MAX_DIM
const static int MAX_DIM
Definition: hsa_queue_entry.hh:311
ComputeUnit::dispWorkgroup
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
Definition: compute_unit.cc:404
ComputeUnit::numScalarMemUnits
int numScalarMemUnits
Definition: compute_unit.hh:232
ScheduleStage::regStats
void regStats()
Definition: schedule_stage.cc:778
ComputeUnit::ldsNoFlatInstsPerWF
Stats::Formula ldsNoFlatInstsPerWF
Definition: compute_unit.hh:481
MipsISA::index
Bitfield< 30, 0 > index
Definition: pra_constants.hh:44
ComputeUnit::globalReads
Stats::Scalar globalReads
Definition: compute_unit.hh:508
Packet::getAddr
Addr getAddr() const
Definition: packet.hh:754
ComputeUnit::vpc_f16
Stats::Formula vpc_f16
Definition: compute_unit.hh:589
ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:282
ComputeUnit::completedWGs
Stats::Scalar completedWGs
Definition: compute_unit.hh:605
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
ComputeUnit::mapWaveToGlobalMem
int mapWaveToGlobalMem(Wavefront *w) const
Definition: compute_unit.cc:268
ComputeUnit::dynamicLMemInstrCnt
Stats::Scalar dynamicLMemInstrCnt
Definition: compute_unit.hh:553
ComputeUnit::perLaneTLB
bool perLaneTLB
Definition: compute_unit.hh:332
Process
Definition: process.hh:65
ComputeUnit::instCyclesLdsPerSimd
Stats::Vector instCyclesLdsPerSimd
Definition: compute_unit.hh:506
ComputeUnit::incNumAtBarrier
void incNumAtBarrier(int bar_id)
Definition: compute_unit.cc:645
ComputeUnit::argWrites
Stats::Scalar argWrites
Definition: compute_unit.hh:512
ComputeUnit::ITLBPort::SenderState::wavefront
Wavefront * wavefront
Definition: compute_unit.hh:886
Shader::timingSim
bool timingSim
Definition: shader.hh:214
ComputeUnit::scalarDTLBPort
ScalarDTLBPort scalarDTLBPort
Definition: compute_unit.hh:995
ComputeUnit::mapWaveToScalarMem
int mapWaveToScalarMem(Wavefront *w) const
Definition: compute_unit.cc:284
ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition: compute_unit.cc:243
ComputeUnit::mapWaveToLocalMem
int mapWaveToLocalMem(Wavefront *w) const
Definition: compute_unit.cc:276
Wavefront::S_RUNNING
@ S_RUNNING
Definition: wavefront.hh:66
X86ISA::pf
Bitfield< 2 > pf
Definition: misc.hh:550
ComputeUnit::ldsBankAccesses
Stats::Scalar ldsBankAccesses
Definition: compute_unit.hh:543
ComputeUnit::tlbLatency
Stats::Formula tlbLatency
Definition: compute_unit.hh:539
ComputeUnit::totalCycles
Stats::Scalar totalCycles
Definition: compute_unit.hh:587
ComputeUnit::readonlyMemInsts
Stats::Formula readonlyMemInsts
Definition: compute_unit.hh:525
ComputeUnit::headTailMap
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
Definition: compute_unit.hh:1079
ComputeUnit::numVecOpsExecutedMAD16
Stats::Scalar numVecOpsExecutedMAD16
Definition: compute_unit.hh:581
ComputeUnit::vALUInsts
Stats::Scalar vALUInsts
Definition: compute_unit.hh:472
ComputeUnit::numYetToReachBarrier
int numYetToReachBarrier(int bar_id)
Definition: compute_unit.cc:631
HSAQueueEntry::wgSize
int wgSize(int dim) const
Definition: hsa_queue_entry.hh:122
compute_unit.hh
ComputeUnit::gmTokenPort
GMTokenPort gmTokenPort
Definition: compute_unit.hh:649
gpu_static_inst.hh
ComputeUnit::vpc_f64
Stats::Formula vpc_f64
Definition: compute_unit.hh:591
ComputeUnit::doFlush
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
Definition: compute_unit.cc:399
Packet::isRead
bool isRead() const
Definition: packet.hh:556
ExecStage::init
void init()
Definition: exec_stage.cc:57
ComputeUnit::kernargReads
Stats::Scalar kernargReads
Definition: compute_unit.hh:526
ExecStage::exec
void exec()
Definition: exec_stage.cc:150
ComputeUnit::memPortTokens
TokenManager * memPortTokens
Definition: compute_unit.hh:648
BaseTLB::Mode
Mode
Definition: tlb.hh:57
ComputeUnit::readonlyWrites
Stats::Scalar readonlyWrites
Definition: compute_unit.hh:524
ComputeUnit::dynamicFlatMemInstrCnt
Stats::Scalar dynamicFlatMemInstrCnt
Definition: compute_unit.hh:552
MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:82
Wavefront::barrierId
void barrierId(int bar_id)
Definition: wavefront.cc:1437
Tick
uint64_t Tick
Tick count type.
Definition: types.hh:63
Shader::n_wf
int n_wf
Definition: shader.hh:228
ComputeUnit::tlbRequests
Stats::Scalar tlbRequests
Definition: compute_unit.hh:537
OutputDirectory::create
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:209
ComputeUnit::ScalarDataPort::SenderState
Definition: compute_unit.hh:711
PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:237
ComputeUnit::kernargMemInsts
Stats::Formula kernargMemInsts
Definition: compute_unit.hh:528
ComputeUnit::ComputeUnit
ComputeUnit(const Params *p)
Definition: compute_unit.cc:62
Request::WB_L2
@ WB_L2
Definition: request.hh:285
ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:289
ComputeUnit::ScalarDataPort::MemReqEvent::process
void process()
Definition: compute_unit.cc:1603
ComputeUnit::numExeUnits
int numExeUnits() const
Definition: compute_unit.cc:228
ComputeUnit::ScalarDTLBPort::isStalled
bool isStalled() const
Definition: compute_unit.hh:852
RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:82
ComputeUnit::numVecOpsExecutedFMA32
Stats::Scalar numVecOpsExecutedFMA32
Definition: compute_unit.hh:574
Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:340
ComputeUnit::activeLanesPerLMemInstrDist
Stats::Distribution activeLanesPerLMemInstrDist
Definition: compute_unit.hh:595
ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:222
ScheduleStage::init
void init()
Definition: schedule_stage.cc:74
ComputeUnit::scalarMemWritesPerWF
Stats::Formula scalarMemWritesPerWF
Definition: compute_unit.hh:491
ComputeUnit::registerManager
RegisterManager * registerManager
Definition: compute_unit.hh:275
ComputeUnit::functionalTLB
bool functionalTLB
Definition: compute_unit.hh:348
ComputeUnit::SQCPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:994
ComputeUnit::debugSegFault
bool debugSegFault
Definition: compute_unit.hh:344
ComputeUnit::readonlyReads
Stats::Scalar readonlyReads
Definition: compute_unit.hh:523
Packet::getSize
unsigned getSize() const
Definition: packet.hh:764
ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
ComputeUnit::LDSPort::sendTimingReq
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
Definition: compute_unit.cc:2581
ComputeUnit::spillWrites
Stats::Scalar spillWrites
Definition: compute_unit.hh:515
floorLog2
std::enable_if< std::is_integral< T >::value, int >::type floorLog2(T x)
Definition: intmath.hh:63
ScoreboardCheckStage::exec
void exec()
Definition: scoreboard_check_stage.cc:237
HSAQueueEntry::ldsSize
int ldsSize() const
Definition: hsa_queue_entry.hh:190
ComputeUnit::hitsPerTLBLevel
Stats::Vector hitsPerTLBLevel
Definition: compute_unit.hh:541
ComputeUnit::vpc
Stats::Formula vpc
Definition: compute_unit.hh:588
ComputeUnit::numVecOpsExecutedF64
Stats::Scalar numVecOpsExecutedF64
Definition: compute_unit.hh:571
ComputeUnit::scheduleStage
ScheduleStage scheduleStage
Definition: compute_unit.hh:279
ComputeUnit::flatLDSInstsPerWF
Stats::Formula flatLDSInstsPerWF
Definition: compute_unit.hh:485
GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:106
HSAQueueEntry::numVectorRegs
int numVectorRegs() const
Definition: hsa_queue_entry.hh:136
ComputeUnit::allAtBarrier
bool allAtBarrier(int bar_id)
Definition: compute_unit.cc:638
ComputeUnit::pageDivergenceDist
Stats::Distribution pageDivergenceDist
Definition: compute_unit.hh:548
KernelLaunchStaticInst
Definition: gpu_static_inst.hh:276
ComputeUnit::ScalarDTLBPort::SenderState
Definition: compute_unit.hh:843
Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:85
ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition: compute_unit.cc:482
Wavefront::dropFetch
bool dropFetch
Definition: wavefront.hh:105
sim_exit.hh
ComputeUnit::sALUInstsPerWF
Stats::Formula sALUInstsPerWF
Definition: compute_unit.hh:475
HSAQueueEntry
Definition: hsa_queue_entry.hh:60
ComputeUnit::argMemInsts
Stats::Formula argMemInsts
Definition: compute_unit.hh:513
GlobalMemPipeline::regStats
void regStats()
Definition: global_memory_pipeline.cc:286
X86ISA::GpuTLB::TranslationState::saved
Packet::SenderState * saved
Definition: gpu_tlb.hh:335
output.hh
ComputeUnit::groupWrites
Stats::Scalar groupWrites
Definition: compute_unit.hh:518
ComputeUnit::exitCallback
void exitCallback()
Definition: compute_unit.cc:2466
ComputeUnit::vectorMemInstsPerKiloInst
Stats::Formula vectorMemInstsPerKiloInst
Definition: compute_unit.hh:497
RR
@ RR
Definition: compute_unit.hh:72
ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition: compute_unit.hh:216
GPUDispatcher
Definition: dispatcher.hh:60
ComputeUnit::scalarMemWrites
Stats::Scalar scalarMemWrites
Definition: compute_unit.hh:490
wavefront.hh
ComputeUnit::numVecOpsExecutedMAD64
Stats::Scalar numVecOpsExecutedMAD64
Definition: compute_unit.hh:583
ComputeUnit::LDSPort::SenderState
SenderState is information carried along with the packet, esp.
Definition: compute_unit.hh:927
ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:231
ComputeUnit::scalarMemReads
Stats::Scalar scalarMemReads
Definition: compute_unit.hh:492
ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition: compute_unit.hh:246
ComputeUnit::ScalarDTLBPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:1663
WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:90
Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:133
EventFunctionWrapper
Definition: eventq.hh:1101
Stats::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:331
ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:218
ComputeUnit::scalarMemReadsPerKiloInst
Stats::Formula scalarMemReadsPerKiloInst
Definition: compute_unit.hh:498
Wavefront::setStatus
void setStatus(status_e newStatus)
Definition: wavefront.cc:591
ComputeUnit::argReads
Stats::Scalar argReads
Definition: compute_unit.hh:511
LdsState::getRefCounter
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:335
ComputeUnit::DTLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1632
HSAQueueEntry::numWg
int numWg(int dim) const
Definition: hsa_queue_entry.hh:236
MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:85
ComputeUnit::ITLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1776
ComputeUnit::sendToLds
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
Definition: compute_unit.cc:2539
ComputeUnit::numTimesWgBlockedDueSgprAlloc
Stats::Scalar numTimesWgBlockedDueSgprAlloc
Definition: compute_unit.hh:601
ComputeUnit::tickEvent
EventFunctionWrapper tickEvent
Definition: compute_unit.hh:285
ComputeUnit
Definition: compute_unit.hh:198
MemCmd::WriteCompleteResp
@ WriteCompleteResp
Definition: packet.hh:87
ComputeUnit::instCyclesVALU
Stats::Scalar instCyclesVALU
Definition: compute_unit.hh:476
X86ISA::GpuTLB::TranslationState::tlbEntry
TlbEntry * tlbEntry
Definition: gpu_tlb.hh:322
RegisterManager::regStats
void regStats()
Callback to set stat parameters.
Definition: register_manager.cc:134
Wavefront::getStatus
status_e getStatus()
Definition: wavefront.hh:130
ComputeUnit::DTLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1390
ComputeUnit::instCyclesScMemPerSimd
Stats::Vector instCyclesScMemPerSimd
Definition: compute_unit.hh:505
ComputeUnit::completedWfs
Stats::Scalar completedWfs
Definition: compute_unit.hh:604
GPUDispatcher::notifyWgCompl
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
Definition: dispatcher.cc:301
Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:96
ArmISA::j
Bitfield< 24 > j
Definition: miscregs_types.hh:54
ComputeUnit::numVecOpsExecutedFMA64
Stats::Scalar numVecOpsExecutedFMA64
Definition: compute_unit.hh:575
ComputeUnit::decMaxBarrierCnt
void decMaxBarrierCnt(int bar_id)
Definition: compute_unit.cc:673
ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition: compute_unit.hh:294
Wavefront::S_RETURNING
@ S_RETURNING
Definition: wavefront.hh:64
vector_register_file.hh
MemCmd::SwapReq
@ SwapReq
Definition: packet.hh:111
EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1005
ComputeUnit::numVecOpsExecutedF32
Stats::Scalar numVecOpsExecutedF32
Definition: compute_unit.hh:569
LocalMemPipeline::exec
void exec()
Definition: local_memory_pipeline.cc:51
ComputeUnit::exec
void exec()
Definition: compute_unit.cc:703
FetchStage::init
void init()
Definition: fetch_stage.cc:55
registerExitCallback
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition: core.cc:140
MipsISA::k
Bitfield< 23 > k
Definition: dt_constants.hh:78
ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition: compute_unit.hh:224
HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:154
ComputeUnit::kernargWrites
Stats::Scalar kernargWrites
Definition: compute_unit.hh:527
Stats::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:61
RequestPort::sendTimingReq
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition: port.hh:492
ComputeUnit::memPort
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
Definition: compute_unit.hh:989
ComputeUnit::getRefCounter
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Definition: compute_unit.cc:2513
MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:278
ComputeUnit::DataPort::SenderState::port_index
PortID port_index
Definition: compute_unit.hh:663
LocalMemPipeline::regStats
void regStats()
Definition: local_memory_pipeline.cc:128
MemCmd::toString
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:239
OutputStream::stream
std::ostream * stream() const
Get the output underlying output stream.
Definition: output.hh:59
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:234
ComputeUnit::maxBarrierCnt
int maxBarrierCnt(int bar_id)
Definition: compute_unit.cc:659
SenderState
RubyTester::SenderState SenderState
Definition: Check.cc:37
MemCmd
Definition: packet.hh:71
ComputeUnit::sendRequest
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
Definition: compute_unit.cc:1017
HSAQueueEntry::wgId
int wgId(int dim) const
Definition: hsa_queue_entry.hh:210
ArmISA::PageShift
const Addr PageShift
Definition: isa_traits.hh:51
ComputeUnit::injectGlobalMemFence
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
Definition: compute_unit.cc:1229
ArmISA::d
Bitfield< 9 > d
Definition: miscregs_types.hh:60
MipsISA::vaddr
vaddr
Definition: pra_constants.hh:275
GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:252
ComputeUnit::tlbCycles
Stats::Scalar tlbCycles
Definition: compute_unit.hh:538
ComputeUnit::ITLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:883
ComputeUnit::resp_tick_latency
Tick resp_tick_latency
Definition: compute_unit.hh:359
ComputeUnit::wgBlockedDueLdsAllocation
Stats::Scalar wgBlockedDueLdsAllocation
Definition: compute_unit.hh:556
ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:292
MemCmd::SwapResp
@ SwapResp
Definition: packet.hh:112
process.hh
ComputeUnit::wfList
std::vector< std::vector< Wavefront * > > wfList
Definition: compute_unit.hh:288
WaitClass::init
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition: misc.hh:73
ComputeUnit::scalarDataPort
ScalarDataPort scalarDataPort
Definition: compute_unit.hh:993
ComputeUnit::numFailedCASOps
Stats::Scalar numFailedCASOps
Definition: compute_unit.hh:603
isa_traits.hh
scalar_register_file.hh
ComputeUnit::instInterleave
Stats::VectorDistribution instInterleave
Definition: compute_unit.hh:326
exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition: sim_events.cc:88
HSAQueueEntry::gridSize
int gridSize(int dim) const
Definition: hsa_queue_entry.hh:129
ComputeUnit::numVecOpsExecutedMAC32
Stats::Scalar numVecOpsExecutedMAC32
Definition: compute_unit.hh:578
ComputeUnit::vALUUtilization
Stats::Formula vALUUtilization
Definition: compute_unit.hh:479
FetchStage::exec
void exec()
Definition: fetch_stage.cc:64
gpu_dyn_inst.hh
ComputeUnit::DTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:812
WFBarrier
WF barrier slots.
Definition: compute_unit.hh:87
ComputeUnit::releaseBarrier
void releaseBarrier(int bar_id)
Definition: compute_unit.cc:680
ComputeUnit::numALUInstsExecuted
Stats::Formula numALUInstsExecuted
Definition: compute_unit.hh:597
ComputeUnit::numTimesWgBlockedDueVgprAlloc
Stats::Scalar numTimesWgBlockedDueVgprAlloc
Definition: compute_unit.hh:599
ComputeUnit::DataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:682
ComputeUnit::execRateDist
Stats::Distribution execRateDist
Definition: compute_unit.hh:563
ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition: compute_unit.hh:228
ComputeUnit::fetchStage
FetchStage fetchStage
Definition: compute_unit.hh:277
ComputeUnit::isVectorAluIdle
bool isVectorAluIdle(uint32_t simdId) const
Definition: compute_unit.cc:2520
ComputeUnit::startWavefront
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
Definition: compute_unit.cc:306
ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition: compute_unit.hh:238
ScalarMemPipeline::exec
void exec()
Definition: scalar_memory_pipeline.cc:55
OLDEST
@ OLDEST
Definition: compute_unit.hh:71
ComputeUnit::DataPort::SenderState
Definition: compute_unit.hh:660
ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition: compute_unit.cc:236
ComputeUnit::countPages
bool countPages
Definition: compute_unit.hh:354
ComputeUnit::numVecOpsExecuted
Stats::Scalar numVecOpsExecuted
Definition: compute_unit.hh:565
ComputeUnit::groupMemInsts
Stats::Formula groupMemInsts
Definition: compute_unit.hh:519
ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition: compute_unit.hh:283
ComputeUnit::ScalarDataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:719
ComputeUnit::sendScalarRequest
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
Definition: compute_unit.cc:1202
ComputeUnit::ScalarDTLBPort::stallPort
void stallPort()
Definition: compute_unit.hh:853
ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition: compute_unit.hh:236
ProbePoints::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:103
ComputeUnit::getAndIncSeqNum
InstSeqNum getAndIncSeqNum()
Definition: compute_unit.hh:1023
ComputeUnit::numVecOpsExecutedMAD32
Stats::Scalar numVecOpsExecutedMAD32
Definition: compute_unit.hh:582
HSAQueueEntry::isInvDone
bool isInvDone() const
Is invalidate done?
Definition: hsa_queue_entry.hh:355
ComputeUnit::DTLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:809
ComputeUnit::LDSPort::recvReqRetry
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
Definition: compute_unit.cc:2623
ComputeUnit::ScalarDataPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:910
Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
ComputeUnit::ITLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1723
ComputeUnit::scalarRegsReserved
std::vector< int > scalarRegsReserved
Definition: compute_unit.hh:372
ComputeUnit::vectorMemReadsPerKiloInst
Stats::Formula vectorMemReadsPerKiloInst
Definition: compute_unit.hh:495
Stats::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:274
ComputeUnit::spillReads
Stats::Scalar spillReads
Definition: compute_unit.hh:514
Wavefront::S_STOPPED
@ S_STOPPED
Definition: wavefront.hh:62
name
const std::string & name()
Definition: trace.cc:50
GMEnqueue
@ GMEnqueue
Definition: misc.hh:54
Stats::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1177
ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:281
Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:214
ComputeUnit::isDone
bool isDone() const
Definition: compute_unit.cc:2483
MemCmd::MemSyncResp
@ MemSyncResp
Definition: packet.hh:116
ComputeUnit::vpc_f32
Stats::Formula vpc_f32
Definition: compute_unit.hh:590
ComputeUnit::numAtBarrier
int numAtBarrier(int bar_id)
Definition: compute_unit.cc:652
ComputeUnit::hasDispResources
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
Definition: compute_unit.cc:503
Wavefront::pendingFetch
bool pendingFetch
Definition: wavefront.hh:104
HSAQueueEntry::codeAddr
Addr codeAddr() const
Definition: hsa_queue_entry.hh:178
ComputeUnit::ldsPort
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
Definition: compute_unit.hh:978
ComputeUnit::freeBarrierIds
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
Definition: compute_unit.hh:1074
ComputeUnit::flatVMemInsts
Stats::Scalar flatVMemInsts
Definition: compute_unit.hh:482
ComputeUnit::lastVaddrCU
std::vector< Addr > lastVaddrCU
Definition: compute_unit.hh:338
ComputeUnit::vectorMemReadsPerWF
Stats::Formula vectorMemReadsPerWF
Definition: compute_unit.hh:489
ComputeUnit::headTailLatency
Stats::Distribution headTailLatency
Definition: compute_unit.hh:609
ComputeUnit::privMemInsts
Stats::Formula privMemInsts
Definition: compute_unit.hh:522
ComputeUnit::flatVMemInstsPerWF
Stats::Formula flatVMemInstsPerWF
Definition: compute_unit.hh:483
SimObject::name
virtual const std::string name() const
Definition: sim_object.hh:133
BaseTLB::Write
@ Write
Definition: tlb.hh:57
ComputeUnit::DataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:662
ComputeUnit::numVecOpsExecutedMAC64
Stats::Scalar numVecOpsExecutedMAC64
Definition: compute_unit.hh:579
ComputeUnit::numWfsToSched
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
Definition: compute_unit.hh:367
Packet::cmd
MemCmd cmd
The command field of the packet.
Definition: packet.hh:335
HSAQueueEntry::numScalarRegs
int numScalarRegs() const
Definition: hsa_queue_entry.hh:142
Shader::max_valu_insts
int64_t max_valu_insts
Definition: shader.hh:260
ComputeUnit::DataPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:785
ComputeUnit::numVecRegsPerSimd
int numVecRegsPerSimd
Definition: compute_unit.hh:374
Shader::total_valu_insts
int64_t total_valu_insts
Definition: shader.hh:261
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:197
ComputeUnit::SQCPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:987
ArmISA::PageBytes
const Addr PageBytes
Definition: isa_traits.hh:52
ComputeUnit::execStage
ExecStage execStage
Definition: compute_unit.hh:280
Packet::pushSenderState
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition: packet.cc:332
ComputeUnit::scalarMemToSrfBus
WaitClass scalarMemToSrfBus
Definition: compute_unit.hh:234
ComputeUnit::globalMemInsts
Stats::Formula globalMemInsts
Definition: compute_unit.hh:510
ComputeUnit::waveLevelParallelism
Stats::Distribution waveLevelParallelism
Definition: compute_unit.hh:531
LdsState::reserveSpace
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:368
ComputeUnit::pageAccesses
pageDataStruct pageAccesses
Definition: compute_unit.hh:627
ComputeUnit::DataPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:960
ComputeUnit::scalarMemWritesPerKiloInst
Stats::Formula scalarMemWritesPerKiloInst
Definition: compute_unit.hh:499
ComputeUnit::wgBlockedDueBarrierAllocation
Stats::Scalar wgBlockedDueBarrierAllocation
Definition: compute_unit.hh:555
MemCmd::WriteResp
@ WriteResp
Definition: packet.hh:86
ComputeUnit::updateInstStats
void updateInstStats(GPUDynInstPtr gpuDynInst)
Definition: compute_unit.cc:2350
ComputeUnit::updatePageDivergenceDist
void updatePageDivergenceDist(Addr addr)
Definition: compute_unit.cc:2455
ComputeUnit::numVecOpsExecutedTwoOpFP
Stats::Scalar numVecOpsExecutedTwoOpFP
Definition: compute_unit.hh:585
RegisterManager::canAllocateSgprs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:114
MemCmd::MemSyncReq
@ MemSyncReq
Definition: packet.hh:115
Stats::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2634
Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1107
ComputeUnit::vectorMemReads
Stats::Scalar vectorMemReads
Definition: compute_unit.hh:488
Wavefront
Definition: wavefront.hh:57
Stats::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:51
ComputeUnit::fillKernelState
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
Definition: compute_unit.cc:292
ComputeUnit::groupReads
Stats::Scalar groupReads
Definition: compute_unit.hh:517
GlobalMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: global_memory_pipeline.hh:92
ComputeUnit::numCASOps
Stats::Scalar numCASOps
Definition: compute_unit.hh:602
ComputeUnit::numVecOpsExecutedMAC16
Stats::Scalar numVecOpsExecutedMAC16
Definition: compute_unit.hh:577
Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:257
ComputeUnit::regStats
void regStats() override
Callback to set stat parameters.
Definition: compute_unit.cc:1806
ComputeUnit::privReads
Stats::Scalar privReads
Definition: compute_unit.hh:520
ComputeUnit::flatLDSInsts
Stats::Scalar flatLDSInsts
Definition: compute_unit.hh:484
ComputeUnit::LDSPort::SenderState::getMemInst
GPUDynInstPtr getMemInst() const
Definition: compute_unit.hh:940
ComputeUnit::ScalarDataPort::recvReqRetry
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:948
ComputeUnit::Params
ComputeUnitParams Params
Definition: compute_unit.hh:287
ArmISA::len
Bitfield< 18, 16 > len
Definition: miscregs_types.hh:439
Request::KERNEL
@ KERNEL
The request should be marked with KERNEL.
Definition: request.hh:170
ComputeUnit::activeWaves
int activeWaves
Definition: compute_unit.hh:530
addr
ip6_addr_t addr
Definition: inet.hh:423
ComputeUnit::wfSize
int wfSize() const
Definition: compute_unit.hh:397
ComputeUnit::lastVaddrSimd
std::vector< std::vector< Addr > > lastVaddrSimd
Definition: compute_unit.hh:339
ComputeUnit::DataPort::processMemRespEvent
void processMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1306
Stats::VectorDistribution::init
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
Definition: statistics.hh:2741
ComputeUnit::threadCyclesVALU
Stats::Scalar threadCyclesVALU
Definition: compute_unit.hh:478
ScalarMemPipeline::regStats
void regStats()
Definition: scalar_memory_pipeline.cc:147
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
ArmISA::stride
Bitfield< 21, 20 > stride
Definition: miscregs_types.hh:441
Stats::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1924
Packet::isWrite
bool isWrite() const
Definition: packet.hh:557
Clocked::nextCycle
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Definition: clocked_object.hh:210
FetchStage::regStats
void regStats()
Definition: fetch_stage.cc:94
ComputeUnit::vectorRegsReserved
std::vector< int > vectorRegsReserved
Definition: compute_unit.hh:370
LdsChunk
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:55
Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1157
ComputeUnit::getFreeBarrierId
int getFreeBarrierId()
Definition: compute_unit.hh:426
Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:102
sc_core::SC_NONE
@ SC_NONE
Definition: sc_report.hh:50
Stats::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:374
MemCmd::ReadResp
@ ReadResp
Definition: packet.hh:83
ComputeUnit::scalarMemReadsPerWF
Stats::Formula scalarMemReadsPerWF
Definition: compute_unit.hh:493
ComputeUnit::scoreboardCheckStage
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:278
ScoreboardCheckStage::regStats
void regStats()
Definition: scoreboard_check_stage.cc:270
ComputeUnit::ScalarDataPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:740
GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:56
ComputeUnit::releaseWFsFromBarrier
void releaseWFsFromBarrier(int bar_id)
Definition: compute_unit.cc:688
TokenManager
Definition: token_port.hh:129
dispatcher.hh
ComputeUnit::DataPort::createMemRespEvent
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1566
ComputeUnit::ldsNoFlatInsts
Stats::Scalar ldsNoFlatInsts
Definition: compute_unit.hh:480
DPRINTFN
#define DPRINTFN(...)
Definition: trace.hh:238
ComputeUnit::DataPort::processMemReqEvent
void processMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1574
ComputeUnit::tlbPort
std::vector< DTLBPort > tlbPort
Definition: compute_unit.hh:991
Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:508
simout
OutputDirectory simout
Definition: output.cc:61
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:44
TokenRequestPort::setTokenManager
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition: token_port.cc:73
GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:275
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:219
page_table.hh
ComputeUnit::~ComputeUnit
~ComputeUnit()
Definition: compute_unit.cc:215
ComputeUnit::LDSPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Definition: compute_unit.cc:2559
ComputeUnit::pipeMap
std::unordered_set< uint64_t > pipeMap
Definition: compute_unit.hh:273
Stats::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:307
ComputeUnit::instCyclesVMemPerSimd
Stats::Vector instCyclesVMemPerSimd
Definition: compute_unit.hh:504
ComputeUnit::spillMemInsts
Stats::Formula spillMemInsts
Definition: compute_unit.hh:516
ComputeUnit::doInvalidate
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
Definition: compute_unit.cc:380
LdsState::increaseRefCounter
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:300
isPowerOf2
bool isPowerOf2(const T &n)
Definition: intmath.hh:102
ComputeUnit::DTLBPort::SenderState::portIndex
PortID portIndex
Definition: compute_unit.hh:816
ComputeUnit::ipc
Stats::Formula ipc
Definition: compute_unit.hh:592
GuestABI::foo
std::true_type foo(void(*)(ThreadContext *, const Ret &ret, State &state))
ComputeUnit::requestorId
RequestorID requestorId()
Definition: compute_unit.hh:461
csprintf
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:158
ComputeUnit::ScalarDTLBPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:856
ComputeUnit::vALUInstsPerWF
Stats::Formula vALUInstsPerWF
Definition: compute_unit.hh:473
ComputeUnit::mapWaveToScalarAlu
int mapWaveToScalarAlu(Wavefront *w) const
Definition: compute_unit.cc:250
Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:591
ComputeUnit::lds
LdsState & lds
Definition: compute_unit.hh:469
HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:224
ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:356
WFBarrier::InvalidID
static const int InvalidID
Definition: compute_unit.hh:94
ComputeUnit::vectorMemWritesPerKiloInst
Stats::Formula vectorMemWritesPerKiloInst
Definition: compute_unit.hh:496
ComputeUnit::instCyclesSALU
Stats::Scalar instCyclesSALU
Definition: compute_unit.hh:477
GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:272
ComputeUnit::activeLanesPerGMemInstrDist
Stats::Distribution activeLanesPerGMemInstrDist
Definition: compute_unit.hh:594
ComputeUnit::dynamicGMemInstrCnt
Stats::Scalar dynamicGMemInstrCnt
Definition: compute_unit.hh:550
ComputeUnit::numVecOpsExecutedFMA16
Stats::Scalar numVecOpsExecutedFMA16
Definition: compute_unit.hh:573
ComputeUnit::ScalarDTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:846
ExecStage::regStats
void regStats()
Definition: exec_stage.cc:200
ComputeUnit::barrierSlot
WFBarrier & barrierSlot(int bar_id)
Definition: compute_unit.hh:419
ComputeUnit::numScalarALUs
int numScalarALUs
Definition: compute_unit.hh:245
ComputeUnit::scalarMemInstsPerKiloInst
Stats::Formula scalarMemInstsPerKiloInst
Definition: compute_unit.hh:500
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:171
ComputeUnit::numVectorALUs
int numVectorALUs
Definition: compute_unit.hh:241
curTick
Tick curTick()
The current simulated tick.
Definition: core.hh:45
X86ISA::GpuTLB::TranslationState
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: gpu_tlb.hh:310
LdsState::canReserve
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:483

Generated on Wed Sep 30 2020 14:02:12 for gem5 by doxygen 1.8.17