gem5  v21.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
35 
36 #include <limits>
37 
38 #include "arch/x86/isa_traits.hh"
39 #include "base/output.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
54 #include "gpu-compute/shader.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 #include "sim/sim_exit.hh"
61 
63  numVectorGlobalMemUnits(p.num_global_mem_pipes),
64  numVectorSharedMemUnits(p.num_shared_mem_pipes),
65  numScalarMemUnits(p.num_scalar_mem_pipes),
66  numVectorALUs(p.num_SIMDs),
67  numScalarALUs(p.num_scalar_cores),
68  vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
69  coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
70  registerManager(p.register_manager),
71  fetchStage(p, *this),
72  scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
73  scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
74  execStage(p, *this, scheduleToExecute),
75  globalMemoryPipe(p, *this),
76  localMemoryPipe(p, *this),
77  scalarMemoryPipe(p, *this),
78  tickEvent([this]{ exec(); }, "Compute unit tick event",
79  false, Event::CPU_Tick_Pri),
80  cu_id(p.cu_id),
81  vrf(p.vector_register_file), srf(p.scalar_register_file),
82  simdWidth(p.simd_width),
83  spBypassPipeLength(p.spbypass_pipe_length),
84  dpBypassPipeLength(p.dpbypass_pipe_length),
85  scalarPipeStages(p.scalar_pipe_length),
86  operandNetworkLength(p.operand_network_length),
87  issuePeriod(p.issue_period),
88  vrf_gm_bus_latency(p.vrf_gm_bus_latency),
89  srf_scm_bus_latency(p.srf_scm_bus_latency),
90  vrf_lm_bus_latency(p.vrf_lm_bus_latency),
91  perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
92  prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
93  debugSegFault(p.debugSegFault),
94  functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
95  countPages(p.countPages),
96  req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
97  resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
98  _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
99  lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
100  ldsPort(csprintf("%s-port", name()), this),
101  scalarDataPort(csprintf("%s-port", name()), this),
102  scalarDTLBPort(csprintf("%s-port", name()), this),
103  sqcPort(csprintf("%s-port", name()), this),
104  sqcTLBPort(csprintf("%s-port", name()), this),
105  _cacheLineSize(p.system->cacheLineSize()),
106  _numBarrierSlots(p.num_barrier_slots),
107  globalSeqNum(0), wavefrontSize(p.wf_size),
108  scoreboardCheckToSchedule(p),
109  scheduleToExecute(p),
110  stats(this, p.n_wf)
111 {
121  fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
122  p.wf_size <= 0,
123  "WF size is larger than the host can support");
124  fatal_if(!isPowerOf2(wavefrontSize),
125  "Wavefront size should be a power of 2");
126  // calculate how many cycles a vector load or store will need to transfer
127  // its data over the corresponding buses
128  numCyclesPerStoreTransfer =
129  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
130  (double)vrfToCoalescerBusWidth);
131 
132  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
133  / coalescerToVrfBusWidth;
134 
135  // Initialization: all WF slots are assumed STOPPED
136  idleWfs = p.n_wf * numVectorALUs;
137  lastVaddrWF.resize(numVectorALUs);
138  wfList.resize(numVectorALUs);
139 
140  wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
141 
142  for (int i = 0; i < p.num_barrier_slots; ++i) {
143  freeBarrierIds.insert(i);
144  }
145 
146  for (int j = 0; j < numVectorALUs; ++j) {
147  lastVaddrWF[j].resize(p.n_wf);
148 
149  for (int i = 0; i < p.n_wf; ++i) {
150  lastVaddrWF[j][i].resize(wfSize());
151 
152  wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
153  wfList[j][i]->setParent(this);
154 
155  for (int k = 0; k < wfSize(); ++k) {
156  lastVaddrWF[j][i][k] = 0;
157  }
158  }
159  }
160 
161  lastVaddrSimd.resize(numVectorALUs);
162 
163  for (int i = 0; i < numVectorALUs; ++i) {
164  lastVaddrSimd[i].resize(wfSize(), 0);
165  }
166 
167  lastVaddrCU.resize(wfSize());
168 
169  lds.setParent(this);
170 
171  if (p.execPolicy == "OLDEST-FIRST") {
172  exec_policy = EXEC_POLICY::OLDEST;
173  } else if (p.execPolicy == "ROUND-ROBIN") {
174  exec_policy = EXEC_POLICY::RR;
175  } else {
176  fatal("Invalid WF execution policy (CU)\n");
177  }
178 
179  for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
180  memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
181  }
182 
183  for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
184  tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
185  }
186 
187  // Setup tokens for response ports. The number of tokens in memPortTokens
188  // is the total token count for the entire vector port (i.e., this CU).
189  memPortTokens = new TokenManager(p.max_cu_tokens);
190 
191  registerExitCallback([this]() { exitCallback(); });
192 
193  lastExecCycle.resize(numVectorALUs, 0);
194 
195  for (int i = 0; i < vrf.size(); ++i) {
196  vrf[i]->setParent(this);
197  }
198  for (int i = 0; i < srf.size(); ++i) {
199  srf[i]->setParent(this);
200  }
201  numVecRegsPerSimd = vrf[0]->numRegs();
202  numScalarRegsPerSimd = srf[0]->numRegs();
203 
204  registerManager->setParent(this);
205 
206  activeWaves = 0;
207 
208  instExecPerSimd.resize(numVectorALUs, 0);
209 
210  // Calculate the number of bits to address a cache line
211  panic_if(!isPowerOf2(_cacheLineSize),
212  "Cache line size should be a power of two.");
213  cacheLineBits = floorLog2(_cacheLineSize);
214 }
215 
217 {
218  // Delete wavefront slots
219  for (int j = 0; j < numVectorALUs; ++j) {
220  for (int i = 0; i < shader->n_wf; ++i) {
221  delete wfList[j][i];
222  }
223  lastVaddrSimd[j].clear();
224  }
225  lastVaddrCU.clear();
226 }
227 
228 int
230 {
233 }
234 
235 // index into readyList of the first memory unit
236 int
238 {
239  return numVectorALUs + numScalarALUs;
240 }
241 
242 // index into readyList of the last memory unit
243 int
245 {
246  return numExeUnits() - 1;
247 }
248 
249 // index into scalarALUs vector of SALU used by the wavefront
250 int
252 {
253  if (numScalarALUs == 1) {
254  return 0;
255  } else {
256  return w->simdId % numScalarALUs;
257  }
258 }
259 
260 // index into readyList of Scalar ALU unit used by wavefront
261 int
263 {
265 }
266 
267 // index into readyList of Global Memory unit used by wavefront
268 int
270 {
271  // TODO: FIXME if more than 1 GM pipe supported
272  return numVectorALUs + numScalarALUs;
273 }
274 
275 // index into readyList of Local Memory unit used by wavefront
276 int
278 {
279  // TODO: FIXME if more than 1 LM pipe supported
281 }
282 
283 // index into readyList of Scalar Memory unit used by wavefront
284 int
286 {
287  // TODO: FIXME if more than 1 ScM pipe supported
290 }
291 
292 void
294 {
295  w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
296  w->workGroupSz[0] = task->wgSize(0);
297  w->workGroupSz[1] = task->wgSize(1);
298  w->workGroupSz[2] = task->wgSize(2);
299  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
300  w->gridSz[0] = task->gridSize(0);
301  w->gridSz[1] = task->gridSize(1);
302  w->gridSz[2] = task->gridSize(2);
303  w->computeActualWgSz(task);
304 }
305 
306 void
308  HSAQueueEntry *task, int bar_id, bool fetchContext)
309 {
310  static int _n_wave = 0;
311 
312  VectorMask init_mask;
313  init_mask.reset();
314 
315  for (int k = 0; k < wfSize(); ++k) {
316  if (k + waveId * wfSize() < w->actualWgSzTotal)
317  init_mask[k] = 1;
318  }
319 
320  w->execMask() = init_mask;
321 
322  w->kernId = task->dispatchId();
323  w->wfId = waveId;
324  w->initMask = init_mask.to_ullong();
325 
326  if (bar_id > WFBarrier::InvalidID) {
327  w->barrierId(bar_id);
328  } else {
329  assert(!w->hasBarrier());
330  }
331 
332  for (int k = 0; k < wfSize(); ++k) {
333  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
334  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
335  w->actualWgSz[1];
336  w->workItemId[2][k] = (k + waveId * wfSize()) /
337  (w->actualWgSz[0] * w->actualWgSz[1]);
338 
339  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
340  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
341  w->workItemId[0][k];
342  }
343 
344  // WG state
345  w->wgId = task->globalWgId();
346  w->dispatchId = task->dispatchId();
347  w->workGroupId[0] = w->wgId % task->numWg(0);
348  w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
349  w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
350 
351  // set the wavefront context to have a pointer to this section of the LDS
352  w->ldsChunk = ldsChunk;
353 
354  M5_VAR_USED int32_t refCount =
355  lds.increaseRefCounter(w->dispatchId, w->wgId);
356  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
357  cu_id, w->wgId, refCount);
358 
359  w->instructionBuffer.clear();
360 
361  if (w->pendingFetch)
362  w->dropFetch = true;
363 
364  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
365  "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
366  w->simdId, w->wfSlotId, refCount);
367 
368  w->initRegState(task, w->actualWgSzTotal);
369  w->start(_n_wave++, task->codeAddr());
370 
372  activeWaves++;
373 }
374 
380 void
382  GPUDynInstPtr gpuDynInst
383  = std::make_shared<GPUDynInst>(this, nullptr,
385 
386  // kern_id will be used in inv responses
387  gpuDynInst->kern_id = kernId;
388  // update contextId field
389  req->setContext(gpuDynInst->wfDynId);
390 
391  injectGlobalMemFence(gpuDynInst, true, req);
392 }
393 
399 void
401  injectGlobalMemFence(gpuDynInst, true);
402 }
403 
404 // reseting SIMD register pools
405 // I couldn't think of any other place and
406 // I think it is needed in my implementation
407 void
409 {
410  for (int i=0; i<numVectorALUs; i++)
411  {
414  }
415 }
416 
417 void
418 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
419 {
420  // If we aren't ticking, start it up!
421  if (!tickEvent.scheduled()) {
422  DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
424  }
425 
426  // the kernel's invalidate must have finished before any wg dispatch
427  assert(task->isInvDone());
428 
429  // reserve the LDS capacity allocated to the work group
430  // disambiguated by the dispatch ID and workgroup ID, which should be
431  // globally unique
432  LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
433  task->globalWgId(),
434  task->ldsSize());
435 
436  panic_if(!ldsChunk, "was not able to reserve space for this WG");
437 
438  // calculate the number of 32-bit vector registers required
439  // by each work item
440  int vregDemand = task->numVectorRegs();
441  int sregDemand = task->numScalarRegs();
442  int wave_id = 0;
443 
444  int barrier_id = WFBarrier::InvalidID;
445 
450  if (num_wfs_in_wg > 1) {
455  barrier_id = getFreeBarrierId();
456  auto &wf_barrier = barrierSlot(barrier_id);
457  assert(!wf_barrier.maxBarrierCnt());
458  assert(!wf_barrier.numAtBarrier());
459  wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
460 
461  DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
462  "%d waves using this barrier.\n", cu_id, barrier_id,
463  num_wfs_in_wg);
464  }
465 
466  // Assign WFs according to numWfsToSched vector, which is computed by
467  // hasDispResources()
468  for (int j = 0; j < shader->n_wf; ++j) {
469  for (int i = 0; i < numVectorALUs; ++i) {
470  Wavefront *w = wfList[i][j];
471  // Check if this wavefront slot is available and there are WFs
472  // remaining to be dispatched to current SIMD:
473  // WF slot must be stopped and not waiting
474  // for a release to complete S_RETURNING
475  if (w->getStatus() == Wavefront::S_STOPPED &&
476  numWfsToSched[i] > 0) {
477  // decrement number of WFs awaiting dispatch to current SIMD
478  numWfsToSched[i] -= 1;
479 
480  fillKernelState(w, task);
481 
482  DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
483  "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
484  vregDemand, sregDemand);
485 
486  registerManager->allocateRegisters(w, vregDemand, sregDemand);
487 
488  startWavefront(w, wave_id, ldsChunk, task, barrier_id);
489  ++wave_id;
490  }
491  }
492  }
493 }
494 
495 void
497 {
498  panic_if(w->instructionBuffer.empty(),
499  "Instruction Buffer of WF%d can't be empty", w->wgId);
500  GPUDynInstPtr ii = w->instructionBuffer.front();
501  pipeMap.emplace(ii->seqNum());
502 }
503 
504 void
506 {
507  panic_if(w->instructionBuffer.empty(),
508  "Instruction Buffer of WF%d can't be empty", w->wgId);
509  GPUDynInstPtr ii = w->instructionBuffer.front();
510  // delete the dynamic instruction from the pipeline map
511  auto it = pipeMap.find(ii->seqNum());
512  panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
513  pipeMap.erase(it);
514 }
515 
516 bool
518 {
519  // compute true size of workgroup (after clamping to grid size)
520  int trueWgSize[HSAQueueEntry::MAX_DIM];
521  int trueWgSizeTotal = 1;
522 
523  for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
524  trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
525  task->wgId(d) * task->wgSize(d));
526 
527  trueWgSizeTotal *= trueWgSize[d];
528  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
529  }
530 
531  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
532 
533  // calculate the number of WFs in this WG
534  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
535  num_wfs_in_wg = numWfs;
536 
537  bool barrier_avail = true;
538 
539  if (numWfs > 1 && !freeBarrierIds.size()) {
540  barrier_avail = false;
541  }
542 
543  // calculate the number of 32-bit vector registers required by each
544  // work item of the work group
545  int vregDemandPerWI = task->numVectorRegs();
546  // calculate the number of 32-bit scalar registers required by each
547  // work item of the work group
548  int sregDemandPerWI = task->numScalarRegs();
549 
550  // check if the total number of VGPRs snd SGPRs required by all WFs
551  // of the WG fit in the VRFs of all SIMD units and the CU's SRF
552  panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
553  "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
554  "that has %d VGPRs\n",
555  numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
556  panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
557  "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
558  "with %d SGPRs\n",
559  numWfs, sregDemandPerWI, numScalarRegsPerSimd);
560 
561  // number of WF slots that are not occupied
562  int freeWfSlots = 0;
563  // number of Wfs from WG that were successfully mapped to a SIMD
564  int numMappedWfs = 0;
565  numWfsToSched.clear();
566  numWfsToSched.resize(numVectorALUs, 0);
567 
568  // attempt to map WFs to the SIMDs, based on WF slot availability
569  // and register file availability
570  for (int j = 0; j < shader->n_wf; ++j) {
571  for (int i = 0; i < numVectorALUs; ++i) {
572  if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
573  ++freeWfSlots;
574  // check if current WF will fit onto current SIMD/VRF
575  // if all WFs have not yet been mapped to the SIMDs
576  if (numMappedWfs < numWfs &&
578  sregDemandPerWI) &&
580  vregDemandPerWI)) {
581  numWfsToSched[i]++;
582  numMappedWfs++;
583  }
584  }
585  }
586  }
587 
588  // check that the number of mapped WFs is not greater
589  // than the actual number of WFs
590  assert(numMappedWfs <= numWfs);
591 
592  bool vregAvail = true;
593  bool sregAvail = true;
594  // if a WF to SIMD mapping was not found, find the limiting resource
595  if (numMappedWfs < numWfs) {
596 
597  for (int j = 0; j < numVectorALUs; ++j) {
598  // find if there are enough free VGPRs in the SIMD's VRF
599  // to accomodate the WFs of the new WG that would be mapped
600  // to this SIMD unit
601  vregAvail &= registerManager->
602  canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
603  // find if there are enough free SGPRs in the SIMD's SRF
604  // to accomodate the WFs of the new WG that would be mapped
605  // to this SIMD unit
606  sregAvail &= registerManager->
607  canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
608  }
609  }
610 
611  DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
612  VGPR Availability = %d, SGPR Availability = %d\n",
613  freeWfSlots, numMappedWfs, vregAvail, sregAvail);
614 
615  if (!vregAvail) {
617  }
618 
619  if (!sregAvail) {
621  }
622 
623  // Return true if enough WF slots to submit workgroup and if there are
624  // enough VGPRs to schedule all WFs to their SIMD units
625  bool ldsAvail = lds.canReserve(task->ldsSize());
626  if (!ldsAvail) {
628  }
629 
630  if (!barrier_avail) {
632  }
633 
634  // Return true if the following are all true:
635  // (a) all WFs of the WG were mapped to free WF slots
636  // (b) there are enough VGPRs to schedule all WFs to their SIMD units
637  // (c) there are enough SGPRs on the CU to schedule all WFs
638  // (d) there is enough space in LDS to allocate for all WFs
639  bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
640  && ldsAvail && barrier_avail;
641  return can_dispatch;
642 }
643 
644 int
646 {
647  auto &wf_barrier = barrierSlot(bar_id);
648  return wf_barrier.numYetToReachBarrier();
649 }
650 
651 bool
653 {
654  auto &wf_barrier = barrierSlot(bar_id);
655  return wf_barrier.allAtBarrier();
656 }
657 
658 void
660 {
661  auto &wf_barrier = barrierSlot(bar_id);
662  wf_barrier.incNumAtBarrier();
663 }
664 
665 int
667 {
668  auto &wf_barrier = barrierSlot(bar_id);
669  return wf_barrier.numAtBarrier();
670 }
671 
672 int
674 {
675  auto &wf_barrier = barrierSlot(bar_id);
676  return wf_barrier.maxBarrierCnt();
677 }
678 
679 void
681 {
682  auto &wf_barrier = barrierSlot(bar_id);
683  wf_barrier.reset();
684 }
685 
686 void
688 {
689  auto &wf_barrier = barrierSlot(bar_id);
690  wf_barrier.decMaxBarrierCnt();
691 }
692 
693 void
695 {
696  auto &wf_barrier = barrierSlot(bar_id);
697  wf_barrier.release();
698  freeBarrierIds.insert(bar_id);
699 }
700 
701 void
703 {
704  for (int i = 0; i < numVectorALUs; ++i) {
705  for (int j = 0; j < shader->n_wf; ++j) {
706  Wavefront *wf = wfList[i][j];
707  if (wf->barrierId() == bar_id) {
708  assert(wf->getStatus() == Wavefront::S_BARRIER);
710  }
711  }
712  }
713 }
714 
715 // Execute one clock worth of work on the ComputeUnit.
716 void
718 {
719  // process reads and writes in the RFs
720  for (auto &vecRegFile : vrf) {
721  vecRegFile->exec();
722  }
723 
724  for (auto &scRegFile : srf) {
725  scRegFile->exec();
726  }
727 
728  // Execute pipeline stages in reverse order to simulate
729  // the pipeline latency
733  execStage.exec();
736  fetchStage.exec();
737 
738  stats.totalCycles++;
739 
740  // Put this CU to sleep if there is no more work to be done.
741  if (!isDone()) {
743  } else {
745  DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
746  }
747 }
748 
749 void
751 {
752  // Initialize CU Bus models and execution resources
753 
754  // Vector ALUs
755  vectorALUs.clear();
756  for (int i = 0; i < numVectorALUs; i++) {
757  vectorALUs.emplace_back(this, clockPeriod());
758  }
759 
760  // Scalar ALUs
761  scalarALUs.clear();
762  for (int i = 0; i < numScalarALUs; i++) {
763  scalarALUs.emplace_back(this, clockPeriod());
764  }
765 
766  // Vector Global Memory
768  "No support for multiple Global Memory Pipelines exists!!!");
772 
773  // Vector Local/Shared Memory
775  "No support for multiple Local Memory Pipelines exists!!!");
779 
780  // Scalar Memory
782  "No support for multiple Scalar Memory Pipelines exists!!!");
783  scalarMemUnit.init(this, clockPeriod());
786 
789 
790  fetchStage.init();
792  execStage.init();
794 
796 }
797 
798 bool
800 {
801  // Ruby has completed the memory op. Schedule the mem_resp_event at the
802  // appropriate cycle to process the timing memory response
803  // This delay represents the pipeline delay
804  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
805  PortID index = sender_state->port_index;
806  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
807  GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
808 
809  // MemSyncResp + WriteAckResp are handled completely here and we don't
810  // schedule a MemRespEvent to process the responses further
811  if (pkt->cmd == MemCmd::MemSyncResp) {
812  // This response is for 1 of the following request types:
813  // - kernel launch
814  // - kernel end
815  // - non-kernel mem sync
816 
817  // Kernel Launch
818  // wavefront was nullptr when launching kernel, so it is meaningless
819  // here (simdId=-1, wfSlotId=-1)
820  if (gpuDynInst->isKernelLaunch()) {
821  // for kernel launch, the original request must be both kernel-type
822  // and INV_L1
823  assert(pkt->req->isKernel());
824  assert(pkt->req->isInvL1());
825 
826  // one D-Cache inv is done, decrement counter
827  dispatcher.updateInvCounter(gpuDynInst->kern_id);
828 
829  delete pkt->senderState;
830  delete pkt;
831  return true;
832  }
833 
834  // retrieve wavefront from inst
835  Wavefront *w = gpuDynInst->wavefront();
836 
837  // Check if we are waiting on Kernel End Flush
838  if (w->getStatus() == Wavefront::S_RETURNING
839  && gpuDynInst->isEndOfKernel()) {
840  // for kernel end, the original request must be both kernel-type
841  // and last-level GPU cache should be flushed if it contains
842  // dirty data. This request may have been quiesced and
843  // immediately responded to if the GL2 is a write-through /
844  // read-only cache.
845  assert(pkt->req->isKernel());
846  assert(pkt->req->isGL2CacheFlush());
847 
848  // once flush done, decrement counter, and return whether all
849  // dirty writeback operations are done for the kernel
850  bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
851 
852  // not all wbs are done for the kernel, just release pkt
853  // resources
854  if (!isWbDone) {
855  delete pkt->senderState;
856  delete pkt;
857  return true;
858  }
859 
860  // all wbs are completed for the kernel, do retirement work
861  // for the workgroup
862  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
863  computeUnit->cu_id, w->simdId, w->wfSlotId,
864  w->wfDynId, w->wgId);
865 
866  dispatcher.notifyWgCompl(w);
867  w->setStatus(Wavefront::S_STOPPED);
868  }
869 
870  if (!pkt->req->isKernel()) {
871  w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
872  DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
873  "outstanding reqs %d => %d\n", gpuDynInst->simdId,
874  gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
875  gpuDynInst->disassemble(), w->outstandingReqs,
876  w->outstandingReqs - 1);
878  }
879 
880  delete pkt->senderState;
881  delete pkt;
882  return true;
883  }
884 
885  EventFunctionWrapper *mem_resp_event =
886  computeUnit->memPort[index].createMemRespEvent(pkt);
887 
888  DPRINTF(GPUPort,
889  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
890  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
891  gpuDynInst->seqNum(), index, pkt->req->getPaddr());
892 
893  computeUnit->schedule(mem_resp_event,
895 
896  return true;
897 }
898 
899 bool
901 {
902  assert(!pkt->req->isKernel());
903 
904  // retrieve sender state
905  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
906  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
907 
908  assert(pkt->isRead() || pkt->isWrite());
909  assert(gpuDynInst->numScalarReqs > 0);
910 
911  gpuDynInst->numScalarReqs--;
912 
921  if (!gpuDynInst->numScalarReqs) {
922  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
923  computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
924  gpuDynInst);
925  } else {
926  computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
927  gpuDynInst);
928  }
929  }
930 
931  delete pkt->senderState;
932  delete pkt;
933 
934  return true;
935 }
936 
937 void
939 {
940  for (const auto &pkt : retries) {
941  if (!sendTimingReq(pkt)) {
942  break;
943  } else {
944  retries.pop_front();
945  }
946  }
947 }
948 
949 void
951 {
952  int len = retries.size();
953 
954  assert(len > 0);
955 
956  for (int i = 0; i < len; ++i) {
957  PacketPtr pkt = retries.front().first;
958  M5_VAR_USED GPUDynInstPtr gpuDynInst = retries.front().second;
959  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
960  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
961  pkt->req->getPaddr());
962 
966  if (!sendTimingReq(pkt)) {
967  DPRINTF(GPUMem, "failed again!\n");
968  break;
969  } else {
970  DPRINTF(GPUMem, "successful!\n");
971  retries.pop_front();
972  }
973  }
974 }
975 
976 bool
978 {
979  computeUnit->fetchStage.processFetchReturn(pkt);
980  return true;
981 }
982 
983 void
985 {
986  int len = retries.size();
987 
988  assert(len > 0);
989 
990  for (int i = 0; i < len; ++i) {
991  PacketPtr pkt = retries.front().first;
992  M5_VAR_USED Wavefront *wavefront = retries.front().second;
993  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
994  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
995  pkt->req->getPaddr());
996  if (!sendTimingReq(pkt)) {
997  DPRINTF(GPUFetch, "failed again!\n");
998  break;
999  } else {
1000  DPRINTF(GPUFetch, "successful!\n");
1001  retries.pop_front();
1002  }
1003  }
1004 }
1005 
1006 void
1008 {
1009  // There must be a way around this check to do the globalMemStart...
1010  Addr tmp_vaddr = pkt->req->getVaddr();
1011 
1012  updatePageDivergenceDist(tmp_vaddr);
1013 
1014  // set PC in request
1015  pkt->req->setPC(gpuDynInst->wavefront()->pc());
1016 
1017  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1018 
1019  // figure out the type of the request to set read/write
1020  BaseTLB::Mode TLB_mode;
1021  assert(pkt->isRead() || pkt->isWrite());
1022 
1023  // only do some things if actually accessing data
1024  bool isDataAccess = pkt->isWrite() || pkt->isRead();
1025 
1026  // Check write before read for atomic operations
1027  // since atomic operations should use BaseTLB::Write
1028  if (pkt->isWrite()) {
1029  TLB_mode = BaseTLB::Write;
1030  } else if (pkt->isRead()) {
1031  TLB_mode = BaseTLB::Read;
1032  } else {
1033  fatal("pkt is not a read nor a write\n");
1034  }
1035 
1036  stats.tlbCycles -= curTick();
1037  ++stats.tlbRequests;
1038 
1039  PortID tlbPort_index = perLaneTLB ? index : 0;
1040 
1041  if (shader->timingSim) {
1042  if (debugSegFault) {
1044  Addr vaddr = pkt->req->getVaddr();
1045  unsigned size = pkt->getSize();
1046 
1047  if ((vaddr + size - 1) % 64 < vaddr % 64) {
1048  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1049  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1050  }
1051 
1052  Addr paddr;
1053 
1054  if (!p->pTable->translate(vaddr, paddr)) {
1055  if (!p->fixupFault(vaddr)) {
1056  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1057  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1058  vaddr);
1059  }
1060  }
1061  }
1062 
1063  // This is the SenderState needed upon return
1064  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1065 
1066  // This is the senderState needed by the TLB hierarchy to function
1067  X86ISA::GpuTLB::TranslationState *translation_state =
1068  new X86ISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
1069  pkt->senderState);
1070 
1071  pkt->senderState = translation_state;
1072 
1073  if (functionalTLB) {
1074  tlbPort[tlbPort_index].sendFunctional(pkt);
1075 
1076  // update the hitLevel distribution
1077  int hit_level = translation_state->hitLevel;
1078  assert(hit_level != -1);
1079  stats.hitsPerTLBLevel[hit_level]++;
1080 
1081  // New SenderState for the memory access
1082  X86ISA::GpuTLB::TranslationState *sender_state =
1083  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1084 
1085  delete sender_state->tlbEntry;
1086  delete sender_state->saved;
1087  delete sender_state;
1088 
1089  assert(pkt->req->hasPaddr());
1090  assert(pkt->req->hasSize());
1091 
1092  // this is necessary because the GPU TLB receives packets instead
1093  // of requests. when the translation is complete, all relevent
1094  // fields in the request will be populated, but not in the packet.
1095  // here we create the new packet so we can set the size, addr,
1096  // and proper flags.
1097  PacketPtr oldPkt = pkt;
1098  pkt = new Packet(oldPkt->req, oldPkt->cmd);
1099  if (isDataAccess) {
1100  uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1101  pkt->dataStatic(tmpData);
1102  }
1103  delete oldPkt;
1104 
1105 
1106  // New SenderState for the memory access
1107  pkt->senderState =
1108  new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
1109  nullptr);
1110 
1111  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1112  gpuDynInst->tlbHitLevel[index] = hit_level;
1113 
1114  // translation is done. Schedule the mem_req_event at the
1115  // appropriate cycle to send the timing memory request to ruby
1116  EventFunctionWrapper *mem_req_event =
1117  memPort[index].createMemReqEvent(pkt);
1118 
1119  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1120  "scheduled\n", cu_id, gpuDynInst->simdId,
1121  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1122 
1123  schedule(mem_req_event, curTick() + req_tick_latency);
1124  } else if (tlbPort[tlbPort_index].isStalled()) {
1125  assert(tlbPort[tlbPort_index].retries.size() > 0);
1126 
1127  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1128  "failed!\n", cu_id, gpuDynInst->simdId,
1129  gpuDynInst->wfSlotId, tmp_vaddr);
1130 
1131  tlbPort[tlbPort_index].retries.push_back(pkt);
1132  } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1133  // Stall the data port;
1134  // No more packet will be issued till
1135  // ruby indicates resources are freed by
1136  // a recvReqRetry() call back on this port.
1137  tlbPort[tlbPort_index].stallPort();
1138 
1139  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1140  "failed!\n", cu_id, gpuDynInst->simdId,
1141  gpuDynInst->wfSlotId, tmp_vaddr);
1142 
1143  tlbPort[tlbPort_index].retries.push_back(pkt);
1144  } else {
1145  DPRINTF(GPUTLB,
1146  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1147  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1148  }
1149  } else {
1150  if (pkt->cmd == MemCmd::MemSyncReq) {
1151  gpuDynInst->resetEntireStatusVector();
1152  } else {
1153  gpuDynInst->decrementStatusVector(index);
1154  }
1155 
1156  // New SenderState for the memory access
1157  delete pkt->senderState;
1158 
1159  // Because it's atomic operation, only need TLB translation state
1160  pkt->senderState = new X86ISA::GpuTLB::TranslationState(TLB_mode,
1161  shader->gpuTc);
1162 
1163  tlbPort[tlbPort_index].sendFunctional(pkt);
1164 
1165  // the addr of the packet is not modified, so we need to create a new
1166  // packet, or otherwise the memory access will have the old virtual
1167  // address sent in the translation packet, instead of the physical
1168  // address returned by the translation.
1169  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1170  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1171 
1172  // Translation is done. It is safe to send the packet to memory.
1173  memPort[0].sendFunctional(new_pkt);
1174 
1175  DPRINTF(GPUMem, "Functional sendRequest\n");
1176  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1177  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1178  new_pkt->req->getPaddr());
1179 
1180  // safe_cast the senderState
1181  X86ISA::GpuTLB::TranslationState *sender_state =
1182  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1183 
1184  delete sender_state->tlbEntry;
1185  delete new_pkt;
1186  delete pkt->senderState;
1187  delete pkt;
1188  }
1189 }
1190 
1191 void
1193 {
1194  assert(pkt->isWrite() || pkt->isRead());
1195 
1196  BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write;
1197 
1198  pkt->senderState =
1200 
1201  pkt->senderState =
1202  new X86ISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
1203  pkt->senderState);
1204 
1205  if (scalarDTLBPort.isStalled()) {
1206  assert(scalarDTLBPort.retries.size());
1207  scalarDTLBPort.retries.push_back(pkt);
1208  } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1210  scalarDTLBPort.retries.push_back(pkt);
1211  } else {
1212  DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1213  tlb_mode == BaseTLB::Read ? "read" : "write",
1214  pkt->req->getVaddr());
1215  }
1216 }
1217 
1218 void
1220  bool kernelMemSync,
1221  RequestPtr req)
1222 {
1223  assert(gpuDynInst->isGlobalSeg() ||
1224  gpuDynInst->executedAs() == Enums::SC_GLOBAL);
1225 
1226  if (!req) {
1227  req = std::make_shared<Request>(
1228  0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
1229  }
1230 
1231  // all mem sync requests have Paddr == 0
1232  req->setPaddr(0);
1233 
1234  PacketPtr pkt = nullptr;
1235 
1236  if (kernelMemSync) {
1237  if (gpuDynInst->isKernelLaunch()) {
1238  req->setCacheCoherenceFlags(Request::INV_L1);
1239  req->setReqInstSeqNum(gpuDynInst->seqNum());
1240  req->setFlags(Request::KERNEL);
1241  pkt = new Packet(req, MemCmd::MemSyncReq);
1242  pkt->pushSenderState(
1243  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1244 
1245  EventFunctionWrapper *mem_req_event =
1246  memPort[0].createMemReqEvent(pkt);
1247 
1248  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1249  "an acquire\n", cu_id, gpuDynInst->simdId,
1250  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1251 
1252  schedule(mem_req_event, curTick() + req_tick_latency);
1253  } else {
1254  // kernel end flush of GL2 cache may be quiesced by Ruby if the
1255  // GL2 is a read-only cache
1256  assert(shader->impl_kern_end_rel);
1257  assert(gpuDynInst->isEndOfKernel());
1258 
1259  req->setCacheCoherenceFlags(Request::FLUSH_L2);
1260  req->setReqInstSeqNum(gpuDynInst->seqNum());
1261  req->setFlags(Request::KERNEL);
1262  pkt = new Packet(req, MemCmd::MemSyncReq);
1263  pkt->pushSenderState(
1264  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1265 
1266  EventFunctionWrapper *mem_req_event =
1267  memPort[0].createMemReqEvent(pkt);
1268 
1269  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1270  "a release\n", cu_id, gpuDynInst->simdId,
1271  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1272 
1273  schedule(mem_req_event, curTick() + req_tick_latency);
1274  }
1275  } else {
1276  gpuDynInst->setRequestFlags(req);
1277 
1278  req->setReqInstSeqNum(gpuDynInst->seqNum());
1279 
1280  pkt = new Packet(req, MemCmd::MemSyncReq);
1281  pkt->pushSenderState(
1282  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1283 
1284  EventFunctionWrapper *mem_req_event =
1285  memPort[0].createMemReqEvent(pkt);
1286 
1287  DPRINTF(GPUPort,
1288  "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1289  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1290  pkt->req->getPaddr());
1291 
1292  schedule(mem_req_event, curTick() + req_tick_latency);
1293  }
1294 }
1295 
1296 void
1298 {
1299  DataPort::SenderState *sender_state =
1300  safe_cast<DataPort::SenderState*>(pkt->senderState);
1301 
1302  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1303  ComputeUnit *compute_unit = computeUnit;
1304 
1305  assert(gpuDynInst);
1306 
1307  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1308  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1309  pkt->req->getPaddr(), id);
1310 
1311  Addr paddr = pkt->req->getPaddr();
1312 
1313  // mem sync resp callback must be handled already in
1314  // DataPort::recvTimingResp
1315  assert(pkt->cmd != MemCmd::MemSyncResp);
1316 
1317  // The status vector and global memory response for WriteResp packets get
1318  // handled by the WriteCompleteResp packets.
1319  if (pkt->cmd == MemCmd::WriteResp) {
1320  delete pkt;
1321  return;
1322  }
1323 
1324  // this is for read, write and atomic
1325  int index = gpuDynInst->memStatusVector[paddr].back();
1326 
1327  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1328  pkt->req->getPaddr(), id);
1329 
1330  gpuDynInst->memStatusVector[paddr].pop_back();
1331  gpuDynInst->pAddr = pkt->req->getPaddr();
1332 
1333  gpuDynInst->decrementStatusVector(index);
1334  DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1335 
1336  if (gpuDynInst->allLanesZero()) {
1337  auto iter = gpuDynInst->memStatusVector.begin();
1338  auto end = gpuDynInst->memStatusVector.end();
1339 
1340  while (iter != end) {
1341  assert(iter->second.empty());
1342  ++iter;
1343  }
1344 
1345  // Calculate the difference between the arrival of the first cache
1346  // block and the last cache block to arrive if we have the time
1347  // for the first cache block.
1348  if (compute_unit->headTailMap.count(gpuDynInst)) {
1349  Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1350  compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1351  compute_unit->headTailMap.erase(gpuDynInst);
1352  }
1353 
1354  gpuDynInst->memStatusVector.clear();
1355 
1356  gpuDynInst->
1357  profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1358  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1359 
1360  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1361  compute_unit->cu_id, gpuDynInst->simdId,
1362  gpuDynInst->wfSlotId);
1363  } else {
1364  if (pkt->isRead()) {
1365  if (!compute_unit->headTailMap.count(gpuDynInst)) {
1366  compute_unit->headTailMap
1367  .insert(std::make_pair(gpuDynInst, curTick()));
1368  }
1369  }
1370  }
1371 
1372  delete pkt->senderState;
1373  delete pkt;
1374 }
1375 
1376 bool
1378 {
1379  Addr line = pkt->req->getPaddr();
1380 
1381  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1382  pkt->req->getVaddr(), line);
1383 
1384  assert(pkt->senderState);
1385  computeUnit->stats.tlbCycles += curTick();
1386 
1387  // pop off the TLB translation state
1388  X86ISA::GpuTLB::TranslationState *translation_state =
1389  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1390 
1391  // no PageFaults are permitted for data accesses
1392  if (!translation_state->tlbEntry) {
1393  DTLBPort::SenderState *sender_state =
1394  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1395 
1396  M5_VAR_USED Wavefront *w =
1397  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1398  [sender_state->_gpuDynInst->wfSlotId];
1399 
1400  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1401  pkt->req->getVaddr());
1402  }
1403 
1404  // update the hitLevel distribution
1405  int hit_level = translation_state->hitLevel;
1406  computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1407 
1408  delete translation_state->tlbEntry;
1409  assert(!translation_state->ports.size());
1410  pkt->senderState = translation_state->saved;
1411 
1412  // for prefetch pkt
1413  BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1414 
1415  delete translation_state;
1416 
1417  // use the original sender state to know how to close this transaction
1418  DTLBPort::SenderState *sender_state =
1419  safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1420 
1421  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1422  PortID mp_index = sender_state->portIndex;
1423  Addr vaddr = pkt->req->getVaddr();
1424  gpuDynInst->memStatusVector[line].push_back(mp_index);
1425  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1426 
1427  MemCmd requestCmd;
1428 
1429  if (pkt->cmd == MemCmd::ReadResp) {
1430  requestCmd = MemCmd::ReadReq;
1431  } else if (pkt->cmd == MemCmd::WriteResp) {
1432  requestCmd = MemCmd::WriteReq;
1433  } else if (pkt->cmd == MemCmd::SwapResp) {
1434  requestCmd = MemCmd::SwapReq;
1435  } else {
1436  panic("unsupported response to request conversion %s\n",
1437  pkt->cmd.toString());
1438  }
1439 
1440  if (computeUnit->prefetchDepth) {
1441  int simdId = gpuDynInst->simdId;
1442  int wfSlotId = gpuDynInst->wfSlotId;
1443  Addr last = 0;
1444 
1445  switch(computeUnit->prefetchType) {
1446  case Enums::PF_CU:
1447  last = computeUnit->lastVaddrCU[mp_index];
1448  break;
1449  case Enums::PF_PHASE:
1450  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1451  break;
1452  case Enums::PF_WF:
1453  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1454  default:
1455  break;
1456  }
1457 
1458  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1459  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1460 
1461  int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1463  : 0;
1464 
1465  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1466 
1467  computeUnit->lastVaddrCU[mp_index] = vaddr;
1468  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1469  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1470 
1471  stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1472  computeUnit->prefetchStride: stride;
1473 
1474  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1475  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1476 
1477  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1478 
1479  // Prefetch Next few pages atomically
1480  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1481  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1483 
1484  if (!stride)
1485  break;
1486 
1487  RequestPtr prefetch_req = std::make_shared<Request>(
1489  sizeof(uint8_t), 0,
1490  computeUnit->requestorId(),
1491  0, 0, nullptr);
1492 
1493  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1494  uint8_t foo = 0;
1495  prefetch_pkt->dataStatic(&foo);
1496 
1497  // Because it's atomic operation, only need TLB translation state
1498  prefetch_pkt->senderState =
1499  new X86ISA::GpuTLB::TranslationState(TLB_mode,
1500  computeUnit->shader->gpuTc, true);
1501 
1502  // Currently prefetches are zero-latency, hence the sendFunctional
1503  sendFunctional(prefetch_pkt);
1504 
1505  /* safe_cast the senderState */
1507  safe_cast<X86ISA::GpuTLB::TranslationState*>(
1508  prefetch_pkt->senderState);
1509 
1510 
1511  delete tlb_state->tlbEntry;
1512  delete tlb_state;
1513  delete prefetch_pkt;
1514  }
1515  }
1516 
1517  // First we must convert the response cmd back to a request cmd so that
1518  // the request can be sent through the cu's request port
1519  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1520  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1521  delete pkt->senderState;
1522  delete pkt;
1523 
1524  // New SenderState for the memory access
1525  new_pkt->senderState =
1526  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1527  nullptr);
1528 
1529  // translation is done. Schedule the mem_req_event at the appropriate
1530  // cycle to send the timing memory request to ruby
1531  EventFunctionWrapper *mem_req_event =
1532  computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1533 
1534  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1535  computeUnit->cu_id, gpuDynInst->simdId,
1536  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1537 
1538  computeUnit->schedule(mem_req_event, curTick() +
1539  computeUnit->req_tick_latency);
1540 
1541  return true;
1542 }
1543 
1546 {
1547  return new EventFunctionWrapper(
1548  [this, pkt]{ processMemReqEvent(pkt); },
1549  "ComputeUnit memory request event", true);
1550 }
1551 
1554 {
1555  return new EventFunctionWrapper(
1556  [this, pkt]{ processMemRespEvent(pkt); },
1557  "ComputeUnit memory response event", true);
1558 }
1559 
1560 void
1562 {
1563  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1564  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1565  M5_VAR_USED ComputeUnit *compute_unit = computeUnit;
1566 
1567  if (!(sendTimingReq(pkt))) {
1568  retries.push_back(std::make_pair(pkt, gpuDynInst));
1569 
1570  DPRINTF(GPUPort,
1571  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1572  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1573  id, pkt->req->getPaddr());
1574  } else {
1575  DPRINTF(GPUPort,
1576  "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1577  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1578  gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1579  pkt->req->getPaddr());
1580  }
1581 }
1582 
1583 const char*
1585 {
1586  return "ComputeUnit scalar memory request event";
1587 }
1588 
1589 void
1591 {
1592  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1593  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1594  M5_VAR_USED ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1595 
1596  if (!(scalarDataPort.sendTimingReq(pkt))) {
1597  scalarDataPort.retries.push_back(pkt);
1598 
1599  DPRINTF(GPUPort,
1600  "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1601  compute_unit->cu_id, gpuDynInst->simdId,
1602  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1603  } else {
1604  DPRINTF(GPUPort,
1605  "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1606  "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1607  gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1608  pkt->req->getPaddr());
1609  }
1610 }
1611 
1612 /*
1613  * The initial translation request could have been rejected,
1614  * if <retries> queue is not Retry sending the translation
1615  * request. sendRetry() is called from the peer port whenever
1616  * a translation completes.
1617  */
1618 void
1620 {
1621  int len = retries.size();
1622 
1623  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1624  computeUnit->cu_id, len);
1625 
1626  assert(len > 0);
1627  assert(isStalled());
1628  // recvReqRetry is an indication that the resource on which this
1629  // port was stalling on is freed. So, remove the stall first
1630  unstallPort();
1631 
1632  for (int i = 0; i < len; ++i) {
1633  PacketPtr pkt = retries.front();
1634  M5_VAR_USED Addr vaddr = pkt->req->getVaddr();
1635  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1636 
1637  if (!sendTimingReq(pkt)) {
1638  // Stall port
1639  stallPort();
1640  DPRINTF(GPUTLB, ": failed again\n");
1641  break;
1642  } else {
1643  DPRINTF(GPUTLB, ": successful\n");
1644  retries.pop_front();
1645  }
1646  }
1647 }
1648 
1649 bool
1651 {
1652  assert(pkt->senderState);
1653 
1654  X86ISA::GpuTLB::TranslationState *translation_state =
1655  safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1656 
1657  // Page faults are not allowed
1658  fatal_if(!translation_state->tlbEntry,
1659  "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1660 
1661  delete translation_state->tlbEntry;
1662  assert(!translation_state->ports.size());
1663 
1664  pkt->senderState = translation_state->saved;
1665  delete translation_state;
1666 
1667  ScalarDTLBPort::SenderState *sender_state =
1668  safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1669 
1670  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1671  delete pkt->senderState;
1672 
1673  M5_VAR_USED Wavefront *w = gpuDynInst->wavefront();
1674 
1675  DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1676  "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1677  w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1678 
1679  MemCmd mem_cmd;
1680 
1681  if (pkt->cmd == MemCmd::ReadResp) {
1682  mem_cmd = MemCmd::ReadReq;
1683  } else if (pkt->cmd == MemCmd::WriteResp) {
1684  mem_cmd = MemCmd::WriteReq;
1685  } else {
1686  fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1687  pkt->cmd.toString());
1688  }
1689 
1690  PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1691  req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1692  delete pkt;
1693 
1694  req_pkt->senderState =
1696 
1697  if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1698  computeUnit->scalarDataPort.retries.push_back(req_pkt);
1699  DPRINTF(GPUMem, "send scalar req failed for: %s\n",
1700  gpuDynInst->disassemble());
1701  } else {
1702  DPRINTF(GPUMem, "send scalar req for: %s\n",
1703  gpuDynInst->disassemble());
1704  }
1705 
1706  return true;
1707 }
1708 
1709 bool
1711 {
1712  M5_VAR_USED Addr line = pkt->req->getPaddr();
1713  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1714  computeUnit->cu_id, pkt->req->getVaddr(), line);
1715 
1716  assert(pkt->senderState);
1717 
1718  // pop off the TLB translation state
1719  X86ISA::GpuTLB::TranslationState *translation_state
1720  = safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
1721 
1722  bool success = translation_state->tlbEntry != nullptr;
1723  delete translation_state->tlbEntry;
1724  assert(!translation_state->ports.size());
1725  pkt->senderState = translation_state->saved;
1726  delete translation_state;
1727 
1728  // use the original sender state to know how to close this transaction
1729  ITLBPort::SenderState *sender_state =
1730  safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1731 
1732  // get the wavefront associated with this translation request
1733  Wavefront *wavefront = sender_state->wavefront;
1734  delete pkt->senderState;
1735 
1736  if (success) {
1737  // pkt is reused in fetch(), don't delete it here. However, we must
1738  // reset the command to be a request so that it can be sent through
1739  // the cu's request port
1740  assert(pkt->cmd == MemCmd::ReadResp);
1741  pkt->cmd = MemCmd::ReadReq;
1742 
1743  computeUnit->fetchStage.fetch(pkt, wavefront);
1744  } else {
1745  if (wavefront->dropFetch) {
1746  assert(wavefront->instructionBuffer.empty());
1747  wavefront->dropFetch = false;
1748  }
1749 
1750  wavefront->pendingFetch = 0;
1751  }
1752 
1753  return true;
1754 }
1755 
1756 /*
1757  * The initial translation request could have been rejected, if
1758  * <retries> queue is not empty. Retry sending the translation
1759  * request. sendRetry() is called from the peer port whenever
1760  * a translation completes.
1761  */
1762 void
1764 {
1765 
1766  int len = retries.size();
1767  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1768 
1769  assert(len > 0);
1770  assert(isStalled());
1771 
1772  // recvReqRetry is an indication that the resource on which this
1773  // port was stalling on is freed. So, remove the stall first
1774  unstallPort();
1775 
1776  for (int i = 0; i < len; ++i) {
1777  PacketPtr pkt = retries.front();
1778  M5_VAR_USED Addr vaddr = pkt->req->getVaddr();
1779  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1780 
1781  if (!sendTimingReq(pkt)) {
1782  stallPort(); // Stall port
1783  DPRINTF(GPUTLB, ": failed again\n");
1784  break;
1785  } else {
1786  DPRINTF(GPUTLB, ": successful\n");
1787  retries.pop_front();
1788  }
1789  }
1790 }
1791 
1792 void
1794 {
1795  if (gpuDynInst->isScalar()) {
1796  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1797  stats.sALUInsts++;
1799  } else if (gpuDynInst->isLoad()) {
1801  } else if (gpuDynInst->isStore()) {
1803  }
1804  } else {
1805  if (gpuDynInst->isALU()) {
1808  exitSimLoop("max vALU insts");
1809  }
1810  stats.vALUInsts++;
1813  += gpuDynInst->wavefront()->execMask().count();
1814  } else if (gpuDynInst->isFlat()) {
1815  if (gpuDynInst->isLocalMem()) {
1816  stats.flatLDSInsts++;
1817  } else {
1818  stats.flatVMemInsts++;
1819  }
1820  } else if (gpuDynInst->isLocalMem()) {
1822  } else if (gpuDynInst->isLoad()) {
1824  } else if (gpuDynInst->isStore()) {
1826  }
1827 
1828  if (gpuDynInst->isLoad()) {
1829  switch (gpuDynInst->executedAs()) {
1830  case Enums::SC_SPILL:
1831  stats.spillReads++;
1832  break;
1833  case Enums::SC_GLOBAL:
1834  stats.globalReads++;
1835  break;
1836  case Enums::SC_GROUP:
1837  stats.groupReads++;
1838  break;
1839  case Enums::SC_PRIVATE:
1840  stats.privReads++;
1841  break;
1842  case Enums::SC_READONLY:
1843  stats.readonlyReads++;
1844  break;
1845  case Enums::SC_KERNARG:
1846  stats.kernargReads++;
1847  break;
1848  case Enums::SC_ARG:
1849  stats.argReads++;
1850  break;
1851  case Enums::SC_NONE:
1856  break;
1857  default:
1858  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1859  break;
1860  }
1861  } else if (gpuDynInst->isStore()) {
1862  switch (gpuDynInst->executedAs()) {
1863  case Enums::SC_SPILL:
1864  stats.spillWrites++;
1865  break;
1866  case Enums::SC_GLOBAL:
1867  stats.globalWrites++;
1868  break;
1869  case Enums::SC_GROUP:
1870  stats.groupWrites++;
1871  break;
1872  case Enums::SC_PRIVATE:
1873  stats.privWrites++;
1874  break;
1875  case Enums::SC_READONLY:
1877  break;
1878  case Enums::SC_KERNARG:
1879  stats.kernargWrites++;
1880  break;
1881  case Enums::SC_ARG:
1882  stats.argWrites++;
1883  break;
1884  case Enums::SC_NONE:
1889  break;
1890  default:
1891  fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1892  break;
1893  }
1894  }
1895  }
1896 }
1897 
1898 void
1900 {
1901  Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
1902 
1903  if (!pagesTouched.count(virt_page_addr))
1904  pagesTouched[virt_page_addr] = 1;
1905  else
1906  pagesTouched[virt_page_addr]++;
1907 }
1908 
1909 void
1911 {
1912  if (countPages) {
1913  std::ostream *page_stat_file = simout.create(name().c_str())->stream();
1914 
1915  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1916  std::endl;
1917 
1918  for (auto iter : pageAccesses) {
1919  *page_stat_file << std::hex << iter.first << ",";
1920  *page_stat_file << std::dec << iter.second.first << ",";
1921  *page_stat_file << std::dec << iter.second.second << std::endl;
1922  }
1923  }
1924 }
1925 
1926 bool
1928 {
1929  for (int i = 0; i < numVectorALUs; ++i) {
1930  if (!isVectorAluIdle(i)) {
1931  return false;
1932  }
1933  }
1934 
1935  // TODO: FIXME if more than 1 of any memory pipe supported
1936  if (!srfToScalarMemPipeBus.rdy()) {
1937  return false;
1938  }
1939  if (!vrfToGlobalMemPipeBus.rdy()) {
1940  return false;
1941  }
1942  if (!vrfToLocalMemPipeBus.rdy()) {
1943  return false;
1944  }
1945 
1950  return false;
1951  }
1952 
1953  return true;
1954 }
1955 
1956 int32_t
1957 ComputeUnit::getRefCounter(const uint32_t dispatchId,
1958  const uint32_t wgId) const
1959 {
1960  return lds.getRefCounter(dispatchId, wgId);
1961 }
1962 
1963 bool
1964 ComputeUnit::isVectorAluIdle(uint32_t simdId) const
1965 {
1966  assert(simdId < numVectorALUs);
1967 
1968  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1969  if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
1970  return false;
1971  }
1972  }
1973 
1974  return true;
1975 }
1976 
1982 bool
1984 {
1985  // this is just a request to carry the GPUDynInstPtr
1986  // back and forth
1987  RequestPtr newRequest = std::make_shared<Request>();
1988  newRequest->setPaddr(0x0);
1989 
1990  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
1991  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
1992 
1993  // This is the SenderState needed upon return
1994  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
1995 
1996  return ldsPort.sendTimingReq(newPacket);
1997 }
1998 
2002 bool
2004 {
2005  const ComputeUnit::LDSPort::SenderState *senderState =
2006  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2007 
2008  fatal_if(!senderState, "did not get the right sort of sender state");
2009 
2010  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2011 
2012  delete packet->senderState;
2013  delete packet;
2014 
2015  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2016  return true;
2017 }
2018 
2024 bool
2026 {
2027  ComputeUnit::LDSPort::SenderState *sender_state =
2028  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
2029  fatal_if(!sender_state, "packet without a valid sender state");
2030 
2031  M5_VAR_USED GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2032 
2033  if (isStalled()) {
2034  fatal_if(retries.empty(), "must have retries waiting to be stalled");
2035 
2036  retries.push(pkt);
2037 
2038  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2039  computeUnit->cu_id, gpuDynInst->simdId,
2040  gpuDynInst->wfSlotId);
2041  return false;
2042  } else if (!RequestPort::sendTimingReq(pkt)) {
2043  // need to stall the LDS port until a recvReqRetry() is received
2044  // this indicates that there is more space
2045  stallPort();
2046  retries.push(pkt);
2047 
2048  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2049  computeUnit->cu_id, gpuDynInst->simdId,
2050  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2051  return false;
2052  } else {
2053  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2054  computeUnit->cu_id, gpuDynInst->simdId,
2055  gpuDynInst->wfSlotId, pkt->req->getPaddr());
2056  return true;
2057  }
2058 }
2059 
2066 void
2068 {
2069  auto queueSize = retries.size();
2070 
2071  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2072  computeUnit->cu_id, queueSize);
2073 
2074  fatal_if(queueSize < 1,
2075  "why was there a recvReqRetry() with no pending reqs?");
2076  fatal_if(!isStalled(),
2077  "recvReqRetry() happened when the port was not stalled");
2078 
2079  unstallPort();
2080 
2081  while (!retries.empty()) {
2082  PacketPtr packet = retries.front();
2083 
2084  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2085 
2086  if (!RequestPort::sendTimingReq(packet)) {
2087  // Stall port
2088  stallPort();
2089  DPRINTF(GPUPort, ": LDS send failed again\n");
2090  break;
2091  } else {
2092  DPRINTF(GPUTLB, ": LDS send successful\n");
2093  retries.pop();
2094  }
2095  }
2096 }
2097 
2099  : Stats::Group(parent),
2100  ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2101  ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2102  "per-wavefront."),
2103  ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2104  ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2105  "per-wavefront."),
2106  ADD_STAT(instCyclesVALU,
2107  "Number of cycles needed to execute VALU insts."),
2108  ADD_STAT(instCyclesSALU,
2109  "Number of cycles needed to execute SALU insts."),
2110  ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2111  "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2112  "the number of active threads."),
2113  ADD_STAT(vALUUtilization,
2114  "Percentage of active vector ALU threads in a wave."),
2115  ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2116  " accesses that resolve to LDS."),
2117  ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2118  "including FLAT accesses that resolve to LDS) per-wavefront."),
2119  ADD_STAT(flatVMemInsts,
2120  "The number of FLAT insts that resolve to vmem issued."),
2121  ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2122  "resolve to vmem issued per-wavefront."),
2123  ADD_STAT(flatLDSInsts,
2124  "The number of FLAT insts that resolve to LDS issued."),
2125  ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2126  "resolve to LDS issued per-wavefront."),
2127  ADD_STAT(vectorMemWrites,
2128  "Number of vector mem write insts (excluding FLAT insts)."),
2129  ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2130  "insts (excluding FLAT insts) per-wavefront."),
2131  ADD_STAT(vectorMemReads,
2132  "Number of vector mem read insts (excluding FLAT insts)."),
2133  ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2134  "(excluding FLAT insts) per-wavefront."),
2135  ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2136  ADD_STAT(scalarMemWritesPerWF,
2137  "The average number of scalar mem write insts per-wavefront."),
2138  ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2139  ADD_STAT(scalarMemReadsPerWF,
2140  "The average number of scalar mem read insts per-wavefront."),
2141  ADD_STAT(vectorMemReadsPerKiloInst,
2142  "Number of vector mem reads per kilo-instruction"),
2143  ADD_STAT(vectorMemWritesPerKiloInst,
2144  "Number of vector mem writes per kilo-instruction"),
2145  ADD_STAT(vectorMemInstsPerKiloInst,
2146  "Number of vector mem insts per kilo-instruction"),
2147  ADD_STAT(scalarMemReadsPerKiloInst,
2148  "Number of scalar mem reads per kilo-instruction"),
2149  ADD_STAT(scalarMemWritesPerKiloInst,
2150  "Number of scalar mem writes per kilo-instruction"),
2151  ADD_STAT(scalarMemInstsPerKiloInst,
2152  "Number of scalar mem insts per kilo-instruction"),
2153  ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2154  "command, data from VRF to vector memory unit, per SIMD"),
2155  ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2156  "command, data from SRF to scalar memory unit, per SIMD"),
2157  ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2158  "command, data from VRF to LDS unit, per SIMD"),
2159  ADD_STAT(globalReads, "Number of reads to the global segment"),
2160  ADD_STAT(globalWrites, "Number of writes to the global segment"),
2161  ADD_STAT(globalMemInsts,
2162  "Number of memory instructions sent to the global segment"),
2163  ADD_STAT(argReads, "Number of reads to the arg segment"),
2164  ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2165  ADD_STAT(argMemInsts,
2166  "Number of memory instructions sent to the arg segment"),
2167  ADD_STAT(spillReads, "Number of reads to the spill segment"),
2168  ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2169  ADD_STAT(spillMemInsts,
2170  "Number of memory instructions sent to the spill segment"),
2171  ADD_STAT(groupReads, "Number of reads to the group segment"),
2172  ADD_STAT(groupWrites, "Number of writes to the group segment"),
2173  ADD_STAT(groupMemInsts,
2174  "Number of memory instructions sent to the group segment"),
2175  ADD_STAT(privReads, "Number of reads to the private segment"),
2176  ADD_STAT(privWrites, "Number of writes to the private segment"),
2177  ADD_STAT(privMemInsts,
2178  "Number of memory instructions sent to the private segment"),
2179  ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2180  ADD_STAT(readonlyWrites,
2181  "Number of memory instructions sent to the readonly segment"),
2182  ADD_STAT(readonlyMemInsts,
2183  "Number of memory instructions sent to the readonly segment"),
2184  ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2185  ADD_STAT(kernargWrites,
2186  "Number of memory instructions sent to the kernarg segment"),
2187  ADD_STAT(kernargMemInsts,
2188  "Number of memory instructions sent to the kernarg segment"),
2189  ADD_STAT(waveLevelParallelism,
2190  "wave level parallelism: count of active waves at wave launch"),
2191  ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2192  ADD_STAT(tlbCycles,
2193  "total number of cycles for all uncoalesced requests"),
2194  ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2195  ADD_STAT(hitsPerTLBLevel,
2196  "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2197  ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2198  ADD_STAT(ldsBankConflictDist,
2199  "Number of bank conflicts per LDS memory packet"),
2200  ADD_STAT(pageDivergenceDist,
2201  "pages touched per wf (over all mem. instr.)"),
2202  ADD_STAT(dynamicGMemInstrCnt,
2203  "dynamic non-flat global memory instruction count"),
2204  ADD_STAT(dynamicFlatMemInstrCnt,
2205  "dynamic flat global memory instruction count"),
2206  ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2207  ADD_STAT(wgBlockedDueBarrierAllocation,
2208  "WG dispatch was blocked due to lack of barrier resources"),
2209  ADD_STAT(wgBlockedDueLdsAllocation,
2210  "Workgroup blocked due to LDS capacity"),
2211  ADD_STAT(numInstrExecuted, "number of instructions executed"),
2212  ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2213  "vector instructions per cycle"),
2214  ADD_STAT(numVecOpsExecuted,
2215  "number of vec ops executed (e.g. WF size/inst)"),
2216  ADD_STAT(numVecOpsExecutedF16,
2217  "number of f16 vec ops executed (e.g. WF size/inst)"),
2218  ADD_STAT(numVecOpsExecutedF32,
2219  "number of f32 vec ops executed (e.g. WF size/inst)"),
2220  ADD_STAT(numVecOpsExecutedF64,
2221  "number of f64 vec ops executed (e.g. WF size/inst)"),
2222  ADD_STAT(numVecOpsExecutedFMA16,
2223  "number of fma16 vec ops executed (e.g. WF size/inst)"),
2224  ADD_STAT(numVecOpsExecutedFMA32,
2225  "number of fma32 vec ops executed (e.g. WF size/inst)"),
2226  ADD_STAT(numVecOpsExecutedFMA64,
2227  "number of fma64 vec ops executed (e.g. WF size/inst)"),
2228  ADD_STAT(numVecOpsExecutedMAC16,
2229  "number of mac16 vec ops executed (e.g. WF size/inst)"),
2230  ADD_STAT(numVecOpsExecutedMAC32,
2231  "number of mac32 vec ops executed (e.g. WF size/inst)"),
2232  ADD_STAT(numVecOpsExecutedMAC64,
2233  "number of mac64 vec ops executed (e.g. WF size/inst)"),
2234  ADD_STAT(numVecOpsExecutedMAD16,
2235  "number of mad16 vec ops executed (e.g. WF size/inst)"),
2236  ADD_STAT(numVecOpsExecutedMAD32,
2237  "number of mad32 vec ops executed (e.g. WF size/inst)"),
2238  ADD_STAT(numVecOpsExecutedMAD64,
2239  "number of mad64 vec ops executed (e.g. WF size/inst)"),
2240  ADD_STAT(numVecOpsExecutedTwoOpFP,
2241  "number of two op FP vec ops executed (e.g. WF size/inst)"),
2242  ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2243  ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2244  ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2245  ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2246  ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2247  ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2248  ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2249  "instruction (over all instructions)"),
2250  ADD_STAT(activeLanesPerGMemInstrDist,
2251  "number of active lanes per global memory instruction"),
2252  ADD_STAT(activeLanesPerLMemInstrDist,
2253  "number of active lanes per local memory instruction"),
2254  ADD_STAT(numALUInstsExecuted,
2255  "Number of dynamic non-GM memory insts executed"),
2256  ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2257  "blocked due to VGPR allocation per SIMD"),
2258  ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2259  "blocked due to SGPR allocation per SIMD"),
2260  ADD_STAT(numCASOps, "number of compare and swap operations"),
2261  ADD_STAT(numFailedCASOps,
2262  "number of compare and swap operations that failed"),
2263  ADD_STAT(completedWfs, "number of completed wavefronts"),
2264  ADD_STAT(completedWGs, "number of completed workgroups"),
2265  ADD_STAT(headTailLatency, "ticks between first and last cache block "
2266  "arrival at coalescer"),
2267  ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2268 {
2269  ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2270 
2274 
2275  hitsPerTLBLevel.init(4);
2276  execRateDist.init(0, 10, 2);
2277  ldsBankConflictDist.init(0, cu->wfSize(), 2);
2278 
2279  pageDivergenceDist.init(1, cu->wfSize(), 4);
2280  controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
2283 
2284  headTailLatency.init(0, 1000000, 10000).flags(Stats::pdf | Stats::oneline);
2285  waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2286  instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2287 
2290  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
2298 
2307 
2315 
2317 
2318  // fixed number of TLB levels
2319  for (int i = 0; i < 4; ++i) {
2320  if (!i)
2321  hitsPerTLBLevel.subname(i,"page_table");
2322  else
2323  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2324  }
2325 
2331 
2334 }
ScheduleStage::exec
void exec()
Definition: schedule_stage.cc:88
ComputeUnit::DataPort::createMemReqEvent
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1545
ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition: compute_unit.hh:244
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:183
ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition: compute_unit.hh:232
LocalMemPipeline::isLMRespFIFOWrRdy
bool isLMRespFIFOWrRdy() const
Definition: local_memory_pipeline.hh:67
LocalMemPipeline::isLMReqFIFOWrRdy
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: local_memory_pipeline.hh:73
Request::INV_L1
@ INV_L1
mem_sync_op flags
Definition: request.hh:295
ComputeUnit::ComputeUnitStats::pageDivergenceDist
Stats::Distribution pageDivergenceDist
Definition: compute_unit.hh:1020
Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:462
RegisterManager::canAllocateVgprs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:108
ComputeUnit::ComputeUnitStats::instCyclesVALU
Stats::Scalar instCyclesVALU
Definition: compute_unit.hh:950
ComputeUnit::numScalarRegsPerSimd
int numScalarRegsPerSimd
Definition: compute_unit.hh:372
roundDown
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:150
ComputeUnit::ComputeUnitStats::vectorMemWritesPerWF
Stats::Formula vectorMemWritesPerWF
Definition: compute_unit.hh:961
ComputeUnit::ComputeUnitStats::kernargReads
Stats::Scalar kernargReads
Definition: compute_unit.hh:1000
ComputeUnit::ScalarDataPort::MemReqEvent::description
const char * description() const
Return a C string describing the event.
Definition: compute_unit.cc:1584
ComputeUnit::ComputeUnitStats::spillMemInsts
Stats::Formula spillMemInsts
Definition: compute_unit.hh:990
Shader::impl_kern_end_rel
int impl_kern_end_rel
Definition: shader.hh:200
RegisterManager::allocateRegisters
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
Definition: register_manager.cc:121
simple_pool_manager.hh
ComputeUnit::resetBarrier
void resetBarrier(int bar_id)
Definition: compute_unit.cc:680
ComputeUnit::ComputeUnitStats::hitsPerTLBLevel
Stats::Vector hitsPerTLBLevel
Definition: compute_unit.hh:1013
ComputeUnit::deleteFromPipeMap
void deleteFromPipeMap(Wavefront *w)
Definition: compute_unit.cc:505
ComputeUnit::req_tick_latency
Tick req_tick_latency
Definition: compute_unit.hh:354
ComputeUnit::pagesTouched
std::map< Addr, int > pagesTouched
Definition: compute_unit.hh:377
ComputeUnit::init
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: compute_unit.cc:750
ComputeUnit::ComputeUnitStats::vpc_f32
Stats::Formula vpc_f32
Definition: compute_unit.hh:1062
ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition: compute_unit.hh:222
ComputeUnit::mapWaveToScalarAluGlobalIdx
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
Definition: compute_unit.cc:262
X86ISA::GpuTLB::TranslationState::hitLevel
int hitLevel
Definition: gpu_tlb.hh:306
BaseTLB::Read
@ Read
Definition: tlb.hh:57
ComputeUnit::locMemToVrfBus
WaitClass locMemToVrfBus
Definition: compute_unit.hh:228
ComputeUnit::ScalarDataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:598
EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:201
shader.hh
HSAQueueEntry::MAX_DIM
const static int MAX_DIM
Definition: hsa_queue_entry.hh:309
ComputeUnit::dispWorkgroup
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
Definition: compute_unit.cc:418
ComputeUnit::numScalarMemUnits
int numScalarMemUnits
Definition: compute_unit.hh:234
MipsISA::index
Bitfield< 30, 0 > index
Definition: pra_constants.hh:44
Packet::getAddr
Addr getAddr() const
Definition: packet.hh:755
ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:284
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
ComputeUnit::mapWaveToGlobalMem
int mapWaveToGlobalMem(Wavefront *w) const
Definition: compute_unit.cc:269
ComputeUnit::perLaneTLB
bool perLaneTLB
Definition: compute_unit.hh:328
Process
Definition: process.hh:65
ComputeUnit::ComputeUnitStats::flatLDSInstsPerWF
Stats::Formula flatLDSInstsPerWF
Definition: compute_unit.hh:959
ComputeUnit::incNumAtBarrier
void incNumAtBarrier(int bar_id)
Definition: compute_unit.cc:659
ComputeUnit::ComputeUnitStats::privMemInsts
Stats::Formula privMemInsts
Definition: compute_unit.hh:996
ComputeUnit::ITLBPort::SenderState::wavefront
Wavefront * wavefront
Definition: compute_unit.hh:741
Shader::timingSim
bool timingSim
Definition: shader.hh:194
ComputeUnit::ComputeUnitStats::readonlyMemInsts
Stats::Formula readonlyMemInsts
Definition: compute_unit.hh:999
ComputeUnit::scalarDTLBPort
ScalarDTLBPort scalarDTLBPort
Definition: compute_unit.hh:850
ComputeUnit::mapWaveToScalarMem
int mapWaveToScalarMem(Wavefront *w) const
Definition: compute_unit.cc:285
ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition: compute_unit.cc:244
ComputeUnit::mapWaveToLocalMem
int mapWaveToLocalMem(Wavefront *w) const
Definition: compute_unit.cc:277
ComputeUnit::ComputeUnitStats::numVecOpsExecutedF64
Stats::Scalar numVecOpsExecutedF64
Definition: compute_unit.hh:1043
Wavefront::S_RUNNING
@ S_RUNNING
Definition: wavefront.hh:68
X86ISA::pf
Bitfield< 2 > pf
Definition: misc.hh:550
ComputeUnit::headTailMap
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
Definition: compute_unit.hh:936
ComputeUnit::numYetToReachBarrier
int numYetToReachBarrier(int bar_id)
Definition: compute_unit.cc:645
ComputeUnit::ComputeUnitStats::tlbCycles
Stats::Scalar tlbCycles
Definition: compute_unit.hh:1009
HSAQueueEntry::wgSize
int wgSize(int dim) const
Definition: hsa_queue_entry.hh:120
compute_unit.hh
ComputeUnit::gmTokenPort
GMTokenPort gmTokenPort
Definition: compute_unit.hh:504
gpu_static_inst.hh
ComputeUnit::doFlush
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
Definition: compute_unit.cc:400
Packet::isRead
bool isRead() const
Definition: packet.hh:557
ExecStage::init
void init()
Definition: exec_stage.cc:58
ExecStage::exec
void exec()
Definition: exec_stage.cc:151
ComputeUnit::memPortTokens
TokenManager * memPortTokens
Definition: compute_unit.hh:503
BaseTLB::Mode
Mode
Definition: tlb.hh:57
MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:83
ComputeUnit::ComputeUnitStats::scalarMemInstsPerKiloInst
Stats::Formula scalarMemInstsPerKiloInst
Definition: compute_unit.hh:974
ComputeUnit::ComputeUnitStats::globalReads
Stats::Scalar globalReads
Definition: compute_unit.hh:982
Wavefront::barrierId
void barrierId(int bar_id)
Definition: wavefront.cc:1416
Tick
uint64_t Tick
Tick count type.
Definition: types.hh:59
Shader::n_wf
int n_wf
Definition: shader.hh:208
ComputeUnit::ComputeUnitStats::groupReads
Stats::Scalar groupReads
Definition: compute_unit.hh:991
ComputeUnit::ComputeUnitStats::sALUInstsPerWF
Stats::Formula sALUInstsPerWF
Definition: compute_unit.hh:949
OutputDirectory::create
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:207
ComputeUnit::ScalarDataPort::SenderState
Definition: compute_unit.hh:566
PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:243
Stats::Group::Group
Group()=delete
ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:291
ComputeUnit::ScalarDataPort::MemReqEvent::process
void process()
Definition: compute_unit.cc:1590
ComputeUnit::numExeUnits
int numExeUnits() const
Definition: compute_unit.cc:229
ComputeUnit::ScalarDTLBPort::isStalled
bool isStalled() const
Definition: compute_unit.hh:707
RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:86
ComputeUnit::ComputeUnitStats::flatVMemInstsPerWF
Stats::Formula flatVMemInstsPerWF
Definition: compute_unit.hh:957
Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:341
ComputeUnit::stats
ComputeUnit::ComputeUnitStats stats
ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:224
ScheduleStage::init
void init()
Definition: schedule_stage.cc:74
ComputeUnit::registerManager
RegisterManager * registerManager
Definition: compute_unit.hh:277
ComputeUnit::functionalTLB
bool functionalTLB
Definition: compute_unit.hh:344
ComputeUnit::ComputeUnitStats::dynamicGMemInstrCnt
Stats::Scalar dynamicGMemInstrCnt
Definition: compute_unit.hh:1022
ComputeUnit::SQCPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:984
ComputeUnit::debugSegFault
bool debugSegFault
Definition: compute_unit.hh:340
Packet::getSize
unsigned getSize() const
Definition: packet.hh:765
ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
ComputeUnit::LDSPort::sendTimingReq
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
Definition: compute_unit.cc:2025
ScoreboardCheckStage::exec
void exec()
Definition: scoreboard_check_stage.cc:247
HSAQueueEntry::ldsSize
int ldsSize() const
Definition: hsa_queue_entry.hh:188
ComputeUnit::ComputeUnitStats::numVecOpsExecutedF32
Stats::Scalar numVecOpsExecutedF32
Definition: compute_unit.hh:1041
ComputeUnit::scheduleStage
ScheduleStage scheduleStage
Definition: compute_unit.hh:281
GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:106
HSAQueueEntry::numVectorRegs
int numVectorRegs() const
Definition: hsa_queue_entry.hh:134
ComputeUnit::allAtBarrier
bool allAtBarrier(int bar_id)
Definition: compute_unit.cc:652
KernelLaunchStaticInst
Definition: gpu_static_inst.hh:277
ComputeUnit::ComputeUnitStats::scalarMemReadsPerWF
Stats::Formula scalarMemReadsPerWF
Definition: compute_unit.hh:967
ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueSgprAlloc
Stats::Scalar numTimesWgBlockedDueSgprAlloc
Definition: compute_unit.hh:1073
ComputeUnit::ScalarDTLBPort::SenderState
Definition: compute_unit.hh:698
Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition: wavefront.hh:90
ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition: compute_unit.cc:496
Wavefront::dropFetch
bool dropFetch
Definition: wavefront.hh:110
sim_exit.hh
Request::KERNEL
@ KERNEL
The request should be marked with KERNEL.
Definition: request.hh:174
HSAQueueEntry
Definition: hsa_queue_entry.hh:58
X86ISA::GpuTLB::TranslationState::saved
Packet::SenderState * saved
Definition: gpu_tlb.hh:307
output.hh
ComputeUnit::ComputeUnit
ComputeUnit(const Params &p)
Definition: compute_unit.cc:62
Request::FLUSH_L2
@ FLUSH_L2
Definition: request.hh:296
ComputeUnit::exitCallback
void exitCallback()
Definition: compute_unit.cc:1910
ComputeUnit::ComputeUnitStats::numVecOpsExecutedF16
Stats::Scalar numVecOpsExecutedF16
Definition: compute_unit.hh:1039
RR
@ RR
Definition: compute_unit.hh:74
ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition: compute_unit.hh:218
GPUDispatcher
Definition: dispatcher.hh:61
wavefront.hh
ComputeUnit::ComputeUnitStats::instCyclesLdsPerSimd
Stats::Vector instCyclesLdsPerSimd
Definition: compute_unit.hh:980
ComputeUnit::LDSPort::SenderState
SenderState is information carried along with the packet, esp.
Definition: compute_unit.hh:782
ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:231
ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition: compute_unit.hh:248
ComputeUnit::ScalarDTLBPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:1650
WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:90
ComputeUnit::ComputeUnitStats::vectorMemWritesPerKiloInst
Stats::Formula vectorMemWritesPerKiloInst
Definition: compute_unit.hh:970
Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:113
EventFunctionWrapper
Definition: eventq.hh:1112
Stats::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:339
ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:220
Wavefront::setStatus
void setStatus(status_e newStatus)
Definition: wavefront.cc:517
LdsState::getRefCounter
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:329
ComputeUnit::DTLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1619
HSAQueueEntry::numWg
int numWg(int dim) const
Definition: hsa_queue_entry.hh:234
ComputeUnit::ComputeUnitStats::scalarMemWritesPerKiloInst
Stats::Formula scalarMemWritesPerKiloInst
Definition: compute_unit.hh:973
MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:86
ComputeUnit::ComputeUnitStats::groupWrites
Stats::Scalar groupWrites
Definition: compute_unit.hh:992
ComputeUnit::ITLBPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:1763
ComputeUnit::tickEvent
EventFunctionWrapper tickEvent
Definition: compute_unit.hh:287
ComputeUnit
Definition: compute_unit.hh:200
X86ISA::GpuTLB::TranslationState::tlbEntry
TlbEntry * tlbEntry
Definition: gpu_tlb.hh:294
Wavefront::getStatus
status_e getStatus()
Definition: wavefront.hh:135
ComputeUnit::DTLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1377
GPUDispatcher::notifyWgCompl
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
Definition: dispatcher.cc:294
Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:98
ComputeUnit::ComputeUnitStats::ldsBankConflictDist
Stats::Distribution ldsBankConflictDist
Definition: compute_unit.hh:1016
ArmISA::j
Bitfield< 24 > j
Definition: miscregs_types.hh:54
ComputeUnit::ComputeUnitStats::spillWrites
Stats::Scalar spillWrites
Definition: compute_unit.hh:989
ComputeUnit::decMaxBarrierCnt
void decMaxBarrierCnt(int bar_id)
Definition: compute_unit.cc:687
ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition: compute_unit.hh:296
Wavefront::S_RETURNING
@ S_RETURNING
Definition: wavefront.hh:66
vector_register_file.hh
MemCmd::SwapReq
@ SwapReq
Definition: packet.hh:112
EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1016
ComputeUnit::ComputeUnitStats::wgBlockedDueBarrierAllocation
Stats::Scalar wgBlockedDueBarrierAllocation
Definition: compute_unit.hh:1027
LocalMemPipeline::exec
void exec()
Definition: local_memory_pipeline.cc:51
ComputeUnit::exec
void exec()
Definition: compute_unit.cc:717
FetchStage::init
void init()
Definition: fetch_stage.cc:55
registerExitCallback
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition: core.cc:137
ComputeUnit::ComputeUnitStats::totalCycles
Stats::Scalar totalCycles
Definition: compute_unit.hh:1059
MipsISA::k
Bitfield< 23 > k
Definition: dt_constants.hh:78
ComputeUnit::ComputeUnitStats::vpc
Stats::Formula vpc
Definition: compute_unit.hh:1060
ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition: compute_unit.hh:226
HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:152
Stats::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:62
RequestPort::sendTimingReq
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition: port.hh:492
ComputeUnit::memPort
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
Definition: compute_unit.hh:844
ComputeUnit::getRefCounter
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Definition: compute_unit.cc:1957
MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:278
ComputeUnit::DataPort::SenderState::port_index
PortID port_index
Definition: compute_unit.hh:518
MemCmd::toString
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:240
OutputStream::stream
std::ostream * stream() const
Get the output underlying output stream.
Definition: output.hh:59
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:237
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:71
ComputeUnit::maxBarrierCnt
int maxBarrierCnt(int bar_id)
Definition: compute_unit.cc:673
SenderState
RubyTester::SenderState SenderState
Definition: Check.cc:37
ComputeUnit::ComputeUnitStats::kernargWrites
Stats::Scalar kernargWrites
Definition: compute_unit.hh:1001
ComputeUnit::ComputeUnitStats::tlbLatency
Stats::Formula tlbLatency
Definition: compute_unit.hh:1010
MemCmd
Definition: packet.hh:72
RegisterManager::vrfPoolMgrs
std::vector< PoolManager * > vrfPoolMgrs
Definition: register_manager.hh:79
ComputeUnit::sendRequest
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
Definition: compute_unit.cc:1007
HSAQueueEntry::wgId
int wgId(int dim) const
Definition: hsa_queue_entry.hh:208
ComputeUnit::ComputeUnitStats::flatLDSInsts
Stats::Scalar flatLDSInsts
Definition: compute_unit.hh:958
ComputeUnit::injectGlobalMemFence
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
Definition: compute_unit.cc:1219
ArmISA::d
Bitfield< 9 > d
Definition: miscregs_types.hh:60
MipsISA::vaddr
vaddr
Definition: pra_constants.hh:275
GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:245
ComputeUnit::ComputeUnitStats::vpc_f64
Stats::Formula vpc_f64
Definition: compute_unit.hh:1063
ComputeUnit::ITLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:738
ComputeUnit::resp_tick_latency
Tick resp_tick_latency
Definition: compute_unit.hh:355
X86ISA::PageShift
const Addr PageShift
Definition: isa_traits.hh:48
ComputeUnit::ComputeUnitStats::ldsNoFlatInstsPerWF
Stats::Formula ldsNoFlatInstsPerWF
Definition: compute_unit.hh:955
ComputeUnit::ComputeUnitStats::vALUInsts
Stats::Scalar vALUInsts
Definition: compute_unit.hh:946
ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:294
ComputeUnit::ComputeUnitStats::sALUInsts
Stats::Scalar sALUInsts
Definition: compute_unit.hh:948
ComputeUnit::ComputeUnitStats::argReads
Stats::Scalar argReads
Definition: compute_unit.hh:985
ComputeUnit::ComputeUnitStats::globalWrites
Stats::Scalar globalWrites
Definition: compute_unit.hh:983
MemCmd::SwapResp
@ SwapResp
Definition: packet.hh:113
ComputeUnit::ComputeUnitStats::numTimesWgBlockedDueVgprAlloc
Stats::Scalar numTimesWgBlockedDueVgprAlloc
Definition: compute_unit.hh:1071
process.hh
X86ISA::PageBytes
const Addr PageBytes
Definition: isa_traits.hh:49
ComputeUnit::wfList
std::vector< std::vector< Wavefront * > > wfList
Definition: compute_unit.hh:290
WaitClass::init
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition: misc.hh:73
ComputeUnit::scalarDataPort
ScalarDataPort scalarDataPort
Definition: compute_unit.hh:848
isa_traits.hh
ComputeUnit::ComputeUnitStats::tlbRequests
Stats::Scalar tlbRequests
Definition: compute_unit.hh:1008
scalar_register_file.hh
exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition: sim_events.cc:85
ComputeUnit::ComputeUnitStats::scalarMemReadsPerKiloInst
Stats::Formula scalarMemReadsPerKiloInst
Definition: compute_unit.hh:972
HSAQueueEntry::gridSize
int gridSize(int dim) const
Definition: hsa_queue_entry.hh:127
FetchStage::exec
void exec()
Definition: fetch_stage.cc:64
gpu_dyn_inst.hh
ComputeUnit::DTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:667
WFBarrier
WF barrier slots.
Definition: compute_unit.hh:89
ComputeUnit::releaseBarrier
void releaseBarrier(int bar_id)
Definition: compute_unit.cc:694
ComputeUnit::ComputeUnitStats::privWrites
Stats::Scalar privWrites
Definition: compute_unit.hh:995
ComputeUnit::DataPort::computeUnit
ComputeUnit * computeUnit
Definition: compute_unit.hh:537
ComputeUnit::ComputeUnitStats::controlFlowDivergenceDist
Stats::Distribution controlFlowDivergenceDist
Definition: compute_unit.hh:1065
ComputeUnit::ComputeUnitStats::ComputeUnitStats
ComputeUnitStats(Stats::Group *parent, int n_wf)
Definition: compute_unit.cc:2098
ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition: compute_unit.hh:230
ComputeUnit::fetchStage
FetchStage fetchStage
Definition: compute_unit.hh:279
ComputeUnit::isVectorAluIdle
bool isVectorAluIdle(uint32_t simdId) const
Definition: compute_unit.cc:1964
ComputeUnit::ComputeUnitStats::spillReads
Stats::Scalar spillReads
Definition: compute_unit.hh:988
ComputeUnit::startWavefront
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
Definition: compute_unit.cc:307
ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition: compute_unit.hh:240
ScalarMemPipeline::exec
void exec()
Definition: scalar_memory_pipeline.cc:53
OLDEST
@ OLDEST
Definition: compute_unit.hh:73
ComputeUnit::DataPort::SenderState
Definition: compute_unit.hh:515
ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition: compute_unit.cc:237
ComputeUnit::ComputeUnitStats::instCyclesVMemPerSimd
Stats::Vector instCyclesVMemPerSimd
Definition: compute_unit.hh:978
ComputeUnit::countPages
bool countPages
Definition: compute_unit.hh:350
ComputeUnit::ComputeUnitStats::vALUInstsPerWF
Stats::Formula vALUInstsPerWF
Definition: compute_unit.hh:947
ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition: compute_unit.hh:285
ComputeUnit::ScalarDataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:574
ComputeUnit::ComputeUnitStats::vectorMemInstsPerKiloInst
Stats::Formula vectorMemInstsPerKiloInst
Definition: compute_unit.hh:971
ComputeUnit::sendScalarRequest
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
Definition: compute_unit.cc:1192
ComputeUnit::ComputeUnitStats::vectorMemReads
Stats::Scalar vectorMemReads
Definition: compute_unit.hh:962
ComputeUnit::ComputeUnitStats::activeLanesPerLMemInstrDist
Stats::Distribution activeLanesPerLMemInstrDist
Definition: compute_unit.hh:1067
ComputeUnit::ScalarDTLBPort::stallPort
void stallPort()
Definition: compute_unit.hh:708
ComputeUnit::sendToLds
M5_NODISCARD bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
Definition: compute_unit.cc:1983
ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition: compute_unit.hh:238
ProbePoints::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:103
ComputeUnit::getAndIncSeqNum
InstSeqNum getAndIncSeqNum()
Definition: compute_unit.hh:880
HSAQueueEntry::isInvDone
bool isInvDone() const
Is invalidate done?
Definition: hsa_queue_entry.hh:353
ComputeUnit::DTLBPort::SenderState
SenderState is information carried along with the packet throughout the TLB hierarchy.
Definition: compute_unit.hh:664
ComputeUnit::LDSPort::recvReqRetry
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
Definition: compute_unit.cc:2067
ComputeUnit::ScalarDataPort::recvTimingResp
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
Definition: compute_unit.cc:900
Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:148
ComputeUnit::ITLBPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:1710
ComputeUnit::scalarRegsReserved
std::vector< int > scalarRegsReserved
Definition: compute_unit.hh:368
ComputeUnit::ComputeUnitStats::argMemInsts
Stats::Formula argMemInsts
Definition: compute_unit.hh:987
ComputeUnit::resetRegisterPool
void resetRegisterPool()
Definition: compute_unit.cc:408
Wavefront::S_STOPPED
@ S_STOPPED
Definition: wavefront.hh:64
name
const std::string & name()
Definition: trace.cc:48
GMEnqueue
@ GMEnqueue
Definition: misc.hh:54
Stats::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1028
ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:283
Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:214
ComputeUnit::isDone
bool isDone() const
Definition: compute_unit.cc:1927
MemCmd::MemSyncResp
@ MemSyncResp
Definition: packet.hh:117
ComputeUnit::ComputeUnitStats::vectorMemReadsPerWF
Stats::Formula vectorMemReadsPerWF
Definition: compute_unit.hh:963
ComputeUnit::numAtBarrier
int numAtBarrier(int bar_id)
Definition: compute_unit.cc:666
ComputeUnit::hasDispResources
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
Definition: compute_unit.cc:517
Wavefront::pendingFetch
bool pendingFetch
Definition: wavefront.hh:109
HSAQueueEntry::codeAddr
Addr codeAddr() const
Definition: hsa_queue_entry.hh:176
ComputeUnit::ldsPort
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
Definition: compute_unit.hh:833
ComputeUnit::freeBarrierIds
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
Definition: compute_unit.hh:931
ComputeUnit::lastVaddrCU
std::vector< Addr > lastVaddrCU
Definition: compute_unit.hh:334
ComputeUnit::ComputeUnitStats::scalarMemWritesPerWF
Stats::Formula scalarMemWritesPerWF
Definition: compute_unit.hh:965
X86ISA::addr
Bitfield< 3 > addr
Definition: types.hh:80
SimObject::name
virtual const std::string name() const
Definition: sim_object.hh:182
BaseTLB::Write
@ Write
Definition: tlb.hh:57
ComputeUnit::DataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:517
ComputeUnit::ComputeUnitStats::globalMemInsts
Stats::Formula globalMemInsts
Definition: compute_unit.hh:984
ComputeUnit::numWfsToSched
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
Definition: compute_unit.hh:363
RegisterManager::srfPoolMgrs
std::vector< PoolManager * > srfPoolMgrs
Definition: register_manager.hh:78
Packet::cmd
MemCmd cmd
The command field of the packet.
Definition: packet.hh:336
HSAQueueEntry::numScalarRegs
int numScalarRegs() const
Definition: hsa_queue_entry.hh:140
floorLog2
std::enable_if_t< std::is_integral< T >::value, int > floorLog2(T x)
Definition: intmath.hh:63
Shader::max_valu_insts
int64_t max_valu_insts
Definition: shader.hh:232
ComputeUnit::DataPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:799
ComputeUnit::numVecRegsPerSimd
int numVecRegsPerSimd
Definition: compute_unit.hh:370
ComputeUnit::ComputeUnitStats::vectorMemWrites
Stats::Scalar vectorMemWrites
Definition: compute_unit.hh:960
Shader::total_valu_insts
int64_t total_valu_insts
Definition: shader.hh:233
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:197
ComputeUnit::ComputeUnitStats::ipc
Stats::Formula ipc
Definition: compute_unit.hh:1064
ComputeUnit::SQCPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition: compute_unit.cc:977
ComputeUnit::execStage
ExecStage execStage
Definition: compute_unit.hh:282
Packet::pushSenderState
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition: packet.cc:332
ComputeUnit::ComputeUnitStats::readonlyWrites
Stats::Scalar readonlyWrites
Definition: compute_unit.hh:998
ComputeUnit::scalarMemToSrfBus
WaitClass scalarMemToSrfBus
Definition: compute_unit.hh:236
ComputeUnit::ComputeUnitStats::waveLevelParallelism
Stats::Distribution waveLevelParallelism
Definition: compute_unit.hh:1004
LdsState::reserveSpace
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:362
ComputeUnit::pageAccesses
pageDataStruct pageAccesses
Definition: compute_unit.hh:482
ComputeUnit::DataPort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:950
ComputeUnit::ComputeUnitStats::kernargMemInsts
Stats::Formula kernargMemInsts
Definition: compute_unit.hh:1002
MemCmd::WriteResp
@ WriteResp
Definition: packet.hh:87
ComputeUnit::updateInstStats
void updateInstStats(GPUDynInstPtr gpuDynInst)
Definition: compute_unit.cc:1793
ComputeUnit::updatePageDivergenceDist
void updatePageDivergenceDist(Addr addr)
Definition: compute_unit.cc:1899
RegisterManager::canAllocateSgprs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
Definition: register_manager.cc:114
MemCmd::MemSyncReq
@ MemSyncReq
Definition: packet.hh:116
Stats::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1108
Wavefront
Definition: wavefront.hh:59
Stats::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:52
ComputeUnit::fillKernelState
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
Definition: compute_unit.cc:293
GlobalMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition: global_memory_pipeline.hh:94
ComputeUnit::ComputeUnitStats::activeLanesPerGMemInstrDist
Stats::Distribution activeLanesPerGMemInstrDist
Definition: compute_unit.hh:1066
Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:258
ComputeUnit::ComputeUnitStats::groupMemInsts
Stats::Formula groupMemInsts
Definition: compute_unit.hh:993
Stats::Group
Statistics container.
Definition: group.hh:87
ComputeUnit::LDSPort::SenderState::getMemInst
GPUDynInstPtr getMemInst() const
Definition: compute_unit.hh:795
ComputeUnit::ScalarDataPort::recvReqRetry
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition: compute_unit.cc:938
ComputeUnit::Params
ComputeUnitParams Params
Definition: compute_unit.hh:289
ArmISA::len
Bitfield< 18, 16 > len
Definition: miscregs_types.hh:439
ComputeUnit::activeWaves
int activeWaves
Definition: compute_unit.hh:940
ComputeUnit::wfSize
int wfSize() const
Definition: compute_unit.hh:393
ComputeUnit::lastVaddrSimd
std::vector< std::vector< Addr > > lastVaddrSimd
Definition: compute_unit.hh:335
ComputeUnit::ComputeUnitStats::ldsNoFlatInsts
Stats::Scalar ldsNoFlatInsts
Definition: compute_unit.hh:954
ComputeUnit::DataPort::processMemRespEvent
void processMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1297
Stats::VectorDistribution::init
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
Definition: statistics.hh:2277
ComputeUnit::ComputeUnitStats::privReads
Stats::Scalar privReads
Definition: compute_unit.hh:994
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
ComputeUnit::ComputeUnitStats::threadCyclesVALU
Stats::Scalar threadCyclesVALU
Definition: compute_unit.hh:952
ArmISA::stride
Bitfield< 21, 20 > stride
Definition: miscregs_types.hh:441
Stats::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1323
Packet::isWrite
bool isWrite() const
Definition: packet.hh:558
Clocked::nextCycle
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Definition: clocked_object.hh:210
ComputeUnit::ComputeUnitStats::completedWfs
Stats::Scalar completedWfs
Definition: compute_unit.hh:1076
ComputeUnit::vectorRegsReserved
std::vector< int > vectorRegsReserved
Definition: compute_unit.hh:366
LdsChunk
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:55
Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1158
ComputeUnit::getFreeBarrierId
int getFreeBarrierId()
Definition: compute_unit.hh:424
Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:107
ComputeUnit::ComputeUnitStats::numALUInstsExecuted
Stats::Formula numALUInstsExecuted
Definition: compute_unit.hh:1069
X86ISA::GpuTLB::TranslationState::ports
std::vector< ResponsePort * > ports
Definition: gpu_tlb.hh:300
sc_core::SC_NONE
@ SC_NONE
Definition: sc_report.hh:50
Stats::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition: statistics.hh:383
ComputeUnit::ComputeUnitStats::instCyclesSALU
Stats::Scalar instCyclesSALU
Definition: compute_unit.hh:951
MemCmd::ReadResp
@ ReadResp
Definition: packet.hh:84
ComputeUnit::scoreboardCheckStage
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:280
Stats
Definition: statistics.cc:53
ComputeUnit::ComputeUnitStats::execRateDist
Stats::Distribution execRateDist
Definition: compute_unit.hh:1035
ComputeUnit::ScalarDataPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:595
ComputeUnit::ComputeUnitStats::instCyclesScMemPerSimd
Stats::Vector instCyclesScMemPerSimd
Definition: compute_unit.hh:979
curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:43
GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:56
ComputeUnit::releaseWFsFromBarrier
void releaseWFsFromBarrier(int bar_id)
Definition: compute_unit.cc:702
TokenManager
Definition: token_port.hh:129
dispatcher.hh
ComputeUnit::DataPort::createMemRespEvent
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Definition: compute_unit.cc:1553
DPRINTFN
#define DPRINTFN(...)
Definition: trace.hh:241
ComputeUnit::DataPort::processMemReqEvent
void processMemReqEvent(PacketPtr pkt)
Definition: compute_unit.cc:1561
ComputeUnit::tlbPort
std::vector< DTLBPort > tlbPort
Definition: compute_unit.hh:846
Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:509
simout
OutputDirectory simout
Definition: output.cc:59
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:44
ComputeUnit::ComputeUnitStats::wgBlockedDueLdsAllocation
Stats::Scalar wgBlockedDueLdsAllocation
Definition: compute_unit.hh:1028
ComputeUnit::ComputeUnitStats::scalarMemWrites
Stats::Scalar scalarMemWrites
Definition: compute_unit.hh:964
TokenRequestPort::setTokenManager
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition: token_port.cc:71
MipsISA::vpc
Bitfield< 1 > vpc
Definition: mt_constants.hh:41
GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:282
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:219
page_table.hh
ComputeUnit::ComputeUnitStats::numVecOpsExecuted
Stats::Scalar numVecOpsExecuted
Definition: compute_unit.hh:1037
ComputeUnit::~ComputeUnit
~ComputeUnit()
Definition: compute_unit.cc:216
ComputeUnit::ComputeUnitStats::argWrites
Stats::Scalar argWrites
Definition: compute_unit.hh:986
ComputeUnit::LDSPort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Definition: compute_unit.cc:2003
ComputeUnit::pipeMap
std::unordered_set< uint64_t > pipeMap
Definition: compute_unit.hh:275
ComputeUnit::ComputeUnitStats::numInstrExecuted
Stats::Scalar numInstrExecuted
Definition: compute_unit.hh:1032
ComputeUnit::ComputeUnitStats::readonlyReads
Stats::Scalar readonlyReads
Definition: compute_unit.hh:997
ComputeUnit::doInvalidate
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
Definition: compute_unit.cc:381
LdsState::increaseRefCounter
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:294
isPowerOf2
bool isPowerOf2(const T &n)
Definition: intmath.hh:102
ComputeUnit::DTLBPort::SenderState::portIndex
PortID portIndex
Definition: compute_unit.hh:671
ComputeUnit::ComputeUnitStats::vALUUtilization
Stats::Formula vALUUtilization
Definition: compute_unit.hh:953
ComputeUnit::ComputeUnitStats::scalarMemReads
Stats::Scalar scalarMemReads
Definition: compute_unit.hh:966
ComputeUnit::ComputeUnitStats::headTailLatency
Stats::Distribution headTailLatency
Definition: compute_unit.hh:1081
ComputeUnit::ComputeUnitStats::instInterleave
Stats::VectorDistribution instInterleave
Definition: compute_unit.hh:1087
GuestABI::foo
std::true_type foo(void(*)(ThreadContext *, const Ret &ret, State &state))
ComputeUnit::requestorId
RequestorID requestorId()
Definition: compute_unit.hh:459
ComputeUnit::ComputeUnitStats::flatVMemInsts
Stats::Scalar flatVMemInsts
Definition: compute_unit.hh:956
csprintf
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:158
ComputeUnit::ScalarDTLBPort::retries
std::deque< PacketPtr > retries
Definition: compute_unit.hh:711
ComputeUnit::mapWaveToScalarAlu
int mapWaveToScalarAlu(Wavefront *w) const
Definition: compute_unit.cc:251
ComputeUnit::ComputeUnitStats::vectorMemReadsPerKiloInst
Stats::Formula vectorMemReadsPerKiloInst
Definition: compute_unit.hh:969
Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:516
ComputeUnit::lds
LdsState & lds
Definition: compute_unit.hh:467
HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:222
ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:352
WFBarrier::InvalidID
static const int InvalidID
Definition: compute_unit.hh:96
X86ISA::GpuTLB::TranslationState::tlbMode
Mode tlbMode
Definition: gpu_tlb.hh:285
ComputeUnit::ComputeUnitStats::dynamicLMemInstrCnt
Stats::Scalar dynamicLMemInstrCnt
Definition: compute_unit.hh:1025
GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:265
ComputeUnit::ScalarDTLBPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:701
ComputeUnit::barrierSlot
WFBarrier & barrierSlot(int bar_id)
Definition: compute_unit.hh:417
ComputeUnit::numScalarALUs
int numScalarALUs
Definition: compute_unit.hh:247
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:171
ComputeUnit::numVectorALUs
int numVectorALUs
Definition: compute_unit.hh:243
X86ISA::GpuTLB::TranslationState
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: gpu_tlb.hh:282
LdsState::canReserve
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:477
ComputeUnit::ComputeUnitStats::vpc_f16
Stats::Formula vpc_f16
Definition: compute_unit.hh:1061

Generated on Tue Mar 23 2021 19:41:27 for gem5 by doxygen 1.8.17