gem5  v19.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Authors: John Kalamatianos,
34  * Anthony Gutierrez
35  */
36 
38 
39 #include <limits>
40 
41 #include "base/output.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUExec.hh"
44 #include "debug/GPUFetch.hh"
45 #include "debug/GPUMem.hh"
46 #include "debug/GPUPort.hh"
47 #include "debug/GPUPrefetch.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
53 #include "gpu-compute/ndrange.hh"
54 #include "gpu-compute/shader.hh"
57 #include "gpu-compute/wavefront.hh"
58 #include "mem/page_table.hh"
59 #include "sim/process.hh"
60 
61 ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
62  scoreboardCheckStage(p), scheduleStage(p), execStage(p),
63  globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
64  cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
65  spBypassPipeLength(p->spbypass_pipe_length),
66  dpBypassPipeLength(p->dpbypass_pipe_length),
67  issuePeriod(p->issue_period),
68  numGlbMemUnits(p->num_global_mem_pipes),
69  numLocMemUnits(p->num_shared_mem_pipes),
70  perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
71  prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
72  xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
73  functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
74  countPages(p->countPages), barrier_id(0),
75  vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
76  coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
77  req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
78  resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
79  _masterId(p->system->getMasterId(this, "ComputeUnit")),
80  lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
81  globalSeqNum(0), wavefrontSize(p->wfSize),
82  kernelLaunchInst(new KernelLaunchStaticInst())
83 {
93  fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
94  p->wfSize <= 0,
95  "WF size is larger than the host can support");
97  "Wavefront size should be a power of 2");
98  // calculate how many cycles a vector load or store will need to transfer
99  // its data over the corresponding buses
101  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
102  (double)vrfToCoalescerBusWidth);
103 
104  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
106 
107  lastVaddrWF.resize(numSIMDs);
108  wfList.resize(numSIMDs);
109 
110  for (int j = 0; j < numSIMDs; ++j) {
111  lastVaddrWF[j].resize(p->n_wf);
112 
113  for (int i = 0; i < p->n_wf; ++i) {
114  lastVaddrWF[j][i].resize(wfSize());
115 
116  wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
117  wfList[j][i]->setParent(this);
118 
119  for (int k = 0; k < wfSize(); ++k) {
120  lastVaddrWF[j][i][k] = 0;
121  }
122  }
123  }
124 
125  lastVaddrSimd.resize(numSIMDs);
126 
127  for (int i = 0; i < numSIMDs; ++i) {
128  lastVaddrSimd[i].resize(wfSize(), 0);
129  }
130 
131  lastVaddrCU.resize(wfSize());
132 
133  lds.setParent(this);
134 
135  if (p->execPolicy == "OLDEST-FIRST") {
137  } else if (p->execPolicy == "ROUND-ROBIN") {
139  } else {
140  fatal("Invalid WF execution policy (CU)\n");
141  }
142 
143  memPort.resize(wfSize());
144 
145  // resize the tlbPort vectorArray
146  int tlbPort_width = perLaneTLB ? wfSize() : 1;
147  tlbPort.resize(tlbPort_width);
148 
149  cuExitCallback = new CUExitCallback(this);
151 
152  xactCasLoadMap.clear();
153  lastExecCycle.resize(numSIMDs, 0);
154 
155  for (int i = 0; i < vrf.size(); ++i) {
156  vrf[i]->setParent(this);
157  }
158 
159  numVecRegsPerSimd = vrf[0]->numRegs();
160 }
161 
163 {
164  // Delete wavefront slots
165  for (int j = 0; j < numSIMDs; ++j) {
166  for (int i = 0; i < shader->n_wf; ++i) {
167  delete wfList[j][i];
168  }
169  lastVaddrSimd[j].clear();
170  }
171  lastVaddrCU.clear();
172  readyList.clear();
173  waveStatusList.clear();
174  dispatchList.clear();
175  vectorAluInstAvail.clear();
176  delete cuExitCallback;
177  delete ldsPort;
178 }
179 
180 void
182 {
183  w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
184 
185  w->workGroupSz[0] = ndr->q.wgSize[0];
186  w->workGroupSz[1] = ndr->q.wgSize[1];
187  w->workGroupSz[2] = ndr->q.wgSize[2];
188  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
189  w->gridSz[0] = ndr->q.gdSize[0];
190  w->gridSz[1] = ndr->q.gdSize[1];
191  w->gridSz[2] = ndr->q.gdSize[2];
192  w->kernelArgs = ndr->q.args;
193  w->privSizePerItem = ndr->q.privMemPerItem;
195  w->roBase = ndr->q.roMemStart;
196  w->roSize = ndr->q.roMemTotal;
197  w->computeActualWgSz(ndr);
198 }
199 
200 void
202 
203  if (!timestampVec.empty()) {
204  uint32_t vecSize = timestampVec.size();
205  uint32_t i = 0;
206  while (i < vecSize) {
207  if (timestampVec[i] <= shader->tick_cnt) {
209  vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
210  statusVec[i]);
211  timestampVec.erase(timestampVec.begin() + i);
212  regIdxVec.erase(regIdxVec.begin() + i);
213  statusVec.erase(statusVec.begin() + i);
214  --vecSize;
215  --i;
216  }
217  ++i;
218  }
219  }
220 
221  for (int i = 0; i< numSIMDs; ++i) {
222  vrf[i]->updateEvents();
223  }
224 }
225 
226 
227 void
229  NDRange *ndr)
230 {
231  static int _n_wave = 0;
232 
233  VectorMask init_mask;
234  init_mask.reset();
235 
236  for (int k = 0; k < wfSize(); ++k) {
237  if (k + waveId * wfSize() < w->actualWgSzTotal)
238  init_mask[k] = 1;
239  }
240 
241  w->kernId = ndr->dispatchId;
242  w->wfId = waveId;
243  w->initMask = init_mask.to_ullong();
244 
245  for (int k = 0; k < wfSize(); ++k) {
246  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
247  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
248  w->actualWgSz[1];
249  w->workItemId[2][k] = (k + waveId * wfSize()) /
250  (w->actualWgSz[0] * w->actualWgSz[1]);
251 
252  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
253  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
254  w->workItemId[0][k];
255  }
256 
257  w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
258 
259  w->barCnt.resize(wfSize(), 0);
260 
261  w->maxBarCnt = 0;
262  w->oldBarrierCnt = 0;
263  w->barrierCnt = 0;
264 
265  w->privBase = ndr->q.privMemStart;
266  ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
267 
268  w->spillBase = ndr->q.spillMemStart;
269  ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
270 
271  w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
272 
273  // WG state
274  w->wgId = ndr->globalWgId;
275  w->dispatchId = ndr->dispatchId;
276  w->workGroupId[0] = w->wgId % ndr->numWg[0];
277  w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
278  w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
279 
280  w->barrierId = barrier_id;
281  w->stalledAtBarrier = false;
282 
283  // set the wavefront context to have a pointer to this section of the LDS
284  w->ldsChunk = ldsChunk;
285 
286  int32_t refCount M5_VAR_USED =
288  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
289  cu_id, w->wgId, refCount);
290 
291  w->instructionBuffer.clear();
292 
293  if (w->pendingFetch)
294  w->dropFetch = true;
295 
296  // is this the last wavefront in the workgroup
297  // if set the spillWidth to be the remaining work-items
298  // so that the vector access is correct
299  if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
300  w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
301  } else {
302  w->spillWidth = wfSize();
303  }
304 
305  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
306  "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
307 
308  w->start(++_n_wave, ndr->q.code_ptr);
309 }
310 
311 void
313 {
314  // reserve the LDS capacity allocated to the work group
315  // disambiguated by the dispatch ID and workgroup ID, which should be
316  // globally unique
317  LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
318  ndr->q.ldsSize);
319 
320  // Send L1 cache acquire
321  // isKernel + isAcquire = Kernel Begin
323  GPUDynInstPtr gpuDynInst =
324  std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst,
325  getAndIncSeqNum());
326 
327  gpuDynInst->useContinuation = false;
328  injectGlobalMemFence(gpuDynInst, true);
329  }
330 
331  // calculate the number of 32-bit vector registers required by wavefront
332  int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
333  int wave_id = 0;
334 
335  // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
336  for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
337  Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
338  // Check if this wavefront slot is available:
339  // It must be stopped and not waiting
340  // for a release to complete S_RETURNING
341  if (w->status == Wavefront::S_STOPPED) {
342  fillKernelState(w, ndr);
343  // if we have scheduled all work items then stop
344  // scheduling wavefronts
345  if (wave_id * wfSize() >= w->actualWgSzTotal)
346  break;
347 
348  // reserve vector registers for the scheduled wavefront
350  uint32_t normSize = 0;
351 
352  w->startVgprIndex = vrf[m % numSIMDs]->manager->
353  allocateRegion(vregDemand, &normSize);
354 
355  w->reservedVectorRegs = normSize;
357 
358  startWavefront(w, wave_id, ldsChunk, ndr);
359  ++wave_id;
360  }
361  }
362  ++barrier_id;
363 }
364 
365 int
367 {
368  // Get true size of workgroup (after clamping to grid size)
369  int trueWgSize[3];
370  int trueWgSizeTotal = 1;
371 
372  for (int d = 0; d < 3; ++d) {
373  trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
374  ndr->wgId[d] * ndr->q.wgSize[d]);
375 
376  trueWgSizeTotal *= trueWgSize[d];
377  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
378  }
379 
380  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
381 
382  // calculate the number of 32-bit vector registers required by each
383  // work item of the work group
384  int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
385  bool vregAvail = true;
386  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
387  int freeWfSlots = 0;
388  // check if the total number of VGPRs required by all WFs of the WG
389  // fit in the VRFs of all SIMD units
390  assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
391  int numMappedWfs = 0;
392  std::vector<int> numWfsPerSimd;
393  numWfsPerSimd.resize(numSIMDs, 0);
394  // find how many free WF slots we have across all SIMDs
395  for (int j = 0; j < shader->n_wf; ++j) {
396  for (int i = 0; i < numSIMDs; ++i) {
397  if (wfList[i][j]->status == Wavefront::S_STOPPED) {
398  // count the number of free WF slots
399  ++freeWfSlots;
400  if (numMappedWfs < numWfs) {
401  // count the WFs to be assigned per SIMD
402  numWfsPerSimd[i]++;
403  }
404  numMappedWfs++;
405  }
406  }
407  }
408 
409  // if there are enough free WF slots then find if there are enough
410  // free VGPRs per SIMD based on the WF->SIMD mapping
411  if (freeWfSlots >= numWfs) {
412  for (int j = 0; j < numSIMDs; ++j) {
413  // find if there are enough free VGPR regions in the SIMD's VRF
414  // to accommodate the WFs of the new WG that would be mapped to
415  // this SIMD unit
416  vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
417  vregDemandPerWI);
418 
419  // stop searching if there is at least one SIMD
420  // whose VRF does not have enough free VGPR pools.
421  // This is because a WG is scheduled only if ALL
422  // of its WFs can be scheduled
423  if (!vregAvail)
424  break;
425  }
426  }
427 
428  DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
429  freeWfSlots, vregAvail);
430 
431  if (!vregAvail) {
433  }
434 
435  // Return true if enough WF slots to submit workgroup and if there are
436  // enough VGPRs to schedule all WFs to their SIMD units
437  if (!lds.canReserve(ndr->q.ldsSize)) {
439  }
440 
441  // Return true if (a) there are enough free WF slots to submit
442  // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
443  // SIMD units and (c) if there is enough space in LDS
444  return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
445 }
446 
447 int
448 ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
449 {
450  DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
451  int ccnt = 0;
452 
453  for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
454  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
455  Wavefront *w = wfList[i_simd][i_wf];
456 
457  if (w->status == Wavefront::S_RUNNING) {
458  DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
459 
460  DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
461  w->barrierId, _barrier_id);
462 
463  DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
464  w->barrierCnt, bcnt);
465  }
466 
467  if (w->status == Wavefront::S_RUNNING &&
468  w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
469  !w->outstandingReqs) {
470  ++ccnt;
471 
472  DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
473  "%d\n", i_simd, i_wf, ccnt);
474  }
475  }
476  }
477 
478  DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
479  cu_id, ccnt, bslots);
480 
481  return ccnt == bslots;
482 }
483 
484 // Check if the current wavefront is blocked on additional resources.
485 bool
486 ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
487 {
488  bool cede = false;
489 
490  // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
491  // magic instructions will impact the scheduling of wavefronts
492  if (xact_cas_mode) {
493  /*
494  * When a wavefront calls xact_cas_ld, it adds itself to a per address
495  * queue. All per address queues are managed by the xactCasLoadMap.
496  *
497  * A wavefront is not blocked if: it is not in ANY per address queue or
498  * if it is at the head of a per address queue.
499  */
500  for (auto itMap : xactCasLoadMap) {
501  std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
502 
503  if (!curWaveIDQueue.empty()) {
504  for (auto it : curWaveIDQueue) {
505  waveIdentifier cur_wave = it;
506 
507  if (cur_wave.simdId == simdId &&
508  cur_wave.wfSlotId == wfSlotId) {
509  // 2 possibilities
510  // 1: this WF has a green light
511  // 2: another WF has a green light
512  waveIdentifier owner_wave = curWaveIDQueue.front();
513 
514  if (owner_wave.simdId != cur_wave.simdId ||
515  owner_wave.wfSlotId != cur_wave.wfSlotId) {
516  // possibility 2
517  cede = true;
518  break;
519  } else {
520  // possibility 1
521  break;
522  }
523  }
524  }
525  }
526  }
527  }
528 
529  return cede;
530 }
531 
532 // Execute one clock worth of work on the ComputeUnit.
533 void
535 {
536  updateEvents();
537  // Execute pipeline stages in reverse order to simulate
538  // the pipeline latency
541  execStage.exec();
544  fetchStage.exec();
545 
546  totalCycles++;
547 }
548 
549 void
551 {
552  // Initialize CU Bus models
555  nextGlbMemBus = 0;
556  nextLocMemBus = 0;
558  "No support for multiple Global Memory Pipelines exists!!!");
560  for (int j = 0; j < numGlbMemUnits; ++j) {
563  }
564 
566  "No support for multiple Local Memory Pipelines exists!!!");
568  for (int j = 0; j < numLocMemUnits; ++j) {
571  }
572  vectorRegsReserved.resize(numSIMDs, 0);
573  aluPipe.resize(numSIMDs);
574  wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
575 
576  for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
577  wfWait[i] = WaitClass();
578  wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
579  }
580 
581  for (int i = 0; i < numSIMDs; ++i) {
582  aluPipe[i] = WaitClass();
583  aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
584  }
585 
586  // Setup space for call args
587  for (int j = 0; j < numSIMDs; ++j) {
588  for (int i = 0; i < shader->n_wf; ++i) {
589  wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
590  }
591  }
592 
593  // Initializing pipeline resources
594  readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
595  waveStatusList.resize(numSIMDs);
596 
597  for (int j = 0; j < numSIMDs; ++j) {
598  for (int i = 0; i < shader->n_wf; ++i) {
599  waveStatusList[j].push_back(
600  std::make_pair(wfList[j][i], BLOCKED));
601  }
602  }
603 
604  for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
605  dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
606  }
607 
608  fetchStage.init(this);
610  scheduleStage.init(this);
611  execStage.init(this);
612  globalMemoryPipe.init(this);
613  localMemoryPipe.init(this);
614  // initialize state for statistics calculation
615  vectorAluInstAvail.resize(numSIMDs, false);
616  shrMemInstAvail = 0;
617  glbMemInstAvail = 0;
618 }
619 
620 bool
622 {
623  // Ruby has completed the memory op. Schedule the mem_resp_event at the
624  // appropriate cycle to process the timing memory response
625  // This delay represents the pipeline delay
626  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
627  int index = sender_state->port_index;
628  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
629 
630  // Is the packet returned a Kernel End or Barrier
631  if (pkt->req->isKernel() && pkt->req->isRelease()) {
632  Wavefront *w =
633  computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
634 
635  // Check if we are waiting on Kernel End Release
636  if (w->status == Wavefront::S_RETURNING) {
637  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
638  computeUnit->cu_id, w->simdId, w->wfSlotId,
639  w->wfDynId, w->kernId);
640 
641  computeUnit->shader->dispatcher->notifyWgCompl(w);
643  } else {
644  w->outstandingReqs--;
645  }
646 
647  DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
648  computeUnit->cu_id, gpuDynInst->simdId,
649  gpuDynInst->wfSlotId, w->barrierCnt);
650 
651  if (gpuDynInst->useContinuation) {
652  assert(!gpuDynInst->isNoScope());
653  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
654  gpuDynInst);
655  }
656 
657  delete pkt->senderState;
658  delete pkt;
659  return true;
660  } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
661  if (gpuDynInst->useContinuation) {
662  assert(!gpuDynInst->isNoScope());
663  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
664  gpuDynInst);
665  }
666 
667  delete pkt->senderState;
668  delete pkt;
669  return true;
670  }
671 
672  EventFunctionWrapper *mem_resp_event =
673  computeUnit->memPort[index]->createMemRespEvent(pkt);
674 
675  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
676  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
677  index, pkt->req->getPaddr());
678 
679  computeUnit->schedule(mem_resp_event,
680  curTick() + computeUnit->resp_tick_latency);
681  return true;
682 }
683 
684 void
686 {
687  int len = retries.size();
688 
689  assert(len > 0);
690 
691  for (int i = 0; i < len; ++i) {
692  PacketPtr pkt = retries.front().first;
693  GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
694  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
695  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
696  pkt->req->getPaddr());
697 
701  if (!sendTimingReq(pkt)) {
702  DPRINTF(GPUMem, "failed again!\n");
703  break;
704  } else {
705  DPRINTF(GPUMem, "successful!\n");
706  retries.pop_front();
707  }
708  }
709 }
710 
711 bool
713 {
714  computeUnit->fetchStage.processFetchReturn(pkt);
715 
716  return true;
717 }
718 
719 void
721 {
722  int len = retries.size();
723 
724  assert(len > 0);
725 
726  for (int i = 0; i < len; ++i) {
727  PacketPtr pkt = retries.front().first;
728  Wavefront *wavefront M5_VAR_USED = retries.front().second;
729  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
730  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
731  pkt->req->getPaddr());
732  if (!sendTimingReq(pkt)) {
733  DPRINTF(GPUFetch, "failed again!\n");
734  break;
735  } else {
736  DPRINTF(GPUFetch, "successful!\n");
737  retries.pop_front();
738  }
739  }
740 }
741 
742 void
744 {
745  // There must be a way around this check to do the globalMemStart...
746  Addr tmp_vaddr = pkt->req->getVaddr();
747 
748  updatePageDivergenceDist(tmp_vaddr);
749 
750  // set PC in request
751  pkt->req->setPC(gpuDynInst->wavefront()->pc());
752 
753  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
754 
755  // figure out the type of the request to set read/write
756  BaseTLB::Mode TLB_mode;
757  assert(pkt->isRead() || pkt->isWrite());
758 
759  // Check write before read for atomic operations
760  // since atomic operations should use BaseTLB::Write
761  if (pkt->isWrite()){
762  TLB_mode = BaseTLB::Write;
763  } else if (pkt->isRead()) {
764  TLB_mode = BaseTLB::Read;
765  } else {
766  fatal("pkt is not a read nor a write\n");
767  }
768 
769  tlbCycles -= curTick();
770  ++tlbRequests;
771 
772  int tlbPort_index = perLaneTLB ? index : 0;
773 
774  if (shader->timingSim) {
775  if (debugSegFault) {
777  Addr vaddr = pkt->req->getVaddr();
778  unsigned size = pkt->getSize();
779 
780  if ((vaddr + size - 1) % 64 < vaddr % 64) {
781  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
782  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
783  }
784 
785  Addr paddr;
786 
787  if (!p->pTable->translate(vaddr, paddr)) {
788  if (!p->fixupStackFault(vaddr)) {
789  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
790  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
791  vaddr);
792  }
793  }
794  }
795 
796  // This is the SenderState needed upon return
797  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
798 
799  // This is the senderState needed by the TLB hierarchy to function
800  TheISA::GpuTLB::TranslationState *translation_state =
801  new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
802  pkt->senderState);
803 
804  pkt->senderState = translation_state;
805 
806  if (functionalTLB) {
807  tlbPort[tlbPort_index]->sendFunctional(pkt);
808 
809  // update the hitLevel distribution
810  int hit_level = translation_state->hitLevel;
811  assert(hit_level != -1);
812  hitsPerTLBLevel[hit_level]++;
813 
814  // New SenderState for the memory access
815  X86ISA::GpuTLB::TranslationState *sender_state =
817 
818  delete sender_state->tlbEntry;
819  delete sender_state->saved;
820  delete sender_state;
821 
822  assert(pkt->req->hasPaddr());
823  assert(pkt->req->hasSize());
824 
825  uint8_t *tmpData = pkt->getPtr<uint8_t>();
826 
827  // this is necessary because the GPU TLB receives packets instead
828  // of requests. when the translation is complete, all relevent
829  // fields in the request will be populated, but not in the packet.
830  // here we create the new packet so we can set the size, addr,
831  // and proper flags.
832  PacketPtr oldPkt = pkt;
833  pkt = new Packet(oldPkt->req, oldPkt->cmd);
834  delete oldPkt;
835  pkt->dataStatic(tmpData);
836 
837 
838  // New SenderState for the memory access
839  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
840  index, nullptr);
841 
842  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
843  gpuDynInst->tlbHitLevel[index] = hit_level;
844 
845 
846  // translation is done. Schedule the mem_req_event at the
847  // appropriate cycle to send the timing memory request to ruby
848  EventFunctionWrapper *mem_req_event =
849  memPort[index]->createMemReqEvent(pkt);
850 
851  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
852  "scheduled\n", cu_id, gpuDynInst->simdId,
853  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
854 
855  schedule(mem_req_event, curTick() + req_tick_latency);
856  } else if (tlbPort[tlbPort_index]->isStalled()) {
857  assert(tlbPort[tlbPort_index]->retries.size() > 0);
858 
859  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
860  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
861  tmp_vaddr);
862 
863  tlbPort[tlbPort_index]->retries.push_back(pkt);
864  } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
865  // Stall the data port;
866  // No more packet will be issued till
867  // ruby indicates resources are freed by
868  // a recvReqRetry() call back on this port.
869  tlbPort[tlbPort_index]->stallPort();
870 
871  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
872  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
873  tmp_vaddr);
874 
875  tlbPort[tlbPort_index]->retries.push_back(pkt);
876  } else {
877  DPRINTF(GPUTLB,
878  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
879  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
880  }
881  } else {
882  if (pkt->cmd == MemCmd::MemFenceReq) {
883  gpuDynInst->statusBitVector = VectorMask(0);
884  } else {
885  gpuDynInst->statusBitVector &= (~(1ll << index));
886  }
887 
888  // New SenderState for the memory access
889  delete pkt->senderState;
890 
891  // Because it's atomic operation, only need TLB translation state
892  pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
893  shader->gpuTc);
894 
895  tlbPort[tlbPort_index]->sendFunctional(pkt);
896 
897  // the addr of the packet is not modified, so we need to create a new
898  // packet, or otherwise the memory access will have the old virtual
899  // address sent in the translation packet, instead of the physical
900  // address returned by the translation.
901  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
902  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
903 
904  // Translation is done. It is safe to send the packet to memory.
905  memPort[0]->sendFunctional(new_pkt);
906 
907  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
908  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
909  new_pkt->req->getPaddr());
910 
911  // safe_cast the senderState
912  TheISA::GpuTLB::TranslationState *sender_state =
913  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
914 
915  delete sender_state->tlbEntry;
916  delete new_pkt;
917  delete pkt->senderState;
918  delete pkt;
919  }
920 }
921 
922 void
924 {
925  EventFunctionWrapper *mem_req_event =
926  memPort[index]->createMemReqEvent(pkt);
927 
928 
929  // New SenderState for the memory access
930  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
931  nullptr);
932 
933  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
934  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
935  pkt->req->getPaddr());
936 
937  schedule(mem_req_event, curTick() + req_tick_latency);
938 }
939 
940 void
941 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
942  RequestPtr req)
943 {
944  assert(gpuDynInst->isGlobalSeg());
945 
946  if (!req) {
947  req = std::make_shared<Request>(
948  0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
949  }
950  req->setPaddr(0);
951  if (kernelLaunch) {
952  req->setFlags(Request::KERNEL);
953  }
954 
955  // for non-kernel MemFence operations, memorder flags are set depending
956  // on which type of request is currently being sent, so this
957  // should be set by the caller (e.g. if an inst has acq-rel
958  // semantics, it will send one acquire req an one release req)
959  gpuDynInst->setRequestFlags(req, kernelLaunch);
960 
961  // a mem fence must correspond to an acquire/release request
962  assert(req->isAcquire() || req->isRelease());
963 
964  // create packet
965  PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
966 
967  // set packet's sender state
968  pkt->senderState =
969  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
970 
971  // send the packet
972  sendSyncRequest(gpuDynInst, 0, pkt);
973 }
974 
975 void
977 {
978  DataPort::SenderState *sender_state =
980 
981  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
982  ComputeUnit *compute_unit = computeUnit;
983 
984  assert(gpuDynInst);
985 
986  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
987  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
988  pkt->req->getPaddr(), index);
989 
990  Addr paddr = pkt->req->getPaddr();
991 
992  if (pkt->cmd != MemCmd::MemFenceResp) {
993  int index = gpuDynInst->memStatusVector[paddr].back();
994 
995  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
996  pkt->req->getPaddr(), index);
997 
998  gpuDynInst->memStatusVector[paddr].pop_back();
999  gpuDynInst->pAddr = pkt->req->getPaddr();
1000 
1001  if (pkt->isRead() || pkt->isWrite()) {
1002 
1003  if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
1004  gpuDynInst->statusBitVector &= (~(1ULL << index));
1005  } else {
1006  assert(gpuDynInst->statusVector[index] > 0);
1007  gpuDynInst->statusVector[index]--;
1008 
1009  if (!gpuDynInst->statusVector[index])
1010  gpuDynInst->statusBitVector &= (~(1ULL << index));
1011  }
1012 
1013  DPRINTF(GPUMem, "bitvector is now %#x\n",
1014  gpuDynInst->statusBitVector);
1015 
1016  if (gpuDynInst->statusBitVector == VectorMask(0)) {
1017  auto iter = gpuDynInst->memStatusVector.begin();
1018  auto end = gpuDynInst->memStatusVector.end();
1019 
1020  while (iter != end) {
1021  assert(iter->second.empty());
1022  ++iter;
1023  }
1024 
1025  gpuDynInst->memStatusVector.clear();
1026 
1027  if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
1028  gpuDynInst->statusVector.clear();
1029 
1030  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1031 
1032  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1033  compute_unit->cu_id, gpuDynInst->simdId,
1034  gpuDynInst->wfSlotId);
1035 
1036  // after clearing the status vectors,
1037  // see if there is a continuation to perform
1038  // the continuation may generate more work for
1039  // this memory request
1040  if (gpuDynInst->useContinuation) {
1041  assert(!gpuDynInst->isNoScope());
1042  gpuDynInst->execContinuation(
1043  gpuDynInst->staticInstruction(),
1044  gpuDynInst);
1045  }
1046  }
1047  }
1048  } else {
1049  gpuDynInst->statusBitVector = VectorMask(0);
1050 
1051  if (gpuDynInst->useContinuation) {
1052  assert(!gpuDynInst->isNoScope());
1053  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1054  gpuDynInst);
1055  }
1056  }
1057 
1058  delete pkt->senderState;
1059  delete pkt;
1060 }
1061 
1062 ComputeUnit*
1063 ComputeUnitParams::create()
1064 {
1065  return new ComputeUnit(this);
1066 }
1067 
1068 bool
1070 {
1071  Addr line = pkt->req->getPaddr();
1072 
1073  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1074  pkt->req->getVaddr(), line);
1075 
1076  assert(pkt->senderState);
1077  computeUnit->tlbCycles += curTick();
1078 
1079  // pop off the TLB translation state
1080  TheISA::GpuTLB::TranslationState *translation_state =
1081  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1082 
1083  // no PageFaults are permitted for data accesses
1084  if (!translation_state->tlbEntry) {
1085  DTLBPort::SenderState *sender_state =
1086  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1087 
1089  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1090  [sender_state->_gpuDynInst->wfSlotId];
1091 
1092  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1093  pkt->req->getVaddr());
1094  }
1095 
1096  // update the hitLevel distribution
1097  int hit_level = translation_state->hitLevel;
1098  computeUnit->hitsPerTLBLevel[hit_level]++;
1099 
1100  delete translation_state->tlbEntry;
1101  assert(!translation_state->ports.size());
1102  pkt->senderState = translation_state->saved;
1103 
1104  // for prefetch pkt
1105  BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1106 
1107  delete translation_state;
1108 
1109  // use the original sender state to know how to close this transaction
1110  DTLBPort::SenderState *sender_state =
1112 
1113  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1114  int mp_index = sender_state->portIndex;
1115  Addr vaddr = pkt->req->getVaddr();
1116  gpuDynInst->memStatusVector[line].push_back(mp_index);
1117  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1118 
1119  MemCmd requestCmd;
1120 
1121  if (pkt->cmd == MemCmd::ReadResp) {
1122  requestCmd = MemCmd::ReadReq;
1123  } else if (pkt->cmd == MemCmd::WriteResp) {
1124  requestCmd = MemCmd::WriteReq;
1125  } else if (pkt->cmd == MemCmd::SwapResp) {
1126  requestCmd = MemCmd::SwapReq;
1127  } else {
1128  panic("unsupported response to request conversion %s\n",
1129  pkt->cmd.toString());
1130  }
1131 
1132  if (computeUnit->prefetchDepth) {
1133  int simdId = gpuDynInst->simdId;
1134  int wfSlotId = gpuDynInst->wfSlotId;
1135  Addr last = 0;
1136 
1137  switch(computeUnit->prefetchType) {
1138  case Enums::PF_CU:
1139  last = computeUnit->lastVaddrCU[mp_index];
1140  break;
1141  case Enums::PF_PHASE:
1142  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1143  break;
1144  case Enums::PF_WF:
1145  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1146  default:
1147  break;
1148  }
1149 
1150  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1151  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1152 
1153  int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1155  : 0;
1156 
1157  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1158 
1159  computeUnit->lastVaddrCU[mp_index] = vaddr;
1160  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1161  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1162 
1163  stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1164  computeUnit->prefetchStride: stride;
1165 
1166  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1167  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1168 
1169  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1170 
1171  // Prefetch Next few pages atomically
1172  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1173  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1174  vaddr+stride*pf*TheISA::PageBytes);
1175 
1176  if (!stride)
1177  break;
1178 
1179  RequestPtr prefetch_req = std::make_shared<Request>(
1180  0, vaddr + stride * pf * TheISA::PageBytes,
1181  sizeof(uint8_t), 0,
1182  computeUnit->masterId(),
1183  0, 0, nullptr);
1184 
1185  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1186  uint8_t foo = 0;
1187  prefetch_pkt->dataStatic(&foo);
1188 
1189  // Because it's atomic operation, only need TLB translation state
1190  prefetch_pkt->senderState =
1191  new TheISA::GpuTLB::TranslationState(TLB_mode,
1192  computeUnit->shader->gpuTc,
1193  true);
1194 
1195  // Currently prefetches are zero-latency, hence the sendFunctional
1196  sendFunctional(prefetch_pkt);
1197 
1198  /* safe_cast the senderState */
1199  TheISA::GpuTLB::TranslationState *tlb_state =
1200  safe_cast<TheISA::GpuTLB::TranslationState*>(
1201  prefetch_pkt->senderState);
1202 
1203 
1204  delete tlb_state->tlbEntry;
1205  delete tlb_state;
1206  delete prefetch_pkt;
1207  }
1208  }
1209 
1210  // First we must convert the response cmd back to a request cmd so that
1211  // the request can be sent through the cu's master port
1212  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1213  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1214  delete pkt->senderState;
1215  delete pkt;
1216 
1217  // New SenderState for the memory access
1218  new_pkt->senderState =
1219  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1220  nullptr);
1221 
1222  // translation is done. Schedule the mem_req_event at the appropriate
1223  // cycle to send the timing memory request to ruby
1224  EventFunctionWrapper *mem_req_event =
1225  computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt);
1226 
1227  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1228  computeUnit->cu_id, gpuDynInst->simdId,
1229  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1230 
1231  computeUnit->schedule(mem_req_event, curTick() +
1232  computeUnit->req_tick_latency);
1233 
1234  return true;
1235 }
1236 
1239 {
1240  return new EventFunctionWrapper(
1241  [this, pkt]{ processMemReqEvent(pkt); },
1242  "ComputeUnit memory request event", true);
1243 }
1244 
1247 {
1248  return new EventFunctionWrapper(
1249  [this, pkt]{ processMemRespEvent(pkt); },
1250  "ComputeUnit memory response event", true);
1251 }
1252 
1253 void
1255 {
1256  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1257  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1258  ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1259 
1260  if (!(sendTimingReq(pkt))) {
1261  retries.push_back(std::make_pair(pkt, gpuDynInst));
1262 
1263  DPRINTF(GPUPort,
1264  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1265  compute_unit->cu_id, gpuDynInst->simdId,
1266  gpuDynInst->wfSlotId, index,
1267  pkt->req->getPaddr());
1268  } else {
1269  DPRINTF(GPUPort,
1270  "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1271  compute_unit->cu_id, gpuDynInst->simdId,
1272  gpuDynInst->wfSlotId, index,
1273  pkt->req->getPaddr());
1274  }
1275 }
1276 
1277 /*
1278  * The initial translation request could have been rejected,
1279  * if <retries> queue is not Retry sending the translation
1280  * request. sendRetry() is called from the peer port whenever
1281  * a translation completes.
1282  */
1283 void
1285 {
1286  int len = retries.size();
1287 
1288  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1289  computeUnit->cu_id, len);
1290 
1291  assert(len > 0);
1292  assert(isStalled());
1293  // recvReqRetry is an indication that the resource on which this
1294  // port was stalling on is freed. So, remove the stall first
1295  unstallPort();
1296 
1297  for (int i = 0; i < len; ++i) {
1298  PacketPtr pkt = retries.front();
1299  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1300  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1301 
1302  if (!sendTimingReq(pkt)) {
1303  // Stall port
1304  stallPort();
1305  DPRINTF(GPUTLB, ": failed again\n");
1306  break;
1307  } else {
1308  DPRINTF(GPUTLB, ": successful\n");
1309  retries.pop_front();
1310  }
1311  }
1312 }
1313 
1314 bool
1316 {
1317  Addr line M5_VAR_USED = pkt->req->getPaddr();
1318  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1319  computeUnit->cu_id, pkt->req->getVaddr(), line);
1320 
1321  assert(pkt->senderState);
1322 
1323  // pop off the TLB translation state
1324  TheISA::GpuTLB::TranslationState *translation_state =
1325  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1326 
1327  bool success = translation_state->tlbEntry != nullptr;
1328  delete translation_state->tlbEntry;
1329  assert(!translation_state->ports.size());
1330  pkt->senderState = translation_state->saved;
1331  delete translation_state;
1332 
1333  // use the original sender state to know how to close this transaction
1334  ITLBPort::SenderState *sender_state =
1336 
1337  // get the wavefront associated with this translation request
1338  Wavefront *wavefront = sender_state->wavefront;
1339  delete pkt->senderState;
1340 
1341  if (success) {
1342  // pkt is reused in fetch(), don't delete it here. However, we must
1343  // reset the command to be a request so that it can be sent through
1344  // the cu's master port
1345  assert(pkt->cmd == MemCmd::ReadResp);
1346  pkt->cmd = MemCmd::ReadReq;
1347 
1348  computeUnit->fetchStage.fetch(pkt, wavefront);
1349  } else {
1350  if (wavefront->dropFetch) {
1351  assert(wavefront->instructionBuffer.empty());
1352  wavefront->dropFetch = false;
1353  }
1354 
1355  wavefront->pendingFetch = 0;
1356  }
1357 
1358  return true;
1359 }
1360 
1361 /*
1362  * The initial translation request could have been rejected, if
1363  * <retries> queue is not empty. Retry sending the translation
1364  * request. sendRetry() is called from the peer port whenever
1365  * a translation completes.
1366  */
1367 void
1369 {
1370 
1371  int len = retries.size();
1372  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1373 
1374  assert(len > 0);
1375  assert(isStalled());
1376 
1377  // recvReqRetry is an indication that the resource on which this
1378  // port was stalling on is freed. So, remove the stall first
1379  unstallPort();
1380 
1381  for (int i = 0; i < len; ++i) {
1382  PacketPtr pkt = retries.front();
1383  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1384  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1385 
1386  if (!sendTimingReq(pkt)) {
1387  stallPort(); // Stall port
1388  DPRINTF(GPUTLB, ": failed again\n");
1389  break;
1390  } else {
1391  DPRINTF(GPUTLB, ": successful\n");
1392  retries.pop_front();
1393  }
1394  }
1395 }
1396 
1397 void
1399 {
1401 
1402  vALUInsts
1403  .name(name() + ".valu_insts")
1404  .desc("Number of vector ALU insts issued.")
1405  ;
1407  .name(name() + ".valu_insts_per_wf")
1408  .desc("The avg. number of vector ALU insts issued per-wavefront.")
1409  ;
1410  sALUInsts
1411  .name(name() + ".salu_insts")
1412  .desc("Number of scalar ALU insts issued.")
1413  ;
1415  .name(name() + ".salu_insts_per_wf")
1416  .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1417  ;
1419  .name(name() + ".inst_cycles_valu")
1420  .desc("Number of cycles needed to execute VALU insts.")
1421  ;
1423  .name(name() + ".inst_cycles_salu")
1424  .desc("Number of cycles needed to execute SALU insts.")
1425  ;
1427  .name(name() + ".thread_cycles_valu")
1428  .desc("Number of thread cycles used to execute vector ALU ops. "
1429  "Similar to instCyclesVALU but multiplied by the number of "
1430  "active threads.")
1431  ;
1433  .name(name() + ".valu_utilization")
1434  .desc("Percentage of active vector ALU threads in a wave.")
1435  ;
1437  .name(name() + ".lds_no_flat_insts")
1438  .desc("Number of LDS insts issued, not including FLAT "
1439  "accesses that resolve to LDS.")
1440  ;
1442  .name(name() + ".lds_no_flat_insts_per_wf")
1443  .desc("The avg. number of LDS insts (not including FLAT "
1444  "accesses that resolve to LDS) per-wavefront.")
1445  ;
1447  .name(name() + ".flat_vmem_insts")
1448  .desc("The number of FLAT insts that resolve to vmem issued.")
1449  ;
1451  .name(name() + ".flat_vmem_insts_per_wf")
1452  .desc("The average number of FLAT insts that resolve to vmem "
1453  "issued per-wavefront.")
1454  ;
1455  flatLDSInsts
1456  .name(name() + ".flat_lds_insts")
1457  .desc("The number of FLAT insts that resolve to LDS issued.")
1458  ;
1460  .name(name() + ".flat_lds_insts_per_wf")
1461  .desc("The average number of FLAT insts that resolve to LDS "
1462  "issued per-wavefront.")
1463  ;
1465  .name(name() + ".vector_mem_writes")
1466  .desc("Number of vector mem write insts (excluding FLAT insts).")
1467  ;
1469  .name(name() + ".vector_mem_writes_per_wf")
1470  .desc("The average number of vector mem write insts "
1471  "(excluding FLAT insts) per-wavefront.")
1472  ;
1474  .name(name() + ".vector_mem_reads")
1475  .desc("Number of vector mem read insts (excluding FLAT insts).")
1476  ;
1478  .name(name() + ".vector_mem_reads_per_wf")
1479  .desc("The avg. number of vector mem read insts (excluding "
1480  "FLAT insts) per-wavefront.")
1481  ;
1483  .name(name() + ".scalar_mem_writes")
1484  .desc("Number of scalar mem write insts.")
1485  ;
1487  .name(name() + ".scalar_mem_writes_per_wf")
1488  .desc("The average number of scalar mem write insts per-wavefront.")
1489  ;
1491  .name(name() + ".scalar_mem_reads")
1492  .desc("Number of scalar mem read insts.")
1493  ;
1495  .name(name() + ".scalar_mem_reads_per_wf")
1496  .desc("The average number of scalar mem read insts per-wavefront.")
1497  ;
1498 
1501  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
1509 
1510  tlbCycles
1511  .name(name() + ".tlb_cycles")
1512  .desc("total number of cycles for all uncoalesced requests")
1513  ;
1514 
1515  tlbRequests
1516  .name(name() + ".tlb_requests")
1517  .desc("number of uncoalesced requests")
1518  ;
1519 
1520  tlbLatency
1521  .name(name() + ".avg_translation_latency")
1522  .desc("Avg. translation latency for data translations")
1523  ;
1524 
1526 
1528  .init(4)
1529  .name(name() + ".TLB_hits_distribution")
1530  .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
1531  ;
1532 
1533  // fixed number of TLB levels
1534  for (int i = 0; i < 4; ++i) {
1535  if (!i)
1536  hitsPerTLBLevel.subname(i,"page_table");
1537  else
1538  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
1539  }
1540 
1541  execRateDist
1542  .init(0, 10, 2)
1543  .name(name() + ".inst_exec_rate")
1544  .desc("Instruction Execution Rate: Number of executed vector "
1545  "instructions per cycle")
1546  ;
1547 
1549  .init(0, wfSize(), 2)
1550  .name(name() + ".lds_bank_conflicts")
1551  .desc("Number of bank conflicts per LDS memory packet")
1552  ;
1553 
1555  .name(name() + ".lds_bank_access_cnt")
1556  .desc("Total number of LDS bank accesses")
1557  ;
1558 
1560  // A wavefront can touch up to N pages per memory instruction where
1561  // N is equal to the wavefront size
1562  // The number of pages per bin can be configured (here it's 4).
1563  .init(1, wfSize(), 4)
1564  .name(name() + ".page_divergence_dist")
1565  .desc("pages touched per wf (over all mem. instr.)")
1566  ;
1567 
1569  .init(1, wfSize(), 4)
1570  .name(name() + ".warp_execution_dist")
1571  .desc("number of lanes active per instruction (oval all instructions)")
1572  ;
1573 
1575  .init(1, wfSize(), 4)
1576  .name(name() + ".gmem_lanes_execution_dist")
1577  .desc("number of active lanes per global memory instruction")
1578  ;
1579 
1581  .init(1, wfSize(), 4)
1582  .name(name() + ".lmem_lanes_execution_dist")
1583  .desc("number of active lanes per local memory instruction")
1584  ;
1585 
1587  .name(name() + ".num_instr_executed")
1588  .desc("number of instructions executed")
1589  ;
1590 
1592  .name(name() + ".num_vec_ops_executed")
1593  .desc("number of vec ops executed (e.g. WF size/inst)")
1594  ;
1595 
1596  totalCycles
1597  .name(name() + ".num_total_cycles")
1598  .desc("number of cycles the CU ran for")
1599  ;
1600 
1601  ipc
1602  .name(name() + ".ipc")
1603  .desc("Instructions per cycle (this CU only)")
1604  ;
1605 
1606  vpc
1607  .name(name() + ".vpc")
1608  .desc("Vector Operations per cycle (this CU only)")
1609  ;
1610 
1612  .name(name() + ".num_alu_insts_executed")
1613  .desc("Number of dynamic non-GM memory insts executed")
1614  ;
1615 
1617  .name(name() + ".wg_blocked_due_lds_alloc")
1618  .desc("Workgroup blocked due to LDS capacity")
1619  ;
1620 
1623 
1625  .name(name() + ".times_wg_blocked_due_vgpr_alloc")
1626  .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
1627  ;
1628 
1630  .name(name() + ".global_mem_instr_cnt")
1631  .desc("dynamic global memory instructions count")
1632  ;
1633 
1635  .name(name() + ".local_mem_instr_cnt")
1636  .desc("dynamic local memory intruction count")
1637  ;
1638 
1641 
1642  completedWfs
1643  .name(name() + ".num_completed_wfs")
1644  .desc("number of completed wavefronts")
1645  ;
1646 
1647  numCASOps
1648  .name(name() + ".num_CAS_ops")
1649  .desc("number of compare and swap operations")
1650  ;
1651 
1653  .name(name() + ".num_failed_CAS_ops")
1654  .desc("number of compare and swap operations that failed")
1655  ;
1656 
1657  // register stats of pipeline stages
1658  fetchStage.regStats();
1661  execStage.regStats();
1662 
1663  // register stats of memory pipeline
1666 }
1667 
1668 void
1670 {
1671  if (gpuDynInst->isScalar()) {
1672  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1673  sALUInsts++;
1674  instCyclesSALU++;
1675  } else if (gpuDynInst->isLoad()) {
1676  scalarMemReads++;
1677  } else if (gpuDynInst->isStore()) {
1678  scalarMemWrites++;
1679  }
1680  } else {
1681  if (gpuDynInst->isALU()) {
1682  vALUInsts++;
1683  instCyclesVALU++;
1684  threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
1685  } else if (gpuDynInst->isFlat()) {
1686  if (gpuDynInst->isLocalMem()) {
1687  flatLDSInsts++;
1688  } else {
1689  flatVMemInsts++;
1690  }
1691  } else if (gpuDynInst->isLocalMem()) {
1692  ldsNoFlatInsts++;
1693  } else if (gpuDynInst->isLoad()) {
1694  vectorMemReads++;
1695  } else if (gpuDynInst->isStore()) {
1696  vectorMemWrites++;
1697  }
1698  }
1699 }
1700 
1701 void
1703 {
1704  Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
1705 
1706  if (!pagesTouched.count(virt_page_addr))
1707  pagesTouched[virt_page_addr] = 1;
1708  else
1709  pagesTouched[virt_page_addr]++;
1710 }
1711 
1712 void
1714 {
1715  if (computeUnit->countPages) {
1716  std::ostream *page_stat_file =
1717  simout.create(computeUnit->name().c_str())->stream();
1718 
1719  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1720  std::endl;
1721 
1722  for (auto iter : computeUnit->pageAccesses) {
1723  *page_stat_file << std::hex << iter.first << ",";
1724  *page_stat_file << std::dec << iter.second.first << ",";
1725  *page_stat_file << std::dec << iter.second.second << std::endl;
1726  }
1727  }
1728  }
1729 
1730 bool
1732 {
1733  for (int i = 0; i < numSIMDs; ++i) {
1734  if (!isSimdDone(i)) {
1735  return false;
1736  }
1737  }
1738 
1739  bool glbMemBusRdy = true;
1740  for (int j = 0; j < numGlbMemUnits; ++j) {
1741  glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
1742  }
1743  bool locMemBusRdy = true;
1744  for (int j = 0; j < numLocMemUnits; ++j) {
1745  locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
1746  }
1747 
1752  !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
1753  return false;
1754  }
1755 
1756  return true;
1757 }
1758 
1759 int32_t
1760 ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
1761 {
1762  return lds.getRefCounter(dispatchId, wgId);
1763 }
1764 
1765 bool
1766 ComputeUnit::isSimdDone(uint32_t simdId) const
1767 {
1768  assert(simdId < numSIMDs);
1769 
1770  for (int i=0; i < numGlbMemUnits; ++i) {
1771  if (!vrfToGlobalMemPipeBus[i].rdy())
1772  return false;
1773  }
1774  for (int i=0; i < numLocMemUnits; ++i) {
1775  if (!vrfToLocalMemPipeBus[i].rdy())
1776  return false;
1777  }
1778  if (!aluPipe[simdId].rdy()) {
1779  return false;
1780  }
1781 
1782  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1783  if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
1784  return false;
1785  }
1786  }
1787 
1788  return true;
1789 }
1790 
1796 bool
1798 {
1799  // this is just a request to carry the GPUDynInstPtr
1800  // back and forth
1801  RequestPtr newRequest = std::make_shared<Request>();
1802  newRequest->setPaddr(0x0);
1803 
1804  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
1805  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
1806 
1807  // This is the SenderState needed upon return
1808  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
1809 
1810  return ldsPort->sendTimingReq(newPacket);
1811 }
1812 
1816 bool
1818 {
1819  const ComputeUnit::LDSPort::SenderState *senderState =
1820  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
1821 
1822  fatal_if(!senderState, "did not get the right sort of sender state");
1823 
1824  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
1825 
1826  delete packet->senderState;
1827  delete packet;
1828 
1829  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1830  return true;
1831 }
1832 
1838 bool
1840 {
1841  ComputeUnit::LDSPort::SenderState *sender_state =
1842  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
1843  fatal_if(!sender_state, "packet without a valid sender state");
1844 
1845  GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
1846 
1847  if (isStalled()) {
1848  fatal_if(retries.empty(), "must have retries waiting to be stalled");
1849 
1850  retries.push(pkt);
1851 
1852  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
1853  computeUnit->cu_id, gpuDynInst->simdId,
1854  gpuDynInst->wfSlotId);
1855  return false;
1856  } else if (!MasterPort::sendTimingReq(pkt)) {
1857  // need to stall the LDS port until a recvReqRetry() is received
1858  // this indicates that there is more space
1859  stallPort();
1860  retries.push(pkt);
1861 
1862  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1863  computeUnit->cu_id, gpuDynInst->simdId,
1864  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1865  return false;
1866  } else {
1867  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1868  computeUnit->cu_id, gpuDynInst->simdId,
1869  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1870  return true;
1871  }
1872 }
1873 
1880 void
1882 {
1883  auto queueSize = retries.size();
1884 
1885  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
1886  computeUnit->cu_id, queueSize);
1887 
1888  fatal_if(queueSize < 1,
1889  "why was there a recvReqRetry() with no pending reqs?");
1890  fatal_if(!isStalled(),
1891  "recvReqRetry() happened when the port was not stalled");
1892 
1893  unstallPort();
1894 
1895  while (!retries.empty()) {
1896  PacketPtr packet = retries.front();
1897 
1898  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
1899 
1900  if (!MasterPort::sendTimingReq(packet)) {
1901  // Stall port
1902  stallPort();
1903  DPRINTF(GPUPort, ": LDS send failed again\n");
1904  break;
1905  } else {
1906  DPRINTF(GPUTLB, ": LDS send successful\n");
1907  retries.pop();
1908  }
1909  }
1910 }
uint32_t numVecRegsPerSimd
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:167
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
Addr roBase
Definition: wavefront.hh:275
uint16_t cRegCount
Definition: qstruct.hh:62
RubyTester::SenderState SenderState
Definition: Check.cc:37
#define DPRINTF(x,...)
Definition: trace.hh:229
uint32_t workGroupSz[3]
Definition: wavefront.hh:197
void processMemReqEvent(PacketPtr pkt)
Stats::Formula vpc
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
OutputDirectory simout
Definition: output.cc:65
Stats::Scalar flatLDSInsts
Addr spillBase
Definition: wavefront.hh:263
Bitfield< 30, 0 > index
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
Definition: statistics.hh:379
const Regs::Info & regInfo(Addr daddr)
Definition: sinicreg.hh:184
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:175
uint32_t numCyclesPerLoadTransfer
Definition: packet.hh:76
Stats::Formula ipc
uint64_t privMemStart
Definition: qstruct.hh:63
Bitfield< 7 > i
WaitClass glbMemToVrfBus
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
Definition: compute_unit.hh:59
std::map< unsigned, waveQueue > xactCasLoadMap
bool debugSegFault
LdsState & lds
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:206
Bitfield< 0 > m
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
Definition: gpu_tlb.hh:330
void fillKernelState(Wavefront *w, NDRange *ndr)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Definition: compute_unit.hh:99
Stats::Formula flatLDSInstsPerWF
uint32_t barrierCnt
Definition: wavefront.hh:157
Stats::Distribution controlFlowDivergenceDist
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:237
Bitfield< 21, 20 > stride
const Addr PageShift
Definition: isa_traits.hh:46
std::vector< std::vector< Wavefront * > > readyList
int maxBarCnt
Definition: wavefront.hh:254
std::shared_ptr< Request > RequestPtr
Definition: request.hh:83
uint32_t gridSz[3]
Definition: wavefront.hh:198
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
ip6_addr_t addr
Definition: inet.hh:335
VectorMask initMask
Definition: wavefront.hh:250
uint32_t wgSz
Definition: wavefront.hh:200
virtual Process * getProcessPtr()=0
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
uint16_t sRegCount
Definition: qstruct.hh:60
uint32_t spillWidth
Definition: wavefront.hh:267
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:66
int simdId
Definition: wavefront.hh:165
bool dropFetch
Definition: wavefront.hh:172
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
Definition: wavefront.cc:783
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
Definition: port.hh:445
void init(ComputeUnit *cu)
uint32_t dispatchId
Definition: wavefront.hh:208
std::vector< DTLBPort * > tlbPort
int kernId
Definition: wavefront.hh:163
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
Definition: lds_state.hh:59
uint64_t code_ptr
Definition: qstruct.hh:55
void updateEvents()
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
int wfSlotId
Definition: wavefront.hh:162
Stats::Scalar dynamicLMemInstrCnt
bool stalledAtBarrier
Definition: wavefront.hh:256
SenderState is information carried along with the packet throughout the TLB hierarchy.
bool isWrite() const
Definition: packet.hh:529
LdsChunk * ldsChunk
Definition: wavefront.hh:260
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1090
bool isRead() const
Definition: packet.hh:528
GPUStaticInst * kernelLaunchInst
#define DPRINTFN(...)
Definition: trace.hh:233
Stats::Scalar numInstrExecuted
Stats::Scalar vALUInsts
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1152
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1040
Stats::Distribution ldsBankConflictDist
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
bool rdy() const
Definition: misc.hh:70
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
RequestPtr req
A pointer to the original request.
Definition: packet.hh:327
uint32_t workGroupId[3]
Definition: wavefront.hh:196
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
uint64_t wfDynId
Definition: wavefront.hh:282
Bitfield< 5, 0 > status
uint32_t barrierSlots
Definition: wavefront.hh:159
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
Definition: compute_unit.cc:61
unsigned getSize() const
Definition: packet.hh:736
GlobalMemPipeline globalMemoryPipe
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
void exec()
Definition: fetch_stage.cc:69
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
bool functionalTLB
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:193
Tick curTick()
The current simulated tick.
Definition: core.hh:47
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:169
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
int funcargs_size
Definition: shader.hh:133
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:162
uint32_t actualWgSz[3]
Definition: wavefront.hh:202
uint32_t wfId
Definition: wavefront.hh:206
bool translate(Addr vaddr, Addr &paddr)
Translate function.
Definition: page_table.cc:144
Addr privBase
Definition: wavefront.hh:270
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:194
Stats::Distribution pageDivergenceDist
ExecStage execStage
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
void regStats()
Definition: fetch_stage.cc:99
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:454
std::vector< WaitClass > vrfToGlobalMemPipeBus
bool isDone() const
void updateInstStats(GPUDynInstPtr gpuDynInst)
Bitfield< 23 > k
Definition: dt_constants.hh:80
std::vector< int > barCnt
Definition: wavefront.hh:253
Bitfield< 9 > d
uint32_t wgId
Definition: wavefront.hh:199
Stats::Scalar flatVMemInsts
int numWg[3]
Definition: ndrange.hh:50
Stats::Scalar numCASOps
GPUDynInstPtr getMemInst() const
Bitfield< 18, 16 > len
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
void exec()
Definition: exec_stage.cc:129
uint32_t actualWgSzTotal
Definition: wavefront.hh:203
uint32_t ldsSize
Definition: qstruct.hh:72
Addr getAddr() const
Definition: packet.hh:726
int wfSize() const
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
Definition: core.cc:143
std::vector< std::vector< Addr > > lastVaddrSimd
bool isPowerOf2(const T &n)
Definition: intmath.hh:146
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:203
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
Definition: wavefront.cc:982
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:185
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2609
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
Definition: exec_stage.cc:54
uint32_t wgSize[3]
Definition: qstruct.hh:59
STL list class.
Definition: stl.hh:54
uint32_t gdSize[3]
Definition: qstruct.hh:57
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:290
ThreadContext * gpuTc
Definition: shader.hh:99
The request should be marked with KERNEL.
Definition: request.hh:173
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
Bitfield< 0 > w
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
Packet::SenderState * saved
Definition: gpu_tlb.hh:355
#define ULL(N)
uint64_t constant
Definition: types.hh:50
virtual const std::string name() const
Definition: sim_object.hh:120
Stats::Scalar ldsNoFlatInsts
T safe_cast(U ptr)
Definition: cast.hh:61
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:255
uint64_t roMemStart
Definition: qstruct.hh:69
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Definition: qstruct.hh:93
Stats::Scalar instCyclesVALU
const Addr PageBytes
Definition: isa_traits.hh:47
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Bitfield< 15 > system
Definition: misc.hh:999
Stats::Scalar completedWfs
bool xact_cas_mode
uint32_t outstandingReqs
Definition: wavefront.hh:210
Bitfield< 24 > j
uint32_t globalWgId
Definition: ndrange.hh:57
uint32_t privSizePerItem
Definition: wavefront.hh:272
Mode
Definition: tlb.hh:59
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:279
Stats::Formula vALUInstsPerWF
bool fixupStackFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
Definition: process.cc:362
uint32_t privMemPerItem
Definition: qstruct.hh:64
Shader * shader
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
EmulationPageTable * pTable
Definition: process.hh:181
uint64_t spillMemStart
Definition: qstruct.hh:66
Stats::Distribution activeLanesPerGMemInstrDist
uint32_t oldBarrierCnt
Definition: wavefront.hh:156
bool timingSim
Definition: shader.hh:106
Declarations of a non-full system Page Table.
bool pendingFetch
Definition: wavefront.hh:171
void init(uint64_t *_tcnt, uint32_t _numStages=0)
Definition: misc.hh:54
Stats::Scalar tlbCycles
SenderState is information carried along with the packet, esp.
int reservedVectorRegs
Definition: wavefront.hh:230
uint32_t startVgprIndex
Definition: wavefront.hh:233
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
uint32_t roSize
Definition: wavefront.hh:277
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet&#39;s sender state.
Definition: packet.hh:480
uint32_t spillSizePerItem
Definition: wavefront.hh:265
MemCmd cmd
The command field of the packet.
Definition: packet.hh:322
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:157
void init(ComputeUnit *cu)
Tick ticks(int numCycles) const
Definition: shader.hh:91
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:358
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
Definition: intmath.hh:153
int ReadyWorkgroup(NDRange *ndr)
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack&#39;d and must stall...
void init(ComputeUnit *cu)
Definition: fetch_stage.cc:57
std::map< Addr, int > pagesTouched
void schedule(Event &event, Tick when)
Definition: eventq.hh:744
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:325
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
HsaQueueEntry q
Definition: ndrange.hh:45
uint32_t roMemTotal
Definition: qstruct.hh:70
WaitClass locMemToVrfBus
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Definition: wavefront.cc:142
FetchStage fetchStage
Definition: compute_unit.hh:97
Stats::Formula flatVMemInstsPerWF
std::vector< uint8_t > statusVec
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
uint32_t barrier_id
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:312
std::vector< uint64_t > lastExecCycle
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
uint32_t spillMemPerItem
Definition: qstruct.hh:67
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
uint16_t dRegCount
Definition: qstruct.hh:61
Bitfield< 2 > pf
Definition: misc.hh:552
static const int NumArgumentRegs M5_VAR_USED
Definition: process.cc:84
int impl_kern_boundary_sync
Definition: shader.hh:110
Stats::Scalar sALUInsts
Stats::Scalar ldsBankAccesses
uint32_t barrierId
Definition: wavefront.hh:158
Tick req_tick_latency
Stats::Scalar totalCycles
uint64_t tick_cnt
Definition: shader.hh:151
void regStats()
Definition: exec_stage.cc:153
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
Bitfield< 0 > p
std::vector< Addr > lastVaddrCU
int n_wf
Definition: shader.hh:121
int wgId[3]
Definition: ndrange.hh:48
void regStats() override
Callback to set stat parameters.
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Definition: lds_state.cc:82
ComputeUnitParams Params
Stats::Formula ldsNoFlatInstsPerWF
bool isSimdDone(uint32_t) const
uint64_t getAndIncSeqNum()
status_e status
Definition: wavefront.hh:160
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
MasterID masterId()
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
std::vector< int > vectorRegsReserved
EXEC_POLICY exec_policy
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:104
int dispatchId
Definition: ndrange.hh:66
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:98
uint8_t * kernelArgs
Definition: wavefront.hh:280

Generated on Fri Feb 28 2020 16:27:01 for gem5 by doxygen 1.8.13