gem5  v20.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
35 
36 #include <limits>
37 
38 #include "base/output.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUSync.hh"
46 #include "debug/GPUTLB.hh"
50 #include "gpu-compute/ndrange.hh"
51 #include "gpu-compute/shader.hh"
54 #include "gpu-compute/wavefront.hh"
55 #include "mem/page_table.hh"
56 #include "sim/process.hh"
57 
58 ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
59  scoreboardCheckStage(p), scheduleStage(p), execStage(p),
60  globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
61  cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
62  spBypassPipeLength(p->spbypass_pipe_length),
63  dpBypassPipeLength(p->dpbypass_pipe_length),
64  issuePeriod(p->issue_period),
65  numGlbMemUnits(p->num_global_mem_pipes),
66  numLocMemUnits(p->num_shared_mem_pipes),
67  perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
68  prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
69  xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
70  functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
71  countPages(p->countPages), barrier_id(0),
72  vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
73  coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
74  req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
75  resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
76  _masterId(p->system->getMasterId(this, "ComputeUnit")),
77  lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
78  globalSeqNum(0), wavefrontSize(p->wfSize),
79  kernelLaunchInst(new KernelLaunchStaticInst())
80 {
90  fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
91  p->wfSize <= 0,
92  "WF size is larger than the host can support");
94  "Wavefront size should be a power of 2");
95  // calculate how many cycles a vector load or store will need to transfer
96  // its data over the corresponding buses
98  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
99  (double)vrfToCoalescerBusWidth);
100 
101  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
103 
104  lastVaddrWF.resize(numSIMDs);
105  wfList.resize(numSIMDs);
106 
107  for (int j = 0; j < numSIMDs; ++j) {
108  lastVaddrWF[j].resize(p->n_wf);
109 
110  for (int i = 0; i < p->n_wf; ++i) {
111  lastVaddrWF[j][i].resize(wfSize());
112 
113  wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
114  wfList[j][i]->setParent(this);
115 
116  for (int k = 0; k < wfSize(); ++k) {
117  lastVaddrWF[j][i][k] = 0;
118  }
119  }
120  }
121 
122  lastVaddrSimd.resize(numSIMDs);
123 
124  for (int i = 0; i < numSIMDs; ++i) {
125  lastVaddrSimd[i].resize(wfSize(), 0);
126  }
127 
128  lastVaddrCU.resize(wfSize());
129 
130  lds.setParent(this);
131 
132  if (p->execPolicy == "OLDEST-FIRST") {
134  } else if (p->execPolicy == "ROUND-ROBIN") {
136  } else {
137  fatal("Invalid WF execution policy (CU)\n");
138  }
139 
140  memPort.resize(wfSize());
141 
142  // resize the tlbPort vectorArray
143  int tlbPort_width = perLaneTLB ? wfSize() : 1;
144  tlbPort.resize(tlbPort_width);
145 
146  cuExitCallback = new CUExitCallback(this);
148 
149  xactCasLoadMap.clear();
150  lastExecCycle.resize(numSIMDs, 0);
151 
152  for (int i = 0; i < vrf.size(); ++i) {
153  vrf[i]->setParent(this);
154  }
155 
156  numVecRegsPerSimd = vrf[0]->numRegs();
157 }
158 
160 {
161  // Delete wavefront slots
162  for (int j = 0; j < numSIMDs; ++j) {
163  for (int i = 0; i < shader->n_wf; ++i) {
164  delete wfList[j][i];
165  }
166  lastVaddrSimd[j].clear();
167  }
168  lastVaddrCU.clear();
169  readyList.clear();
170  waveStatusList.clear();
171  dispatchList.clear();
172  vectorAluInstAvail.clear();
173  delete cuExitCallback;
174  delete ldsPort;
175 }
176 
177 void
179 {
180  w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
181 
182  w->workGroupSz[0] = ndr->q.wgSize[0];
183  w->workGroupSz[1] = ndr->q.wgSize[1];
184  w->workGroupSz[2] = ndr->q.wgSize[2];
185  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
186  w->gridSz[0] = ndr->q.gdSize[0];
187  w->gridSz[1] = ndr->q.gdSize[1];
188  w->gridSz[2] = ndr->q.gdSize[2];
189  w->kernelArgs = ndr->q.args;
190  w->privSizePerItem = ndr->q.privMemPerItem;
192  w->roBase = ndr->q.roMemStart;
193  w->roSize = ndr->q.roMemTotal;
194  w->computeActualWgSz(ndr);
195 }
196 
197 void
199 
200  if (!timestampVec.empty()) {
201  uint32_t vecSize = timestampVec.size();
202  uint32_t i = 0;
203  while (i < vecSize) {
204  if (timestampVec[i] <= shader->tick_cnt) {
206  vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
207  statusVec[i]);
208  timestampVec.erase(timestampVec.begin() + i);
209  regIdxVec.erase(regIdxVec.begin() + i);
210  statusVec.erase(statusVec.begin() + i);
211  --vecSize;
212  --i;
213  }
214  ++i;
215  }
216  }
217 
218  for (int i = 0; i< numSIMDs; ++i) {
219  vrf[i]->updateEvents();
220  }
221 }
222 
223 
224 void
226  NDRange *ndr)
227 {
228  static int _n_wave = 0;
229 
230  VectorMask init_mask;
231  init_mask.reset();
232 
233  for (int k = 0; k < wfSize(); ++k) {
234  if (k + waveId * wfSize() < w->actualWgSzTotal)
235  init_mask[k] = 1;
236  }
237 
238  w->kernId = ndr->dispatchId;
239  w->wfId = waveId;
240  w->initMask = init_mask.to_ullong();
241 
242  for (int k = 0; k < wfSize(); ++k) {
243  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
244  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
245  w->actualWgSz[1];
246  w->workItemId[2][k] = (k + waveId * wfSize()) /
247  (w->actualWgSz[0] * w->actualWgSz[1]);
248 
249  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
250  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
251  w->workItemId[0][k];
252  }
253 
254  w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
255 
256  w->barCnt.resize(wfSize(), 0);
257 
258  w->maxBarCnt = 0;
259  w->oldBarrierCnt = 0;
260  w->barrierCnt = 0;
261 
262  w->privBase = ndr->q.privMemStart;
263  ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
264 
265  w->spillBase = ndr->q.spillMemStart;
266  ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
267 
268  w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
269 
270  // WG state
271  w->wgId = ndr->globalWgId;
272  w->dispatchId = ndr->dispatchId;
273  w->workGroupId[0] = w->wgId % ndr->numWg[0];
274  w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
275  w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
276 
277  w->barrierId = barrier_id;
278  w->stalledAtBarrier = false;
279 
280  // set the wavefront context to have a pointer to this section of the LDS
281  w->ldsChunk = ldsChunk;
282 
283  int32_t refCount M5_VAR_USED =
285  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
286  cu_id, w->wgId, refCount);
287 
288  w->instructionBuffer.clear();
289 
290  if (w->pendingFetch)
291  w->dropFetch = true;
292 
293  // is this the last wavefront in the workgroup
294  // if set the spillWidth to be the remaining work-items
295  // so that the vector access is correct
296  if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
297  w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
298  } else {
299  w->spillWidth = wfSize();
300  }
301 
302  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
303  "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
304 
305  w->start(++_n_wave, ndr->q.code_ptr);
306 }
307 
308 void
310 {
311  // reserve the LDS capacity allocated to the work group
312  // disambiguated by the dispatch ID and workgroup ID, which should be
313  // globally unique
314  LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
315  ndr->q.ldsSize);
316 
317  // Send L1 cache acquire
318  // isKernel + isAcquire = Kernel Begin
320  GPUDynInstPtr gpuDynInst =
321  std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst,
322  getAndIncSeqNum());
323 
324  gpuDynInst->useContinuation = false;
325  injectGlobalMemFence(gpuDynInst, true);
326  }
327 
328  // calculate the number of 32-bit vector registers required by wavefront
329  int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
330  int wave_id = 0;
331 
332  // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
333  for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
334  Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
335  // Check if this wavefront slot is available:
336  // It must be stopped and not waiting
337  // for a release to complete S_RETURNING
338  if (w->status == Wavefront::S_STOPPED) {
339  fillKernelState(w, ndr);
340  // if we have scheduled all work items then stop
341  // scheduling wavefronts
342  if (wave_id * wfSize() >= w->actualWgSzTotal)
343  break;
344 
345  // reserve vector registers for the scheduled wavefront
347  uint32_t normSize = 0;
348 
349  w->startVgprIndex = vrf[m % numSIMDs]->manager->
350  allocateRegion(vregDemand, &normSize);
351 
352  w->reservedVectorRegs = normSize;
354 
355  startWavefront(w, wave_id, ldsChunk, ndr);
356  ++wave_id;
357  }
358  }
359  ++barrier_id;
360 }
361 
362 int
364 {
365  // Get true size of workgroup (after clamping to grid size)
366  int trueWgSize[3];
367  int trueWgSizeTotal = 1;
368 
369  for (int d = 0; d < 3; ++d) {
370  trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
371  ndr->wgId[d] * ndr->q.wgSize[d]);
372 
373  trueWgSizeTotal *= trueWgSize[d];
374  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
375  }
376 
377  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
378 
379  // calculate the number of 32-bit vector registers required by each
380  // work item of the work group
381  int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
382  bool vregAvail = true;
383  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
384  int freeWfSlots = 0;
385  // check if the total number of VGPRs required by all WFs of the WG
386  // fit in the VRFs of all SIMD units
387  assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
388  int numMappedWfs = 0;
389  std::vector<int> numWfsPerSimd;
390  numWfsPerSimd.resize(numSIMDs, 0);
391  // find how many free WF slots we have across all SIMDs
392  for (int j = 0; j < shader->n_wf; ++j) {
393  for (int i = 0; i < numSIMDs; ++i) {
394  if (wfList[i][j]->status == Wavefront::S_STOPPED) {
395  // count the number of free WF slots
396  ++freeWfSlots;
397  if (numMappedWfs < numWfs) {
398  // count the WFs to be assigned per SIMD
399  numWfsPerSimd[i]++;
400  }
401  numMappedWfs++;
402  }
403  }
404  }
405 
406  // if there are enough free WF slots then find if there are enough
407  // free VGPRs per SIMD based on the WF->SIMD mapping
408  if (freeWfSlots >= numWfs) {
409  for (int j = 0; j < numSIMDs; ++j) {
410  // find if there are enough free VGPR regions in the SIMD's VRF
411  // to accommodate the WFs of the new WG that would be mapped to
412  // this SIMD unit
413  vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
414  vregDemandPerWI);
415 
416  // stop searching if there is at least one SIMD
417  // whose VRF does not have enough free VGPR pools.
418  // This is because a WG is scheduled only if ALL
419  // of its WFs can be scheduled
420  if (!vregAvail)
421  break;
422  }
423  }
424 
425  DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
426  freeWfSlots, vregAvail);
427 
428  if (!vregAvail) {
430  }
431 
432  // Return true if enough WF slots to submit workgroup and if there are
433  // enough VGPRs to schedule all WFs to their SIMD units
434  if (!lds.canReserve(ndr->q.ldsSize)) {
436  }
437 
438  // Return true if (a) there are enough free WF slots to submit
439  // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
440  // SIMD units and (c) if there is enough space in LDS
441  return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
442 }
443 
444 int
445 ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
446 {
447  DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
448  int ccnt = 0;
449 
450  for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
451  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
452  Wavefront *w = wfList[i_simd][i_wf];
453 
454  if (w->status == Wavefront::S_RUNNING) {
455  DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
456 
457  DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
458  w->barrierId, _barrier_id);
459 
460  DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
461  w->barrierCnt, bcnt);
462  }
463 
464  if (w->status == Wavefront::S_RUNNING &&
465  w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
466  !w->outstandingReqs) {
467  ++ccnt;
468 
469  DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
470  "%d\n", i_simd, i_wf, ccnt);
471  }
472  }
473  }
474 
475  DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
476  cu_id, ccnt, bslots);
477 
478  return ccnt == bslots;
479 }
480 
481 // Check if the current wavefront is blocked on additional resources.
482 bool
483 ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
484 {
485  bool cede = false;
486 
487  // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
488  // magic instructions will impact the scheduling of wavefronts
489  if (xact_cas_mode) {
490  /*
491  * When a wavefront calls xact_cas_ld, it adds itself to a per address
492  * queue. All per address queues are managed by the xactCasLoadMap.
493  *
494  * A wavefront is not blocked if: it is not in ANY per address queue or
495  * if it is at the head of a per address queue.
496  */
497  for (auto itMap : xactCasLoadMap) {
498  std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
499 
500  if (!curWaveIDQueue.empty()) {
501  for (auto it : curWaveIDQueue) {
502  waveIdentifier cur_wave = it;
503 
504  if (cur_wave.simdId == simdId &&
505  cur_wave.wfSlotId == wfSlotId) {
506  // 2 possibilities
507  // 1: this WF has a green light
508  // 2: another WF has a green light
509  waveIdentifier owner_wave = curWaveIDQueue.front();
510 
511  if (owner_wave.simdId != cur_wave.simdId ||
512  owner_wave.wfSlotId != cur_wave.wfSlotId) {
513  // possibility 2
514  cede = true;
515  break;
516  } else {
517  // possibility 1
518  break;
519  }
520  }
521  }
522  }
523  }
524  }
525 
526  return cede;
527 }
528 
529 // Execute one clock worth of work on the ComputeUnit.
530 void
532 {
533  updateEvents();
534  // Execute pipeline stages in reverse order to simulate
535  // the pipeline latency
538  execStage.exec();
541  fetchStage.exec();
542 
543  totalCycles++;
544 }
545 
546 void
548 {
549  // Initialize CU Bus models
552  nextGlbMemBus = 0;
553  nextLocMemBus = 0;
555  "No support for multiple Global Memory Pipelines exists!!!");
557  for (int j = 0; j < numGlbMemUnits; ++j) {
560  }
561 
563  "No support for multiple Local Memory Pipelines exists!!!");
565  for (int j = 0; j < numLocMemUnits; ++j) {
568  }
569  vectorRegsReserved.resize(numSIMDs, 0);
570  aluPipe.resize(numSIMDs);
571  wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
572 
573  for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
574  wfWait[i] = WaitClass();
575  wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
576  }
577 
578  for (int i = 0; i < numSIMDs; ++i) {
579  aluPipe[i] = WaitClass();
580  aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
581  }
582 
583  // Setup space for call args
584  for (int j = 0; j < numSIMDs; ++j) {
585  for (int i = 0; i < shader->n_wf; ++i) {
586  wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
587  }
588  }
589 
590  // Initializing pipeline resources
591  readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
592  waveStatusList.resize(numSIMDs);
593 
594  for (int j = 0; j < numSIMDs; ++j) {
595  for (int i = 0; i < shader->n_wf; ++i) {
596  waveStatusList[j].push_back(
597  std::make_pair(wfList[j][i], BLOCKED));
598  }
599  }
600 
601  for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
602  dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
603  }
604 
605  fetchStage.init(this);
607  scheduleStage.init(this);
608  execStage.init(this);
609  globalMemoryPipe.init(this);
610  localMemoryPipe.init(this);
611  // initialize state for statistics calculation
612  vectorAluInstAvail.resize(numSIMDs, false);
613  shrMemInstAvail = 0;
614  glbMemInstAvail = 0;
615 }
616 
617 bool
619 {
620  // Ruby has completed the memory op. Schedule the mem_resp_event at the
621  // appropriate cycle to process the timing memory response
622  // This delay represents the pipeline delay
623  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
624  int index = sender_state->port_index;
625  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
626 
627  // Is the packet returned a Kernel End or Barrier
628  if (pkt->req->isKernel() && pkt->req->isRelease()) {
629  Wavefront *w =
630  computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
631 
632  // Check if we are waiting on Kernel End Release
633  if (w->status == Wavefront::S_RETURNING) {
634  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
635  computeUnit->cu_id, w->simdId, w->wfSlotId,
636  w->wfDynId, w->kernId);
637 
638  computeUnit->shader->dispatcher->notifyWgCompl(w);
640  } else {
641  w->outstandingReqs--;
642  }
643 
644  DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
645  computeUnit->cu_id, gpuDynInst->simdId,
646  gpuDynInst->wfSlotId, w->barrierCnt);
647 
648  if (gpuDynInst->useContinuation) {
649  assert(!gpuDynInst->isNoScope());
650  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
651  gpuDynInst);
652  }
653 
654  delete pkt->senderState;
655  delete pkt;
656  return true;
657  } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
658  if (gpuDynInst->useContinuation) {
659  assert(!gpuDynInst->isNoScope());
660  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
661  gpuDynInst);
662  }
663 
664  delete pkt->senderState;
665  delete pkt;
666  return true;
667  }
668 
669  EventFunctionWrapper *mem_resp_event =
670  computeUnit->memPort[index]->createMemRespEvent(pkt);
671 
672  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
673  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
674  index, pkt->req->getPaddr());
675 
676  computeUnit->schedule(mem_resp_event,
677  curTick() + computeUnit->resp_tick_latency);
678  return true;
679 }
680 
681 void
683 {
684  int len = retries.size();
685 
686  assert(len > 0);
687 
688  for (int i = 0; i < len; ++i) {
689  PacketPtr pkt = retries.front().first;
690  GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
691  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
692  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
693  pkt->req->getPaddr());
694 
698  if (!sendTimingReq(pkt)) {
699  DPRINTF(GPUMem, "failed again!\n");
700  break;
701  } else {
702  DPRINTF(GPUMem, "successful!\n");
703  retries.pop_front();
704  }
705  }
706 }
707 
708 bool
710 {
711  computeUnit->fetchStage.processFetchReturn(pkt);
712 
713  return true;
714 }
715 
716 void
718 {
719  int len = retries.size();
720 
721  assert(len > 0);
722 
723  for (int i = 0; i < len; ++i) {
724  PacketPtr pkt = retries.front().first;
725  Wavefront *wavefront M5_VAR_USED = retries.front().second;
726  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
727  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
728  pkt->req->getPaddr());
729  if (!sendTimingReq(pkt)) {
730  DPRINTF(GPUFetch, "failed again!\n");
731  break;
732  } else {
733  DPRINTF(GPUFetch, "successful!\n");
734  retries.pop_front();
735  }
736  }
737 }
738 
739 void
741 {
742  // There must be a way around this check to do the globalMemStart...
743  Addr tmp_vaddr = pkt->req->getVaddr();
744 
745  updatePageDivergenceDist(tmp_vaddr);
746 
747  // set PC in request
748  pkt->req->setPC(gpuDynInst->wavefront()->pc());
749 
750  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
751 
752  // figure out the type of the request to set read/write
753  BaseTLB::Mode TLB_mode;
754  assert(pkt->isRead() || pkt->isWrite());
755 
756  // Check write before read for atomic operations
757  // since atomic operations should use BaseTLB::Write
758  if (pkt->isWrite()){
759  TLB_mode = BaseTLB::Write;
760  } else if (pkt->isRead()) {
761  TLB_mode = BaseTLB::Read;
762  } else {
763  fatal("pkt is not a read nor a write\n");
764  }
765 
766  tlbCycles -= curTick();
767  ++tlbRequests;
768 
769  int tlbPort_index = perLaneTLB ? index : 0;
770 
771  if (shader->timingSim) {
772  if (debugSegFault) {
774  Addr vaddr = pkt->req->getVaddr();
775  unsigned size = pkt->getSize();
776 
777  if ((vaddr + size - 1) % 64 < vaddr % 64) {
778  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
779  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
780  }
781 
782  Addr paddr;
783 
784  if (!p->pTable->translate(vaddr, paddr)) {
785  if (!p->fixupFault(vaddr)) {
786  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
787  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
788  vaddr);
789  }
790  }
791  }
792 
793  // This is the SenderState needed upon return
794  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
795 
796  // This is the senderState needed by the TLB hierarchy to function
797  TheISA::GpuTLB::TranslationState *translation_state =
798  new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
799  pkt->senderState);
800 
801  pkt->senderState = translation_state;
802 
803  if (functionalTLB) {
804  tlbPort[tlbPort_index]->sendFunctional(pkt);
805 
806  // update the hitLevel distribution
807  int hit_level = translation_state->hitLevel;
808  assert(hit_level != -1);
809  hitsPerTLBLevel[hit_level]++;
810 
811  // New SenderState for the memory access
812  X86ISA::GpuTLB::TranslationState *sender_state =
814 
815  delete sender_state->tlbEntry;
816  delete sender_state->saved;
817  delete sender_state;
818 
819  assert(pkt->req->hasPaddr());
820  assert(pkt->req->hasSize());
821 
822  uint8_t *tmpData = pkt->getPtr<uint8_t>();
823 
824  // this is necessary because the GPU TLB receives packets instead
825  // of requests. when the translation is complete, all relevent
826  // fields in the request will be populated, but not in the packet.
827  // here we create the new packet so we can set the size, addr,
828  // and proper flags.
829  PacketPtr oldPkt = pkt;
830  pkt = new Packet(oldPkt->req, oldPkt->cmd);
831  delete oldPkt;
832  pkt->dataStatic(tmpData);
833 
834 
835  // New SenderState for the memory access
836  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
837  index, nullptr);
838 
839  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
840  gpuDynInst->tlbHitLevel[index] = hit_level;
841 
842 
843  // translation is done. Schedule the mem_req_event at the
844  // appropriate cycle to send the timing memory request to ruby
845  EventFunctionWrapper *mem_req_event =
846  memPort[index]->createMemReqEvent(pkt);
847 
848  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
849  "scheduled\n", cu_id, gpuDynInst->simdId,
850  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
851 
852  schedule(mem_req_event, curTick() + req_tick_latency);
853  } else if (tlbPort[tlbPort_index]->isStalled()) {
854  assert(tlbPort[tlbPort_index]->retries.size() > 0);
855 
856  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
857  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
858  tmp_vaddr);
859 
860  tlbPort[tlbPort_index]->retries.push_back(pkt);
861  } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
862  // Stall the data port;
863  // No more packet will be issued till
864  // ruby indicates resources are freed by
865  // a recvReqRetry() call back on this port.
866  tlbPort[tlbPort_index]->stallPort();
867 
868  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
869  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
870  tmp_vaddr);
871 
872  tlbPort[tlbPort_index]->retries.push_back(pkt);
873  } else {
874  DPRINTF(GPUTLB,
875  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
876  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
877  }
878  } else {
879  if (pkt->cmd == MemCmd::MemFenceReq) {
880  gpuDynInst->statusBitVector = VectorMask(0);
881  } else {
882  gpuDynInst->statusBitVector &= (~(1ll << index));
883  }
884 
885  // New SenderState for the memory access
886  delete pkt->senderState;
887 
888  // Because it's atomic operation, only need TLB translation state
889  pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
890  shader->gpuTc);
891 
892  tlbPort[tlbPort_index]->sendFunctional(pkt);
893 
894  // the addr of the packet is not modified, so we need to create a new
895  // packet, or otherwise the memory access will have the old virtual
896  // address sent in the translation packet, instead of the physical
897  // address returned by the translation.
898  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
899  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
900 
901  // Translation is done. It is safe to send the packet to memory.
902  memPort[0]->sendFunctional(new_pkt);
903 
904  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
905  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
906  new_pkt->req->getPaddr());
907 
908  // safe_cast the senderState
909  TheISA::GpuTLB::TranslationState *sender_state =
910  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
911 
912  delete sender_state->tlbEntry;
913  delete new_pkt;
914  delete pkt->senderState;
915  delete pkt;
916  }
917 }
918 
919 void
921 {
922  EventFunctionWrapper *mem_req_event =
923  memPort[index]->createMemReqEvent(pkt);
924 
925 
926  // New SenderState for the memory access
927  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
928  nullptr);
929 
930  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
931  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
932  pkt->req->getPaddr());
933 
934  schedule(mem_req_event, curTick() + req_tick_latency);
935 }
936 
937 void
938 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
939  RequestPtr req)
940 {
941  assert(gpuDynInst->isGlobalSeg());
942 
943  if (!req) {
944  req = std::make_shared<Request>(
945  0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
946  }
947  req->setPaddr(0);
948  if (kernelLaunch) {
949  req->setFlags(Request::KERNEL);
950  }
951 
952  // for non-kernel MemFence operations, memorder flags are set depending
953  // on which type of request is currently being sent, so this
954  // should be set by the caller (e.g. if an inst has acq-rel
955  // semantics, it will send one acquire req an one release req)
956  gpuDynInst->setRequestFlags(req, kernelLaunch);
957 
958  // a mem fence must correspond to an acquire/release request
959  assert(req->isAcquire() || req->isRelease());
960 
961  // create packet
962  PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
963 
964  // set packet's sender state
965  pkt->senderState =
966  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
967 
968  // send the packet
969  sendSyncRequest(gpuDynInst, 0, pkt);
970 }
971 
972 void
974 {
975  DataPort::SenderState *sender_state =
977 
978  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
979  ComputeUnit *compute_unit = computeUnit;
980 
981  assert(gpuDynInst);
982 
983  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
984  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
985  pkt->req->getPaddr(), index);
986 
987  Addr paddr = pkt->req->getPaddr();
988 
989  if (pkt->cmd != MemCmd::MemFenceResp) {
990  int index = gpuDynInst->memStatusVector[paddr].back();
991 
992  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
993  pkt->req->getPaddr(), index);
994 
995  gpuDynInst->memStatusVector[paddr].pop_back();
996  gpuDynInst->pAddr = pkt->req->getPaddr();
997 
998  if (pkt->isRead() || pkt->isWrite()) {
999 
1000  if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
1001  gpuDynInst->statusBitVector &= (~(1ULL << index));
1002  } else {
1003  assert(gpuDynInst->statusVector[index] > 0);
1004  gpuDynInst->statusVector[index]--;
1005 
1006  if (!gpuDynInst->statusVector[index])
1007  gpuDynInst->statusBitVector &= (~(1ULL << index));
1008  }
1009 
1010  DPRINTF(GPUMem, "bitvector is now %#x\n",
1011  gpuDynInst->statusBitVector);
1012 
1013  if (gpuDynInst->statusBitVector == VectorMask(0)) {
1014  auto iter = gpuDynInst->memStatusVector.begin();
1015  auto end = gpuDynInst->memStatusVector.end();
1016 
1017  while (iter != end) {
1018  assert(iter->second.empty());
1019  ++iter;
1020  }
1021 
1022  gpuDynInst->memStatusVector.clear();
1023 
1024  if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
1025  gpuDynInst->statusVector.clear();
1026 
1027  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1028 
1029  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1030  compute_unit->cu_id, gpuDynInst->simdId,
1031  gpuDynInst->wfSlotId);
1032 
1033  // after clearing the status vectors,
1034  // see if there is a continuation to perform
1035  // the continuation may generate more work for
1036  // this memory request
1037  if (gpuDynInst->useContinuation) {
1038  assert(!gpuDynInst->isNoScope());
1039  gpuDynInst->execContinuation(
1040  gpuDynInst->staticInstruction(),
1041  gpuDynInst);
1042  }
1043  }
1044  }
1045  } else {
1046  gpuDynInst->statusBitVector = VectorMask(0);
1047 
1048  if (gpuDynInst->useContinuation) {
1049  assert(!gpuDynInst->isNoScope());
1050  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1051  gpuDynInst);
1052  }
1053  }
1054 
1055  delete pkt->senderState;
1056  delete pkt;
1057 }
1058 
1059 ComputeUnit*
1060 ComputeUnitParams::create()
1061 {
1062  return new ComputeUnit(this);
1063 }
1064 
1065 bool
1067 {
1068  Addr line = pkt->req->getPaddr();
1069 
1070  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1071  pkt->req->getVaddr(), line);
1072 
1073  assert(pkt->senderState);
1074  computeUnit->tlbCycles += curTick();
1075 
1076  // pop off the TLB translation state
1077  TheISA::GpuTLB::TranslationState *translation_state =
1078  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1079 
1080  // no PageFaults are permitted for data accesses
1081  if (!translation_state->tlbEntry) {
1082  DTLBPort::SenderState *sender_state =
1083  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1084 
1085  Wavefront *w M5_VAR_USED =
1086  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1087  [sender_state->_gpuDynInst->wfSlotId];
1088 
1089  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1090  pkt->req->getVaddr());
1091  }
1092 
1093  // update the hitLevel distribution
1094  int hit_level = translation_state->hitLevel;
1095  computeUnit->hitsPerTLBLevel[hit_level]++;
1096 
1097  delete translation_state->tlbEntry;
1098  assert(!translation_state->ports.size());
1099  pkt->senderState = translation_state->saved;
1100 
1101  // for prefetch pkt
1102  BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1103 
1104  delete translation_state;
1105 
1106  // use the original sender state to know how to close this transaction
1107  DTLBPort::SenderState *sender_state =
1109 
1110  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1111  int mp_index = sender_state->portIndex;
1112  Addr vaddr = pkt->req->getVaddr();
1113  gpuDynInst->memStatusVector[line].push_back(mp_index);
1114  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1115 
1116  MemCmd requestCmd;
1117 
1118  if (pkt->cmd == MemCmd::ReadResp) {
1119  requestCmd = MemCmd::ReadReq;
1120  } else if (pkt->cmd == MemCmd::WriteResp) {
1121  requestCmd = MemCmd::WriteReq;
1122  } else if (pkt->cmd == MemCmd::SwapResp) {
1123  requestCmd = MemCmd::SwapReq;
1124  } else {
1125  panic("unsupported response to request conversion %s\n",
1126  pkt->cmd.toString());
1127  }
1128 
1129  if (computeUnit->prefetchDepth) {
1130  int simdId = gpuDynInst->simdId;
1131  int wfSlotId = gpuDynInst->wfSlotId;
1132  Addr last = 0;
1133 
1134  switch(computeUnit->prefetchType) {
1135  case Enums::PF_CU:
1136  last = computeUnit->lastVaddrCU[mp_index];
1137  break;
1138  case Enums::PF_PHASE:
1139  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1140  break;
1141  case Enums::PF_WF:
1142  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1143  default:
1144  break;
1145  }
1146 
1147  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1148  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1149 
1150  int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1152  : 0;
1153 
1154  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1155 
1156  computeUnit->lastVaddrCU[mp_index] = vaddr;
1157  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1158  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1159 
1160  stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1161  computeUnit->prefetchStride: stride;
1162 
1163  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1164  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1165 
1166  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1167 
1168  // Prefetch Next few pages atomically
1169  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1170  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1171  vaddr+stride*pf*TheISA::PageBytes);
1172 
1173  if (!stride)
1174  break;
1175 
1176  RequestPtr prefetch_req = std::make_shared<Request>(
1177  vaddr + stride * pf * TheISA::PageBytes,
1178  sizeof(uint8_t), 0,
1179  computeUnit->masterId(),
1180  0, 0, nullptr);
1181 
1182  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1183  uint8_t foo = 0;
1184  prefetch_pkt->dataStatic(&foo);
1185 
1186  // Because it's atomic operation, only need TLB translation state
1187  prefetch_pkt->senderState =
1188  new TheISA::GpuTLB::TranslationState(TLB_mode,
1189  computeUnit->shader->gpuTc,
1190  true);
1191 
1192  // Currently prefetches are zero-latency, hence the sendFunctional
1193  sendFunctional(prefetch_pkt);
1194 
1195  /* safe_cast the senderState */
1196  TheISA::GpuTLB::TranslationState *tlb_state =
1197  safe_cast<TheISA::GpuTLB::TranslationState*>(
1198  prefetch_pkt->senderState);
1199 
1200 
1201  delete tlb_state->tlbEntry;
1202  delete tlb_state;
1203  delete prefetch_pkt;
1204  }
1205  }
1206 
1207  // First we must convert the response cmd back to a request cmd so that
1208  // the request can be sent through the cu's master port
1209  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1210  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1211  delete pkt->senderState;
1212  delete pkt;
1213 
1214  // New SenderState for the memory access
1215  new_pkt->senderState =
1216  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1217  nullptr);
1218 
1219  // translation is done. Schedule the mem_req_event at the appropriate
1220  // cycle to send the timing memory request to ruby
1221  EventFunctionWrapper *mem_req_event =
1222  computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt);
1223 
1224  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1225  computeUnit->cu_id, gpuDynInst->simdId,
1226  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1227 
1228  computeUnit->schedule(mem_req_event, curTick() +
1229  computeUnit->req_tick_latency);
1230 
1231  return true;
1232 }
1233 
1236 {
1237  return new EventFunctionWrapper(
1238  [this, pkt]{ processMemReqEvent(pkt); },
1239  "ComputeUnit memory request event", true);
1240 }
1241 
1244 {
1245  return new EventFunctionWrapper(
1246  [this, pkt]{ processMemRespEvent(pkt); },
1247  "ComputeUnit memory response event", true);
1248 }
1249 
1250 void
1252 {
1253  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1254  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1255  ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1256 
1257  if (!(sendTimingReq(pkt))) {
1258  retries.push_back(std::make_pair(pkt, gpuDynInst));
1259 
1260  DPRINTF(GPUPort,
1261  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1262  compute_unit->cu_id, gpuDynInst->simdId,
1263  gpuDynInst->wfSlotId, index,
1264  pkt->req->getPaddr());
1265  } else {
1266  DPRINTF(GPUPort,
1267  "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1268  compute_unit->cu_id, gpuDynInst->simdId,
1269  gpuDynInst->wfSlotId, index,
1270  pkt->req->getPaddr());
1271  }
1272 }
1273 
1274 /*
1275  * The initial translation request could have been rejected,
1276  * if <retries> queue is not Retry sending the translation
1277  * request. sendRetry() is called from the peer port whenever
1278  * a translation completes.
1279  */
1280 void
1282 {
1283  int len = retries.size();
1284 
1285  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1286  computeUnit->cu_id, len);
1287 
1288  assert(len > 0);
1289  assert(isStalled());
1290  // recvReqRetry is an indication that the resource on which this
1291  // port was stalling on is freed. So, remove the stall first
1292  unstallPort();
1293 
1294  for (int i = 0; i < len; ++i) {
1295  PacketPtr pkt = retries.front();
1296  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1297  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1298 
1299  if (!sendTimingReq(pkt)) {
1300  // Stall port
1301  stallPort();
1302  DPRINTF(GPUTLB, ": failed again\n");
1303  break;
1304  } else {
1305  DPRINTF(GPUTLB, ": successful\n");
1306  retries.pop_front();
1307  }
1308  }
1309 }
1310 
1311 bool
1313 {
1314  Addr line M5_VAR_USED = pkt->req->getPaddr();
1315  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1316  computeUnit->cu_id, pkt->req->getVaddr(), line);
1317 
1318  assert(pkt->senderState);
1319 
1320  // pop off the TLB translation state
1321  TheISA::GpuTLB::TranslationState *translation_state =
1322  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1323 
1324  bool success = translation_state->tlbEntry != nullptr;
1325  delete translation_state->tlbEntry;
1326  assert(!translation_state->ports.size());
1327  pkt->senderState = translation_state->saved;
1328  delete translation_state;
1329 
1330  // use the original sender state to know how to close this transaction
1331  ITLBPort::SenderState *sender_state =
1333 
1334  // get the wavefront associated with this translation request
1335  Wavefront *wavefront = sender_state->wavefront;
1336  delete pkt->senderState;
1337 
1338  if (success) {
1339  // pkt is reused in fetch(), don't delete it here. However, we must
1340  // reset the command to be a request so that it can be sent through
1341  // the cu's master port
1342  assert(pkt->cmd == MemCmd::ReadResp);
1343  pkt->cmd = MemCmd::ReadReq;
1344 
1345  computeUnit->fetchStage.fetch(pkt, wavefront);
1346  } else {
1347  if (wavefront->dropFetch) {
1348  assert(wavefront->instructionBuffer.empty());
1349  wavefront->dropFetch = false;
1350  }
1351 
1352  wavefront->pendingFetch = 0;
1353  }
1354 
1355  return true;
1356 }
1357 
1358 /*
1359  * The initial translation request could have been rejected, if
1360  * <retries> queue is not empty. Retry sending the translation
1361  * request. sendRetry() is called from the peer port whenever
1362  * a translation completes.
1363  */
1364 void
1366 {
1367 
1368  int len = retries.size();
1369  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1370 
1371  assert(len > 0);
1372  assert(isStalled());
1373 
1374  // recvReqRetry is an indication that the resource on which this
1375  // port was stalling on is freed. So, remove the stall first
1376  unstallPort();
1377 
1378  for (int i = 0; i < len; ++i) {
1379  PacketPtr pkt = retries.front();
1380  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1381  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1382 
1383  if (!sendTimingReq(pkt)) {
1384  stallPort(); // Stall port
1385  DPRINTF(GPUTLB, ": failed again\n");
1386  break;
1387  } else {
1388  DPRINTF(GPUTLB, ": successful\n");
1389  retries.pop_front();
1390  }
1391  }
1392 }
1393 
1394 void
1396 {
1398 
1399  vALUInsts
1400  .name(name() + ".valu_insts")
1401  .desc("Number of vector ALU insts issued.")
1402  ;
1404  .name(name() + ".valu_insts_per_wf")
1405  .desc("The avg. number of vector ALU insts issued per-wavefront.")
1406  ;
1407  sALUInsts
1408  .name(name() + ".salu_insts")
1409  .desc("Number of scalar ALU insts issued.")
1410  ;
1412  .name(name() + ".salu_insts_per_wf")
1413  .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1414  ;
1416  .name(name() + ".inst_cycles_valu")
1417  .desc("Number of cycles needed to execute VALU insts.")
1418  ;
1420  .name(name() + ".inst_cycles_salu")
1421  .desc("Number of cycles needed to execute SALU insts.")
1422  ;
1424  .name(name() + ".thread_cycles_valu")
1425  .desc("Number of thread cycles used to execute vector ALU ops. "
1426  "Similar to instCyclesVALU but multiplied by the number of "
1427  "active threads.")
1428  ;
1430  .name(name() + ".valu_utilization")
1431  .desc("Percentage of active vector ALU threads in a wave.")
1432  ;
1434  .name(name() + ".lds_no_flat_insts")
1435  .desc("Number of LDS insts issued, not including FLAT "
1436  "accesses that resolve to LDS.")
1437  ;
1439  .name(name() + ".lds_no_flat_insts_per_wf")
1440  .desc("The avg. number of LDS insts (not including FLAT "
1441  "accesses that resolve to LDS) per-wavefront.")
1442  ;
1444  .name(name() + ".flat_vmem_insts")
1445  .desc("The number of FLAT insts that resolve to vmem issued.")
1446  ;
1448  .name(name() + ".flat_vmem_insts_per_wf")
1449  .desc("The average number of FLAT insts that resolve to vmem "
1450  "issued per-wavefront.")
1451  ;
1452  flatLDSInsts
1453  .name(name() + ".flat_lds_insts")
1454  .desc("The number of FLAT insts that resolve to LDS issued.")
1455  ;
1457  .name(name() + ".flat_lds_insts_per_wf")
1458  .desc("The average number of FLAT insts that resolve to LDS "
1459  "issued per-wavefront.")
1460  ;
1462  .name(name() + ".vector_mem_writes")
1463  .desc("Number of vector mem write insts (excluding FLAT insts).")
1464  ;
1466  .name(name() + ".vector_mem_writes_per_wf")
1467  .desc("The average number of vector mem write insts "
1468  "(excluding FLAT insts) per-wavefront.")
1469  ;
1471  .name(name() + ".vector_mem_reads")
1472  .desc("Number of vector mem read insts (excluding FLAT insts).")
1473  ;
1475  .name(name() + ".vector_mem_reads_per_wf")
1476  .desc("The avg. number of vector mem read insts (excluding "
1477  "FLAT insts) per-wavefront.")
1478  ;
1480  .name(name() + ".scalar_mem_writes")
1481  .desc("Number of scalar mem write insts.")
1482  ;
1484  .name(name() + ".scalar_mem_writes_per_wf")
1485  .desc("The average number of scalar mem write insts per-wavefront.")
1486  ;
1488  .name(name() + ".scalar_mem_reads")
1489  .desc("Number of scalar mem read insts.")
1490  ;
1492  .name(name() + ".scalar_mem_reads_per_wf")
1493  .desc("The average number of scalar mem read insts per-wavefront.")
1494  ;
1495 
1498  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
1506 
1507  tlbCycles
1508  .name(name() + ".tlb_cycles")
1509  .desc("total number of cycles for all uncoalesced requests")
1510  ;
1511 
1512  tlbRequests
1513  .name(name() + ".tlb_requests")
1514  .desc("number of uncoalesced requests")
1515  ;
1516 
1517  tlbLatency
1518  .name(name() + ".avg_translation_latency")
1519  .desc("Avg. translation latency for data translations")
1520  ;
1521 
1523 
1525  .init(4)
1526  .name(name() + ".TLB_hits_distribution")
1527  .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
1528  ;
1529 
1530  // fixed number of TLB levels
1531  for (int i = 0; i < 4; ++i) {
1532  if (!i)
1533  hitsPerTLBLevel.subname(i,"page_table");
1534  else
1535  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
1536  }
1537 
1538  execRateDist
1539  .init(0, 10, 2)
1540  .name(name() + ".inst_exec_rate")
1541  .desc("Instruction Execution Rate: Number of executed vector "
1542  "instructions per cycle")
1543  ;
1544 
1546  .init(0, wfSize(), 2)
1547  .name(name() + ".lds_bank_conflicts")
1548  .desc("Number of bank conflicts per LDS memory packet")
1549  ;
1550 
1552  .name(name() + ".lds_bank_access_cnt")
1553  .desc("Total number of LDS bank accesses")
1554  ;
1555 
1557  // A wavefront can touch up to N pages per memory instruction where
1558  // N is equal to the wavefront size
1559  // The number of pages per bin can be configured (here it's 4).
1560  .init(1, wfSize(), 4)
1561  .name(name() + ".page_divergence_dist")
1562  .desc("pages touched per wf (over all mem. instr.)")
1563  ;
1564 
1566  .init(1, wfSize(), 4)
1567  .name(name() + ".warp_execution_dist")
1568  .desc("number of lanes active per instruction (oval all instructions)")
1569  ;
1570 
1572  .init(1, wfSize(), 4)
1573  .name(name() + ".gmem_lanes_execution_dist")
1574  .desc("number of active lanes per global memory instruction")
1575  ;
1576 
1578  .init(1, wfSize(), 4)
1579  .name(name() + ".lmem_lanes_execution_dist")
1580  .desc("number of active lanes per local memory instruction")
1581  ;
1582 
1584  .name(name() + ".num_instr_executed")
1585  .desc("number of instructions executed")
1586  ;
1587 
1589  .name(name() + ".num_vec_ops_executed")
1590  .desc("number of vec ops executed (e.g. WF size/inst)")
1591  ;
1592 
1593  totalCycles
1594  .name(name() + ".num_total_cycles")
1595  .desc("number of cycles the CU ran for")
1596  ;
1597 
1598  ipc
1599  .name(name() + ".ipc")
1600  .desc("Instructions per cycle (this CU only)")
1601  ;
1602 
1603  vpc
1604  .name(name() + ".vpc")
1605  .desc("Vector Operations per cycle (this CU only)")
1606  ;
1607 
1609  .name(name() + ".num_alu_insts_executed")
1610  .desc("Number of dynamic non-GM memory insts executed")
1611  ;
1612 
1614  .name(name() + ".wg_blocked_due_lds_alloc")
1615  .desc("Workgroup blocked due to LDS capacity")
1616  ;
1617 
1620 
1622  .name(name() + ".times_wg_blocked_due_vgpr_alloc")
1623  .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
1624  ;
1625 
1627  .name(name() + ".global_mem_instr_cnt")
1628  .desc("dynamic global memory instructions count")
1629  ;
1630 
1632  .name(name() + ".local_mem_instr_cnt")
1633  .desc("dynamic local memory intruction count")
1634  ;
1635 
1638 
1639  completedWfs
1640  .name(name() + ".num_completed_wfs")
1641  .desc("number of completed wavefronts")
1642  ;
1643 
1644  numCASOps
1645  .name(name() + ".num_CAS_ops")
1646  .desc("number of compare and swap operations")
1647  ;
1648 
1650  .name(name() + ".num_failed_CAS_ops")
1651  .desc("number of compare and swap operations that failed")
1652  ;
1653 
1654  // register stats of pipeline stages
1655  fetchStage.regStats();
1658  execStage.regStats();
1659 
1660  // register stats of memory pipeline
1663 }
1664 
1665 void
1667 {
1668  if (gpuDynInst->isScalar()) {
1669  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1670  sALUInsts++;
1671  instCyclesSALU++;
1672  } else if (gpuDynInst->isLoad()) {
1673  scalarMemReads++;
1674  } else if (gpuDynInst->isStore()) {
1675  scalarMemWrites++;
1676  }
1677  } else {
1678  if (gpuDynInst->isALU()) {
1679  vALUInsts++;
1680  instCyclesVALU++;
1681  threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
1682  } else if (gpuDynInst->isFlat()) {
1683  if (gpuDynInst->isLocalMem()) {
1684  flatLDSInsts++;
1685  } else {
1686  flatVMemInsts++;
1687  }
1688  } else if (gpuDynInst->isLocalMem()) {
1689  ldsNoFlatInsts++;
1690  } else if (gpuDynInst->isLoad()) {
1691  vectorMemReads++;
1692  } else if (gpuDynInst->isStore()) {
1693  vectorMemWrites++;
1694  }
1695  }
1696 }
1697 
1698 void
1700 {
1701  Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
1702 
1703  if (!pagesTouched.count(virt_page_addr))
1704  pagesTouched[virt_page_addr] = 1;
1705  else
1706  pagesTouched[virt_page_addr]++;
1707 }
1708 
1709 void
1711 {
1712  if (computeUnit->countPages) {
1713  std::ostream *page_stat_file =
1714  simout.create(computeUnit->name().c_str())->stream();
1715 
1716  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1717  std::endl;
1718 
1719  for (auto iter : computeUnit->pageAccesses) {
1720  *page_stat_file << std::hex << iter.first << ",";
1721  *page_stat_file << std::dec << iter.second.first << ",";
1722  *page_stat_file << std::dec << iter.second.second << std::endl;
1723  }
1724  }
1725  }
1726 
1727 bool
1729 {
1730  for (int i = 0; i < numSIMDs; ++i) {
1731  if (!isSimdDone(i)) {
1732  return false;
1733  }
1734  }
1735 
1736  bool glbMemBusRdy = true;
1737  for (int j = 0; j < numGlbMemUnits; ++j) {
1738  glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
1739  }
1740  bool locMemBusRdy = true;
1741  for (int j = 0; j < numLocMemUnits; ++j) {
1742  locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
1743  }
1744 
1749  !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
1750  return false;
1751  }
1752 
1753  return true;
1754 }
1755 
1756 int32_t
1757 ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
1758 {
1759  return lds.getRefCounter(dispatchId, wgId);
1760 }
1761 
1762 bool
1763 ComputeUnit::isSimdDone(uint32_t simdId) const
1764 {
1765  assert(simdId < numSIMDs);
1766 
1767  for (int i=0; i < numGlbMemUnits; ++i) {
1768  if (!vrfToGlobalMemPipeBus[i].rdy())
1769  return false;
1770  }
1771  for (int i=0; i < numLocMemUnits; ++i) {
1772  if (!vrfToLocalMemPipeBus[i].rdy())
1773  return false;
1774  }
1775  if (!aluPipe[simdId].rdy()) {
1776  return false;
1777  }
1778 
1779  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1780  if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
1781  return false;
1782  }
1783  }
1784 
1785  return true;
1786 }
1787 
1793 bool
1795 {
1796  // this is just a request to carry the GPUDynInstPtr
1797  // back and forth
1798  RequestPtr newRequest = std::make_shared<Request>();
1799  newRequest->setPaddr(0x0);
1800 
1801  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
1802  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
1803 
1804  // This is the SenderState needed upon return
1805  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
1806 
1807  return ldsPort->sendTimingReq(newPacket);
1808 }
1809 
1813 bool
1815 {
1816  const ComputeUnit::LDSPort::SenderState *senderState =
1817  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
1818 
1819  fatal_if(!senderState, "did not get the right sort of sender state");
1820 
1821  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
1822 
1823  delete packet->senderState;
1824  delete packet;
1825 
1826  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1827  return true;
1828 }
1829 
1835 bool
1837 {
1838  ComputeUnit::LDSPort::SenderState *sender_state =
1839  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
1840  fatal_if(!sender_state, "packet without a valid sender state");
1841 
1842  GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
1843 
1844  if (isStalled()) {
1845  fatal_if(retries.empty(), "must have retries waiting to be stalled");
1846 
1847  retries.push(pkt);
1848 
1849  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
1850  computeUnit->cu_id, gpuDynInst->simdId,
1851  gpuDynInst->wfSlotId);
1852  return false;
1853  } else if (!MasterPort::sendTimingReq(pkt)) {
1854  // need to stall the LDS port until a recvReqRetry() is received
1855  // this indicates that there is more space
1856  stallPort();
1857  retries.push(pkt);
1858 
1859  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1860  computeUnit->cu_id, gpuDynInst->simdId,
1861  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1862  return false;
1863  } else {
1864  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1865  computeUnit->cu_id, gpuDynInst->simdId,
1866  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1867  return true;
1868  }
1869 }
1870 
1877 void
1879 {
1880  auto queueSize = retries.size();
1881 
1882  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
1883  computeUnit->cu_id, queueSize);
1884 
1885  fatal_if(queueSize < 1,
1886  "why was there a recvReqRetry() with no pending reqs?");
1887  fatal_if(!isStalled(),
1888  "recvReqRetry() happened when the port was not stalled");
1889 
1890  unstallPort();
1891 
1892  while (!retries.empty()) {
1893  PacketPtr packet = retries.front();
1894 
1895  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
1896 
1897  if (!MasterPort::sendTimingReq(packet)) {
1898  // Stall port
1899  stallPort();
1900  DPRINTF(GPUPort, ": LDS send failed again\n");
1901  break;
1902  } else {
1903  DPRINTF(GPUTLB, ": LDS send successful\n");
1904  retries.pop();
1905  }
1906  }
1907 }
uint32_t numVecRegsPerSimd
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
Addr roBase
Definition: wavefront.hh:273
uint16_t cRegCount
Definition: qstruct.hh:62
RubyTester::SenderState SenderState
Definition: Check.cc:37
#define DPRINTF(x,...)
Definition: trace.hh:225
uint32_t workGroupSz[3]
Definition: wavefront.hh:195
void processMemReqEvent(PacketPtr pkt)
Stats::Formula vpc
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
OutputDirectory simout
Definition: output.cc:61
Stats::Scalar flatLDSInsts
Addr spillBase
Definition: wavefront.hh:261
Bitfield< 30, 0 > index
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
Definition: statistics.hh:376
const Regs::Info & regInfo(Addr daddr)
Definition: sinicreg.hh:182
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
std::true_type foo(void(*)(ThreadContext *, const Ret &ret, State &state))
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:171
uint32_t numCyclesPerLoadTransfer
Definition: packet.hh:70
Stats::Formula ipc
uint64_t privMemStart
Definition: qstruct.hh:63
Bitfield< 7 > i
WaitClass glbMemToVrfBus
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
Definition: compute_unit.hh:56
std::map< unsigned, waveQueue > xactCasLoadMap
bool debugSegFault
LdsState & lds
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:207
Bitfield< 0 > m
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
Definition: gpu_tlb.hh:329
void fillKernelState(Wavefront *w, NDRange *ndr)
const Addr PageShift
Definition: isa_traits.hh:55
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Definition: compute_unit.hh:96
Stats::Formula flatLDSInstsPerWF
uint32_t barrierCnt
Definition: wavefront.hh:155
Stats::Distribution controlFlowDivergenceDist
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:231
Bitfield< 21, 20 > stride
std::vector< std::vector< Wavefront * > > readyList
int maxBarCnt
Definition: wavefront.hh:252
std::shared_ptr< Request > RequestPtr
Definition: request.hh:81
uint32_t gridSz[3]
Definition: wavefront.hh:196
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:43
ip6_addr_t addr
Definition: inet.hh:330
VectorMask initMask
Definition: wavefront.hh:248
uint32_t wgSz
Definition: wavefront.hh:198
virtual Process * getProcessPtr()=0
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
uint16_t sRegCount
Definition: qstruct.hh:60
uint32_t spillWidth
Definition: wavefront.hh:265
int simdId
Definition: wavefront.hh:163
bool isGMLdRespFIFOWrRdy() const
bool dropFetch
Definition: wavefront.hh:170
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
Definition: wavefront.cc:781
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
Definition: port.hh:441
void init(ComputeUnit *cu)
uint32_t dispatchId
Definition: wavefront.hh:206
std::vector< DTLBPort * > tlbPort
int kernId
Definition: wavefront.hh:161
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
Definition: lds_state.hh:56
uint64_t code_ptr
Definition: qstruct.hh:55
void updateEvents()
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
int wfSlotId
Definition: wavefront.hh:160
Stats::Scalar dynamicLMemInstrCnt
bool stalledAtBarrier
Definition: wavefront.hh:254
SenderState is information carried along with the packet throughout the TLB hierarchy.
bool isWrite() const
Definition: packet.hh:523
LdsChunk * ldsChunk
Definition: wavefront.hh:258
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1084
bool isRead() const
Definition: packet.hh:522
GPUStaticInst * kernelLaunchInst
#define DPRINTFN(...)
Definition: trace.hh:229
Stats::Scalar numInstrExecuted
Stats::Scalar vALUInsts
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1149
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1034
Stats::Distribution ldsBankConflictDist
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
bool rdy() const
Definition: misc.hh:68
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
RequestPtr req
A pointer to the original request.
Definition: packet.hh:321
uint32_t workGroupId[3]
Definition: wavefront.hh:194
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
uint64_t wfDynId
Definition: wavefront.hh:280
Bitfield< 5, 0 > status
uint32_t barrierSlots
Definition: wavefront.hh:157
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
Definition: compute_unit.cc:58
unsigned getSize() const
Definition: packet.hh:730
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:98
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
void exec()
Definition: fetch_stage.cc:66
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:46
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
bool functionalTLB
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:191
Tick curTick()
The current simulated tick.
Definition: core.hh:44
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:167
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
int funcargs_size
Definition: shader.hh:133
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:158
uint32_t actualWgSz[3]
Definition: wavefront.hh:200
uint32_t wfId
Definition: wavefront.hh:204
bool translate(Addr vaddr, Addr &paddr)
Translate function.
Definition: page_table.cc:140
Addr privBase
Definition: wavefront.hh:268
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:192
Stats::Distribution pageDivergenceDist
ExecStage execStage
Definition: compute_unit.hh:97
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
void regStats()
Definition: fetch_stage.cc:96
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:451
std::vector< WaitClass > vrfToGlobalMemPipeBus
bool isDone() const
void updateInstStats(GPUDynInstPtr gpuDynInst)
Bitfield< 23 > k
Definition: dt_constants.hh:78
std::vector< int > barCnt
Definition: wavefront.hh:251
Bitfield< 9 > d
uint32_t wgId
Definition: wavefront.hh:197
Stats::Scalar flatVMemInsts
int numWg[3]
Definition: ndrange.hh:50
Stats::Scalar numCASOps
GPUDynInstPtr getMemInst() const
Bitfield< 18, 16 > len
The request should be marked with KERNEL.
Definition: request.hh:169
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
void exec()
Definition: exec_stage.cc:126
uint32_t actualWgSzTotal
Definition: wavefront.hh:201
uint32_t ldsSize
Definition: qstruct.hh:72
Addr getAddr() const
Definition: packet.hh:720
int wfSize() const
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
Definition: core.cc:140
std::vector< std::vector< Addr > > lastVaddrSimd
bool isPowerOf2(const T &n)
Definition: intmath.hh:90
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:199
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void schedule(Event &event, Tick when)
Definition: eventq.hh:934
void computeActualWgSz(NDRange *ndr)
Definition: wavefront.cc:980
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:131
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2606
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
Definition: exec_stage.cc:51
uint32_t wgSize[3]
Definition: qstruct.hh:59
STL list class.
Definition: stl.hh:51
uint32_t gdSize[3]
Definition: qstruct.hh:57
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:287
ThreadContext * gpuTc
Definition: shader.hh:99
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
Bitfield< 0 > w
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140
Packet::SenderState * saved
Definition: gpu_tlb.hh:354
#define ULL(N)
uint64_t constant
Definition: types.hh:48
Stats::Scalar ldsNoFlatInsts
T safe_cast(U ptr)
Definition: cast.hh:59
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:249
uint64_t roMemStart
Definition: qstruct.hh:69
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Definition: qstruct.hh:93
Stats::Scalar instCyclesVALU
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Bitfield< 15 > system
Definition: misc.hh:997
Stats::Scalar completedWfs
bool xact_cas_mode
uint32_t outstandingReqs
Definition: wavefront.hh:208
Bitfield< 24 > j
uint32_t globalWgId
Definition: ndrange.hh:57
uint32_t privSizePerItem
Definition: wavefront.hh:270
Mode
Definition: tlb.hh:57
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:276
Stats::Formula vALUInstsPerWF
uint32_t privMemPerItem
Definition: qstruct.hh:64
virtual const std::string name() const
Definition: sim_object.hh:129
Shader * shader
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
EmulationPageTable * pTable
Definition: process.hh:174
uint64_t spillMemStart
Definition: qstruct.hh:66
Stats::Distribution activeLanesPerGMemInstrDist
uint32_t oldBarrierCnt
Definition: wavefront.hh:154
bool timingSim
Definition: shader.hh:106
Declarations of a non-full system Page Table.
bool fixupFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
Definition: process.cc:355
bool pendingFetch
Definition: wavefront.hh:169
void init(uint64_t *_tcnt, uint32_t _numStages=0)
Definition: misc.hh:52
Stats::Scalar tlbCycles
SenderState is information carried along with the packet, esp.
int reservedVectorRegs
Definition: wavefront.hh:228
uint32_t startVgprIndex
Definition: wavefront.hh:231
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
uint32_t roSize
Definition: wavefront.hh:275
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet&#39;s sender state.
Definition: packet.hh:474
uint32_t spillSizePerItem
Definition: wavefront.hh:263
MemCmd cmd
The command field of the packet.
Definition: packet.hh:316
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:155
void init(ComputeUnit *cu)
Tick ticks(int numCycles) const
Definition: shader.hh:91
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:355
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
Definition: intmath.hh:99
int ReadyWorkgroup(NDRange *ndr)
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack&#39;d and must stall...
void init(ComputeUnit *cu)
Definition: fetch_stage.cc:54
std::map< Addr, int > pagesTouched
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:322
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
HsaQueueEntry q
Definition: ndrange.hh:45
uint32_t roMemTotal
Definition: qstruct.hh:70
WaitClass locMemToVrfBus
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Definition: wavefront.cc:140
const Addr PageBytes
Definition: isa_traits.hh:56
FetchStage fetchStage
Definition: compute_unit.hh:94
Stats::Formula flatVMemInstsPerWF
std::vector< uint8_t > statusVec
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
uint32_t barrier_id
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:309
std::vector< uint64_t > lastExecCycle
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
uint32_t spillMemPerItem
Definition: qstruct.hh:67
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
Definition: compute_unit.hh:99
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:64
uint16_t dRegCount
Definition: qstruct.hh:61
Bitfield< 2 > pf
Definition: misc.hh:550
int impl_kern_boundary_sync
Definition: shader.hh:110
Stats::Scalar sALUInsts
Stats::Scalar ldsBankAccesses
uint32_t barrierId
Definition: wavefront.hh:156
Tick req_tick_latency
Stats::Scalar totalCycles
uint64_t tick_cnt
Definition: shader.hh:151
void regStats()
Definition: exec_stage.cc:150
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
Bitfield< 0 > p
std::vector< Addr > lastVaddrCU
int n_wf
Definition: shader.hh:121
int wgId[3]
Definition: ndrange.hh:48
void regStats() override
Callback to set stat parameters.
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Definition: lds_state.cc:79
ComputeUnitParams Params
Stats::Formula ldsNoFlatInstsPerWF
bool isSimdDone(uint32_t) const
uint64_t getAndIncSeqNum()
status_e status
Definition: wavefront.hh:158
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
MasterID masterId()
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
std::vector< int > vectorRegsReserved
EXEC_POLICY exec_policy
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:103
int dispatchId
Definition: ndrange.hh:66
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:95
uint8_t * kernelArgs
Definition: wavefront.hh:278

Generated on Thu May 28 2020 16:21:33 for gem5 by doxygen 1.8.13