gem5  v20.0.0.2
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
35 
36 #include <limits>
37 
38 #include "base/output.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUSync.hh"
46 #include "debug/GPUTLB.hh"
50 #include "gpu-compute/ndrange.hh"
51 #include "gpu-compute/shader.hh"
54 #include "gpu-compute/wavefront.hh"
55 #include "mem/page_table.hh"
56 #include "sim/process.hh"
57 
58 ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
59  scoreboardCheckStage(p), scheduleStage(p), execStage(p),
60  globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
61  cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
62  spBypassPipeLength(p->spbypass_pipe_length),
63  dpBypassPipeLength(p->dpbypass_pipe_length),
64  issuePeriod(p->issue_period),
65  numGlbMemUnits(p->num_global_mem_pipes),
66  numLocMemUnits(p->num_shared_mem_pipes),
67  perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
68  prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
69  xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
70  functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
71  countPages(p->countPages), barrier_id(0),
72  vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
73  coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
74  req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
75  resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
76  _masterId(p->system->getMasterId(this, "ComputeUnit")),
77  lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
78  _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
79  wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst())
80 {
90  fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
91  p->wfSize <= 0,
92  "WF size is larger than the host can support");
94  "Wavefront size should be a power of 2");
95  // calculate how many cycles a vector load or store will need to transfer
96  // its data over the corresponding buses
98  (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
99  (double)vrfToCoalescerBusWidth);
100 
101  numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
103 
104  lastVaddrWF.resize(numSIMDs);
105  wfList.resize(numSIMDs);
106 
107  for (int j = 0; j < numSIMDs; ++j) {
108  lastVaddrWF[j].resize(p->n_wf);
109 
110  for (int i = 0; i < p->n_wf; ++i) {
111  lastVaddrWF[j][i].resize(wfSize());
112 
113  wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
114  wfList[j][i]->setParent(this);
115 
116  for (int k = 0; k < wfSize(); ++k) {
117  lastVaddrWF[j][i][k] = 0;
118  }
119  }
120  }
121 
122  lastVaddrSimd.resize(numSIMDs);
123 
124  for (int i = 0; i < numSIMDs; ++i) {
125  lastVaddrSimd[i].resize(wfSize(), 0);
126  }
127 
128  lastVaddrCU.resize(wfSize());
129 
130  lds.setParent(this);
131 
132  if (p->execPolicy == "OLDEST-FIRST") {
134  } else if (p->execPolicy == "ROUND-ROBIN") {
136  } else {
137  fatal("Invalid WF execution policy (CU)\n");
138  }
139 
140  memPort.resize(wfSize());
141 
142  // Setup tokens for slave ports. The number of tokens in memSlaveTokens
143  // is the total token count for the entire vector port (i.e., this CU).
144  memPortTokens = new TokenManager(p->max_cu_tokens);
145 
146  // resize the tlbPort vectorArray
147  int tlbPort_width = perLaneTLB ? wfSize() : 1;
148  tlbPort.resize(tlbPort_width);
149 
150  cuExitCallback = new CUExitCallback(this);
152 
153  xactCasLoadMap.clear();
154  lastExecCycle.resize(numSIMDs, 0);
155 
156  for (int i = 0; i < vrf.size(); ++i) {
157  vrf[i]->setParent(this);
158  }
159 
160  numVecRegsPerSimd = vrf[0]->numRegs();
161 }
162 
164 {
165  // Delete wavefront slots
166  for (int j = 0; j < numSIMDs; ++j) {
167  for (int i = 0; i < shader->n_wf; ++i) {
168  delete wfList[j][i];
169  }
170  lastVaddrSimd[j].clear();
171  }
172  lastVaddrCU.clear();
173  readyList.clear();
174  waveStatusList.clear();
175  dispatchList.clear();
176  vectorAluInstAvail.clear();
177  delete cuExitCallback;
178  delete ldsPort;
179 }
180 
181 void
183 {
184  w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
185 
186  w->workGroupSz[0] = ndr->q.wgSize[0];
187  w->workGroupSz[1] = ndr->q.wgSize[1];
188  w->workGroupSz[2] = ndr->q.wgSize[2];
189  w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
190  w->gridSz[0] = ndr->q.gdSize[0];
191  w->gridSz[1] = ndr->q.gdSize[1];
192  w->gridSz[2] = ndr->q.gdSize[2];
193  w->kernelArgs = ndr->q.args;
194  w->privSizePerItem = ndr->q.privMemPerItem;
196  w->roBase = ndr->q.roMemStart;
197  w->roSize = ndr->q.roMemTotal;
198  w->computeActualWgSz(ndr);
199 }
200 
201 void
203 
204  if (!timestampVec.empty()) {
205  uint32_t vecSize = timestampVec.size();
206  uint32_t i = 0;
207  while (i < vecSize) {
208  if (timestampVec[i] <= shader->tick_cnt) {
210  vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
211  statusVec[i]);
212  timestampVec.erase(timestampVec.begin() + i);
213  regIdxVec.erase(regIdxVec.begin() + i);
214  statusVec.erase(statusVec.begin() + i);
215  --vecSize;
216  --i;
217  }
218  ++i;
219  }
220  }
221 
222  for (int i = 0; i< numSIMDs; ++i) {
223  vrf[i]->updateEvents();
224  }
225 }
226 
227 
228 void
230  NDRange *ndr)
231 {
232  static int _n_wave = 0;
233 
234  VectorMask init_mask;
235  init_mask.reset();
236 
237  for (int k = 0; k < wfSize(); ++k) {
238  if (k + waveId * wfSize() < w->actualWgSzTotal)
239  init_mask[k] = 1;
240  }
241 
242  w->kernId = ndr->dispatchId;
243  w->wfId = waveId;
244  w->initMask = init_mask.to_ullong();
245 
246  for (int k = 0; k < wfSize(); ++k) {
247  w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
248  w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
249  w->actualWgSz[1];
250  w->workItemId[2][k] = (k + waveId * wfSize()) /
251  (w->actualWgSz[0] * w->actualWgSz[1]);
252 
253  w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
254  w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
255  w->workItemId[0][k];
256  }
257 
258  w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
259 
260  w->barCnt.resize(wfSize(), 0);
261 
262  w->maxBarCnt = 0;
263  w->oldBarrierCnt = 0;
264  w->barrierCnt = 0;
265 
266  w->privBase = ndr->q.privMemStart;
267  ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
268 
269  w->spillBase = ndr->q.spillMemStart;
270  ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
271 
272  w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
273 
274  // WG state
275  w->wgId = ndr->globalWgId;
276  w->dispatchId = ndr->dispatchId;
277  w->workGroupId[0] = w->wgId % ndr->numWg[0];
278  w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
279  w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
280 
281  w->barrierId = barrier_id;
282  w->stalledAtBarrier = false;
283 
284  // set the wavefront context to have a pointer to this section of the LDS
285  w->ldsChunk = ldsChunk;
286 
287  int32_t refCount M5_VAR_USED =
289  DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
290  cu_id, w->wgId, refCount);
291 
292  w->instructionBuffer.clear();
293 
294  if (w->pendingFetch)
295  w->dropFetch = true;
296 
297  // is this the last wavefront in the workgroup
298  // if set the spillWidth to be the remaining work-items
299  // so that the vector access is correct
300  if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
301  w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
302  } else {
303  w->spillWidth = wfSize();
304  }
305 
306  DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
307  "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
308 
309  w->start(++_n_wave, ndr->q.code_ptr);
310 }
311 
312 void
314 {
315  // reserve the LDS capacity allocated to the work group
316  // disambiguated by the dispatch ID and workgroup ID, which should be
317  // globally unique
318  LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
319  ndr->q.ldsSize);
320 
321  // Send L1 cache acquire
322  // isKernel + isAcquire = Kernel Begin
324  GPUDynInstPtr gpuDynInst =
325  std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst,
326  getAndIncSeqNum());
327 
328  gpuDynInst->useContinuation = false;
329  injectGlobalMemFence(gpuDynInst, true);
330  }
331 
332  // calculate the number of 32-bit vector registers required by wavefront
333  int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
334  int wave_id = 0;
335 
336  // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
337  for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
338  Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
339  // Check if this wavefront slot is available:
340  // It must be stopped and not waiting
341  // for a release to complete S_RETURNING
342  if (w->status == Wavefront::S_STOPPED) {
343  fillKernelState(w, ndr);
344  // if we have scheduled all work items then stop
345  // scheduling wavefronts
346  if (wave_id * wfSize() >= w->actualWgSzTotal)
347  break;
348 
349  // reserve vector registers for the scheduled wavefront
351  uint32_t normSize = 0;
352 
353  w->startVgprIndex = vrf[m % numSIMDs]->manager->
354  allocateRegion(vregDemand, &normSize);
355 
356  w->reservedVectorRegs = normSize;
358 
359  startWavefront(w, wave_id, ldsChunk, ndr);
360  ++wave_id;
361  }
362  }
363  ++barrier_id;
364 }
365 
366 int
368 {
369  // Get true size of workgroup (after clamping to grid size)
370  int trueWgSize[3];
371  int trueWgSizeTotal = 1;
372 
373  for (int d = 0; d < 3; ++d) {
374  trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
375  ndr->wgId[d] * ndr->q.wgSize[d]);
376 
377  trueWgSizeTotal *= trueWgSize[d];
378  DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
379  }
380 
381  DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
382 
383  // calculate the number of 32-bit vector registers required by each
384  // work item of the work group
385  int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
386  bool vregAvail = true;
387  int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
388  int freeWfSlots = 0;
389  // check if the total number of VGPRs required by all WFs of the WG
390  // fit in the VRFs of all SIMD units
391  assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
392  int numMappedWfs = 0;
393  std::vector<int> numWfsPerSimd;
394  numWfsPerSimd.resize(numSIMDs, 0);
395  // find how many free WF slots we have across all SIMDs
396  for (int j = 0; j < shader->n_wf; ++j) {
397  for (int i = 0; i < numSIMDs; ++i) {
398  if (wfList[i][j]->status == Wavefront::S_STOPPED) {
399  // count the number of free WF slots
400  ++freeWfSlots;
401  if (numMappedWfs < numWfs) {
402  // count the WFs to be assigned per SIMD
403  numWfsPerSimd[i]++;
404  }
405  numMappedWfs++;
406  }
407  }
408  }
409 
410  // if there are enough free WF slots then find if there are enough
411  // free VGPRs per SIMD based on the WF->SIMD mapping
412  if (freeWfSlots >= numWfs) {
413  for (int j = 0; j < numSIMDs; ++j) {
414  // find if there are enough free VGPR regions in the SIMD's VRF
415  // to accommodate the WFs of the new WG that would be mapped to
416  // this SIMD unit
417  vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
418  vregDemandPerWI);
419 
420  // stop searching if there is at least one SIMD
421  // whose VRF does not have enough free VGPR pools.
422  // This is because a WG is scheduled only if ALL
423  // of its WFs can be scheduled
424  if (!vregAvail)
425  break;
426  }
427  }
428 
429  DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
430  freeWfSlots, vregAvail);
431 
432  if (!vregAvail) {
434  }
435 
436  // Return true if enough WF slots to submit workgroup and if there are
437  // enough VGPRs to schedule all WFs to their SIMD units
438  if (!lds.canReserve(ndr->q.ldsSize)) {
440  }
441 
442  // Return true if (a) there are enough free WF slots to submit
443  // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
444  // SIMD units and (c) if there is enough space in LDS
445  return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
446 }
447 
448 int
449 ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
450 {
451  DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
452  int ccnt = 0;
453 
454  for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
455  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
456  Wavefront *w = wfList[i_simd][i_wf];
457 
458  if (w->status == Wavefront::S_RUNNING) {
459  DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
460 
461  DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
462  w->barrierId, _barrier_id);
463 
464  DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
465  w->barrierCnt, bcnt);
466  }
467 
468  if (w->status == Wavefront::S_RUNNING &&
469  w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
470  !w->outstandingReqs) {
471  ++ccnt;
472 
473  DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
474  "%d\n", i_simd, i_wf, ccnt);
475  }
476  }
477  }
478 
479  DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
480  cu_id, ccnt, bslots);
481 
482  return ccnt == bslots;
483 }
484 
485 // Check if the current wavefront is blocked on additional resources.
486 bool
487 ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
488 {
489  bool cede = false;
490 
491  // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
492  // magic instructions will impact the scheduling of wavefronts
493  if (xact_cas_mode) {
494  /*
495  * When a wavefront calls xact_cas_ld, it adds itself to a per address
496  * queue. All per address queues are managed by the xactCasLoadMap.
497  *
498  * A wavefront is not blocked if: it is not in ANY per address queue or
499  * if it is at the head of a per address queue.
500  */
501  for (auto itMap : xactCasLoadMap) {
502  std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
503 
504  if (!curWaveIDQueue.empty()) {
505  for (auto it : curWaveIDQueue) {
506  waveIdentifier cur_wave = it;
507 
508  if (cur_wave.simdId == simdId &&
509  cur_wave.wfSlotId == wfSlotId) {
510  // 2 possibilities
511  // 1: this WF has a green light
512  // 2: another WF has a green light
513  waveIdentifier owner_wave = curWaveIDQueue.front();
514 
515  if (owner_wave.simdId != cur_wave.simdId ||
516  owner_wave.wfSlotId != cur_wave.wfSlotId) {
517  // possibility 2
518  cede = true;
519  break;
520  } else {
521  // possibility 1
522  break;
523  }
524  }
525  }
526  }
527  }
528  }
529 
530  return cede;
531 }
532 
533 // Execute one clock worth of work on the ComputeUnit.
534 void
536 {
537  updateEvents();
538  // Execute pipeline stages in reverse order to simulate
539  // the pipeline latency
542  execStage.exec();
545  fetchStage.exec();
546 
547  totalCycles++;
548 }
549 
550 void
552 {
553  // Initialize CU Bus models
556  nextGlbMemBus = 0;
557  nextLocMemBus = 0;
559  "No support for multiple Global Memory Pipelines exists!!!");
561  for (int j = 0; j < numGlbMemUnits; ++j) {
564  }
565 
567  "No support for multiple Local Memory Pipelines exists!!!");
569  for (int j = 0; j < numLocMemUnits; ++j) {
572  }
573  vectorRegsReserved.resize(numSIMDs, 0);
574  aluPipe.resize(numSIMDs);
575  wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
576 
577  for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
578  wfWait[i] = WaitClass();
579  wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
580  }
581 
582  for (int i = 0; i < numSIMDs; ++i) {
583  aluPipe[i] = WaitClass();
584  aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
585  }
586 
587  // Setup space for call args
588  for (int j = 0; j < numSIMDs; ++j) {
589  for (int i = 0; i < shader->n_wf; ++i) {
590  wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
591  }
592  }
593 
594  // Initializing pipeline resources
595  readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
596  waveStatusList.resize(numSIMDs);
597 
598  for (int j = 0; j < numSIMDs; ++j) {
599  for (int i = 0; i < shader->n_wf; ++i) {
600  waveStatusList[j].push_back(
601  std::make_pair(wfList[j][i], BLOCKED));
602  }
603  }
604 
605  for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
606  dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
607  }
608 
609  fetchStage.init(this);
611  scheduleStage.init(this);
612  execStage.init(this);
613  globalMemoryPipe.init(this);
614  localMemoryPipe.init(this);
615  // initialize state for statistics calculation
616  vectorAluInstAvail.resize(numSIMDs, false);
617  shrMemInstAvail = 0;
618  glbMemInstAvail = 0;
619 
621 }
622 
623 bool
625 {
626  // Ruby has completed the memory op. Schedule the mem_resp_event at the
627  // appropriate cycle to process the timing memory response
628  // This delay represents the pipeline delay
629  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
630  int index = sender_state->port_index;
631  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
632 
633  // Is the packet returned a Kernel End or Barrier
634  if (pkt->req->isKernel() && pkt->req->isRelease()) {
635  Wavefront *w =
636  computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
637 
638  // Check if we are waiting on Kernel End Release
639  if (w->status == Wavefront::S_RETURNING) {
640  DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
641  computeUnit->cu_id, w->simdId, w->wfSlotId,
642  w->wfDynId, w->kernId);
643 
644  computeUnit->shader->dispatcher->notifyWgCompl(w);
646  } else {
647  w->outstandingReqs--;
648  }
649 
650  DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
651  computeUnit->cu_id, gpuDynInst->simdId,
652  gpuDynInst->wfSlotId, w->barrierCnt);
653 
654  if (gpuDynInst->useContinuation) {
655  assert(!gpuDynInst->isNoScope());
656  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
657  gpuDynInst);
658  }
659 
660  delete pkt->senderState;
661  delete pkt;
662  return true;
663  } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
664  if (gpuDynInst->useContinuation) {
665  assert(!gpuDynInst->isNoScope());
666  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
667  gpuDynInst);
668  }
669 
670  delete pkt->senderState;
671  delete pkt;
672  return true;
673  }
674 
675  EventFunctionWrapper *mem_resp_event =
676  computeUnit->memPort[index]->createMemRespEvent(pkt);
677 
678  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
679  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
680  index, pkt->req->getPaddr());
681 
682  computeUnit->schedule(mem_resp_event,
683  curTick() + computeUnit->resp_tick_latency);
684  return true;
685 }
686 
687 void
689 {
690  int len = retries.size();
691 
692  assert(len > 0);
693 
694  for (int i = 0; i < len; ++i) {
695  PacketPtr pkt = retries.front().first;
696  GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
697  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
698  computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
699  pkt->req->getPaddr());
700 
704  if (!sendTimingReq(pkt)) {
705  DPRINTF(GPUMem, "failed again!\n");
706  break;
707  } else {
708  DPRINTF(GPUMem, "successful!\n");
709  retries.pop_front();
710  }
711  }
712 }
713 
714 bool
716 {
717  computeUnit->fetchStage.processFetchReturn(pkt);
718 
719  return true;
720 }
721 
722 void
724 {
725  int len = retries.size();
726 
727  assert(len > 0);
728 
729  for (int i = 0; i < len; ++i) {
730  PacketPtr pkt = retries.front().first;
731  Wavefront *wavefront M5_VAR_USED = retries.front().second;
732  DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
733  computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
734  pkt->req->getPaddr());
735  if (!sendTimingReq(pkt)) {
736  DPRINTF(GPUFetch, "failed again!\n");
737  break;
738  } else {
739  DPRINTF(GPUFetch, "successful!\n");
740  retries.pop_front();
741  }
742  }
743 }
744 
745 void
747 {
748  // There must be a way around this check to do the globalMemStart...
749  Addr tmp_vaddr = pkt->req->getVaddr();
750 
751  updatePageDivergenceDist(tmp_vaddr);
752 
753  // set PC in request
754  pkt->req->setPC(gpuDynInst->wavefront()->pc());
755 
756  pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
757 
758  // figure out the type of the request to set read/write
759  BaseTLB::Mode TLB_mode;
760  assert(pkt->isRead() || pkt->isWrite());
761 
762  // Check write before read for atomic operations
763  // since atomic operations should use BaseTLB::Write
764  if (pkt->isWrite()){
765  TLB_mode = BaseTLB::Write;
766  } else if (pkt->isRead()) {
767  TLB_mode = BaseTLB::Read;
768  } else {
769  fatal("pkt is not a read nor a write\n");
770  }
771 
772  tlbCycles -= curTick();
773  ++tlbRequests;
774 
775  int tlbPort_index = perLaneTLB ? index : 0;
776 
777  if (shader->timingSim) {
778  if (debugSegFault) {
780  Addr vaddr = pkt->req->getVaddr();
781  unsigned size = pkt->getSize();
782 
783  if ((vaddr + size - 1) % 64 < vaddr % 64) {
784  panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
785  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
786  }
787 
788  Addr paddr;
789 
790  if (!p->pTable->translate(vaddr, paddr)) {
791  if (!p->fixupFault(vaddr)) {
792  panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
793  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
794  vaddr);
795  }
796  }
797  }
798 
799  // This is the SenderState needed upon return
800  pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
801 
802  // This is the senderState needed by the TLB hierarchy to function
803  TheISA::GpuTLB::TranslationState *translation_state =
804  new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
805  pkt->senderState);
806 
807  pkt->senderState = translation_state;
808 
809  if (functionalTLB) {
810  tlbPort[tlbPort_index]->sendFunctional(pkt);
811 
812  // update the hitLevel distribution
813  int hit_level = translation_state->hitLevel;
814  assert(hit_level != -1);
815  hitsPerTLBLevel[hit_level]++;
816 
817  // New SenderState for the memory access
818  X86ISA::GpuTLB::TranslationState *sender_state =
820 
821  delete sender_state->tlbEntry;
822  delete sender_state->saved;
823  delete sender_state;
824 
825  assert(pkt->req->hasPaddr());
826  assert(pkt->req->hasSize());
827 
828  uint8_t *tmpData = pkt->getPtr<uint8_t>();
829 
830  // this is necessary because the GPU TLB receives packets instead
831  // of requests. when the translation is complete, all relevent
832  // fields in the request will be populated, but not in the packet.
833  // here we create the new packet so we can set the size, addr,
834  // and proper flags.
835  PacketPtr oldPkt = pkt;
836  pkt = new Packet(oldPkt->req, oldPkt->cmd);
837  delete oldPkt;
838  pkt->dataStatic(tmpData);
839 
840 
841  // New SenderState for the memory access
842  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
843  index, nullptr);
844 
845  gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
846  gpuDynInst->tlbHitLevel[index] = hit_level;
847 
848 
849  // translation is done. Schedule the mem_req_event at the
850  // appropriate cycle to send the timing memory request to ruby
851  EventFunctionWrapper *mem_req_event =
852  memPort[index]->createMemReqEvent(pkt);
853 
854  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
855  "scheduled\n", cu_id, gpuDynInst->simdId,
856  gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
857 
858  schedule(mem_req_event, curTick() + req_tick_latency);
859  } else if (tlbPort[tlbPort_index]->isStalled()) {
860  assert(tlbPort[tlbPort_index]->retries.size() > 0);
861 
862  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
863  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
864  tmp_vaddr);
865 
866  tlbPort[tlbPort_index]->retries.push_back(pkt);
867  } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
868  // Stall the data port;
869  // No more packet will be issued till
870  // ruby indicates resources are freed by
871  // a recvReqRetry() call back on this port.
872  tlbPort[tlbPort_index]->stallPort();
873 
874  DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
875  "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
876  tmp_vaddr);
877 
878  tlbPort[tlbPort_index]->retries.push_back(pkt);
879  } else {
880  DPRINTF(GPUTLB,
881  "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
882  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
883  }
884  } else {
885  if (pkt->cmd == MemCmd::MemFenceReq) {
886  gpuDynInst->statusBitVector = VectorMask(0);
887  } else {
888  gpuDynInst->statusBitVector &= (~(1ll << index));
889  }
890 
891  // New SenderState for the memory access
892  delete pkt->senderState;
893 
894  // Because it's atomic operation, only need TLB translation state
895  pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
896  shader->gpuTc);
897 
898  tlbPort[tlbPort_index]->sendFunctional(pkt);
899 
900  // the addr of the packet is not modified, so we need to create a new
901  // packet, or otherwise the memory access will have the old virtual
902  // address sent in the translation packet, instead of the physical
903  // address returned by the translation.
904  PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
905  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
906 
907  // Translation is done. It is safe to send the packet to memory.
908  memPort[0]->sendFunctional(new_pkt);
909 
910  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
911  gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
912  new_pkt->req->getPaddr());
913 
914  // safe_cast the senderState
915  TheISA::GpuTLB::TranslationState *sender_state =
916  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
917 
918  delete sender_state->tlbEntry;
919  delete new_pkt;
920  delete pkt->senderState;
921  delete pkt;
922  }
923 }
924 
925 void
927 {
928  EventFunctionWrapper *mem_req_event =
929  memPort[index]->createMemReqEvent(pkt);
930 
931 
932  // New SenderState for the memory access
933  pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
934  nullptr);
935 
936  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
937  cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
938  pkt->req->getPaddr());
939 
940  schedule(mem_req_event, curTick() + req_tick_latency);
941 }
942 
943 void
944 ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
945  RequestPtr req)
946 {
947  assert(gpuDynInst->isGlobalSeg());
948 
949  if (!req) {
950  req = std::make_shared<Request>(
951  0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
952  }
953  req->setPaddr(0);
954  if (kernelLaunch) {
955  req->setFlags(Request::KERNEL);
956  }
957 
958  // for non-kernel MemFence operations, memorder flags are set depending
959  // on which type of request is currently being sent, so this
960  // should be set by the caller (e.g. if an inst has acq-rel
961  // semantics, it will send one acquire req an one release req)
962  gpuDynInst->setRequestFlags(req, kernelLaunch);
963 
964  // a mem fence must correspond to an acquire/release request
965  assert(req->isAcquire() || req->isRelease());
966 
967  // create packet
968  PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
969 
970  // set packet's sender state
971  pkt->senderState =
972  new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
973 
974  // send the packet
975  sendSyncRequest(gpuDynInst, 0, pkt);
976 }
977 
978 void
980 {
981  DataPort::SenderState *sender_state =
983 
984  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
985  ComputeUnit *compute_unit = computeUnit;
986 
987  assert(gpuDynInst);
988 
989  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
990  compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
991  pkt->req->getPaddr(), index);
992 
993  Addr paddr = pkt->req->getPaddr();
994 
995  if (pkt->cmd != MemCmd::MemFenceResp) {
996  int index = gpuDynInst->memStatusVector[paddr].back();
997 
998  DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
999  pkt->req->getPaddr(), index);
1000 
1001  gpuDynInst->memStatusVector[paddr].pop_back();
1002  gpuDynInst->pAddr = pkt->req->getPaddr();
1003 
1004  if (pkt->isRead() || pkt->isWrite()) {
1005 
1006  if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
1007  gpuDynInst->statusBitVector &= (~(1ULL << index));
1008  } else {
1009  assert(gpuDynInst->statusVector[index] > 0);
1010  gpuDynInst->statusVector[index]--;
1011 
1012  if (!gpuDynInst->statusVector[index])
1013  gpuDynInst->statusBitVector &= (~(1ULL << index));
1014  }
1015 
1016  DPRINTF(GPUMem, "bitvector is now %#x\n",
1017  gpuDynInst->statusBitVector);
1018 
1019  if (gpuDynInst->statusBitVector == VectorMask(0)) {
1020  auto iter = gpuDynInst->memStatusVector.begin();
1021  auto end = gpuDynInst->memStatusVector.end();
1022 
1023  while (iter != end) {
1024  assert(iter->second.empty());
1025  ++iter;
1026  }
1027 
1028  gpuDynInst->memStatusVector.clear();
1029 
1030  if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
1031  gpuDynInst->statusVector.clear();
1032 
1033  compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1034 
1035  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1036  compute_unit->cu_id, gpuDynInst->simdId,
1037  gpuDynInst->wfSlotId);
1038 
1039  // after clearing the status vectors,
1040  // see if there is a continuation to perform
1041  // the continuation may generate more work for
1042  // this memory request
1043  if (gpuDynInst->useContinuation) {
1044  assert(!gpuDynInst->isNoScope());
1045  gpuDynInst->execContinuation(
1046  gpuDynInst->staticInstruction(),
1047  gpuDynInst);
1048  }
1049  }
1050  }
1051  } else {
1052  gpuDynInst->statusBitVector = VectorMask(0);
1053 
1054  if (gpuDynInst->useContinuation) {
1055  assert(!gpuDynInst->isNoScope());
1056  gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1057  gpuDynInst);
1058  }
1059  }
1060 
1061  delete pkt->senderState;
1062  delete pkt;
1063 }
1064 
1065 ComputeUnit*
1066 ComputeUnitParams::create()
1067 {
1068  return new ComputeUnit(this);
1069 }
1070 
1071 bool
1073 {
1074  Addr line = pkt->req->getPaddr();
1075 
1076  DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1077  pkt->req->getVaddr(), line);
1078 
1079  assert(pkt->senderState);
1080  computeUnit->tlbCycles += curTick();
1081 
1082  // pop off the TLB translation state
1083  TheISA::GpuTLB::TranslationState *translation_state =
1084  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1085 
1086  // no PageFaults are permitted for data accesses
1087  if (!translation_state->tlbEntry) {
1088  DTLBPort::SenderState *sender_state =
1089  safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1090 
1091  Wavefront *w M5_VAR_USED =
1092  computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1093  [sender_state->_gpuDynInst->wfSlotId];
1094 
1095  DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1096  pkt->req->getVaddr());
1097  }
1098 
1099  // update the hitLevel distribution
1100  int hit_level = translation_state->hitLevel;
1101  computeUnit->hitsPerTLBLevel[hit_level]++;
1102 
1103  delete translation_state->tlbEntry;
1104  assert(!translation_state->ports.size());
1105  pkt->senderState = translation_state->saved;
1106 
1107  // for prefetch pkt
1108  BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1109 
1110  delete translation_state;
1111 
1112  // use the original sender state to know how to close this transaction
1113  DTLBPort::SenderState *sender_state =
1115 
1116  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1117  int mp_index = sender_state->portIndex;
1118  Addr vaddr = pkt->req->getVaddr();
1119  gpuDynInst->memStatusVector[line].push_back(mp_index);
1120  gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1121 
1122  MemCmd requestCmd;
1123 
1124  if (pkt->cmd == MemCmd::ReadResp) {
1125  requestCmd = MemCmd::ReadReq;
1126  } else if (pkt->cmd == MemCmd::WriteResp) {
1127  requestCmd = MemCmd::WriteReq;
1128  } else if (pkt->cmd == MemCmd::SwapResp) {
1129  requestCmd = MemCmd::SwapReq;
1130  } else {
1131  panic("unsupported response to request conversion %s\n",
1132  pkt->cmd.toString());
1133  }
1134 
1135  if (computeUnit->prefetchDepth) {
1136  int simdId = gpuDynInst->simdId;
1137  int wfSlotId = gpuDynInst->wfSlotId;
1138  Addr last = 0;
1139 
1140  switch(computeUnit->prefetchType) {
1141  case Enums::PF_CU:
1142  last = computeUnit->lastVaddrCU[mp_index];
1143  break;
1144  case Enums::PF_PHASE:
1145  last = computeUnit->lastVaddrSimd[simdId][mp_index];
1146  break;
1147  case Enums::PF_WF:
1148  last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1149  default:
1150  break;
1151  }
1152 
1153  DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1154  computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1155 
1156  int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1158  : 0;
1159 
1160  DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1161 
1162  computeUnit->lastVaddrCU[mp_index] = vaddr;
1163  computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1164  computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1165 
1166  stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1167  computeUnit->prefetchStride: stride;
1168 
1169  DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1170  computeUnit->cu_id, simdId, wfSlotId, mp_index);
1171 
1172  DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1173 
1174  // Prefetch Next few pages atomically
1175  for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1176  DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1177  vaddr+stride*pf*TheISA::PageBytes);
1178 
1179  if (!stride)
1180  break;
1181 
1182  RequestPtr prefetch_req = std::make_shared<Request>(
1183  vaddr + stride * pf * TheISA::PageBytes,
1184  sizeof(uint8_t), 0,
1185  computeUnit->masterId(),
1186  0, 0, nullptr);
1187 
1188  PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1189  uint8_t foo = 0;
1190  prefetch_pkt->dataStatic(&foo);
1191 
1192  // Because it's atomic operation, only need TLB translation state
1193  prefetch_pkt->senderState =
1194  new TheISA::GpuTLB::TranslationState(TLB_mode,
1195  computeUnit->shader->gpuTc,
1196  true);
1197 
1198  // Currently prefetches are zero-latency, hence the sendFunctional
1199  sendFunctional(prefetch_pkt);
1200 
1201  /* safe_cast the senderState */
1202  TheISA::GpuTLB::TranslationState *tlb_state =
1203  safe_cast<TheISA::GpuTLB::TranslationState*>(
1204  prefetch_pkt->senderState);
1205 
1206 
1207  delete tlb_state->tlbEntry;
1208  delete tlb_state;
1209  delete prefetch_pkt;
1210  }
1211  }
1212 
1213  // First we must convert the response cmd back to a request cmd so that
1214  // the request can be sent through the cu's master port
1215  PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1216  new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1217  delete pkt->senderState;
1218  delete pkt;
1219 
1220  // New SenderState for the memory access
1221  new_pkt->senderState =
1222  new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1223  nullptr);
1224 
1225  // translation is done. Schedule the mem_req_event at the appropriate
1226  // cycle to send the timing memory request to ruby
1227  EventFunctionWrapper *mem_req_event =
1228  computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt);
1229 
1230  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1231  computeUnit->cu_id, gpuDynInst->simdId,
1232  gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1233 
1234  computeUnit->schedule(mem_req_event, curTick() +
1235  computeUnit->req_tick_latency);
1236 
1237  return true;
1238 }
1239 
1242 {
1243  return new EventFunctionWrapper(
1244  [this, pkt]{ processMemReqEvent(pkt); },
1245  "ComputeUnit memory request event", true);
1246 }
1247 
1250 {
1251  return new EventFunctionWrapper(
1252  [this, pkt]{ processMemRespEvent(pkt); },
1253  "ComputeUnit memory response event", true);
1254 }
1255 
1256 void
1258 {
1259  SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1260  GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1261  ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1262 
1263  if (!(sendTimingReq(pkt))) {
1264  retries.push_back(std::make_pair(pkt, gpuDynInst));
1265 
1266  DPRINTF(GPUPort,
1267  "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1268  compute_unit->cu_id, gpuDynInst->simdId,
1269  gpuDynInst->wfSlotId, index,
1270  pkt->req->getPaddr());
1271  } else {
1272  DPRINTF(GPUPort,
1273  "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1274  compute_unit->cu_id, gpuDynInst->simdId,
1275  gpuDynInst->wfSlotId, index,
1276  pkt->req->getPaddr());
1277  }
1278 }
1279 
1280 /*
1281  * The initial translation request could have been rejected,
1282  * if <retries> queue is not Retry sending the translation
1283  * request. sendRetry() is called from the peer port whenever
1284  * a translation completes.
1285  */
1286 void
1288 {
1289  int len = retries.size();
1290 
1291  DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1292  computeUnit->cu_id, len);
1293 
1294  assert(len > 0);
1295  assert(isStalled());
1296  // recvReqRetry is an indication that the resource on which this
1297  // port was stalling on is freed. So, remove the stall first
1298  unstallPort();
1299 
1300  for (int i = 0; i < len; ++i) {
1301  PacketPtr pkt = retries.front();
1302  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1303  DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1304 
1305  if (!sendTimingReq(pkt)) {
1306  // Stall port
1307  stallPort();
1308  DPRINTF(GPUTLB, ": failed again\n");
1309  break;
1310  } else {
1311  DPRINTF(GPUTLB, ": successful\n");
1312  retries.pop_front();
1313  }
1314  }
1315 }
1316 
1317 bool
1319 {
1320  Addr line M5_VAR_USED = pkt->req->getPaddr();
1321  DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1322  computeUnit->cu_id, pkt->req->getVaddr(), line);
1323 
1324  assert(pkt->senderState);
1325 
1326  // pop off the TLB translation state
1327  TheISA::GpuTLB::TranslationState *translation_state =
1328  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1329 
1330  bool success = translation_state->tlbEntry != nullptr;
1331  delete translation_state->tlbEntry;
1332  assert(!translation_state->ports.size());
1333  pkt->senderState = translation_state->saved;
1334  delete translation_state;
1335 
1336  // use the original sender state to know how to close this transaction
1337  ITLBPort::SenderState *sender_state =
1339 
1340  // get the wavefront associated with this translation request
1341  Wavefront *wavefront = sender_state->wavefront;
1342  delete pkt->senderState;
1343 
1344  if (success) {
1345  // pkt is reused in fetch(), don't delete it here. However, we must
1346  // reset the command to be a request so that it can be sent through
1347  // the cu's master port
1348  assert(pkt->cmd == MemCmd::ReadResp);
1349  pkt->cmd = MemCmd::ReadReq;
1350 
1351  computeUnit->fetchStage.fetch(pkt, wavefront);
1352  } else {
1353  if (wavefront->dropFetch) {
1354  assert(wavefront->instructionBuffer.empty());
1355  wavefront->dropFetch = false;
1356  }
1357 
1358  wavefront->pendingFetch = 0;
1359  }
1360 
1361  return true;
1362 }
1363 
1364 /*
1365  * The initial translation request could have been rejected, if
1366  * <retries> queue is not empty. Retry sending the translation
1367  * request. sendRetry() is called from the peer port whenever
1368  * a translation completes.
1369  */
1370 void
1372 {
1373 
1374  int len = retries.size();
1375  DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1376 
1377  assert(len > 0);
1378  assert(isStalled());
1379 
1380  // recvReqRetry is an indication that the resource on which this
1381  // port was stalling on is freed. So, remove the stall first
1382  unstallPort();
1383 
1384  for (int i = 0; i < len; ++i) {
1385  PacketPtr pkt = retries.front();
1386  Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1387  DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1388 
1389  if (!sendTimingReq(pkt)) {
1390  stallPort(); // Stall port
1391  DPRINTF(GPUTLB, ": failed again\n");
1392  break;
1393  } else {
1394  DPRINTF(GPUTLB, ": successful\n");
1395  retries.pop_front();
1396  }
1397  }
1398 }
1399 
1400 void
1402 {
1404 
1405  vALUInsts
1406  .name(name() + ".valu_insts")
1407  .desc("Number of vector ALU insts issued.")
1408  ;
1410  .name(name() + ".valu_insts_per_wf")
1411  .desc("The avg. number of vector ALU insts issued per-wavefront.")
1412  ;
1413  sALUInsts
1414  .name(name() + ".salu_insts")
1415  .desc("Number of scalar ALU insts issued.")
1416  ;
1418  .name(name() + ".salu_insts_per_wf")
1419  .desc("The avg. number of scalar ALU insts issued per-wavefront.")
1420  ;
1422  .name(name() + ".inst_cycles_valu")
1423  .desc("Number of cycles needed to execute VALU insts.")
1424  ;
1426  .name(name() + ".inst_cycles_salu")
1427  .desc("Number of cycles needed to execute SALU insts.")
1428  ;
1430  .name(name() + ".thread_cycles_valu")
1431  .desc("Number of thread cycles used to execute vector ALU ops. "
1432  "Similar to instCyclesVALU but multiplied by the number of "
1433  "active threads.")
1434  ;
1436  .name(name() + ".valu_utilization")
1437  .desc("Percentage of active vector ALU threads in a wave.")
1438  ;
1440  .name(name() + ".lds_no_flat_insts")
1441  .desc("Number of LDS insts issued, not including FLAT "
1442  "accesses that resolve to LDS.")
1443  ;
1445  .name(name() + ".lds_no_flat_insts_per_wf")
1446  .desc("The avg. number of LDS insts (not including FLAT "
1447  "accesses that resolve to LDS) per-wavefront.")
1448  ;
1450  .name(name() + ".flat_vmem_insts")
1451  .desc("The number of FLAT insts that resolve to vmem issued.")
1452  ;
1454  .name(name() + ".flat_vmem_insts_per_wf")
1455  .desc("The average number of FLAT insts that resolve to vmem "
1456  "issued per-wavefront.")
1457  ;
1458  flatLDSInsts
1459  .name(name() + ".flat_lds_insts")
1460  .desc("The number of FLAT insts that resolve to LDS issued.")
1461  ;
1463  .name(name() + ".flat_lds_insts_per_wf")
1464  .desc("The average number of FLAT insts that resolve to LDS "
1465  "issued per-wavefront.")
1466  ;
1468  .name(name() + ".vector_mem_writes")
1469  .desc("Number of vector mem write insts (excluding FLAT insts).")
1470  ;
1472  .name(name() + ".vector_mem_writes_per_wf")
1473  .desc("The average number of vector mem write insts "
1474  "(excluding FLAT insts) per-wavefront.")
1475  ;
1477  .name(name() + ".vector_mem_reads")
1478  .desc("Number of vector mem read insts (excluding FLAT insts).")
1479  ;
1481  .name(name() + ".vector_mem_reads_per_wf")
1482  .desc("The avg. number of vector mem read insts (excluding "
1483  "FLAT insts) per-wavefront.")
1484  ;
1486  .name(name() + ".scalar_mem_writes")
1487  .desc("Number of scalar mem write insts.")
1488  ;
1490  .name(name() + ".scalar_mem_writes_per_wf")
1491  .desc("The average number of scalar mem write insts per-wavefront.")
1492  ;
1494  .name(name() + ".scalar_mem_reads")
1495  .desc("Number of scalar mem read insts.")
1496  ;
1498  .name(name() + ".scalar_mem_reads_per_wf")
1499  .desc("The average number of scalar mem read insts per-wavefront.")
1500  ;
1501 
1504  vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
1512 
1513  tlbCycles
1514  .name(name() + ".tlb_cycles")
1515  .desc("total number of cycles for all uncoalesced requests")
1516  ;
1517 
1518  tlbRequests
1519  .name(name() + ".tlb_requests")
1520  .desc("number of uncoalesced requests")
1521  ;
1522 
1523  tlbLatency
1524  .name(name() + ".avg_translation_latency")
1525  .desc("Avg. translation latency for data translations")
1526  ;
1527 
1529 
1531  .init(4)
1532  .name(name() + ".TLB_hits_distribution")
1533  .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
1534  ;
1535 
1536  // fixed number of TLB levels
1537  for (int i = 0; i < 4; ++i) {
1538  if (!i)
1539  hitsPerTLBLevel.subname(i,"page_table");
1540  else
1541  hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
1542  }
1543 
1544  execRateDist
1545  .init(0, 10, 2)
1546  .name(name() + ".inst_exec_rate")
1547  .desc("Instruction Execution Rate: Number of executed vector "
1548  "instructions per cycle")
1549  ;
1550 
1552  .init(0, wfSize(), 2)
1553  .name(name() + ".lds_bank_conflicts")
1554  .desc("Number of bank conflicts per LDS memory packet")
1555  ;
1556 
1558  .name(name() + ".lds_bank_access_cnt")
1559  .desc("Total number of LDS bank accesses")
1560  ;
1561 
1563  // A wavefront can touch up to N pages per memory instruction where
1564  // N is equal to the wavefront size
1565  // The number of pages per bin can be configured (here it's 4).
1566  .init(1, wfSize(), 4)
1567  .name(name() + ".page_divergence_dist")
1568  .desc("pages touched per wf (over all mem. instr.)")
1569  ;
1570 
1572  .init(1, wfSize(), 4)
1573  .name(name() + ".warp_execution_dist")
1574  .desc("number of lanes active per instruction (oval all instructions)")
1575  ;
1576 
1578  .init(1, wfSize(), 4)
1579  .name(name() + ".gmem_lanes_execution_dist")
1580  .desc("number of active lanes per global memory instruction")
1581  ;
1582 
1584  .init(1, wfSize(), 4)
1585  .name(name() + ".lmem_lanes_execution_dist")
1586  .desc("number of active lanes per local memory instruction")
1587  ;
1588 
1590  .name(name() + ".num_instr_executed")
1591  .desc("number of instructions executed")
1592  ;
1593 
1595  .name(name() + ".num_vec_ops_executed")
1596  .desc("number of vec ops executed (e.g. WF size/inst)")
1597  ;
1598 
1599  totalCycles
1600  .name(name() + ".num_total_cycles")
1601  .desc("number of cycles the CU ran for")
1602  ;
1603 
1604  ipc
1605  .name(name() + ".ipc")
1606  .desc("Instructions per cycle (this CU only)")
1607  ;
1608 
1609  vpc
1610  .name(name() + ".vpc")
1611  .desc("Vector Operations per cycle (this CU only)")
1612  ;
1613 
1615  .name(name() + ".num_alu_insts_executed")
1616  .desc("Number of dynamic non-GM memory insts executed")
1617  ;
1618 
1620  .name(name() + ".wg_blocked_due_lds_alloc")
1621  .desc("Workgroup blocked due to LDS capacity")
1622  ;
1623 
1626 
1628  .name(name() + ".times_wg_blocked_due_vgpr_alloc")
1629  .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
1630  ;
1631 
1633  .name(name() + ".global_mem_instr_cnt")
1634  .desc("dynamic global memory instructions count")
1635  ;
1636 
1638  .name(name() + ".local_mem_instr_cnt")
1639  .desc("dynamic local memory intruction count")
1640  ;
1641 
1644 
1645  completedWfs
1646  .name(name() + ".num_completed_wfs")
1647  .desc("number of completed wavefronts")
1648  ;
1649 
1650  numCASOps
1651  .name(name() + ".num_CAS_ops")
1652  .desc("number of compare and swap operations")
1653  ;
1654 
1656  .name(name() + ".num_failed_CAS_ops")
1657  .desc("number of compare and swap operations that failed")
1658  ;
1659 
1660  // register stats of pipeline stages
1661  fetchStage.regStats();
1664  execStage.regStats();
1665 
1666  // register stats of memory pipeline
1669 }
1670 
1671 void
1673 {
1674  if (gpuDynInst->isScalar()) {
1675  if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1676  sALUInsts++;
1677  instCyclesSALU++;
1678  } else if (gpuDynInst->isLoad()) {
1679  scalarMemReads++;
1680  } else if (gpuDynInst->isStore()) {
1681  scalarMemWrites++;
1682  }
1683  } else {
1684  if (gpuDynInst->isALU()) {
1685  vALUInsts++;
1686  instCyclesVALU++;
1687  threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
1688  } else if (gpuDynInst->isFlat()) {
1689  if (gpuDynInst->isLocalMem()) {
1690  flatLDSInsts++;
1691  } else {
1692  flatVMemInsts++;
1693  }
1694  } else if (gpuDynInst->isLocalMem()) {
1695  ldsNoFlatInsts++;
1696  } else if (gpuDynInst->isLoad()) {
1697  vectorMemReads++;
1698  } else if (gpuDynInst->isStore()) {
1699  vectorMemWrites++;
1700  }
1701  }
1702 }
1703 
1704 void
1706 {
1707  Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
1708 
1709  if (!pagesTouched.count(virt_page_addr))
1710  pagesTouched[virt_page_addr] = 1;
1711  else
1712  pagesTouched[virt_page_addr]++;
1713 }
1714 
1715 void
1717 {
1718  if (computeUnit->countPages) {
1719  std::ostream *page_stat_file =
1720  simout.create(computeUnit->name().c_str())->stream();
1721 
1722  *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1723  std::endl;
1724 
1725  for (auto iter : computeUnit->pageAccesses) {
1726  *page_stat_file << std::hex << iter.first << ",";
1727  *page_stat_file << std::dec << iter.second.first << ",";
1728  *page_stat_file << std::dec << iter.second.second << std::endl;
1729  }
1730  }
1731  }
1732 
1733 bool
1735 {
1736  for (int i = 0; i < numSIMDs; ++i) {
1737  if (!isSimdDone(i)) {
1738  return false;
1739  }
1740  }
1741 
1742  bool glbMemBusRdy = true;
1743  for (int j = 0; j < numGlbMemUnits; ++j) {
1744  glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
1745  }
1746  bool locMemBusRdy = true;
1747  for (int j = 0; j < numLocMemUnits; ++j) {
1748  locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
1749  }
1750 
1755  !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
1756  return false;
1757  }
1758 
1759  return true;
1760 }
1761 
1762 int32_t
1763 ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
1764 {
1765  return lds.getRefCounter(dispatchId, wgId);
1766 }
1767 
1768 bool
1769 ComputeUnit::isSimdDone(uint32_t simdId) const
1770 {
1771  assert(simdId < numSIMDs);
1772 
1773  for (int i=0; i < numGlbMemUnits; ++i) {
1774  if (!vrfToGlobalMemPipeBus[i].rdy())
1775  return false;
1776  }
1777  for (int i=0; i < numLocMemUnits; ++i) {
1778  if (!vrfToLocalMemPipeBus[i].rdy())
1779  return false;
1780  }
1781  if (!aluPipe[simdId].rdy()) {
1782  return false;
1783  }
1784 
1785  for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1786  if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
1787  return false;
1788  }
1789  }
1790 
1791  return true;
1792 }
1793 
1799 bool
1801 {
1802  // this is just a request to carry the GPUDynInstPtr
1803  // back and forth
1804  RequestPtr newRequest = std::make_shared<Request>();
1805  newRequest->setPaddr(0x0);
1806 
1807  // ReadReq is not evaluted by the LDS but the Packet ctor requires this
1808  PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
1809 
1810  // This is the SenderState needed upon return
1811  newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
1812 
1813  return ldsPort->sendTimingReq(newPacket);
1814 }
1815 
1819 bool
1821 {
1822  const ComputeUnit::LDSPort::SenderState *senderState =
1823  dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
1824 
1825  fatal_if(!senderState, "did not get the right sort of sender state");
1826 
1827  GPUDynInstPtr gpuDynInst = senderState->getMemInst();
1828 
1829  delete packet->senderState;
1830  delete packet;
1831 
1832  computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1833  return true;
1834 }
1835 
1841 bool
1843 {
1844  ComputeUnit::LDSPort::SenderState *sender_state =
1845  dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
1846  fatal_if(!sender_state, "packet without a valid sender state");
1847 
1848  GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
1849 
1850  if (isStalled()) {
1851  fatal_if(retries.empty(), "must have retries waiting to be stalled");
1852 
1853  retries.push(pkt);
1854 
1855  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
1856  computeUnit->cu_id, gpuDynInst->simdId,
1857  gpuDynInst->wfSlotId);
1858  return false;
1859  } else if (!MasterPort::sendTimingReq(pkt)) {
1860  // need to stall the LDS port until a recvReqRetry() is received
1861  // this indicates that there is more space
1862  stallPort();
1863  retries.push(pkt);
1864 
1865  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1866  computeUnit->cu_id, gpuDynInst->simdId,
1867  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1868  return false;
1869  } else {
1870  DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1871  computeUnit->cu_id, gpuDynInst->simdId,
1872  gpuDynInst->wfSlotId, pkt->req->getPaddr());
1873  return true;
1874  }
1875 }
1876 
1883 void
1885 {
1886  auto queueSize = retries.size();
1887 
1888  DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
1889  computeUnit->cu_id, queueSize);
1890 
1891  fatal_if(queueSize < 1,
1892  "why was there a recvReqRetry() with no pending reqs?");
1893  fatal_if(!isStalled(),
1894  "recvReqRetry() happened when the port was not stalled");
1895 
1896  unstallPort();
1897 
1898  while (!retries.empty()) {
1899  PacketPtr packet = retries.front();
1900 
1901  DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
1902 
1903  if (!MasterPort::sendTimingReq(packet)) {
1904  // Stall port
1905  stallPort();
1906  DPRINTF(GPUPort, ": LDS send failed again\n");
1907  break;
1908  } else {
1909  DPRINTF(GPUTLB, ": LDS send successful\n");
1910  retries.pop();
1911  }
1912  }
1913 }
uint32_t numVecRegsPerSimd
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
Addr roBase
Definition: wavefront.hh:273
uint16_t cRegCount
Definition: qstruct.hh:62
RubyTester::SenderState SenderState
Definition: Check.cc:37
#define DPRINTF(x,...)
Definition: trace.hh:222
uint32_t workGroupSz[3]
Definition: wavefront.hh:195
void processMemReqEvent(PacketPtr pkt)
Stats::Formula vpc
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
GMTokenPort gmTokenPort
OutputDirectory simout
Definition: output.cc:61
Stats::Scalar flatLDSInsts
Addr spillBase
Definition: wavefront.hh:261
Bitfield< 30, 0 > index
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
Definition: statistics.hh:376
const Regs::Info & regInfo(Addr daddr)
Definition: sinicreg.hh:182
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
std::true_type foo(void(*)(ThreadContext *, const Ret &ret, State &state))
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:171
uint32_t numCyclesPerLoadTransfer
Definition: packet.hh:70
const std::string & name()
Definition: trace.cc:50
Stats::Formula ipc
uint64_t privMemStart
Definition: qstruct.hh:63
Bitfield< 7 > i
WaitClass glbMemToVrfBus
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
Definition: compute_unit.hh:57
std::map< unsigned, waveQueue > xactCasLoadMap
bool debugSegFault
LdsState & lds
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition: output.cc:207
Bitfield< 0 > m
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
Definition: gpu_tlb.hh:329
void fillKernelState(Wavefront *w, NDRange *ndr)
const Addr PageShift
Definition: isa_traits.hh:55
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Definition: compute_unit.hh:97
Stats::Formula flatLDSInstsPerWF
uint32_t barrierCnt
Definition: wavefront.hh:155
Stats::Distribution controlFlowDivergenceDist
const std::string & toString() const
Return the string to a cmd given by idx.
Definition: packet.hh:231
Bitfield< 21, 20 > stride
std::vector< std::vector< Wavefront * > > readyList
int maxBarCnt
Definition: wavefront.hh:252
std::shared_ptr< Request > RequestPtr
Definition: request.hh:81
uint32_t gridSz[3]
Definition: wavefront.hh:196
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:43
ip6_addr_t addr
Definition: inet.hh:330
VectorMask initMask
Definition: wavefront.hh:248
uint32_t wgSz
Definition: wavefront.hh:198
virtual Process * getProcessPtr()=0
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
uint16_t sRegCount
Definition: qstruct.hh:60
uint32_t spillWidth
Definition: wavefront.hh:265
int simdId
Definition: wavefront.hh:163
bool isGMLdRespFIFOWrRdy() const
bool dropFetch
Definition: wavefront.hh:170
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
Definition: wavefront.cc:792
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
Definition: port.hh:441
void init(ComputeUnit *cu)
uint32_t dispatchId
Definition: wavefront.hh:206
std::vector< DTLBPort * > tlbPort
int kernId
Definition: wavefront.hh:161
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
Definition: lds_state.hh:56
uint64_t code_ptr
Definition: qstruct.hh:55
void updateEvents()
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
int wfSlotId
Definition: wavefront.hh:160
Stats::Scalar dynamicLMemInstrCnt
bool stalledAtBarrier
Definition: wavefront.hh:254
SenderState is information carried along with the packet throughout the TLB hierarchy.
bool isWrite() const
Definition: packet.hh:523
LdsChunk * ldsChunk
Definition: wavefront.hh:258
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1084
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenMasterPort/SlaveMasterPort pa...
Definition: token_port.cc:73
bool isRead() const
Definition: packet.hh:522
GPUStaticInst * kernelLaunchInst
#define DPRINTFN(...)
Definition: trace.hh:226
Stats::Scalar numInstrExecuted
Stats::Scalar vALUInsts
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1149
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1034
Stats::Distribution ldsBankConflictDist
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
bool rdy() const
Definition: misc.hh:68
TokenManager * memPortTokens
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
RequestPtr req
A pointer to the original request.
Definition: packet.hh:321
uint32_t workGroupId[3]
Definition: wavefront.hh:194
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
uint64_t wfDynId
Definition: wavefront.hh:280
Bitfield< 5, 0 > status
uint32_t barrierSlots
Definition: wavefront.hh:157
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
Definition: compute_unit.cc:58
unsigned getSize() const
Definition: packet.hh:730
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:99
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
void exec()
Definition: fetch_stage.cc:66
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:46
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
bool functionalTLB
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:191
Tick curTick()
The current simulated tick.
Definition: core.hh:44
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:167
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
int funcargs_size
Definition: shader.hh:133
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:158
uint32_t actualWgSz[3]
Definition: wavefront.hh:200
uint32_t wfId
Definition: wavefront.hh:204
bool translate(Addr vaddr, Addr &paddr)
Translate function.
Definition: page_table.cc:140
Addr privBase
Definition: wavefront.hh:268
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:192
Stats::Distribution pageDivergenceDist
ExecStage execStage
Definition: compute_unit.hh:98
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
void regStats()
Definition: fetch_stage.cc:96
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition: lds_state.hh:451
std::vector< WaitClass > vrfToGlobalMemPipeBus
bool isDone() const
void updateInstStats(GPUDynInstPtr gpuDynInst)
Bitfield< 23 > k
Definition: dt_constants.hh:78
std::vector< int > barCnt
Definition: wavefront.hh:251
Bitfield< 9 > d
uint32_t wgId
Definition: wavefront.hh:197
Stats::Scalar flatVMemInsts
int numWg[3]
Definition: ndrange.hh:50
Stats::Scalar numCASOps
GPUDynInstPtr getMemInst() const
Bitfield< 18, 16 > len
The request should be marked with KERNEL.
Definition: request.hh:169
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
void exec()
Definition: exec_stage.cc:126
uint32_t actualWgSzTotal
Definition: wavefront.hh:201
uint32_t ldsSize
Definition: qstruct.hh:72
Addr getAddr() const
Definition: packet.hh:720
int wfSize() const
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
Definition: core.cc:140
std::vector< std::vector< Addr > > lastVaddrSimd
bool isPowerOf2(const T &n)
Definition: intmath.hh:90
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:199
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void schedule(Event &event, Tick when)
Definition: eventq.hh:998
void computeActualWgSz(NDRange *ndr)
Definition: wavefront.cc:991
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:131
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2606
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
Definition: exec_stage.cc:51
uint32_t wgSize[3]
Definition: qstruct.hh:59
STL list class.
Definition: stl.hh:51
uint32_t gdSize[3]
Definition: qstruct.hh:57
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition: lds_state.hh:287
ThreadContext * gpuTc
Definition: shader.hh:99
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
Bitfield< 0 > w
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140
Packet::SenderState * saved
Definition: gpu_tlb.hh:354
#define ULL(N)
uint64_t constant
Definition: types.hh:48
Stats::Scalar ldsNoFlatInsts
T safe_cast(U ptr)
Definition: cast.hh:59
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:249
uint64_t roMemStart
Definition: qstruct.hh:69
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Definition: qstruct.hh:93
Stats::Scalar instCyclesVALU
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Bitfield< 15 > system
Definition: misc.hh:997
Stats::Scalar completedWfs
bool xact_cas_mode
uint32_t outstandingReqs
Definition: wavefront.hh:208
Bitfield< 24 > j
uint32_t globalWgId
Definition: ndrange.hh:57
uint32_t privSizePerItem
Definition: wavefront.hh:270
Mode
Definition: tlb.hh:57
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:276
Stats::Formula vALUInstsPerWF
uint32_t privMemPerItem
Definition: qstruct.hh:64
virtual const std::string name() const
Definition: sim_object.hh:128
Shader * shader
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
EmulationPageTable * pTable
Definition: process.hh:174
uint64_t spillMemStart
Definition: qstruct.hh:66
Stats::Distribution activeLanesPerGMemInstrDist
uint32_t oldBarrierCnt
Definition: wavefront.hh:154
bool timingSim
Definition: shader.hh:106
Declarations of a non-full system Page Table.
bool fixupFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
Definition: process.cc:353
bool pendingFetch
Definition: wavefront.hh:169
void init(uint64_t *_tcnt, uint32_t _numStages=0)
Definition: misc.hh:52
Stats::Scalar tlbCycles
SenderState is information carried along with the packet, esp.
int reservedVectorRegs
Definition: wavefront.hh:228
uint32_t startVgprIndex
Definition: wavefront.hh:231
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
uint32_t roSize
Definition: wavefront.hh:275
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet&#39;s sender state.
Definition: packet.hh:474
uint32_t spillSizePerItem
Definition: wavefront.hh:263
MemCmd cmd
The command field of the packet.
Definition: packet.hh:316
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:155
void init(ComputeUnit *cu)
Tick ticks(int numCycles) const
Definition: shader.hh:91
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition: lds_state.hh:355
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
Definition: intmath.hh:99
int ReadyWorkgroup(NDRange *ndr)
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack&#39;d and must stall...
void init(ComputeUnit *cu)
Definition: fetch_stage.cc:54
std::map< Addr, int > pagesTouched
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition: lds_state.hh:322
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
HsaQueueEntry q
Definition: ndrange.hh:45
uint32_t roMemTotal
Definition: qstruct.hh:70
WaitClass locMemToVrfBus
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Definition: wavefront.cc:140
const Addr PageBytes
Definition: isa_traits.hh:56
FetchStage fetchStage
Definition: compute_unit.hh:95
Stats::Formula flatVMemInstsPerWF
std::vector< uint8_t > statusVec
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
uint32_t barrier_id
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:309
std::vector< uint64_t > lastExecCycle
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
uint32_t spillMemPerItem
Definition: qstruct.hh:67
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:64
uint16_t dRegCount
Definition: qstruct.hh:61
Bitfield< 2 > pf
Definition: misc.hh:550
int impl_kern_boundary_sync
Definition: shader.hh:110
Stats::Scalar sALUInsts
Stats::Scalar ldsBankAccesses
uint32_t barrierId
Definition: wavefront.hh:156
Tick req_tick_latency
Stats::Scalar totalCycles
uint64_t tick_cnt
Definition: shader.hh:151
void regStats()
Definition: exec_stage.cc:150
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
Bitfield< 0 > p
std::vector< Addr > lastVaddrCU
int n_wf
Definition: shader.hh:121
int wgId[3]
Definition: ndrange.hh:48
void regStats() override
Callback to set stat parameters.
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Definition: lds_state.cc:79
ComputeUnitParams Params
Stats::Formula ldsNoFlatInstsPerWF
bool isSimdDone(uint32_t) const
uint64_t getAndIncSeqNum()
status_e status
Definition: wavefront.hh:158
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
MasterID masterId()
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
std::vector< int > vectorRegsReserved
EXEC_POLICY exec_policy
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:103
int dispatchId
Definition: ndrange.hh:66
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ScoreboardCheckStage scoreboardCheckStage
Definition: compute_unit.hh:96
uint8_t * kernelArgs
Definition: wavefront.hh:278

Generated on Mon Jun 8 2020 15:45:11 for gem5 by doxygen 1.8.13