gem5 v23.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <limits>
35
38#include "base/output.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUExec.hh"
41#include "debug/GPUFetch.hh"
42#include "debug/GPUMem.hh"
43#include "debug/GPUPort.hh"
44#include "debug/GPUPrefetch.hh"
45#include "debug/GPUReg.hh"
46#include "debug/GPURename.hh"
47#include "debug/GPUSync.hh"
48#include "debug/GPUTLB.hh"
54#include "gpu-compute/shader.hh"
58#include "mem/page_table.hh"
59#include "sim/process.hh"
60#include "sim/sim_exit.hh"
61
62namespace gem5
63{
64
66 numVectorGlobalMemUnits(p.num_global_mem_pipes),
67 numVectorSharedMemUnits(p.num_shared_mem_pipes),
68 numScalarMemUnits(p.num_scalar_mem_pipes),
69 numVectorALUs(p.num_SIMDs),
70 numScalarALUs(p.num_scalar_cores),
71 vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
72 coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
73 registerManager(p.register_manager),
74 fetchStage(p, *this),
75 scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
76 scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
77 execStage(p, *this, scheduleToExecute),
78 globalMemoryPipe(p, *this),
79 localMemoryPipe(p, *this),
80 scalarMemoryPipe(p, *this),
81 tickEvent([this]{ exec(); }, "Compute unit tick event",
82 false, Event::CPU_Tick_Pri),
83 cu_id(p.cu_id),
84 vrf(p.vector_register_file), srf(p.scalar_register_file),
85 simdWidth(p.simd_width),
86 spBypassPipeLength(p.spbypass_pipe_length),
87 dpBypassPipeLength(p.dpbypass_pipe_length),
88 scalarPipeStages(p.scalar_pipe_length),
89 operandNetworkLength(p.operand_network_length),
90 issuePeriod(p.issue_period),
91 vrf_gm_bus_latency(p.vrf_gm_bus_latency),
92 srf_scm_bus_latency(p.srf_scm_bus_latency),
93 vrf_lm_bus_latency(p.vrf_lm_bus_latency),
94 perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
95 prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
96 debugSegFault(p.debugSegFault),
97 functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
98 countPages(p.countPages),
99 req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
100 resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
101 scalar_req_tick_latency(
102 p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
103 scalar_resp_tick_latency(
104 p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
105 _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
106 lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
107 ldsPort(csprintf("%s-port", name()), this),
108 scalarDataPort(csprintf("%s-port", name()), this),
109 scalarDTLBPort(csprintf("%s-port", name()), this),
110 sqcPort(csprintf("%s-port", name()), this),
111 sqcTLBPort(csprintf("%s-port", name()), this),
112 _cacheLineSize(p.system->cacheLineSize()),
113 _numBarrierSlots(p.num_barrier_slots),
114 globalSeqNum(0), wavefrontSize(p.wf_size),
115 scoreboardCheckToSchedule(p),
116 scheduleToExecute(p),
117 stats(this, p.n_wf)
118{
119 // This is not currently supported and would require adding more handling
120 // for system vs. device memory requests on the functional paths, so we
121 // fatal immediately in the constructor if this configuration is seen.
122 fatal_if(functionalTLB && FullSystem,
123 "Functional TLB not supported in full-system GPU simulation");
124
134 fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
135 p.wf_size <= 0,
136 "WF size is larger than the host can support");
137 fatal_if(!isPowerOf2(wavefrontSize),
138 "Wavefront size should be a power of 2");
139 // calculate how many cycles a vector load or store will need to transfer
140 // its data over the corresponding buses
141 numCyclesPerStoreTransfer =
142 (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
143 (double)vrfToCoalescerBusWidth);
144
145 numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
146 / coalescerToVrfBusWidth;
147
148 // Initialization: all WF slots are assumed STOPPED
149 idleWfs = p.n_wf * numVectorALUs;
150 lastVaddrWF.resize(numVectorALUs);
151 wfList.resize(numVectorALUs);
152
153 wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
154
155 for (int i = 0; i < p.num_barrier_slots; ++i) {
156 freeBarrierIds.insert(i);
157 }
158
159 for (int j = 0; j < numVectorALUs; ++j) {
160 lastVaddrWF[j].resize(p.n_wf);
161
162 for (int i = 0; i < p.n_wf; ++i) {
163 lastVaddrWF[j][i].resize(wfSize());
164
165 wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
166 wfList[j][i]->setParent(this);
167
168 for (int k = 0; k < wfSize(); ++k) {
169 lastVaddrWF[j][i][k] = 0;
170 }
171 }
172 }
173
174 lastVaddrSimd.resize(numVectorALUs);
175
176 for (int i = 0; i < numVectorALUs; ++i) {
177 lastVaddrSimd[i].resize(wfSize(), 0);
178 }
179
180 lastVaddrCU.resize(wfSize());
181
182 lds.setParent(this);
183
184 if (p.execPolicy == "OLDEST-FIRST") {
185 exec_policy = EXEC_POLICY::OLDEST;
186 } else if (p.execPolicy == "ROUND-ROBIN") {
187 exec_policy = EXEC_POLICY::RR;
188 } else {
189 fatal("Invalid WF execution policy (CU)\n");
190 }
191
192 for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
193 memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
194 }
195
196 for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
197 tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
198 }
199
200 // Setup tokens for response ports. The number of tokens in memPortTokens
201 // is the total token count for the entire vector port (i.e., this CU).
202 memPortTokens = new TokenManager(p.max_cu_tokens);
203
204 registerExitCallback([this]() { exitCallback(); });
205
206 lastExecCycle.resize(numVectorALUs, 0);
207
208 for (int i = 0; i < vrf.size(); ++i) {
209 vrf[i]->setParent(this);
210 }
211 for (int i = 0; i < srf.size(); ++i) {
212 srf[i]->setParent(this);
213 }
214 numVecRegsPerSimd = vrf[0]->numRegs();
215 numScalarRegsPerSimd = srf[0]->numRegs();
216
217 registerManager->setParent(this);
218
219 activeWaves = 0;
220
221 instExecPerSimd.resize(numVectorALUs, 0);
222
223 // Calculate the number of bits to address a cache line
224 panic_if(!isPowerOf2(_cacheLineSize),
225 "Cache line size should be a power of two.");
226 cacheLineBits = floorLog2(_cacheLineSize);
227}
228
230{
231 // Delete wavefront slots
232 for (int j = 0; j < numVectorALUs; ++j) {
233 for (int i = 0; i < shader->n_wf; ++i) {
234 delete wfList[j][i];
235 }
236 lastVaddrSimd[j].clear();
237 }
238 lastVaddrCU.clear();
239}
240
241int
243{
246}
247
248// index into readyList of the first memory unit
249int
251{
253}
254
255// index into readyList of the last memory unit
256int
258{
259 return numExeUnits() - 1;
260}
261
262// index into scalarALUs vector of SALU used by the wavefront
263int
265{
266 if (numScalarALUs == 1) {
267 return 0;
268 } else {
269 return w->simdId % numScalarALUs;
270 }
271}
272
273// index into readyList of Scalar ALU unit used by wavefront
274int
276{
278}
279
280// index into readyList of Global Memory unit used by wavefront
281int
283{
284 // TODO: FIXME if more than 1 GM pipe supported
286}
287
288// index into readyList of Local Memory unit used by wavefront
289int
291{
292 // TODO: FIXME if more than 1 LM pipe supported
294}
295
296// index into readyList of Scalar Memory unit used by wavefront
297int
299{
300 // TODO: FIXME if more than 1 ScM pipe supported
303}
304
305void
307{
308 w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
309 w->workGroupSz[0] = task->wgSize(0);
310 w->workGroupSz[1] = task->wgSize(1);
311 w->workGroupSz[2] = task->wgSize(2);
312 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
313 w->gridSz[0] = task->gridSize(0);
314 w->gridSz[1] = task->gridSize(1);
315 w->gridSz[2] = task->gridSize(2);
316 w->computeActualWgSz(task);
317}
318
319void
321 HSAQueueEntry *task, int bar_id, bool fetchContext)
322{
323 static int _n_wave = 0;
324
325 VectorMask init_mask;
326 init_mask.reset();
327
328 for (int k = 0; k < wfSize(); ++k) {
329 if (k + waveId * wfSize() < w->actualWgSzTotal)
330 init_mask[k] = 1;
331 }
332
333 w->execMask() = init_mask;
334
335 w->kernId = task->dispatchId();
336 w->wfId = waveId;
337 w->initMask = init_mask.to_ullong();
338
339 if (bar_id > WFBarrier::InvalidID) {
340 w->barrierId(bar_id);
341 } else {
342 assert(!w->hasBarrier());
343 }
344
345 for (int k = 0; k < wfSize(); ++k) {
346 w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
347 w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
348 w->actualWgSz[1];
349 w->workItemId[2][k] = (k + waveId * wfSize()) /
350 (w->actualWgSz[0] * w->actualWgSz[1]);
351
352 w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
353 w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
354 w->workItemId[0][k];
355 }
356
357 // WG state
358 w->wgId = task->globalWgId();
359 w->dispatchId = task->dispatchId();
360 w->workGroupId[0] = w->wgId % task->numWg(0);
361 w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
362 w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
363
364 // set the wavefront context to have a pointer to this section of the LDS
365 w->ldsChunk = ldsChunk;
366
367 [[maybe_unused]] int32_t refCount =
368 lds.increaseRefCounter(w->dispatchId, w->wgId);
369 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
370 cu_id, w->wgId, refCount);
371
372 w->instructionBuffer.clear();
373
374 if (w->pendingFetch)
375 w->dropFetch = true;
376
377 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
378 "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
379 w->simdId, w->wfSlotId, refCount);
380
381 w->initRegState(task, w->actualWgSzTotal);
382 w->start(_n_wave++, task->codeAddr());
383
385 activeWaves++;
386}
387
393void
395 GPUDynInstPtr gpuDynInst
396 = std::make_shared<GPUDynInst>(this, nullptr,
398
399 // kern_id will be used in inv responses
400 gpuDynInst->kern_id = kernId;
401 // update contextId field
402 req->setContext(gpuDynInst->wfDynId);
403
404 injectGlobalMemFence(gpuDynInst, true, req);
405}
406
412void
414 injectGlobalMemFence(gpuDynInst, true);
415}
416
417// reseting SIMD register pools
418// I couldn't think of any other place and
419// I think it is needed in my implementation
420void
422{
423 for (int i=0; i<numVectorALUs; i++)
424 {
427 }
428}
429
430void
432{
433 // If we aren't ticking, start it up!
434 if (!tickEvent.scheduled()) {
435 DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
437 }
438
439 // the kernel's invalidate must have finished before any wg dispatch
440 assert(task->isInvDone());
441
442 // reserve the LDS capacity allocated to the work group
443 // disambiguated by the dispatch ID and workgroup ID, which should be
444 // globally unique
445 LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
446 task->globalWgId(),
447 task->ldsSize());
448
449 panic_if(!ldsChunk, "was not able to reserve space for this WG");
450
451 // calculate the number of 32-bit vector registers required
452 // by each work item
453 int vregDemand = task->numVectorRegs();
454 int sregDemand = task->numScalarRegs();
455 int wave_id = 0;
456
457 int barrier_id = WFBarrier::InvalidID;
458
463 if (num_wfs_in_wg > 1) {
468 barrier_id = getFreeBarrierId();
469 auto &wf_barrier = barrierSlot(barrier_id);
470 assert(!wf_barrier.maxBarrierCnt());
471 assert(!wf_barrier.numAtBarrier());
472 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
473
474 DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
475 "%d waves using this barrier.\n", cu_id, barrier_id,
476 num_wfs_in_wg);
477 }
478
479 // Assign WFs according to numWfsToSched vector, which is computed by
480 // hasDispResources()
481 for (int j = 0; j < shader->n_wf; ++j) {
482 for (int i = 0; i < numVectorALUs; ++i) {
483 Wavefront *w = wfList[i][j];
484 // Check if this wavefront slot is available and there are WFs
485 // remaining to be dispatched to current SIMD:
486 // WF slot must be stopped and not waiting
487 // for a release to complete S_RETURNING
488 if (w->getStatus() == Wavefront::S_STOPPED &&
489 numWfsToSched[i] > 0) {
490 // decrement number of WFs awaiting dispatch to current SIMD
491 numWfsToSched[i] -= 1;
492
493 fillKernelState(w, task);
494
495 DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
496 "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
497 vregDemand, sregDemand);
498
499 registerManager->allocateRegisters(w, vregDemand, sregDemand);
500
501 startWavefront(w, wave_id, ldsChunk, task, barrier_id);
502 ++wave_id;
503 }
504 }
505 }
506}
507
508void
510{
511 panic_if(w->instructionBuffer.empty(),
512 "Instruction Buffer of WF%d can't be empty", w->wgId);
513 GPUDynInstPtr ii = w->instructionBuffer.front();
514 pipeMap.emplace(ii->seqNum());
515}
516
517void
519{
520 panic_if(w->instructionBuffer.empty(),
521 "Instruction Buffer of WF%d can't be empty", w->wgId);
522 GPUDynInstPtr ii = w->instructionBuffer.front();
523 // delete the dynamic instruction from the pipeline map
524 auto it = pipeMap.find(ii->seqNum());
525 panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
526 pipeMap.erase(it);
527}
528
529bool
531{
532 // compute true size of workgroup (after clamping to grid size)
533 int trueWgSize[HSAQueueEntry::MAX_DIM];
534 int trueWgSizeTotal = 1;
535
536 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
537 trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
538 task->wgId(d) * task->wgSize(d));
539
540 trueWgSizeTotal *= trueWgSize[d];
541 DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
542 }
543
544 DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
545
546 // calculate the number of WFs in this WG
547 int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
548 num_wfs_in_wg = numWfs;
549
550 bool barrier_avail = true;
551
552 if (numWfs > 1 && !freeBarrierIds.size()) {
553 barrier_avail = false;
554 }
555
556 // calculate the number of 32-bit vector registers required by each
557 // work item of the work group
558 int vregDemandPerWI = task->numVectorRegs();
559 // calculate the number of 32-bit scalar registers required by each
560 // work item of the work group
561 int sregDemandPerWI = task->numScalarRegs();
562
563 // check if the total number of VGPRs snd SGPRs required by all WFs
564 // of the WG fit in the VRFs of all SIMD units and the CU's SRF
565 panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
566 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
567 "that has %d VGPRs\n",
568 numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
569 panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
570 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
571 "with %d SGPRs\n",
572 numWfs, sregDemandPerWI, numScalarRegsPerSimd);
573
574 // number of WF slots that are not occupied
575 int freeWfSlots = 0;
576 // number of Wfs from WG that were successfully mapped to a SIMD
577 int numMappedWfs = 0;
578 numWfsToSched.clear();
579 numWfsToSched.resize(numVectorALUs, 0);
580
581 // attempt to map WFs to the SIMDs, based on WF slot availability
582 // and register file availability
583 for (int j = 0; j < shader->n_wf; ++j) {
584 for (int i = 0; i < numVectorALUs; ++i) {
585 if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
586 ++freeWfSlots;
587 // check if current WF will fit onto current SIMD/VRF
588 // if all WFs have not yet been mapped to the SIMDs
589 if (numMappedWfs < numWfs &&
591 sregDemandPerWI) &&
593 vregDemandPerWI)) {
594 numWfsToSched[i]++;
595 numMappedWfs++;
596 }
597 }
598 }
599 }
600
601 // check that the number of mapped WFs is not greater
602 // than the actual number of WFs
603 assert(numMappedWfs <= numWfs);
604
605 bool vregAvail = true;
606 bool sregAvail = true;
607 // if a WF to SIMD mapping was not found, find the limiting resource
608 if (numMappedWfs < numWfs) {
609
610 for (int j = 0; j < numVectorALUs; ++j) {
611 // find if there are enough free VGPRs in the SIMD's VRF
612 // to accomodate the WFs of the new WG that would be mapped
613 // to this SIMD unit
614 vregAvail &= registerManager->
615 canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
616 // find if there are enough free SGPRs in the SIMD's SRF
617 // to accomodate the WFs of the new WG that would be mapped
618 // to this SIMD unit
619 sregAvail &= registerManager->
620 canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
621 }
622 }
623
624 DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
625 VGPR Availability = %d, SGPR Availability = %d\n",
626 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
627
628 if (!vregAvail) {
630 }
631
632 if (!sregAvail) {
634 }
635
636 // Return true if enough WF slots to submit workgroup and if there are
637 // enough VGPRs to schedule all WFs to their SIMD units
638 bool ldsAvail = lds.canReserve(task->ldsSize());
639 if (!ldsAvail) {
641 }
642
643 if (!barrier_avail) {
645 }
646
647 // Return true if the following are all true:
648 // (a) all WFs of the WG were mapped to free WF slots
649 // (b) there are enough VGPRs to schedule all WFs to their SIMD units
650 // (c) there are enough SGPRs on the CU to schedule all WFs
651 // (d) there is enough space in LDS to allocate for all WFs
652 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
653 && ldsAvail && barrier_avail;
654 return can_dispatch;
655}
656
657int
659{
660 auto &wf_barrier = barrierSlot(bar_id);
661 return wf_barrier.numYetToReachBarrier();
662}
663
664bool
666{
667 auto &wf_barrier = barrierSlot(bar_id);
668 return wf_barrier.allAtBarrier();
669}
670
671void
673{
674 auto &wf_barrier = barrierSlot(bar_id);
675 wf_barrier.incNumAtBarrier();
676}
677
678int
680{
681 auto &wf_barrier = barrierSlot(bar_id);
682 return wf_barrier.numAtBarrier();
683}
684
685int
687{
688 auto &wf_barrier = barrierSlot(bar_id);
689 return wf_barrier.maxBarrierCnt();
690}
691
692void
694{
695 auto &wf_barrier = barrierSlot(bar_id);
696 wf_barrier.reset();
697}
698
699void
701{
702 auto &wf_barrier = barrierSlot(bar_id);
703 wf_barrier.decMaxBarrierCnt();
704}
705
706void
708{
709 auto &wf_barrier = barrierSlot(bar_id);
710 wf_barrier.release();
711 freeBarrierIds.insert(bar_id);
712}
713
714void
716{
717 for (int i = 0; i < numVectorALUs; ++i) {
718 for (int j = 0; j < shader->n_wf; ++j) {
719 Wavefront *wf = wfList[i][j];
720 if (wf->barrierId() == bar_id) {
721 assert(wf->getStatus() == Wavefront::S_BARRIER);
723 }
724 }
725 }
726}
727
728// Execute one clock worth of work on the ComputeUnit.
729void
731{
732 // process reads and writes in the RFs
733 for (auto &vecRegFile : vrf) {
734 vecRegFile->exec();
735 }
736
737 for (auto &scRegFile : srf) {
738 scRegFile->exec();
739 }
740
741 // Execute pipeline stages in reverse order to simulate
742 // the pipeline latency
746 execStage.exec();
750
752
753 // Put this CU to sleep if there is no more work to be done.
754 if (!isDone()) {
756 } else {
758 DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
759 }
760}
761
762void
764{
765 // Initialize CU Bus models and execution resources
766
767 // Vector ALUs
768 vectorALUs.clear();
769 for (int i = 0; i < numVectorALUs; i++) {
770 vectorALUs.emplace_back(this, clockPeriod());
771 }
772
773 // Scalar ALUs
774 scalarALUs.clear();
775 for (int i = 0; i < numScalarALUs; i++) {
776 scalarALUs.emplace_back(this, clockPeriod());
777 }
778
779 // Vector Global Memory
781 "No support for multiple Global Memory Pipelines exists!!!");
785
786 // Vector Local/Shared Memory
788 "No support for multiple Local Memory Pipelines exists!!!");
792
793 // Scalar Memory
795 "No support for multiple Scalar Memory Pipelines exists!!!");
799
802
805 execStage.init();
807
809}
810
811bool
813{
814 return handleResponse(pkt);
815}
816
817bool
819{
820 // Ruby has completed the memory op. Schedule the mem_resp_event at the
821 // appropriate cycle to process the timing memory response
822 // This delay represents the pipeline delay
823 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
824 PortID index = sender_state->port_index;
825 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
826 GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
827
828 // MemSyncResp + WriteAckResp are handled completely here and we don't
829 // schedule a MemRespEvent to process the responses further
830 if (pkt->cmd == MemCmd::MemSyncResp) {
831 // This response is for 1 of the following request types:
832 // - kernel launch
833 // - kernel end
834 // - non-kernel mem sync
835
836 // Kernel Launch
837 // wavefront was nullptr when launching kernel, so it is meaningless
838 // here (simdId=-1, wfSlotId=-1)
839 if (gpuDynInst->isKernelLaunch()) {
840 // for kernel launch, the original request must be both kernel-type
841 // and INV_L1
842 assert(pkt->req->isKernel());
843 assert(pkt->req->isInvL1());
844
845 // one D-Cache inv is done, decrement counter
846 dispatcher.updateInvCounter(gpuDynInst->kern_id);
847
848 delete pkt->senderState;
849 delete pkt;
850 return true;
851 }
852
853 // retrieve wavefront from inst
854 Wavefront *w = gpuDynInst->wavefront();
855
856 // Check if we are waiting on Kernel End Flush
857 if (w->getStatus() == Wavefront::S_RETURNING
858 && gpuDynInst->isEndOfKernel()) {
859 // for kernel end, the original request must be both kernel-type
860 // and last-level GPU cache should be flushed if it contains
861 // dirty data. This request may have been quiesced and
862 // immediately responded to if the GL2 is a write-through /
863 // read-only cache.
864 assert(pkt->req->isKernel());
865 assert(pkt->req->isGL2CacheFlush());
866
867 // once flush done, decrement counter, and return whether all
868 // dirty writeback operations are done for the kernel
869 bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
870
871 // not all wbs are done for the kernel, just release pkt
872 // resources
873 if (!isWbDone) {
874 delete pkt->senderState;
875 delete pkt;
876 return true;
877 }
878
879 // all wbs are completed for the kernel, do retirement work
880 // for the workgroup
881 DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
882 computeUnit->cu_id, w->simdId, w->wfSlotId,
883 w->wfDynId, w->wgId);
884
885 dispatcher.notifyWgCompl(w);
886 w->setStatus(Wavefront::S_STOPPED);
887 }
888
889 if (!pkt->req->isKernel()) {
890 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
891 DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
892 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
893 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
894 gpuDynInst->disassemble(), w->outstandingReqs,
895 w->outstandingReqs - 1);
896 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
897 }
898
899 delete pkt->senderState;
900 delete pkt;
901 return true;
902 }
903
904 EventFunctionWrapper *mem_resp_event =
905 computeUnit->memPort[index].createMemRespEvent(pkt);
906
907 DPRINTF(GPUPort,
908 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
909 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
910 gpuDynInst->seqNum(), index, pkt->req->getPaddr());
911
912 computeUnit->schedule(mem_resp_event,
913 curTick() + computeUnit->resp_tick_latency);
914
915 return true;
916}
917
918bool
920{
921 return handleResponse(pkt);
922}
923
924bool
926{
927 assert(!pkt->req->isKernel());
928
929 // retrieve sender state
930 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
931 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
932
933 assert(pkt->isRead() || pkt->isWrite());
934 assert(gpuDynInst->numScalarReqs > 0);
935
936 gpuDynInst->numScalarReqs--;
937
946 if (!gpuDynInst->numScalarReqs) {
947 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
948 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
949 gpuDynInst);
950 } else {
951 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
952 gpuDynInst);
953 }
954 }
955
956 delete pkt->senderState;
957 delete pkt;
958
959 return true;
960}
961
962void
964{
965 for (const auto &pkt : retries) {
966 if (!sendTimingReq(pkt)) {
967 break;
968 } else {
969 retries.pop_front();
970 }
971 }
972}
973
974void
976{
977 int len = retries.size();
978
979 assert(len > 0);
980
981 for (int i = 0; i < len; ++i) {
982 PacketPtr pkt = retries.front().first;
983 [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
984 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
985 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
986 pkt->req->getPaddr());
987
991 if (!sendTimingReq(pkt)) {
992 DPRINTF(GPUMem, "failed again!\n");
993 break;
994 } else {
995 DPRINTF(GPUMem, "successful!\n");
996 retries.pop_front();
997 }
998 }
999}
1000
1001bool
1003{
1004 computeUnit->handleSQCReturn(pkt);
1005
1006 return true;
1007}
1008
1009void
1011{
1013}
1014
1015void
1017{
1018 int len = retries.size();
1019
1020 assert(len > 0);
1021
1022 for (int i = 0; i < len; ++i) {
1023 PacketPtr pkt = retries.front().first;
1024 [[maybe_unused]] Wavefront *wavefront = retries.front().second;
1025 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1026 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1027 pkt->req->getPaddr());
1028 if (!sendTimingReq(pkt)) {
1029 DPRINTF(GPUFetch, "failed again!\n");
1030 break;
1031 } else {
1032 DPRINTF(GPUFetch, "successful!\n");
1033 retries.pop_front();
1034 }
1035 }
1036}
1037
1038void
1040{
1041 // There must be a way around this check to do the globalMemStart...
1042 Addr tmp_vaddr = pkt->req->getVaddr();
1043
1044 updatePageDivergenceDist(tmp_vaddr);
1045
1046 // set PC in request
1047 pkt->req->setPC(gpuDynInst->wavefront()->pc());
1048
1049 pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1050
1051 // figure out the type of the request to set read/write
1052 BaseMMU::Mode TLB_mode;
1053 assert(pkt->isRead() || pkt->isWrite());
1054
1055 // only do some things if actually accessing data
1056 bool isDataAccess = pkt->isWrite() || pkt->isRead();
1057
1058 // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode
1059 // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1060 // Rather than hacking up the pagetable to add these bits in, we just
1061 // keep a structure local to our GPUs that are populated in our
1062 // emulated driver whenever memory is allocated. Consult that structure
1063 // here in case we need a memtype override.
1064 //
1065 // In full system mode these can be extracted from the PTE and assigned
1066 // after address translation takes place.
1067 if (!FullSystem) {
1069 }
1070
1071 // Check write before read for atomic operations
1072 // since atomic operations should use BaseMMU::Write
1073 if (pkt->isWrite()) {
1074 TLB_mode = BaseMMU::Write;
1075 } else if (pkt->isRead()) {
1076 TLB_mode = BaseMMU::Read;
1077 } else {
1078 fatal("pkt is not a read nor a write\n");
1079 }
1080
1081 if (!functionalTLB) {
1082 stats.tlbCycles -= curTick();
1083 }
1085
1086 PortID tlbPort_index = perLaneTLB ? index : 0;
1087
1088 if (shader->timingSim) {
1089 if (!FullSystem && debugSegFault) {
1091 Addr vaddr = pkt->req->getVaddr();
1092 unsigned size = pkt->getSize();
1093
1094 if ((vaddr + size - 1) % 64 < vaddr % 64) {
1095 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1096 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1097 }
1098
1099 Addr paddr;
1100
1101 if (!p->pTable->translate(vaddr, paddr)) {
1102 if (!p->fixupFault(vaddr)) {
1103 panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1104 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1105 vaddr);
1106 }
1107 }
1108 }
1109
1110 // This is the SenderState needed upon return
1111 pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1112
1113 // This is the senderState needed by the TLB hierarchy to function
1114 GpuTranslationState *translation_state =
1115 new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1116 pkt->senderState);
1117
1118 pkt->senderState = translation_state;
1119
1120 if (functionalTLB) {
1121 tlbPort[tlbPort_index].sendFunctional(pkt);
1122
1123 // update the hitLevel distribution
1124 int hit_level = translation_state->hitLevel;
1125 assert(hit_level != -1);
1126 stats.hitsPerTLBLevel[hit_level]++;
1127
1128 // New SenderState for the memory access
1129 GpuTranslationState *sender_state =
1130 safe_cast<GpuTranslationState*>(pkt->senderState);
1131
1132 delete sender_state->tlbEntry;
1133 delete sender_state->saved;
1134 delete sender_state;
1135
1136 assert(pkt->req->hasPaddr());
1137 assert(pkt->req->hasSize());
1138
1139 // this is necessary because the GPU TLB receives packets instead
1140 // of requests. when the translation is complete, all relevent
1141 // fields in the request will be populated, but not in the packet.
1142 // here we create the new packet so we can set the size, addr,
1143 // and proper flags.
1144 PacketPtr oldPkt = pkt;
1145 pkt = new Packet(oldPkt->req, oldPkt->cmd);
1146 if (isDataAccess) {
1147 uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1148 pkt->dataStatic(tmpData);
1149 }
1150 delete oldPkt;
1151
1152
1153 // New SenderState for the memory access
1154 pkt->senderState =
1156 nullptr);
1157
1158 gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1159 gpuDynInst->tlbHitLevel[index] = hit_level;
1160
1161 // translation is done. Schedule the mem_req_event at the
1162 // appropriate cycle to send the timing memory request to ruby
1163 EventFunctionWrapper *mem_req_event =
1164 memPort[index].createMemReqEvent(pkt);
1165
1166 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1167 "scheduled\n", cu_id, gpuDynInst->simdId,
1168 gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1169
1170 schedule(mem_req_event, curTick() + req_tick_latency);
1171 } else if (tlbPort[tlbPort_index].isStalled()) {
1172 assert(tlbPort[tlbPort_index].retries.size() > 0);
1173
1174 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1175 "failed!\n", cu_id, gpuDynInst->simdId,
1176 gpuDynInst->wfSlotId, tmp_vaddr);
1177
1178 tlbPort[tlbPort_index].retries.push_back(pkt);
1179 } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1180 // Stall the data port;
1181 // No more packet will be issued till
1182 // ruby indicates resources are freed by
1183 // a recvReqRetry() call back on this port.
1184 tlbPort[tlbPort_index].stallPort();
1185
1186 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1187 "failed!\n", cu_id, gpuDynInst->simdId,
1188 gpuDynInst->wfSlotId, tmp_vaddr);
1189
1190 tlbPort[tlbPort_index].retries.push_back(pkt);
1191 } else {
1192 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x from "
1193 "instruction %s sent!\n", cu_id, gpuDynInst->simdId,
1194 gpuDynInst->wfSlotId, tmp_vaddr,
1195 gpuDynInst->disassemble().c_str());
1196 }
1197 } else {
1198 if (pkt->cmd == MemCmd::MemSyncReq) {
1199 gpuDynInst->resetEntireStatusVector();
1200 } else {
1201 gpuDynInst->decrementStatusVector(index);
1202 }
1203
1204 // New SenderState for the memory access
1205 delete pkt->senderState;
1206
1207 // Because it's atomic operation, only need TLB translation state
1208 pkt->senderState = new GpuTranslationState(TLB_mode,
1209 shader->gpuTc);
1210
1211 tlbPort[tlbPort_index].sendFunctional(pkt);
1212
1213 // the addr of the packet is not modified, so we need to create a new
1214 // packet, or otherwise the memory access will have the old virtual
1215 // address sent in the translation packet, instead of the physical
1216 // address returned by the translation.
1217 PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1218 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1219
1220 // Translation is done. It is safe to send the packet to memory.
1221 memPort[0].sendFunctional(new_pkt);
1222
1223 DPRINTF(GPUMem, "Functional sendRequest\n");
1224 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1225 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1226 new_pkt->req->getPaddr());
1227
1228 // safe_cast the senderState
1229 GpuTranslationState *sender_state =
1230 safe_cast<GpuTranslationState*>(pkt->senderState);
1231
1232 delete sender_state->tlbEntry;
1233 delete new_pkt;
1234 delete pkt->senderState;
1235 delete pkt;
1236 }
1237}
1238
1239void
1241{
1242 assert(pkt->isWrite() || pkt->isRead());
1243
1244 BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1245
1246 pkt->senderState =
1248
1249 pkt->senderState =
1250 new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1251 pkt->senderState);
1252
1253 if (scalarDTLBPort.isStalled()) {
1254 assert(scalarDTLBPort.retries.size());
1255 scalarDTLBPort.retries.push_back(pkt);
1256 } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1258 scalarDTLBPort.retries.push_back(pkt);
1259 } else {
1260 DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1261 tlb_mode == BaseMMU::Read ? "read" : "write",
1262 pkt->req->getVaddr());
1263 }
1264}
1265
1266void
1268 bool kernelMemSync,
1269 RequestPtr req)
1270{
1271 assert(gpuDynInst->isGlobalSeg() ||
1272 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1273
1274 // Fences will never be issued to system memory, so we can mark the
1275 // requestor as a device memory ID here.
1276 if (!req) {
1277 req = std::make_shared<Request>(
1278 0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
1279 } else {
1280 req->requestorId(vramRequestorId());
1281 }
1282
1283 // all mem sync requests have Paddr == 0
1284 req->setPaddr(0);
1285
1286 PacketPtr pkt = nullptr;
1287
1288 if (kernelMemSync) {
1289 if (gpuDynInst->isKernelLaunch()) {
1290 req->setCacheCoherenceFlags(Request::INV_L1);
1291 req->setReqInstSeqNum(gpuDynInst->seqNum());
1292 req->setFlags(Request::KERNEL);
1293 pkt = new Packet(req, MemCmd::MemSyncReq);
1294 pkt->pushSenderState(
1295 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1296
1297 EventFunctionWrapper *mem_req_event =
1298 memPort[0].createMemReqEvent(pkt);
1299
1300 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1301 "an acquire\n", cu_id, gpuDynInst->simdId,
1302 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1303
1304 schedule(mem_req_event, curTick() + req_tick_latency);
1305 } else {
1306 // kernel end flush of GL2 cache may be quiesced by Ruby if the
1307 // GL2 is a read-only cache
1308 assert(shader->impl_kern_end_rel);
1309 assert(gpuDynInst->isEndOfKernel());
1310
1311 req->setCacheCoherenceFlags(Request::FLUSH_L2);
1312 req->setReqInstSeqNum(gpuDynInst->seqNum());
1313 req->setFlags(Request::KERNEL);
1314 pkt = new Packet(req, MemCmd::MemSyncReq);
1315 pkt->pushSenderState(
1316 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1317
1318 EventFunctionWrapper *mem_req_event =
1319 memPort[0].createMemReqEvent(pkt);
1320
1321 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1322 "a release\n", cu_id, gpuDynInst->simdId,
1323 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1324
1325 schedule(mem_req_event, curTick() + req_tick_latency);
1326 }
1327 } else {
1328 gpuDynInst->setRequestFlags(req);
1329
1330 req->setReqInstSeqNum(gpuDynInst->seqNum());
1331
1332 pkt = new Packet(req, MemCmd::MemSyncReq);
1333 pkt->pushSenderState(
1334 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1335
1336 EventFunctionWrapper *mem_req_event =
1337 memPort[0].createMemReqEvent(pkt);
1338
1339 DPRINTF(GPUPort,
1340 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1341 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1342 pkt->req->getPaddr());
1343
1344 schedule(mem_req_event, curTick() + req_tick_latency);
1345 }
1346}
1347
1348void
1350{
1351 DataPort::SenderState *sender_state =
1352 safe_cast<DataPort::SenderState*>(pkt->senderState);
1353
1354 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1355 ComputeUnit *compute_unit = computeUnit;
1356
1357 assert(gpuDynInst);
1358
1359 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1360 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1361 pkt->req->getPaddr(), id);
1362
1363 Addr paddr = pkt->req->getPaddr();
1364
1365 // mem sync resp callback must be handled already in
1366 // DataPort::recvTimingResp
1367 assert(pkt->cmd != MemCmd::MemSyncResp);
1368
1369 // The status vector and global memory response for WriteResp packets get
1370 // handled by the WriteCompleteResp packets.
1371 if (pkt->cmd == MemCmd::WriteResp) {
1372 if (!FullSystem || !pkt->req->systemReq()) {
1373 delete pkt;
1374 return;
1375 }
1376 }
1377
1378 // this is for read, write and atomic
1379 int index = gpuDynInst->memStatusVector[paddr].back();
1380
1381 DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1382 pkt->req->getPaddr(), id);
1383
1384 gpuDynInst->memStatusVector[paddr].pop_back();
1385 gpuDynInst->pAddr = pkt->req->getPaddr();
1386
1387 gpuDynInst->decrementStatusVector(index);
1388 DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1389
1390 if (gpuDynInst->allLanesZero()) {
1391 auto iter = gpuDynInst->memStatusVector.begin();
1392 auto end = gpuDynInst->memStatusVector.end();
1393
1394 while (iter != end) {
1395 assert(iter->second.empty());
1396 ++iter;
1397 }
1398
1399 // Calculate the difference between the arrival of the first cache
1400 // block and the last cache block to arrive if we have the time
1401 // for the first cache block.
1402 if (compute_unit->headTailMap.count(gpuDynInst)) {
1403 Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1404 compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1405 compute_unit->headTailMap.erase(gpuDynInst);
1406 }
1407
1408 gpuDynInst->memStatusVector.clear();
1409
1410 gpuDynInst->
1411 profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1412 compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1413
1414 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1415 compute_unit->cu_id, gpuDynInst->simdId,
1416 gpuDynInst->wfSlotId);
1417 } else {
1418 if (pkt->isRead()) {
1419 if (!compute_unit->headTailMap.count(gpuDynInst)) {
1420 compute_unit->headTailMap
1421 .insert(std::make_pair(gpuDynInst, curTick()));
1422 }
1423 }
1424 }
1425
1426 delete pkt->senderState;
1427 delete pkt;
1428}
1429
1430bool
1432{
1433 Addr line = pkt->req->getPaddr();
1434
1435 DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1436 pkt->req->getVaddr(), line);
1437
1438 assert(pkt->senderState);
1439 computeUnit->stats.tlbCycles += curTick();
1440
1441 // pop off the TLB translation state
1442 GpuTranslationState *translation_state =
1443 safe_cast<GpuTranslationState*>(pkt->senderState);
1444
1445 // no PageFaults are permitted for data accesses
1446 if (!translation_state->tlbEntry) {
1447 DTLBPort::SenderState *sender_state =
1448 safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1449
1450 [[maybe_unused]] Wavefront *w =
1451 computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1452 [sender_state->_gpuDynInst->wfSlotId];
1453
1454 DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1455 pkt->req->getVaddr());
1456 }
1457
1458 // update the hitLevel distribution
1459 int hit_level = translation_state->hitLevel;
1460 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1461
1462 delete translation_state->tlbEntry;
1463 assert(!translation_state->ports.size());
1464 pkt->senderState = translation_state->saved;
1465
1466 // for prefetch pkt
1467 BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1468
1469 delete translation_state;
1470
1471 // use the original sender state to know how to close this transaction
1472 DTLBPort::SenderState *sender_state =
1473 safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1474
1475 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1476 PortID mp_index = sender_state->portIndex;
1477 Addr vaddr = pkt->req->getVaddr();
1478 gpuDynInst->memStatusVector[line].push_back(mp_index);
1479 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1480
1481 MemCmd requestCmd;
1482
1483 if (pkt->cmd == MemCmd::ReadResp) {
1484 requestCmd = MemCmd::ReadReq;
1485 } else if (pkt->cmd == MemCmd::WriteResp) {
1486 requestCmd = MemCmd::WriteReq;
1487 } else if (pkt->cmd == MemCmd::SwapResp) {
1488 requestCmd = MemCmd::SwapReq;
1489 } else {
1490 panic("unsupported response to request conversion %s\n",
1491 pkt->cmd.toString());
1492 }
1493
1494 if (computeUnit->prefetchDepth) {
1495 int simdId = gpuDynInst->simdId;
1496 int wfSlotId = gpuDynInst->wfSlotId;
1497 Addr last = 0;
1498
1499 switch(computeUnit->prefetchType) {
1500 case enums::PF_CU:
1501 last = computeUnit->lastVaddrCU[mp_index];
1502 break;
1503 case enums::PF_PHASE:
1504 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1505 break;
1506 case enums::PF_WF:
1507 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1508 default:
1509 break;
1510 }
1511
1512 DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1513 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1514
1515 int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1517 : 0;
1518
1519 DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1520
1521 computeUnit->lastVaddrCU[mp_index] = vaddr;
1522 computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1523 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1524
1525 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1526 computeUnit->prefetchStride: stride;
1527
1528 DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1529 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1530
1531 DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1532
1533 // Prefetch Next few pages atomically
1534 for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1535 DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1537
1538 if (!stride)
1539 break;
1540
1541 RequestPtr prefetch_req = std::make_shared<Request>(
1543 sizeof(uint8_t), 0,
1544 computeUnit->requestorId(),
1545 0, 0, nullptr);
1546
1547 PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1548 uint8_t foo = 0;
1549 prefetch_pkt->dataStatic(&foo);
1550
1551 // Because it's atomic operation, only need TLB translation state
1552 prefetch_pkt->senderState =
1553 new GpuTranslationState(TLB_mode,
1554 computeUnit->shader->gpuTc, true);
1555
1556 // Currently prefetches are zero-latency, hence the sendFunctional
1557 sendFunctional(prefetch_pkt);
1558
1559 /* safe_cast the senderState */
1560 GpuTranslationState *tlb_state =
1561 safe_cast<GpuTranslationState*>(
1562 prefetch_pkt->senderState);
1563
1564
1565 delete tlb_state->tlbEntry;
1566 delete tlb_state;
1567 delete prefetch_pkt;
1568 }
1569 }
1570
1571 // First we must convert the response cmd back to a request cmd so that
1572 // the request can be sent through the cu's request port
1573 PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1574 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1575 delete pkt->senderState;
1576 delete pkt;
1577
1578 // New SenderState for the memory access
1579 new_pkt->senderState =
1580 new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1581 nullptr);
1582
1583 // Set VRAM ID for device requests
1584 // For now, system vmem requests use functional reads. This is not that
1585 // critical to model as the region of interest should always be accessing
1586 // device memory. System vmem requests are used by blit kernels to do
1587 // memcpys and load code objects into device memory.
1588 if (new_pkt->req->systemReq()) {
1589 // There will be multiple packets returned for the same gpuDynInst,
1590 // so first check if systemReq is not already set and if so, return
1591 // the token acquired when the dispatch list is filled as system
1592 // requests do not require a GPU coalescer token.
1593 if (!gpuDynInst->isSystemReq()) {
1594 computeUnit->getTokenManager()->recvTokens(1);
1595 gpuDynInst->setSystemReq();
1596 }
1597 } else {
1598 new_pkt->req->requestorId(computeUnit->vramRequestorId());
1599 }
1600
1601 // translation is done. Schedule the mem_req_event at the appropriate
1602 // cycle to send the timing memory request to ruby
1603 EventFunctionWrapper *mem_req_event =
1604 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1605
1606 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1607 computeUnit->cu_id, gpuDynInst->simdId,
1608 gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1609
1610 computeUnit->schedule(mem_req_event, curTick() +
1611 computeUnit->req_tick_latency);
1612
1613 return true;
1614}
1615
1618{
1619 return new EventFunctionWrapper(
1620 [this, pkt]{ processMemReqEvent(pkt); },
1621 "ComputeUnit memory request event", true);
1622}
1623
1626{
1627 return new EventFunctionWrapper(
1628 [this, pkt]{ processMemRespEvent(pkt); },
1629 "ComputeUnit memory response event", true);
1630}
1631
1632void
1634{
1635 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1636 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1637 [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1638
1639 if (pkt->req->systemReq()) {
1640 assert(compute_unit->shader->systemHub);
1641 SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
1642 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1643 } else if (!(sendTimingReq(pkt))) {
1644 retries.push_back(std::make_pair(pkt, gpuDynInst));
1645
1646 DPRINTF(GPUPort,
1647 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1648 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1649 id, pkt->req->getPaddr());
1650 } else {
1651 DPRINTF(GPUPort,
1652 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1653 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1654 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1655 pkt->req->getPaddr());
1656 }
1657}
1658
1659const char*
1661{
1662 return "ComputeUnit scalar memory request event";
1663}
1664
1665void
1667{
1668 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1669 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1670 [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1671
1672 if (pkt->req->systemReq()) {
1673 assert(compute_unit->shader->systemHub);
1674 SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
1675 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1676 } else if (!(scalarDataPort.sendTimingReq(pkt))) {
1677 scalarDataPort.retries.push_back(pkt);
1678
1679 DPRINTF(GPUPort,
1680 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1681 compute_unit->cu_id, gpuDynInst->simdId,
1682 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1683 } else {
1684 DPRINTF(GPUPort,
1685 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1686 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1687 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1688 pkt->req->getPaddr());
1689 }
1690}
1691
1692/*
1693 * The initial translation request could have been rejected,
1694 * if <retries> queue is not Retry sending the translation
1695 * request. sendRetry() is called from the peer port whenever
1696 * a translation completes.
1697 */
1698void
1700{
1701 int len = retries.size();
1702
1703 DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1704 computeUnit->cu_id, len);
1705
1706 assert(len > 0);
1707 assert(isStalled());
1708 // recvReqRetry is an indication that the resource on which this
1709 // port was stalling on is freed. So, remove the stall first
1710 unstallPort();
1711
1712 for (int i = 0; i < len; ++i) {
1713 PacketPtr pkt = retries.front();
1714 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1715 DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1716
1717 if (!sendTimingReq(pkt)) {
1718 // Stall port
1719 stallPort();
1720 DPRINTF(GPUTLB, ": failed again\n");
1721 break;
1722 } else {
1723 DPRINTF(GPUTLB, ": successful\n");
1724 retries.pop_front();
1725 }
1726 }
1727}
1728
1729bool
1731{
1732 assert(pkt->senderState);
1733
1734 GpuTranslationState *translation_state =
1735 safe_cast<GpuTranslationState*>(pkt->senderState);
1736
1737 // Page faults are not allowed
1738 fatal_if(!translation_state->tlbEntry,
1739 "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1740
1741 delete translation_state->tlbEntry;
1742 assert(!translation_state->ports.size());
1743
1744 pkt->senderState = translation_state->saved;
1745 delete translation_state;
1746
1747 ScalarDTLBPort::SenderState *sender_state =
1748 safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1749
1750 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1751 delete pkt->senderState;
1752
1753 [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1754
1755 DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1756 "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1757 w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1758
1759 MemCmd mem_cmd;
1760
1761 if (pkt->cmd == MemCmd::ReadResp) {
1762 mem_cmd = MemCmd::ReadReq;
1763 } else if (pkt->cmd == MemCmd::WriteResp) {
1764 mem_cmd = MemCmd::WriteReq;
1765 } else {
1766 fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1767 pkt->cmd.toString());
1768 }
1769
1770 PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1771 req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1772 delete pkt;
1773
1774 req_pkt->senderState =
1776
1777 // For a system request we want to mark the GPU instruction as a system
1778 // load/store so that after the request is issued to system memory we can
1779 // return any token acquired for the request. Since tokens are returned
1780 // by the coalescer and system requests do not take that path, this needs
1781 // to be tracked.
1782 //
1783 // Device requests change the requestor ID to something in the device
1784 // memory Ruby network.
1785 if (req_pkt->req->systemReq()) {
1786 gpuDynInst->setSystemReq();
1787 } else {
1788 req_pkt->req->requestorId(computeUnit->vramRequestorId());
1789 }
1790
1791 ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
1793 (computeUnit->scalarDataPort, req_pkt);
1794 computeUnit->schedule(scalar_mem_req_event, curTick() +
1795 computeUnit->scalar_req_tick_latency);
1796
1797 return true;
1798}
1799
1800bool
1802{
1803 [[maybe_unused]] Addr line = pkt->req->getPaddr();
1804 DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1805 computeUnit->cu_id, pkt->req->getVaddr(), line);
1806
1807 assert(pkt->senderState);
1808
1809 // pop off the TLB translation state
1810 GpuTranslationState *translation_state
1811 = safe_cast<GpuTranslationState*>(pkt->senderState);
1812
1813 bool success = translation_state->tlbEntry != nullptr;
1814 delete translation_state->tlbEntry;
1815 assert(!translation_state->ports.size());
1816 pkt->senderState = translation_state->saved;
1817 delete translation_state;
1818
1819 // use the original sender state to know how to close this transaction
1820 ITLBPort::SenderState *sender_state =
1821 safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1822
1823 // get the wavefront associated with this translation request
1824 Wavefront *wavefront = sender_state->wavefront;
1825 delete pkt->senderState;
1826
1827 if (success) {
1828 // pkt is reused in fetch(), don't delete it here. However, we must
1829 // reset the command to be a request so that it can be sent through
1830 // the cu's request port
1831 assert(pkt->cmd == MemCmd::ReadResp);
1832 pkt->cmd = MemCmd::ReadReq;
1833
1834 computeUnit->fetchStage.fetch(pkt, wavefront);
1835 } else {
1836 if (wavefront->dropFetch) {
1837 assert(wavefront->instructionBuffer.empty());
1838 wavefront->dropFetch = false;
1839 }
1840
1841 wavefront->pendingFetch = 0;
1842 }
1843
1844 return true;
1845}
1846
1847/*
1848 * The initial translation request could have been rejected, if
1849 * <retries> queue is not empty. Retry sending the translation
1850 * request. sendRetry() is called from the peer port whenever
1851 * a translation completes.
1852 */
1853void
1855{
1856
1857 int len = retries.size();
1858 DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1859
1860 assert(len > 0);
1861 assert(isStalled());
1862
1863 // recvReqRetry is an indication that the resource on which this
1864 // port was stalling on is freed. So, remove the stall first
1865 unstallPort();
1866
1867 for (int i = 0; i < len; ++i) {
1868 PacketPtr pkt = retries.front();
1869 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1870 DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1871
1872 if (!sendTimingReq(pkt)) {
1873 stallPort(); // Stall port
1874 DPRINTF(GPUTLB, ": failed again\n");
1875 break;
1876 } else {
1877 DPRINTF(GPUTLB, ": successful\n");
1878 retries.pop_front();
1879 }
1880 }
1881}
1882
1883void
1885{
1886 if (gpuDynInst->isScalar()) {
1887 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1888 stats.sALUInsts++;
1890 } else if (gpuDynInst->isLoad()) {
1892 } else if (gpuDynInst->isStore()) {
1894 }
1895 } else {
1896 if (gpuDynInst->isALU()) {
1899 exitSimLoop("max vALU insts");
1900 }
1901 stats.vALUInsts++;
1904 += gpuDynInst->wavefront()->execMask().count();
1905 } else if (gpuDynInst->isFlat()) {
1906 if (gpuDynInst->isLocalMem()) {
1908 } else {
1910 }
1911 } else if (gpuDynInst->isFlatGlobal()) {
1913 } else if (gpuDynInst->isLocalMem()) {
1915 } else if (gpuDynInst->isLoad()) {
1917 } else if (gpuDynInst->isStore()) {
1919 }
1920
1921 if (gpuDynInst->isLoad()) {
1922 switch (gpuDynInst->executedAs()) {
1923 case enums::SC_SPILL:
1924 stats.spillReads++;
1925 break;
1926 case enums::SC_GLOBAL:
1928 break;
1929 case enums::SC_GROUP:
1930 stats.groupReads++;
1931 break;
1932 case enums::SC_PRIVATE:
1933 stats.privReads++;
1934 break;
1935 case enums::SC_READONLY:
1937 break;
1938 case enums::SC_KERNARG:
1940 break;
1941 case enums::SC_ARG:
1942 stats.argReads++;
1943 break;
1944 case enums::SC_NONE:
1949 break;
1950 default:
1951 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1952 break;
1953 }
1954 } else if (gpuDynInst->isStore()) {
1955 switch (gpuDynInst->executedAs()) {
1956 case enums::SC_SPILL:
1958 break;
1959 case enums::SC_GLOBAL:
1961 break;
1962 case enums::SC_GROUP:
1964 break;
1965 case enums::SC_PRIVATE:
1966 stats.privWrites++;
1967 break;
1968 case enums::SC_READONLY:
1970 break;
1971 case enums::SC_KERNARG:
1973 break;
1974 case enums::SC_ARG:
1975 stats.argWrites++;
1976 break;
1977 case enums::SC_NONE:
1982 break;
1983 default:
1984 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
1985 break;
1986 }
1987 }
1988 }
1989}
1990
1991void
1993{
1994 Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
1995
1996 if (!pagesTouched.count(virt_page_addr))
1997 pagesTouched[virt_page_addr] = 1;
1998 else
1999 pagesTouched[virt_page_addr]++;
2000}
2001
2002void
2004{
2005 if (countPages) {
2006 std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2007
2008 *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2009 std::endl;
2010
2011 for (auto iter : pageAccesses) {
2012 *page_stat_file << std::hex << iter.first << ",";
2013 *page_stat_file << std::dec << iter.second.first << ",";
2014 *page_stat_file << std::dec << iter.second.second << std::endl;
2015 }
2016 }
2017}
2018
2019bool
2021{
2022 for (int i = 0; i < numVectorALUs; ++i) {
2023 if (!isVectorAluIdle(i)) {
2024 return false;
2025 }
2026 }
2027
2028 // TODO: FIXME if more than 1 of any memory pipe supported
2029 if (!srfToScalarMemPipeBus.rdy()) {
2030 return false;
2031 }
2032 if (!vrfToGlobalMemPipeBus.rdy()) {
2033 return false;
2034 }
2035 if (!vrfToLocalMemPipeBus.rdy()) {
2036 return false;
2037 }
2038
2043 return false;
2044 }
2045
2046 return true;
2047}
2048
2049int32_t
2050ComputeUnit::getRefCounter(const uint32_t dispatchId,
2051 const uint32_t wgId) const
2052{
2053 return lds.getRefCounter(dispatchId, wgId);
2054}
2055
2056bool
2057ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2058{
2059 assert(simdId < numVectorALUs);
2060
2061 for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2062 if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2063 return false;
2064 }
2065 }
2066
2067 return true;
2068}
2069
2075bool
2077{
2078 // this is just a request to carry the GPUDynInstPtr
2079 // back and forth
2080 RequestPtr newRequest = std::make_shared<Request>();
2081 newRequest->setPaddr(0x0);
2082
2083 // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2084 PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2085
2086 // This is the SenderState needed upon return
2087 newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2088
2089 return ldsPort.sendTimingReq(newPacket);
2090}
2091
2097{
2099}
2100
2104bool
2106{
2107 const ComputeUnit::LDSPort::SenderState *senderState =
2108 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2109
2110 fatal_if(!senderState, "did not get the right sort of sender state");
2111
2112 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2113
2114 delete packet->senderState;
2115 delete packet;
2116
2117 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2118 return true;
2119}
2120
2126bool
2128{
2129 ComputeUnit::LDSPort::SenderState *sender_state =
2131 fatal_if(!sender_state, "packet without a valid sender state");
2132
2133 [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2134
2135 if (isStalled()) {
2136 fatal_if(retries.empty(), "must have retries waiting to be stalled");
2137
2138 retries.push(pkt);
2139
2140 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2141 computeUnit->cu_id, gpuDynInst->simdId,
2142 gpuDynInst->wfSlotId);
2143 return false;
2144 } else if (!RequestPort::sendTimingReq(pkt)) {
2145 // need to stall the LDS port until a recvReqRetry() is received
2146 // this indicates that there is more space
2147 stallPort();
2148 retries.push(pkt);
2149
2150 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2151 computeUnit->cu_id, gpuDynInst->simdId,
2152 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2153 return false;
2154 } else {
2155 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2156 computeUnit->cu_id, gpuDynInst->simdId,
2157 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2158 return true;
2159 }
2160}
2161
2168void
2170{
2171 auto queueSize = retries.size();
2172
2173 DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2174 computeUnit->cu_id, queueSize);
2175
2176 fatal_if(queueSize < 1,
2177 "why was there a recvReqRetry() with no pending reqs?");
2178 fatal_if(!isStalled(),
2179 "recvReqRetry() happened when the port was not stalled");
2180
2181 unstallPort();
2182
2183 while (!retries.empty()) {
2184 PacketPtr packet = retries.front();
2185
2186 DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2187
2188 if (!RequestPort::sendTimingReq(packet)) {
2189 // Stall port
2190 stallPort();
2191 DPRINTF(GPUPort, ": LDS send failed again\n");
2192 break;
2193 } else {
2194 DPRINTF(GPUTLB, ": LDS send successful\n");
2195 retries.pop();
2196 }
2197 }
2198}
2199
2201 int n_wf)
2202 : statistics::Group(parent),
2203 ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2204 ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2205 "per-wavefront."),
2206 ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2207 ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2208 "per-wavefront."),
2209 ADD_STAT(instCyclesVALU,
2210 "Number of cycles needed to execute VALU insts."),
2211 ADD_STAT(instCyclesSALU,
2212 "Number of cycles needed to execute SALU insts."),
2213 ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2214 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2215 "the number of active threads."),
2216 ADD_STAT(vALUUtilization,
2217 "Percentage of active vector ALU threads in a wave."),
2218 ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2219 " accesses that resolve to LDS."),
2220 ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2221 "including FLAT accesses that resolve to LDS) per-wavefront."),
2222 ADD_STAT(flatVMemInsts,
2223 "The number of FLAT insts that resolve to vmem issued."),
2224 ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2225 "resolve to vmem issued per-wavefront."),
2226 ADD_STAT(flatLDSInsts,
2227 "The number of FLAT insts that resolve to LDS issued."),
2228 ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2229 "resolve to LDS issued per-wavefront."),
2230 ADD_STAT(vectorMemWrites,
2231 "Number of vector mem write insts (excluding FLAT insts)."),
2232 ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2233 "insts (excluding FLAT insts) per-wavefront."),
2234 ADD_STAT(vectorMemReads,
2235 "Number of vector mem read insts (excluding FLAT insts)."),
2236 ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2237 "(excluding FLAT insts) per-wavefront."),
2238 ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2239 ADD_STAT(scalarMemWritesPerWF,
2240 "The average number of scalar mem write insts per-wavefront."),
2241 ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2242 ADD_STAT(scalarMemReadsPerWF,
2243 "The average number of scalar mem read insts per-wavefront."),
2244 ADD_STAT(vectorMemReadsPerKiloInst,
2245 "Number of vector mem reads per kilo-instruction"),
2246 ADD_STAT(vectorMemWritesPerKiloInst,
2247 "Number of vector mem writes per kilo-instruction"),
2248 ADD_STAT(vectorMemInstsPerKiloInst,
2249 "Number of vector mem insts per kilo-instruction"),
2250 ADD_STAT(scalarMemReadsPerKiloInst,
2251 "Number of scalar mem reads per kilo-instruction"),
2252 ADD_STAT(scalarMemWritesPerKiloInst,
2253 "Number of scalar mem writes per kilo-instruction"),
2254 ADD_STAT(scalarMemInstsPerKiloInst,
2255 "Number of scalar mem insts per kilo-instruction"),
2256 ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2257 "command, data from VRF to vector memory unit, per SIMD"),
2258 ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2259 "command, data from SRF to scalar memory unit, per SIMD"),
2260 ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2261 "command, data from VRF to LDS unit, per SIMD"),
2262 ADD_STAT(globalReads, "Number of reads to the global segment"),
2263 ADD_STAT(globalWrites, "Number of writes to the global segment"),
2264 ADD_STAT(globalMemInsts,
2265 "Number of memory instructions sent to the global segment"),
2266 ADD_STAT(argReads, "Number of reads to the arg segment"),
2267 ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2268 ADD_STAT(argMemInsts,
2269 "Number of memory instructions sent to the arg segment"),
2270 ADD_STAT(spillReads, "Number of reads to the spill segment"),
2271 ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2272 ADD_STAT(spillMemInsts,
2273 "Number of memory instructions sent to the spill segment"),
2274 ADD_STAT(groupReads, "Number of reads to the group segment"),
2275 ADD_STAT(groupWrites, "Number of writes to the group segment"),
2276 ADD_STAT(groupMemInsts,
2277 "Number of memory instructions sent to the group segment"),
2278 ADD_STAT(privReads, "Number of reads to the private segment"),
2279 ADD_STAT(privWrites, "Number of writes to the private segment"),
2280 ADD_STAT(privMemInsts,
2281 "Number of memory instructions sent to the private segment"),
2282 ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2283 ADD_STAT(readonlyWrites,
2284 "Number of memory instructions sent to the readonly segment"),
2285 ADD_STAT(readonlyMemInsts,
2286 "Number of memory instructions sent to the readonly segment"),
2287 ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2288 ADD_STAT(kernargWrites,
2289 "Number of memory instructions sent to the kernarg segment"),
2290 ADD_STAT(kernargMemInsts,
2291 "Number of memory instructions sent to the kernarg segment"),
2292 ADD_STAT(waveLevelParallelism,
2293 "wave level parallelism: count of active waves at wave launch"),
2294 ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2295 ADD_STAT(tlbCycles,
2296 "total number of cycles for all uncoalesced requests"),
2297 ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2298 ADD_STAT(hitsPerTLBLevel,
2299 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2300 ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2301 ADD_STAT(ldsBankConflictDist,
2302 "Number of bank conflicts per LDS memory packet"),
2303 ADD_STAT(pageDivergenceDist,
2304 "pages touched per wf (over all mem. instr.)"),
2305 ADD_STAT(dynamicGMemInstrCnt,
2306 "dynamic non-flat global memory instruction count"),
2307 ADD_STAT(dynamicFlatMemInstrCnt,
2308 "dynamic flat global memory instruction count"),
2309 ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2310 ADD_STAT(wgBlockedDueBarrierAllocation,
2311 "WG dispatch was blocked due to lack of barrier resources"),
2312 ADD_STAT(wgBlockedDueLdsAllocation,
2313 "Workgroup blocked due to LDS capacity"),
2314 ADD_STAT(numInstrExecuted, "number of instructions executed"),
2315 ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2316 "vector instructions per cycle"),
2317 ADD_STAT(numVecOpsExecuted,
2318 "number of vec ops executed (e.g. WF size/inst)"),
2319 ADD_STAT(numVecOpsExecutedF16,
2320 "number of f16 vec ops executed (e.g. WF size/inst)"),
2321 ADD_STAT(numVecOpsExecutedF32,
2322 "number of f32 vec ops executed (e.g. WF size/inst)"),
2323 ADD_STAT(numVecOpsExecutedF64,
2324 "number of f64 vec ops executed (e.g. WF size/inst)"),
2325 ADD_STAT(numVecOpsExecutedFMA16,
2326 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2327 ADD_STAT(numVecOpsExecutedFMA32,
2328 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2329 ADD_STAT(numVecOpsExecutedFMA64,
2330 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2331 ADD_STAT(numVecOpsExecutedMAC16,
2332 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2333 ADD_STAT(numVecOpsExecutedMAC32,
2334 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2335 ADD_STAT(numVecOpsExecutedMAC64,
2336 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2337 ADD_STAT(numVecOpsExecutedMAD16,
2338 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2339 ADD_STAT(numVecOpsExecutedMAD32,
2340 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2341 ADD_STAT(numVecOpsExecutedMAD64,
2342 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2343 ADD_STAT(numVecOpsExecutedTwoOpFP,
2344 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2345 ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2346 ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2347 ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2348 ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2349 ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2350 ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2351 ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2352 "instruction (over all instructions)"),
2353 ADD_STAT(activeLanesPerGMemInstrDist,
2354 "number of active lanes per global memory instruction"),
2355 ADD_STAT(activeLanesPerLMemInstrDist,
2356 "number of active lanes per local memory instruction"),
2357 ADD_STAT(numALUInstsExecuted,
2358 "Number of dynamic non-GM memory insts executed"),
2359 ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2360 "blocked due to VGPR allocation per SIMD"),
2361 ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2362 "blocked due to SGPR allocation per SIMD"),
2363 ADD_STAT(numCASOps, "number of compare and swap operations"),
2364 ADD_STAT(numFailedCASOps,
2365 "number of compare and swap operations that failed"),
2366 ADD_STAT(completedWfs, "number of completed wavefronts"),
2367 ADD_STAT(completedWGs, "number of completed workgroups"),
2368 ADD_STAT(headTailLatency, "ticks between first and last cache block "
2369 "arrival at coalescer"),
2370 ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2371{
2372 ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2373
2377
2379 execRateDist.init(0, 10, 2);
2380 ldsBankConflictDist.init(0, cu->wfSize(), 2);
2381
2382 pageDivergenceDist.init(1, cu->wfSize(), 4);
2386
2387 headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf |
2389 waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2390 instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2391
2402
2411
2419
2421
2422 // fixed number of TLB levels
2423 for (int i = 0; i < 4; ++i) {
2424 if (!i)
2425 hitsPerTLBLevel.subname(i,"page_table");
2426 else
2427 hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2428 }
2429
2435
2438}
2439
2440} // namespace gem5
#define DPRINTFN(...)
Definition trace.hh:238
#define DPRINTF(x,...)
Definition trace.hh:210
void sendRequest(PacketPtr pkt, Event *callback)
Definition system_hub.cc:40
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Tick clockPeriod() const
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
const char * description() const
Return a C string describing the event.
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void processFetchReturn(PacketPtr pkt)
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
int numWg(int dim) const
int wgId(int dim) const
static const int MAX_DIM
int wgSize(int dim) const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:57
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition lds_state.hh:316
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition lds_state.hh:499
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition lds_state.hh:384
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition lds_state.hh:351
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
const std::string & toString() const
Return the string to a cmd given by idx.
Definition packet.hh:276
virtual std::string name() const
Definition named.hh:47
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition output.cc:210
std::ostream * stream() const
Get the output underlying output stream.
Definition output.hh:62
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
bool isRead() const
Definition packet.hh:593
Addr getAddr() const
Definition packet.hh:807
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition packet.cc:334
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225
bool isWrite() const
Definition packet.hh:594
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
unsigned getSize() const
Definition packet.hh:817
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
std::vector< PoolManager * > vrfPoolMgrs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
std::vector< PoolManager * > srfPoolMgrs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition port.hh:530
@ KERNEL
The request should be marked with KERNEL.
Definition request.hh:183
bool timingSim
Definition shader.hh:221
void notifyCuSleep()
Definition shader.cc:517
int64_t total_valu_insts
Definition shader.hh:261
ThreadContext * gpuTc
Definition shader.hh:112
GPUDispatcher & dispatcher()
Definition shader.cc:99
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:530
AMDGPUSystemHub * systemHub
Definition shader.hh:258
int64_t max_valu_insts
Definition shader.hh:260
int impl_kern_end_rel
Definition shader.hh:227
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:256
virtual Process * getProcessPtr()=0
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition token_port.cc:72
WF barrier slots.
static const int InvalidID
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition misc.hh:76
bool rdy(Cycles cycles=Cycles(0)) const
Definition misc.hh:93
void setStatus(status_e newStatus)
Definition wavefront.cc:550
const int simdId
Definition wavefront.hh:99
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:109
status_e getStatus()
Definition wavefront.hh:137
const int wfSlotId
Definition wavefront.hh:96
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition intmath.hh:59
static constexpr bool isPowerOf2(const T &n)
Definition intmath.hh:98
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 18, 16 > len
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21, 20 > stride
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 24 > j
Definition misc_types.hh:57
Bitfield< 1 > vpc
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 23 > k
Bitfield< 0 > w
const Addr PageShift
Definition page_size.hh:48
Bitfield< 3 > addr
Definition types.hh:84
Bitfield< 2 > pf
Definition misc.hh:555
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ GMEnqueue
Definition misc.hh:56
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
OutputDirectory simout
Definition output.cc:62
uint64_t Tick
Tick count type.
Definition types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition sim_events.cc:88
uint16_t RequestorID
Definition request.hh:95
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition core.cc:143
Declarations of a non-full system Page Table.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
const std::string & name()
Definition trace.cc:48

Generated on Mon Jul 10 2023 14:24:31 for gem5 by doxygen 1.9.7