gem5 v24.1.0.1
Loading...
Searching...
No Matches
compute_unit.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <limits>
35
38#include "base/output.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUExec.hh"
41#include "debug/GPUFetch.hh"
42#include "debug/GPUMem.hh"
43#include "debug/GPUPort.hh"
44#include "debug/GPUPrefetch.hh"
45#include "debug/GPUReg.hh"
46#include "debug/GPURename.hh"
47#include "debug/GPUSync.hh"
48#include "debug/GPUTLB.hh"
55#include "gpu-compute/shader.hh"
59#include "mem/page_table.hh"
60#include "sim/process.hh"
61#include "sim/sim_exit.hh"
62
63namespace gem5
64{
65
67 numVectorGlobalMemUnits(p.num_global_mem_pipes),
68 numVectorSharedMemUnits(p.num_shared_mem_pipes),
69 numScalarMemUnits(p.num_scalar_mem_pipes),
70 numVectorALUs(p.num_SIMDs),
71 numScalarALUs(p.num_scalar_cores),
72 vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
73 coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
74 registerManager(p.register_manager),
75 fetchStage(p, *this),
76 scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
77 scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
78 execStage(p, *this, scheduleToExecute),
79 globalMemoryPipe(p, *this),
80 localMemoryPipe(p, *this),
81 scalarMemoryPipe(p, *this),
82 tickEvent([this]{ exec(); }, "Compute unit tick event",
83 false, Event::CPU_Tick_Pri),
84 cu_id(p.cu_id),
85 vrf(p.vector_register_file), srf(p.scalar_register_file),
86 rfc(p.register_file_cache),
87 simdWidth(p.simd_width),
88 spBypassPipeLength(p.spbypass_pipe_length),
89 dpBypassPipeLength(p.dpbypass_pipe_length),
90 rfcPipeLength(p.rfc_pipe_length),
91 scalarPipeStages(p.scalar_pipe_length),
92 operandNetworkLength(p.operand_network_length),
93 issuePeriod(p.issue_period),
94 vrf_gm_bus_latency(p.vrf_gm_bus_latency),
95 srf_scm_bus_latency(p.srf_scm_bus_latency),
96 vrf_lm_bus_latency(p.vrf_lm_bus_latency),
97 perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
98 prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
99 debugSegFault(p.debugSegFault),
100 functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
101 countPages(p.countPages),
102 req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
103 resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
104 scalar_req_tick_latency(
105 p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
106 scalar_resp_tick_latency(
107 p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
108 memtime_latency(p.memtime_latency * p.clk_domain->clockPeriod()),
109 _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
110 lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
111 ldsPort(csprintf("%s-port", name()), this),
112 scalarDataPort(csprintf("%s-port", name()), this),
113 scalarDTLBPort(csprintf("%s-port", name()), this),
114 sqcPort(csprintf("%s-port", name()), this),
115 sqcTLBPort(csprintf("%s-port", name()), this),
116 _cacheLineSize(p.system->cacheLineSize()),
117 _numBarrierSlots(p.num_barrier_slots),
118 globalSeqNum(0), wavefrontSize(p.wf_size),
119 scoreboardCheckToSchedule(p),
120 scheduleToExecute(p),
121 stats(this, p.n_wf)
122{
123 // This is not currently supported and would require adding more handling
124 // for system vs. device memory requests on the functional paths, so we
125 // fatal immediately in the constructor if this configuration is seen.
126 fatal_if(functionalTLB && FullSystem,
127 "Functional TLB not supported in full-system GPU simulation");
128
138 fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
139 p.wf_size <= 0,
140 "WF size is larger than the host can support");
141 fatal_if(!isPowerOf2(wavefrontSize),
142 "Wavefront size should be a power of 2");
143 // calculate how many cycles a vector load or store will need to transfer
144 // its data over the corresponding buses
145 numCyclesPerStoreTransfer =
146 (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
147 (double)vrfToCoalescerBusWidth);
148
149 numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
150 / coalescerToVrfBusWidth;
151
152 // Initialization: all WF slots are assumed STOPPED
153 idleWfs = p.n_wf * numVectorALUs;
154 lastVaddrWF.resize(numVectorALUs);
155 wfList.resize(numVectorALUs);
156
157 wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
158
159 for (int i = 0; i < p.num_barrier_slots; ++i) {
160 freeBarrierIds.insert(i);
161 }
162
163 for (int j = 0; j < numVectorALUs; ++j) {
164 lastVaddrWF[j].resize(p.n_wf);
165
166 for (int i = 0; i < p.n_wf; ++i) {
167 lastVaddrWF[j][i].resize(wfSize());
168
169 wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
170 wfList[j][i]->setParent(this);
171
172 for (int k = 0; k < wfSize(); ++k) {
173 lastVaddrWF[j][i][k] = 0;
174 }
175 }
176 }
177
178 lastVaddrSimd.resize(numVectorALUs);
179
180 for (int i = 0; i < numVectorALUs; ++i) {
181 lastVaddrSimd[i].resize(wfSize(), 0);
182 }
183
184 lastVaddrCU.resize(wfSize());
185
186 lds.setParent(this);
187
188 if (p.execPolicy == "OLDEST-FIRST") {
189 exec_policy = EXEC_POLICY::OLDEST;
190 } else if (p.execPolicy == "ROUND-ROBIN") {
191 exec_policy = EXEC_POLICY::RR;
192 } else {
193 fatal("Invalid WF execution policy (CU)\n");
194 }
195
196 for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
197 memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
198 }
199
200 for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
201 tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
202 }
203
204 // Setup tokens for response ports. The number of tokens in memPortTokens
205 // is the total token count for the entire vector port (i.e., this CU).
206 memPortTokens = new TokenManager(p.max_cu_tokens);
207
208 registerExitCallback([this]() { exitCallback(); });
209
210 lastExecCycle.resize(numVectorALUs, 0);
211
212 for (int i = 0; i < vrf.size(); ++i) {
213 vrf[i]->setParent(this);
214 rfc[i]->setParent(this);
215 }
216 for (int i = 0; i < srf.size(); ++i) {
217 srf[i]->setParent(this);
218 }
219 numVecRegsPerSimd = vrf[0]->numRegs();
220 numScalarRegsPerSimd = srf[0]->numRegs();
221
222 registerManager->setParent(this);
223
224 activeWaves = 0;
225
226 instExecPerSimd.resize(numVectorALUs, 0);
227
228 // Calculate the number of bits to address a cache line
229 panic_if(!isPowerOf2(_cacheLineSize),
230 "Cache line size should be a power of two.");
231 cacheLineBits = floorLog2(_cacheLineSize);
232}
233
235{
236 // Delete wavefront slots
237 for (int j = 0; j < numVectorALUs; ++j) {
238 for (int i = 0; i < shader->n_wf; ++i) {
239 delete wfList[j][i];
240 }
241 lastVaddrSimd[j].clear();
242 }
243 lastVaddrCU.clear();
244}
245
246int
252
253// index into readyList of the first memory unit
254int
259
260// index into readyList of the last memory unit
261int
263{
264 return numExeUnits() - 1;
265}
266
267// index into scalarALUs vector of SALU used by the wavefront
268int
270{
271 if (numScalarALUs == 1) {
272 return 0;
273 } else {
274 return w->simdId % numScalarALUs;
275 }
276}
277
278// index into readyList of Scalar ALU unit used by wavefront
279int
284
285// index into readyList of Global Memory unit used by wavefront
286int
288{
289 // TODO: FIXME if more than 1 GM pipe supported
291}
292
293// index into readyList of Local Memory unit used by wavefront
294int
296{
297 // TODO: FIXME if more than 1 LM pipe supported
299}
300
301// index into readyList of Scalar Memory unit used by wavefront
302int
304{
305 // TODO: FIXME if more than 1 ScM pipe supported
308}
309
310void
312{
313 w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
314 w->workGroupSz[0] = task->wgSize(0);
315 w->workGroupSz[1] = task->wgSize(1);
316 w->workGroupSz[2] = task->wgSize(2);
317 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
318 w->gridSz[0] = task->gridSize(0);
319 w->gridSz[1] = task->gridSize(1);
320 w->gridSz[2] = task->gridSize(2);
321 w->computeActualWgSz(task);
322}
323
324void
326 HSAQueueEntry *task, int bar_id, bool fetchContext)
327{
328 static int _n_wave = 0;
329
330 VectorMask init_mask;
331 init_mask.reset();
332
333 for (int k = 0; k < wfSize(); ++k) {
334 if (k + waveId * wfSize() < w->actualWgSzTotal)
335 init_mask[k] = 1;
336 }
337
338 w->execMask() = init_mask;
339
340 w->kernId = task->dispatchId();
341 w->wfId = waveId;
342 w->initMask = init_mask.to_ullong();
343
344 if (bar_id > WFBarrier::InvalidID) {
345 w->barrierId(bar_id);
346 } else {
347 assert(!w->hasBarrier());
348 }
349
350 for (int k = 0; k < wfSize(); ++k) {
351 w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
352 w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
353 w->actualWgSz[1];
354 w->workItemId[2][k] = (k + waveId * wfSize()) /
355 (w->actualWgSz[0] * w->actualWgSz[1]);
356
357 w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
358 w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
359 w->workItemId[0][k];
360 }
361
362 // WG state
363 w->wgId = task->globalWgId();
364 w->dispatchId = task->dispatchId();
365 w->workGroupId[0] = w->wgId % task->numWg(0);
366 w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
367 w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
368
369 // set the wavefront context to have a pointer to this section of the LDS
370 w->ldsChunk = ldsChunk;
371
372 [[maybe_unused]] int32_t refCount =
373 lds.increaseRefCounter(w->dispatchId, w->wgId);
374 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
375 cu_id, w->wgId, refCount);
376
377 w->instructionBuffer.clear();
378
379 if (w->pendingFetch)
380 w->dropFetch = true;
381
382 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
383 "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
384 w->simdId, w->wfSlotId, refCount);
385
386 w->initRegState(task, w->actualWgSzTotal);
387 w->start(_n_wave++, task->codeAddr());
388
390 activeWaves++;
391
392 panic_if(w->wrGmReqsInPipe, "GM write counter for wavefront non-zero\n");
393 panic_if(w->rdGmReqsInPipe, "GM read counter for wavefront non-zero\n");
394 panic_if(w->wrLmReqsInPipe, "LM write counter for wavefront non-zero\n");
395 panic_if(w->rdLmReqsInPipe, "GM read counter for wavefront non-zero\n");
396 panic_if(w->outstandingReqs,
397 "Outstanding reqs counter for wavefront non-zero\n");
398}
399
405void
407 GPUDynInstPtr gpuDynInst
408 = std::make_shared<GPUDynInst>(this, nullptr,
410
411 // kern_id will be used in inv responses
412 gpuDynInst->kern_id = kernId;
413
414 injectGlobalMemFence(gpuDynInst, true, req);
415}
416
422void
424 injectGlobalMemFence(gpuDynInst, true);
425}
426
432void
434 GPUDynInstPtr gpuDynInst
435 = std::make_shared<GPUDynInst>(this, nullptr,
437
438 // kern_id will be used in inv responses
439 gpuDynInst->kern_id = kernId;
440
441 gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
442 scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
443}
444
445// reseting SIMD register pools
446// I couldn't think of any other place and
447// I think it is needed in my implementation
448void
450{
451 for (int i=0; i<numVectorALUs; i++)
452 {
455 }
456}
457
458void
460{
461 // If we aren't ticking, start it up!
462 if (!tickEvent.scheduled()) {
463 DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
465 }
466
467 // the kernel's invalidate must have finished before any wg dispatch
468 assert(task->isInvDone());
469
470 // reserve the LDS capacity allocated to the work group
471 // disambiguated by the dispatch ID and workgroup ID, which should be
472 // globally unique
473 LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
474 task->globalWgId(),
475 task->ldsSize());
476
477 panic_if(!ldsChunk, "was not able to reserve space for this WG");
478
479 // calculate the number of 32-bit vector registers required
480 // by each work item
481 int vregDemand = task->numVectorRegs();
482 int sregDemand = task->numScalarRegs();
483 int wave_id = 0;
484
485 int barrier_id = WFBarrier::InvalidID;
486
491 if (num_wfs_in_wg > 1) {
496 barrier_id = getFreeBarrierId();
497 auto &wf_barrier = barrierSlot(barrier_id);
498 assert(!wf_barrier.maxBarrierCnt());
499 assert(!wf_barrier.numAtBarrier());
500 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
501
502 DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
503 "%d waves using this barrier.\n", cu_id, barrier_id,
504 num_wfs_in_wg);
505 }
506
507 // Assign WFs according to numWfsToSched vector, which is computed by
508 // hasDispResources()
509 for (int j = 0; j < shader->n_wf; ++j) {
510 for (int i = 0; i < numVectorALUs; ++i) {
511 Wavefront *w = wfList[i][j];
512 // Check if this wavefront slot is available and there are WFs
513 // remaining to be dispatched to current SIMD:
514 // WF slot must be stopped and not waiting
515 // for a release to complete S_RETURNING
516 if (w->getStatus() == Wavefront::S_STOPPED &&
517 numWfsToSched[i] > 0) {
518 // decrement number of WFs awaiting dispatch to current SIMD
519 numWfsToSched[i] -= 1;
520
521 fillKernelState(w, task);
522
523 DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
524 "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
525 vregDemand, sregDemand);
526
527 registerManager->allocateRegisters(w, vregDemand, sregDemand);
528
529 startWavefront(w, wave_id, ldsChunk, task, barrier_id);
530 ++wave_id;
531 }
532 }
533 }
534}
535
536void
538{
539 panic_if(w->instructionBuffer.empty(),
540 "Instruction Buffer of WF%d can't be empty", w->wgId);
541 GPUDynInstPtr ii = w->instructionBuffer.front();
542 pipeMap.emplace(ii->seqNum());
543}
544
545void
547{
548 panic_if(w->instructionBuffer.empty(),
549 "Instruction Buffer of WF%d can't be empty", w->wgId);
550 GPUDynInstPtr ii = w->instructionBuffer.front();
551 // delete the dynamic instruction from the pipeline map
552 auto it = pipeMap.find(ii->seqNum());
553 panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
554 pipeMap.erase(it);
555}
556
557bool
559{
560 // compute true size of workgroup (after clamping to grid size)
561 int trueWgSize[HSAQueueEntry::MAX_DIM];
562 int trueWgSizeTotal = 1;
563
564 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
565 trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
566 task->wgId(d) * task->wgSize(d));
567
568 trueWgSizeTotal *= trueWgSize[d];
569 DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
570 }
571
572 DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
573
574 // calculate the number of WFs in this WG
575 int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
576 num_wfs_in_wg = numWfs;
577
578 bool barrier_avail = true;
579
580 if (numWfs > 1 && !freeBarrierIds.size()) {
581 barrier_avail = false;
582 }
583
584 // calculate the number of 32-bit vector registers required by each
585 // work item of the work group
586 int vregDemandPerWI = task->numVectorRegs();
587 // calculate the number of 32-bit scalar registers required by each
588 // work item of the work group
589 int sregDemandPerWI = task->numScalarRegs();
590
591 // check if the total number of VGPRs snd SGPRs required by all WFs
592 // of the WG fit in the VRFs of all SIMD units and the CU's SRF
593 panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
594 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
595 "that has %d VGPRs\n",
596 numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
597 panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
598 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
599 "with %d SGPRs\n",
600 numWfs, sregDemandPerWI, numScalarRegsPerSimd);
601
602 // number of WF slots that are not occupied
603 int freeWfSlots = 0;
604 // number of Wfs from WG that were successfully mapped to a SIMD
605 int numMappedWfs = 0;
606 numWfsToSched.clear();
607 numWfsToSched.resize(numVectorALUs, 0);
608
609 // attempt to map WFs to the SIMDs, based on WF slot availability
610 // and register file availability
611 for (int j = 0; j < shader->n_wf; ++j) {
612 for (int i = 0; i < numVectorALUs; ++i) {
613 if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
614 ++freeWfSlots;
615 // check if current WF will fit onto current SIMD/VRF
616 // if all WFs have not yet been mapped to the SIMDs
617 if (numMappedWfs < numWfs &&
619 sregDemandPerWI) &&
621 vregDemandPerWI)) {
622 numWfsToSched[i]++;
623 numMappedWfs++;
624 }
625 }
626 }
627 }
628
629 // check that the number of mapped WFs is not greater
630 // than the actual number of WFs
631 assert(numMappedWfs <= numWfs);
632
633 bool vregAvail = true;
634 bool sregAvail = true;
635 // if a WF to SIMD mapping was not found, find the limiting resource
636 if (numMappedWfs < numWfs) {
637
638 for (int j = 0; j < numVectorALUs; ++j) {
639 // find if there are enough free VGPRs in the SIMD's VRF
640 // to accomodate the WFs of the new WG that would be mapped
641 // to this SIMD unit
642 vregAvail &= registerManager->
643 canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
644 // find if there are enough free SGPRs in the SIMD's SRF
645 // to accomodate the WFs of the new WG that would be mapped
646 // to this SIMD unit
647 sregAvail &= registerManager->
648 canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
649 }
650 }
651
652 DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
653 VGPR Availability = %d, SGPR Availability = %d\n",
654 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
655
656 if (!vregAvail) {
658 }
659
660 if (!sregAvail) {
662 }
663
664 // Return true if enough WF slots to submit workgroup and if there are
665 // enough VGPRs to schedule all WFs to their SIMD units
666 bool ldsAvail = lds.canReserve(task->ldsSize());
667 if (!ldsAvail) {
669 }
670
671 if (!barrier_avail) {
673 }
674
675 // Return true if the following are all true:
676 // (a) all WFs of the WG were mapped to free WF slots
677 // (b) there are enough VGPRs to schedule all WFs to their SIMD units
678 // (c) there are enough SGPRs on the CU to schedule all WFs
679 // (d) there is enough space in LDS to allocate for all WFs
680 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
681 && ldsAvail && barrier_avail;
682 return can_dispatch;
683}
684
685int
687{
688 auto &wf_barrier = barrierSlot(bar_id);
689 return wf_barrier.numYetToReachBarrier();
690}
691
692bool
694{
695 auto &wf_barrier = barrierSlot(bar_id);
696 return wf_barrier.allAtBarrier();
697}
698
699void
701{
702 auto &wf_barrier = barrierSlot(bar_id);
703 wf_barrier.incNumAtBarrier();
704}
705
706int
708{
709 auto &wf_barrier = barrierSlot(bar_id);
710 return wf_barrier.numAtBarrier();
711}
712
713int
715{
716 auto &wf_barrier = barrierSlot(bar_id);
717 return wf_barrier.maxBarrierCnt();
718}
719
720void
722{
723 auto &wf_barrier = barrierSlot(bar_id);
724 wf_barrier.reset();
725}
726
727void
729{
730 auto &wf_barrier = barrierSlot(bar_id);
731 wf_barrier.decMaxBarrierCnt();
732}
733
734void
736{
737 auto &wf_barrier = barrierSlot(bar_id);
738 wf_barrier.release();
739 freeBarrierIds.insert(bar_id);
740}
741
742void
744{
745 for (int i = 0; i < numVectorALUs; ++i) {
746 for (int j = 0; j < shader->n_wf; ++j) {
747 Wavefront *wf = wfList[i][j];
748 if (wf->barrierId() == bar_id) {
749 assert(wf->getStatus() == Wavefront::S_BARRIER);
751 }
752 }
753 }
754}
755
756// Execute one clock worth of work on the ComputeUnit.
757void
759{
760 // process reads and writes in the RFs
761 for (auto &vecRegFile : vrf) {
762 vecRegFile->exec();
763 }
764
765 for (auto &scRegFile : srf) {
766 scRegFile->exec();
767 }
768
769 // Execute pipeline stages in reverse order to simulate
770 // the pipeline latency
774 execStage.exec();
778
780
781 // Put this CU to sleep if there is no more work to be done.
782 if (!isDone()) {
784 } else {
786 DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
787 }
788}
789
790void
792{
793 // Initialize CU Bus models and execution resources
794
795 // Vector ALUs
796 vectorALUs.clear();
797 for (int i = 0; i < numVectorALUs; i++) {
798 vectorALUs.emplace_back(this, clockPeriod());
799 }
800
801 // Scalar ALUs
802 scalarALUs.clear();
803 for (int i = 0; i < numScalarALUs; i++) {
804 scalarALUs.emplace_back(this, clockPeriod());
805 }
806
807 // Vector Global Memory
809 "No support for multiple Global Memory Pipelines exists!!!");
813
814 // Vector Local/Shared Memory
816 "No support for multiple Local Memory Pipelines exists!!!");
820
821 // Scalar Memory
823 "No support for multiple Scalar Memory Pipelines exists!!!");
827
830
833 execStage.init();
835
837}
838
839bool
844
845bool
847{
848 // Ruby has completed the memory op. Schedule the mem_resp_event at the
849 // appropriate cycle to process the timing memory response
850 // This delay represents the pipeline delay
851 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
852 PortID index = sender_state->port_index;
853 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
854 GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
855
856 // MemSyncResp + WriteAckResp are handled completely here and we don't
857 // schedule a MemRespEvent to process the responses further
858 if (pkt->cmd == MemCmd::MemSyncResp) {
859 // This response is for 1 of the following request types:
860 // - kernel launch
861 // - kernel end
862 // - non-kernel mem sync
863
864 // Non-kernel mem sync not from an instruction
865 if (!gpuDynInst) {
866 // If there is no dynamic instruction, a CU must be present.
867 ComputeUnit *cu = sender_state->computeUnit;
868 assert(cu != nullptr);
869
870 if (pkt->req->isInvL2()) {
872 assert(cu->shader->getNumOutstandingInvL2s() >= 0);
873 } else {
874 panic("Unknown MemSyncResp not from an instruction");
875 }
876
877 // Cleanup and return, no other response events needed.
878 delete pkt->senderState;
879 delete pkt;
880 return true;
881 }
882
883 // Kernel Launch
884 // wavefront was nullptr when launching kernel, so it is meaningless
885 // here (simdId=-1, wfSlotId=-1)
886 if (gpuDynInst->isKernelLaunch()) {
887 // for kernel launch, the original request must be both kernel-type
888 // and INV_L1
889 assert(pkt->req->isKernel());
890 assert(pkt->req->isInvL1());
891
892 // one D-Cache inv is done, decrement counter
893 dispatcher.updateInvCounter(gpuDynInst->kern_id);
894
895 delete pkt->senderState;
896 delete pkt;
897 return true;
898 }
899
900 // retrieve wavefront from inst
901 Wavefront *w = gpuDynInst->wavefront();
902
903 // Check if we are waiting on Kernel End Flush
904 if (w->getStatus() == Wavefront::S_RETURNING
905 && gpuDynInst->isEndOfKernel()) {
906 // for kernel end, the original request must be both kernel-type
907 // and last-level GPU cache should be flushed if it contains
908 // dirty data. This request may have been quiesced and
909 // immediately responded to if the GL2 is a write-through /
910 // read-only cache.
911 assert(pkt->req->isKernel());
912 assert(pkt->req->isGL2CacheFlush());
913
914 // once flush done, decrement counter, and return whether all
915 // dirty writeback operations are done for the kernel
916 bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
917
918 // not all wbs are done for the kernel, just release pkt
919 // resources
920 if (!isWbDone) {
921 delete pkt->senderState;
922 delete pkt;
923 return true;
924 }
925
926 // all wbs are completed for the kernel, do retirement work
927 // for the workgroup
928 DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
929 computeUnit->cu_id, w->simdId, w->wfSlotId,
930 w->wfDynId, w->wgId);
931
932 dispatcher.notifyWgCompl(w);
933 w->setStatus(Wavefront::S_STOPPED);
934 }
935
936 if (!pkt->req->isKernel()) {
937 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
938 DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
939 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
940 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
941 gpuDynInst->disassemble(), w->outstandingReqs,
942 w->outstandingReqs - 1);
943 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
944 }
945
946 delete pkt->senderState;
947 delete pkt;
948 return true;
949 }
950
951 EventFunctionWrapper *mem_resp_event =
952 computeUnit->memPort[index].createMemRespEvent(pkt);
953
954 DPRINTF(GPUPort,
955 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
956 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
957 gpuDynInst->seqNum(), index, pkt->req->getPaddr());
958
959 computeUnit->schedule(mem_resp_event,
960 curTick() + computeUnit->resp_tick_latency);
961
962 return true;
963}
964
965bool
967{
968 return handleResponse(pkt);
969}
970
971bool
973{
974 // From scalar cache invalidate that was issued at kernel start.
975 if (pkt->req->isKernel()) {
976 delete pkt->senderState;
977 delete pkt;
978
979 return true;
980 }
981
982 assert(!pkt->req->isKernel());
983
984 // retrieve sender state
985 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
986 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
987
988 assert(pkt->isRead() || pkt->isWrite());
989 assert(gpuDynInst->numScalarReqs > 0);
990
991 gpuDynInst->numScalarReqs--;
992
1001 if (!gpuDynInst->numScalarReqs) {
1002 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
1003 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
1004 gpuDynInst);
1005 } else {
1006 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
1007 gpuDynInst);
1008 }
1009 }
1010
1011 delete pkt->senderState;
1012 delete pkt;
1013
1014 return true;
1015}
1016
1017void
1019{
1020 for (const auto &pkt : retries) {
1021 if (!sendTimingReq(pkt)) {
1022 break;
1023 } else {
1024 retries.pop_front();
1025 }
1026 }
1027}
1028
1029void
1031{
1032 int len = retries.size();
1033
1034 assert(len > 0);
1035
1036 for (int i = 0; i < len; ++i) {
1037 PacketPtr pkt = retries.front().first;
1038 [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
1039 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
1040 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1041 pkt->req->getPaddr());
1042
1046 if (!sendTimingReq(pkt)) {
1047 DPRINTF(GPUMem, "failed again!\n");
1048 break;
1049 } else {
1050 DPRINTF(GPUMem, "successful!\n");
1051 retries.pop_front();
1052 }
1053 }
1054}
1055
1056bool
1058{
1059 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1064 if (sender_state->wavefront != nullptr) {
1065 RequestPtr req = pkt->req;
1066 // If the sender state's isKernDispath is set, then the request came
1067 // from the gpu command processor. The request fetches information
1068 // that will be used in the kernel dispatch process. It should be
1069 // handled in the gpu command processor. If the flag isn't set,
1070 // then the request is an instruction fetch and can be handled in
1071 // the compute unit
1072 if (sender_state->isKernDispatch) {
1073 computeUnit->shader->gpuCmdProc.completeTimingRead();
1074 } else {
1075 computeUnit->handleSQCReturn(pkt);
1076 }
1077 } else {
1078 delete pkt->senderState;
1079 delete pkt;
1080 }
1081
1082 return true;
1083}
1084
1085void
1090
1091void
1093{
1094 int len = retries.size();
1095
1096 assert(len > 0);
1097
1098 for (int i = 0; i < len; ++i) {
1099 PacketPtr pkt = retries.front().first;
1100 [[maybe_unused]] Wavefront *wavefront = retries.front().second;
1101 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1102 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1103 pkt->req->getPaddr());
1104 if (!sendTimingReq(pkt)) {
1105 DPRINTF(GPUFetch, "failed again!\n");
1106 break;
1107 } else {
1108 DPRINTF(GPUFetch, "successful!\n");
1109 retries.pop_front();
1110 }
1111 }
1112}
1113
1114const char*
1116{
1117 return "ComputeUnit SQC memory request event";
1118}
1119
1120void
1122{
1123 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1124 [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
1125
1126 assert(!pkt->req->systemReq());
1127
1128 if (!(sqcPort.sendTimingReq(pkt))) {
1130 (pkt, sender_state->wavefront));
1131 }
1132}
1133
1134void
1136{
1137 // There must be a way around this check to do the globalMemStart...
1138 Addr tmp_vaddr = pkt->req->getVaddr();
1139
1140 updatePageDivergenceDist(tmp_vaddr);
1141
1142 // set PC in request
1143 pkt->req->setPC(gpuDynInst->wavefront()->pc());
1144
1145 pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1146
1147 // figure out the type of the request to set read/write
1148 BaseMMU::Mode TLB_mode;
1149 assert(pkt->isRead() || pkt->isWrite());
1150
1151 // only do some things if actually accessing data
1152 bool isDataAccess = pkt->isWrite() || pkt->isRead();
1153
1154 // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode
1155 // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1156 // Rather than hacking up the pagetable to add these bits in, we just
1157 // keep a structure local to our GPUs that are populated in our
1158 // emulated driver whenever memory is allocated. Consult that structure
1159 // here in case we need a memtype override.
1160 //
1161 // In full system mode these can be extracted from the PTE and assigned
1162 // after address translation takes place.
1163 if (!FullSystem) {
1165 }
1166
1167 // Check write before read for atomic operations
1168 // since atomic operations should use BaseMMU::Write
1169 if (pkt->isWrite()) {
1170 TLB_mode = BaseMMU::Write;
1171 } else if (pkt->isRead()) {
1172 TLB_mode = BaseMMU::Read;
1173 } else {
1174 fatal("pkt is not a read nor a write\n");
1175 }
1176
1177 if (!functionalTLB) {
1178 stats.tlbCycles -= curTick();
1179 }
1181
1182 PortID tlbPort_index = perLaneTLB ? index : 0;
1183
1184 if (shader->timingSim) {
1185 if (!FullSystem && debugSegFault) {
1187 Addr vaddr = pkt->req->getVaddr();
1188 unsigned size = pkt->getSize();
1189
1190 if ((vaddr + size - 1) % 64 < vaddr % 64) {
1191 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1192 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1193 }
1194
1195 Addr paddr;
1196
1197 if (!p->pTable->translate(vaddr, paddr)) {
1198 if (!p->fixupFault(vaddr)) {
1199 panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1200 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1201 vaddr);
1202 }
1203 }
1204 }
1205
1206 // This is the SenderState needed upon return
1207 pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1208
1209 // This is the senderState needed by the TLB hierarchy to function
1210 GpuTranslationState *translation_state =
1211 new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1212 pkt->senderState);
1213
1214 pkt->senderState = translation_state;
1215
1216 if (functionalTLB) {
1217 tlbPort[tlbPort_index].sendFunctional(pkt);
1218
1219 // update the hitLevel distribution
1220 int hit_level = translation_state->hitLevel;
1221 assert(hit_level != -1);
1222 stats.hitsPerTLBLevel[hit_level]++;
1223
1224 // New SenderState for the memory access
1225 GpuTranslationState *sender_state =
1226 safe_cast<GpuTranslationState*>(pkt->senderState);
1227
1228 delete sender_state->tlbEntry;
1229 delete sender_state->saved;
1230 delete sender_state;
1231
1232 assert(pkt->req->hasPaddr());
1233 assert(pkt->req->hasSize());
1234
1235 // this is necessary because the GPU TLB receives packets instead
1236 // of requests. when the translation is complete, all relevent
1237 // fields in the request will be populated, but not in the packet.
1238 // here we create the new packet so we can set the size, addr,
1239 // and proper flags.
1240 PacketPtr oldPkt = pkt;
1241 pkt = new Packet(oldPkt->req, oldPkt->cmd);
1242 if (isDataAccess) {
1243 uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1244 pkt->dataStatic(tmpData);
1245 }
1246 delete oldPkt;
1247
1248
1249 // New SenderState for the memory access
1250 pkt->senderState =
1252 nullptr);
1253
1254 gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1255 gpuDynInst->tlbHitLevel[index] = hit_level;
1256
1257 // translation is done. Schedule the mem_req_event at the
1258 // appropriate cycle to send the timing memory request to ruby
1259 EventFunctionWrapper *mem_req_event =
1260 memPort[index].createMemReqEvent(pkt);
1261
1262 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1263 "scheduled\n", cu_id, gpuDynInst->simdId,
1264 gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1265
1266 schedule(mem_req_event, curTick() + req_tick_latency);
1267 } else if (tlbPort[tlbPort_index].isStalled()) {
1268 assert(tlbPort[tlbPort_index].retries.size() > 0);
1269
1270 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1271 "failed!\n", cu_id, gpuDynInst->simdId,
1272 gpuDynInst->wfSlotId, tmp_vaddr);
1273
1274 tlbPort[tlbPort_index].retries.push_back(pkt);
1275 } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1276 // Stall the data port;
1277 // No more packet will be issued till
1278 // ruby indicates resources are freed by
1279 // a recvReqRetry() call back on this port.
1280 tlbPort[tlbPort_index].stallPort();
1281
1282 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1283 "failed!\n", cu_id, gpuDynInst->simdId,
1284 gpuDynInst->wfSlotId, tmp_vaddr);
1285
1286 tlbPort[tlbPort_index].retries.push_back(pkt);
1287 } else {
1288 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x from "
1289 "instruction %s sent!\n", cu_id, gpuDynInst->simdId,
1290 gpuDynInst->wfSlotId, tmp_vaddr,
1291 gpuDynInst->disassemble().c_str());
1292 }
1293 } else {
1294 if (pkt->cmd == MemCmd::MemSyncReq) {
1295 gpuDynInst->resetEntireStatusVector();
1296 } else {
1297 gpuDynInst->decrementStatusVector(index);
1298 }
1299
1300 // New SenderState for the memory access
1301 delete pkt->senderState;
1302
1303 // Because it's atomic operation, only need TLB translation state
1304 pkt->senderState = new GpuTranslationState(TLB_mode,
1305 shader->gpuTc);
1306
1307 tlbPort[tlbPort_index].sendFunctional(pkt);
1308
1309 // the addr of the packet is not modified, so we need to create a new
1310 // packet, or otherwise the memory access will have the old virtual
1311 // address sent in the translation packet, instead of the physical
1312 // address returned by the translation.
1313 PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1314 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1315
1316 // Translation is done. It is safe to send the packet to memory.
1317 memPort[0].sendFunctional(new_pkt);
1318
1319 DPRINTF(GPUMem, "Functional sendRequest\n");
1320 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1321 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1322 new_pkt->req->getPaddr());
1323
1324 // safe_cast the senderState
1325 GpuTranslationState *sender_state =
1326 safe_cast<GpuTranslationState*>(pkt->senderState);
1327
1328 delete sender_state->tlbEntry;
1329 delete new_pkt;
1330 delete pkt->senderState;
1331 delete pkt;
1332 }
1333}
1334
1335void
1337{
1338 assert(pkt->isWrite() || pkt->isRead());
1339
1340 BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1341
1342 pkt->senderState =
1344
1345 pkt->senderState =
1346 new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1347 pkt->senderState);
1348
1349 if (scalarDTLBPort.isStalled()) {
1350 assert(scalarDTLBPort.retries.size());
1351 scalarDTLBPort.retries.push_back(pkt);
1352 } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1354 scalarDTLBPort.retries.push_back(pkt);
1355 } else {
1356 DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1357 tlb_mode == BaseMMU::Read ? "read" : "write",
1358 pkt->req->getVaddr());
1359 }
1360}
1361
1362void
1364 bool kernelMemSync,
1365 RequestPtr req)
1366{
1367 assert(gpuDynInst->isGlobalSeg() ||
1368 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1369
1370 // Fences will never be issued to system memory, so we can mark the
1371 // requestor as a device memory ID here.
1372 if (!req) {
1373 req = std::make_shared<Request>(
1374 0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
1375 } else {
1376 req->requestorId(vramRequestorId());
1377 }
1378
1379 // all mem sync requests have Paddr == 0
1380 req->setPaddr(0);
1381
1382 PacketPtr pkt = nullptr;
1383
1384 if (kernelMemSync) {
1385 if (gpuDynInst->isKernelLaunch()) {
1386 req->setCacheCoherenceFlags(Request::INV_L1);
1387 req->setReqInstSeqNum(gpuDynInst->seqNum());
1388 req->setFlags(Request::KERNEL);
1389 pkt = new Packet(req, MemCmd::MemSyncReq);
1390 pkt->pushSenderState(
1391 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1392
1393 EventFunctionWrapper *mem_req_event =
1394 memPort[0].createMemReqEvent(pkt);
1395
1396 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1397 "an acquire\n", cu_id, gpuDynInst->simdId,
1398 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1399
1400 schedule(mem_req_event, curTick() + req_tick_latency);
1401 } else {
1402 // kernel end flush of GL2 cache may be quiesced by Ruby if the
1403 // GL2 is a read-only cache
1404 assert(shader->impl_kern_end_rel);
1405 assert(gpuDynInst->isEndOfKernel());
1406
1407 req->setCacheCoherenceFlags(Request::FLUSH_L2);
1408 req->setReqInstSeqNum(gpuDynInst->seqNum());
1409 req->setFlags(Request::KERNEL);
1410 pkt = new Packet(req, MemCmd::MemSyncReq);
1411 pkt->pushSenderState(
1412 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1413
1414 EventFunctionWrapper *mem_req_event =
1415 memPort[0].createMemReqEvent(pkt);
1416
1417 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1418 "a release\n", cu_id, gpuDynInst->simdId,
1419 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1420
1421 schedule(mem_req_event, curTick() + req_tick_latency);
1422 }
1423 } else {
1424 gpuDynInst->setRequestFlags(req);
1425
1426 req->setReqInstSeqNum(gpuDynInst->seqNum());
1427
1428 pkt = new Packet(req, MemCmd::MemSyncReq);
1429 pkt->pushSenderState(
1430 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1431
1432 EventFunctionWrapper *mem_req_event =
1433 memPort[0].createMemReqEvent(pkt);
1434
1435 DPRINTF(GPUPort,
1436 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1437 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1438 pkt->req->getPaddr());
1439
1440 schedule(mem_req_event, curTick() + req_tick_latency);
1441 }
1442}
1443
1444void
1446{
1447 auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
1448 req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
1449
1450 auto pkt = new Packet(req, MemCmd::MemSyncReq);
1451 pkt->pushSenderState(
1452 new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
1453
1454 EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
1455
1456 schedule(mem_req_event, curTick() + req_tick_latency);
1457
1459}
1460
1461void
1463{
1464 DataPort::SenderState *sender_state =
1465 safe_cast<DataPort::SenderState*>(pkt->senderState);
1466
1467 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1468 ComputeUnit *compute_unit = computeUnit;
1469
1470 assert(gpuDynInst);
1471
1472 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1473 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1474 pkt->req->getPaddr(), id);
1475
1476 Addr paddr = pkt->req->getPaddr();
1477
1478 // mem sync resp callback must be handled already in
1479 // DataPort::recvTimingResp
1480 assert(pkt->cmd != MemCmd::MemSyncResp);
1481
1482 // The status vector and global memory response for WriteResp packets get
1483 // handled by the WriteCompleteResp packets.
1484 if (pkt->cmd == MemCmd::WriteResp) {
1485 if (!FullSystem || !pkt->req->systemReq()) {
1486 delete pkt;
1487 return;
1488 }
1489 }
1490
1491 // this is for read, write and atomic
1492 int index = gpuDynInst->memStatusVector[paddr].back();
1493
1494 DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1495 pkt->req->getPaddr(), id);
1496
1497 gpuDynInst->memStatusVector[paddr].pop_back();
1498 gpuDynInst->pAddr = pkt->req->getPaddr();
1499
1500 gpuDynInst->decrementStatusVector(index);
1501 DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1502
1503 if (gpuDynInst->allLanesZero()) {
1504 auto iter = gpuDynInst->memStatusVector.begin();
1505 auto end = gpuDynInst->memStatusVector.end();
1506
1507 while (iter != end) {
1508 assert(iter->second.empty());
1509 ++iter;
1510 }
1511
1512 // Calculate the difference between the arrival of the first cache
1513 // block and the last cache block to arrive if we have the time
1514 // for the first cache block.
1515 if (compute_unit->headTailMap.count(gpuDynInst)) {
1516 Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1517 compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1518 compute_unit->headTailMap.erase(gpuDynInst);
1519 }
1520
1521 gpuDynInst->memStatusVector.clear();
1522
1523 gpuDynInst->
1524 profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1525 compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1526
1527 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1528 compute_unit->cu_id, gpuDynInst->simdId,
1529 gpuDynInst->wfSlotId);
1530 } else {
1531 if (pkt->isRead()) {
1532 if (!compute_unit->headTailMap.count(gpuDynInst)) {
1533 compute_unit->headTailMap
1534 .insert(std::make_pair(gpuDynInst, curTick()));
1535 }
1536 }
1537 }
1538
1539 delete pkt->senderState;
1540 delete pkt;
1541}
1542
1543bool
1545{
1546 Addr line = pkt->req->getPaddr();
1547
1548 DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1549 pkt->req->getVaddr(), line);
1550
1551 assert(pkt->senderState);
1552 computeUnit->stats.tlbCycles += curTick();
1553
1554 // pop off the TLB translation state
1555 GpuTranslationState *translation_state =
1556 safe_cast<GpuTranslationState*>(pkt->senderState);
1557
1558 // no PageFaults are permitted for data accesses
1559 if (!translation_state->tlbEntry) {
1560 DTLBPort::SenderState *sender_state =
1561 safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1562
1563 [[maybe_unused]] Wavefront *w =
1564 computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1565 [sender_state->_gpuDynInst->wfSlotId];
1566
1567 DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1568 pkt->req->getVaddr());
1569 }
1570
1571 // update the hitLevel distribution
1572 int hit_level = translation_state->hitLevel;
1573 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1574
1575 delete translation_state->tlbEntry;
1576 assert(!translation_state->ports.size());
1577 pkt->senderState = translation_state->saved;
1578
1579 // for prefetch pkt
1580 BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1581
1582 delete translation_state;
1583
1584 // use the original sender state to know how to close this transaction
1585 DTLBPort::SenderState *sender_state =
1586 safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1587
1588 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1589 PortID mp_index = sender_state->portIndex;
1590 Addr vaddr = pkt->req->getVaddr();
1591 gpuDynInst->memStatusVector[line].push_back(mp_index);
1592 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1593
1594 MemCmd requestCmd;
1595
1596 if (pkt->cmd == MemCmd::ReadResp) {
1597 requestCmd = MemCmd::ReadReq;
1598 } else if (pkt->cmd == MemCmd::WriteResp) {
1599 requestCmd = MemCmd::WriteReq;
1600 } else if (pkt->cmd == MemCmd::SwapResp) {
1601 requestCmd = MemCmd::SwapReq;
1602 } else {
1603 panic("unsupported response to request conversion %s\n",
1604 pkt->cmd.toString());
1605 }
1606
1607 if (computeUnit->prefetchDepth) {
1608 int simdId = gpuDynInst->simdId;
1609 int wfSlotId = gpuDynInst->wfSlotId;
1610 Addr last = 0;
1611
1612 switch(computeUnit->prefetchType) {
1613 case enums::PF_CU:
1614 last = computeUnit->lastVaddrCU[mp_index];
1615 break;
1616 case enums::PF_PHASE:
1617 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1618 break;
1619 case enums::PF_WF:
1620 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1621 default:
1622 break;
1623 }
1624
1625 DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1626 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1627
1628 int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1630 : 0;
1631
1632 DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1633
1634 computeUnit->lastVaddrCU[mp_index] = vaddr;
1635 computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1636 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1637
1638 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1639 computeUnit->prefetchStride: stride;
1640
1641 DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1642 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1643
1644 DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1645
1646 // Prefetch Next few pages atomically
1647 for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1648 DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1650
1651 if (!stride)
1652 break;
1653
1654 RequestPtr prefetch_req = std::make_shared<Request>(
1656 sizeof(uint8_t), 0,
1657 computeUnit->requestorId(),
1658 0, 0, nullptr);
1659
1660 PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1661 uint8_t foo = 0;
1662 prefetch_pkt->dataStatic(&foo);
1663
1664 // Because it's atomic operation, only need TLB translation state
1665 prefetch_pkt->senderState =
1666 new GpuTranslationState(TLB_mode,
1667 computeUnit->shader->gpuTc, true);
1668
1669 // Currently prefetches are zero-latency, hence the sendFunctional
1670 sendFunctional(prefetch_pkt);
1671
1672 /* safe_cast the senderState */
1673 GpuTranslationState *tlb_state =
1674 safe_cast<GpuTranslationState*>(
1675 prefetch_pkt->senderState);
1676
1677
1678 delete tlb_state->tlbEntry;
1679 delete tlb_state;
1680 delete prefetch_pkt;
1681 }
1682 }
1683
1684 // First we must convert the response cmd back to a request cmd so that
1685 // the request can be sent through the cu's request port
1686 PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1687 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1688 delete pkt->senderState;
1689 delete pkt;
1690
1691 // New SenderState for the memory access
1692 new_pkt->senderState =
1693 new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1694 nullptr);
1695
1696 // Set VRAM ID for device requests
1697 // For now, system vmem requests use functional reads. This is not that
1698 // critical to model as the region of interest should always be accessing
1699 // device memory. System vmem requests are used by blit kernels to do
1700 // memcpys and load code objects into device memory.
1701 if (new_pkt->req->systemReq()) {
1702 // There will be multiple packets returned for the same gpuDynInst,
1703 // so first check if systemReq is not already set and if so, return
1704 // the token acquired when the dispatch list is filled as system
1705 // requests do not require a GPU coalescer token.
1706 if (!gpuDynInst->isSystemReq()) {
1707 computeUnit->getTokenManager()->recvTokens(1);
1708 gpuDynInst->setSystemReq();
1709 }
1710 } else {
1711 new_pkt->req->requestorId(computeUnit->vramRequestorId());
1712 }
1713
1714 // translation is done. Schedule the mem_req_event at the appropriate
1715 // cycle to send the timing memory request to ruby
1716 EventFunctionWrapper *mem_req_event =
1717 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1718
1719 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1720 computeUnit->cu_id, gpuDynInst->simdId,
1721 gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1722
1723 computeUnit->schedule(mem_req_event, curTick() +
1724 computeUnit->req_tick_latency);
1725
1726 return true;
1727}
1728
1731{
1732 return new EventFunctionWrapper(
1733 [this, pkt]{ processMemReqEvent(pkt); },
1734 "ComputeUnit memory request event", true);
1735}
1736
1739{
1740 return new EventFunctionWrapper(
1741 [this, pkt]{ processMemRespEvent(pkt); },
1742 "ComputeUnit memory response event", true);
1743}
1744
1745void
1747{
1748 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1749 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1750 [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1751
1752 if (pkt->req->systemReq()) {
1753 assert(compute_unit->shader->systemHub);
1754 SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
1755 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1756 } else if (!(sendTimingReq(pkt))) {
1757 retries.emplace_back(pkt, gpuDynInst);
1758
1759 if (gpuDynInst) {
1760 DPRINTF(GPUPort,
1761 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1762 compute_unit->cu_id, gpuDynInst->simdId,
1763 gpuDynInst->wfSlotId, id, pkt->req->getPaddr());
1764 }
1765 } else {
1766 if (gpuDynInst) {
1767 DPRINTF(GPUPort,
1768 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
1769 " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1770 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1771 pkt->req->getPaddr());
1772 }
1773 }
1774}
1775
1776const char*
1778{
1779 return "ComputeUnit scalar memory request event";
1780}
1781
1782void
1784{
1785 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1786 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1787 [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1788
1789 if (pkt->req->systemReq()) {
1790 assert(compute_unit->shader->systemHub);
1791 SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
1792 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1793 } else if (!(scalarDataPort.sendTimingReq(pkt))) {
1794 scalarDataPort.retries.emplace_back(pkt);
1795
1796 DPRINTF(GPUPort,
1797 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1798 compute_unit->cu_id, gpuDynInst->simdId,
1799 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1800 } else {
1801 DPRINTF(GPUPort,
1802 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1803 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1804 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1805 pkt->req->getPaddr());
1806 }
1807}
1808
1809/*
1810 * The initial translation request could have been rejected,
1811 * if <retries> queue is not Retry sending the translation
1812 * request. sendRetry() is called from the peer port whenever
1813 * a translation completes.
1814 */
1815void
1817{
1818 int len = retries.size();
1819
1820 DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1821 computeUnit->cu_id, len);
1822
1823 assert(len > 0);
1824 assert(isStalled());
1825 // recvReqRetry is an indication that the resource on which this
1826 // port was stalling on is freed. So, remove the stall first
1827 unstallPort();
1828
1829 for (int i = 0; i < len; ++i) {
1830 PacketPtr pkt = retries.front();
1831 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1832 DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1833
1834 if (!sendTimingReq(pkt)) {
1835 // Stall port
1836 stallPort();
1837 DPRINTF(GPUTLB, ": failed again\n");
1838 break;
1839 } else {
1840 DPRINTF(GPUTLB, ": successful\n");
1841 retries.pop_front();
1842 }
1843 }
1844}
1845
1846bool
1848{
1849 assert(pkt->senderState);
1850
1851 GpuTranslationState *translation_state =
1852 safe_cast<GpuTranslationState*>(pkt->senderState);
1853
1854 // Page faults are not allowed
1855 fatal_if(!translation_state->tlbEntry,
1856 "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1857
1858 delete translation_state->tlbEntry;
1859 assert(!translation_state->ports.size());
1860
1861 pkt->senderState = translation_state->saved;
1862 delete translation_state;
1863
1864 ScalarDTLBPort::SenderState *sender_state =
1865 safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
1866
1867 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1868 delete pkt->senderState;
1869
1870 [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1871
1872 DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1873 "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1874 w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1875
1876 MemCmd mem_cmd;
1877
1878 if (pkt->cmd == MemCmd::ReadResp) {
1879 mem_cmd = MemCmd::ReadReq;
1880 } else if (pkt->cmd == MemCmd::WriteResp) {
1881 mem_cmd = MemCmd::WriteReq;
1882 } else {
1883 fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1884 pkt->cmd.toString());
1885 }
1886
1887 PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1888 req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1889 delete pkt;
1890
1891 req_pkt->senderState =
1893
1894 // For a system request we want to mark the GPU instruction as a system
1895 // load/store so that after the request is issued to system memory we can
1896 // return any token acquired for the request. Since tokens are returned
1897 // by the coalescer and system requests do not take that path, this needs
1898 // to be tracked.
1899 //
1900 // Device requests change the requestor ID to something in the device
1901 // memory Ruby network.
1902 if (req_pkt->req->systemReq()) {
1903 gpuDynInst->setSystemReq();
1904 } else {
1905 req_pkt->req->requestorId(computeUnit->vramRequestorId());
1906 }
1907
1908 ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
1910 (computeUnit->scalarDataPort, req_pkt);
1911 computeUnit->schedule(scalar_mem_req_event, curTick() +
1912 computeUnit->scalar_req_tick_latency);
1913
1914 return true;
1915}
1916
1917bool
1919{
1920 [[maybe_unused]] Addr line = pkt->req->getPaddr();
1921 DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1922 computeUnit->cu_id, pkt->req->getVaddr(), line);
1923
1924 assert(pkt->senderState);
1925
1926 // pop off the TLB translation state
1927 GpuTranslationState *translation_state
1928 = safe_cast<GpuTranslationState*>(pkt->senderState);
1929
1930 bool success = translation_state->tlbEntry != nullptr;
1931 delete translation_state->tlbEntry;
1932 assert(!translation_state->ports.size());
1933 pkt->senderState = translation_state->saved;
1934 delete translation_state;
1935
1936 // use the original sender state to know how to close this transaction
1937 ITLBPort::SenderState *sender_state =
1938 safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1939
1940 // get the wavefront associated with this translation request
1941 Wavefront *wavefront = sender_state->wavefront;
1942 delete pkt->senderState;
1943
1944 if (success) {
1945 // pkt is reused in fetch(), don't delete it here. However, we must
1946 // reset the command to be a request so that it can be sent through
1947 // the cu's request port
1948 assert(pkt->cmd == MemCmd::ReadResp);
1949 pkt->cmd = MemCmd::ReadReq;
1950
1951 computeUnit->fetchStage.fetch(pkt, wavefront);
1952 } else {
1953 if (wavefront->dropFetch) {
1954 assert(wavefront->instructionBuffer.empty());
1955 wavefront->dropFetch = false;
1956 }
1957
1958 wavefront->pendingFetch = 0;
1959 }
1960
1961 return true;
1962}
1963
1964/*
1965 * The initial translation request could have been rejected, if
1966 * <retries> queue is not empty. Retry sending the translation
1967 * request. sendRetry() is called from the peer port whenever
1968 * a translation completes.
1969 */
1970void
1972{
1973
1974 int len = retries.size();
1975 DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1976
1977 assert(len > 0);
1978 assert(isStalled());
1979
1980 // recvReqRetry is an indication that the resource on which this
1981 // port was stalling on is freed. So, remove the stall first
1982 unstallPort();
1983
1984 for (int i = 0; i < len; ++i) {
1985 PacketPtr pkt = retries.front();
1986 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1987 DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1988
1989 if (!sendTimingReq(pkt)) {
1990 stallPort(); // Stall port
1991 DPRINTF(GPUTLB, ": failed again\n");
1992 break;
1993 } else {
1994 DPRINTF(GPUTLB, ": successful\n");
1995 retries.pop_front();
1996 }
1997 }
1998}
1999
2000void
2002{
2003 if (gpuDynInst->isScalar()) {
2004 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2005 stats.sALUInsts++;
2007 } else if (gpuDynInst->isLoad()) {
2009 } else if (gpuDynInst->isStore()) {
2011 }
2012 } else {
2013 if (gpuDynInst->isALU()) {
2016 exitSimLoop("max vALU insts");
2017 }
2018 stats.vALUInsts++;
2021 += gpuDynInst->wavefront()->execMask().count();
2022 } else if (gpuDynInst->isFlat()) {
2023 if (gpuDynInst->isLocalMem()) {
2025 } else {
2027 }
2028 } else if (gpuDynInst->isFlatGlobal()) {
2030 } else if (gpuDynInst->isFlatScratch()) {
2032 } else if (gpuDynInst->isLocalMem()) {
2034 } else if (gpuDynInst->isLoad()) {
2036 } else if (gpuDynInst->isStore()) {
2038 }
2039
2040 if (gpuDynInst->isLoad()) {
2041 switch (gpuDynInst->executedAs()) {
2042 case enums::SC_SPILL:
2043 stats.spillReads++;
2044 break;
2045 case enums::SC_GLOBAL:
2047 break;
2048 case enums::SC_GROUP:
2049 stats.groupReads++;
2050 break;
2051 case enums::SC_PRIVATE:
2052 stats.privReads++;
2053 break;
2054 case enums::SC_READONLY:
2056 break;
2057 case enums::SC_KERNARG:
2059 break;
2060 case enums::SC_ARG:
2061 stats.argReads++;
2062 break;
2063 case enums::SC_NONE:
2068 break;
2069 default:
2070 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2071 break;
2072 }
2073 } else if (gpuDynInst->isStore()) {
2074 switch (gpuDynInst->executedAs()) {
2075 case enums::SC_SPILL:
2077 break;
2078 case enums::SC_GLOBAL:
2080 break;
2081 case enums::SC_GROUP:
2083 break;
2084 case enums::SC_PRIVATE:
2085 stats.privWrites++;
2086 break;
2087 case enums::SC_READONLY:
2089 break;
2090 case enums::SC_KERNARG:
2092 break;
2093 case enums::SC_ARG:
2094 stats.argWrites++;
2095 break;
2096 case enums::SC_NONE:
2101 break;
2102 default:
2103 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2104 break;
2105 }
2106 }
2107 }
2108}
2109
2110void
2112{
2113 Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
2114
2115 if (!pagesTouched.count(virt_page_addr))
2116 pagesTouched[virt_page_addr] = 1;
2117 else
2118 pagesTouched[virt_page_addr]++;
2119}
2120
2121void
2123{
2124 if (countPages) {
2125 std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2126
2127 *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2128 std::endl;
2129
2130 for (auto iter : pageAccesses) {
2131 *page_stat_file << std::hex << iter.first << ",";
2132 *page_stat_file << std::dec << iter.second.first << ",";
2133 *page_stat_file << std::dec << iter.second.second << std::endl;
2134 }
2135 }
2136}
2137
2138bool
2140{
2141 for (int i = 0; i < numVectorALUs; ++i) {
2142 if (!isVectorAluIdle(i)) {
2143 return false;
2144 }
2145 }
2146
2147 // TODO: FIXME if more than 1 of any memory pipe supported
2148 if (!srfToScalarMemPipeBus.rdy()) {
2149 return false;
2150 }
2151 if (!vrfToGlobalMemPipeBus.rdy()) {
2152 return false;
2153 }
2154 if (!vrfToLocalMemPipeBus.rdy()) {
2155 return false;
2156 }
2157
2162 return false;
2163 }
2164
2165 return true;
2166}
2167
2168int32_t
2169ComputeUnit::getRefCounter(const uint32_t dispatchId,
2170 const uint32_t wgId) const
2171{
2172 return lds.getRefCounter(dispatchId, wgId);
2173}
2174
2175bool
2176ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2177{
2178 assert(simdId < numVectorALUs);
2179
2180 for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2181 if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2182 return false;
2183 }
2184 }
2185
2186 return true;
2187}
2188
2194bool
2196{
2197 // this is just a request to carry the GPUDynInstPtr
2198 // back and forth
2199 RequestPtr newRequest = std::make_shared<Request>();
2200 newRequest->setPaddr(0x0);
2201
2202 // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2203 PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2204
2205 // This is the SenderState needed upon return
2206 newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2207
2208 return ldsPort.sendTimingReq(newPacket);
2209}
2210
2219
2223bool
2225{
2226 const ComputeUnit::LDSPort::SenderState *senderState =
2227 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2228
2229 fatal_if(!senderState, "did not get the right sort of sender state");
2230
2231 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2232
2233 delete packet->senderState;
2234 delete packet;
2235
2236 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2237 return true;
2238}
2239
2245bool
2247{
2248 ComputeUnit::LDSPort::SenderState *sender_state =
2250 fatal_if(!sender_state, "packet without a valid sender state");
2251
2252 [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2253
2254 if (isStalled()) {
2255 fatal_if(retries.empty(), "must have retries waiting to be stalled");
2256
2257 retries.push(pkt);
2258
2259 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2260 computeUnit->cu_id, gpuDynInst->simdId,
2261 gpuDynInst->wfSlotId);
2262 return false;
2263 } else if (!RequestPort::sendTimingReq(pkt)) {
2264 // need to stall the LDS port until a recvReqRetry() is received
2265 // this indicates that there is more space
2266 stallPort();
2267 retries.push(pkt);
2268
2269 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2270 computeUnit->cu_id, gpuDynInst->simdId,
2271 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2272 return false;
2273 } else {
2274 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2275 computeUnit->cu_id, gpuDynInst->simdId,
2276 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2277 return true;
2278 }
2279}
2280
2287void
2289{
2290 auto queueSize = retries.size();
2291
2292 DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2293 computeUnit->cu_id, queueSize);
2294
2295 fatal_if(queueSize < 1,
2296 "why was there a recvReqRetry() with no pending reqs?");
2297 fatal_if(!isStalled(),
2298 "recvReqRetry() happened when the port was not stalled");
2299
2300 unstallPort();
2301
2302 while (!retries.empty()) {
2303 PacketPtr packet = retries.front();
2304
2305 DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2306
2307 if (!RequestPort::sendTimingReq(packet)) {
2308 // Stall port
2309 stallPort();
2310 DPRINTF(GPUPort, ": LDS send failed again\n");
2311 break;
2312 } else {
2313 DPRINTF(GPUTLB, ": LDS send successful\n");
2314 retries.pop();
2315 }
2316 }
2317}
2318
2320 int n_wf)
2321 : statistics::Group(parent),
2322 ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2323 ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2324 "per-wavefront."),
2325 ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2326 ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2327 "per-wavefront."),
2328 ADD_STAT(instCyclesVALU,
2329 "Number of cycles needed to execute VALU insts."),
2330 ADD_STAT(instCyclesSALU,
2331 "Number of cycles needed to execute SALU insts."),
2332 ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2333 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2334 "the number of active threads."),
2335 ADD_STAT(vALUUtilization,
2336 "Percentage of active vector ALU threads in a wave."),
2337 ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2338 " accesses that resolve to LDS."),
2339 ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2340 "including FLAT accesses that resolve to LDS) per-wavefront."),
2341 ADD_STAT(flatVMemInsts,
2342 "The number of FLAT insts that resolve to vmem issued."),
2343 ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2344 "resolve to vmem issued per-wavefront."),
2345 ADD_STAT(flatLDSInsts,
2346 "The number of FLAT insts that resolve to LDS issued."),
2347 ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2348 "resolve to LDS issued per-wavefront."),
2349 ADD_STAT(vectorMemWrites,
2350 "Number of vector mem write insts (excluding FLAT insts)."),
2351 ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2352 "insts (excluding FLAT insts) per-wavefront."),
2353 ADD_STAT(vectorMemReads,
2354 "Number of vector mem read insts (excluding FLAT insts)."),
2355 ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2356 "(excluding FLAT insts) per-wavefront."),
2357 ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2358 ADD_STAT(scalarMemWritesPerWF,
2359 "The average number of scalar mem write insts per-wavefront."),
2360 ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2361 ADD_STAT(scalarMemReadsPerWF,
2362 "The average number of scalar mem read insts per-wavefront."),
2363 ADD_STAT(vectorMemReadsPerKiloInst,
2364 "Number of vector mem reads per kilo-instruction"),
2365 ADD_STAT(vectorMemWritesPerKiloInst,
2366 "Number of vector mem writes per kilo-instruction"),
2367 ADD_STAT(vectorMemInstsPerKiloInst,
2368 "Number of vector mem insts per kilo-instruction"),
2369 ADD_STAT(scalarMemReadsPerKiloInst,
2370 "Number of scalar mem reads per kilo-instruction"),
2371 ADD_STAT(scalarMemWritesPerKiloInst,
2372 "Number of scalar mem writes per kilo-instruction"),
2373 ADD_STAT(scalarMemInstsPerKiloInst,
2374 "Number of scalar mem insts per kilo-instruction"),
2375 ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2376 "command, data from VRF to vector memory unit, per SIMD"),
2377 ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2378 "command, data from SRF to scalar memory unit, per SIMD"),
2379 ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2380 "command, data from VRF to LDS unit, per SIMD"),
2381 ADD_STAT(globalReads, "Number of reads to the global segment"),
2382 ADD_STAT(globalWrites, "Number of writes to the global segment"),
2383 ADD_STAT(globalMemInsts,
2384 "Number of memory instructions sent to the global segment"),
2385 ADD_STAT(argReads, "Number of reads to the arg segment"),
2386 ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2387 ADD_STAT(argMemInsts,
2388 "Number of memory instructions sent to the arg segment"),
2389 ADD_STAT(spillReads, "Number of reads to the spill segment"),
2390 ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2391 ADD_STAT(spillMemInsts,
2392 "Number of memory instructions sent to the spill segment"),
2393 ADD_STAT(groupReads, "Number of reads to the group segment"),
2394 ADD_STAT(groupWrites, "Number of writes to the group segment"),
2395 ADD_STAT(groupMemInsts,
2396 "Number of memory instructions sent to the group segment"),
2397 ADD_STAT(privReads, "Number of reads to the private segment"),
2398 ADD_STAT(privWrites, "Number of writes to the private segment"),
2399 ADD_STAT(privMemInsts,
2400 "Number of memory instructions sent to the private segment"),
2401 ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2402 ADD_STAT(readonlyWrites,
2403 "Number of memory instructions sent to the readonly segment"),
2404 ADD_STAT(readonlyMemInsts,
2405 "Number of memory instructions sent to the readonly segment"),
2406 ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2407 ADD_STAT(kernargWrites,
2408 "Number of memory instructions sent to the kernarg segment"),
2409 ADD_STAT(kernargMemInsts,
2410 "Number of memory instructions sent to the kernarg segment"),
2411 ADD_STAT(waveLevelParallelism,
2412 "wave level parallelism: count of active waves at wave launch"),
2413 ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2414 ADD_STAT(tlbCycles,
2415 "total number of cycles for all uncoalesced requests"),
2416 ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2417 ADD_STAT(hitsPerTLBLevel,
2418 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2419 ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2420 ADD_STAT(ldsBankConflictDist,
2421 "Number of bank conflicts per LDS memory packet"),
2422 ADD_STAT(pageDivergenceDist,
2423 "pages touched per wf (over all mem. instr.)"),
2424 ADD_STAT(dynamicGMemInstrCnt,
2425 "dynamic non-flat global memory instruction count"),
2426 ADD_STAT(dynamicFlatMemInstrCnt,
2427 "dynamic flat global memory instruction count"),
2428 ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2429 ADD_STAT(wgBlockedDueBarrierAllocation,
2430 "WG dispatch was blocked due to lack of barrier resources"),
2431 ADD_STAT(wgBlockedDueLdsAllocation,
2432 "Workgroup blocked due to LDS capacity"),
2433 ADD_STAT(numInstrExecuted, "number of instructions executed"),
2434 ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2435 "vector instructions per cycle"),
2436 ADD_STAT(numVecOpsExecuted,
2437 "number of vec ops executed (e.g. WF size/inst)"),
2438 ADD_STAT(numVecOpsExecutedF16,
2439 "number of f16 vec ops executed (e.g. WF size/inst)"),
2440 ADD_STAT(numVecOpsExecutedF32,
2441 "number of f32 vec ops executed (e.g. WF size/inst)"),
2442 ADD_STAT(numVecOpsExecutedF64,
2443 "number of f64 vec ops executed (e.g. WF size/inst)"),
2444 ADD_STAT(numVecOpsExecutedFMA16,
2445 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2446 ADD_STAT(numVecOpsExecutedFMA32,
2447 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2448 ADD_STAT(numVecOpsExecutedFMA64,
2449 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2450 ADD_STAT(numVecOpsExecutedMAC16,
2451 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2452 ADD_STAT(numVecOpsExecutedMAC32,
2453 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2454 ADD_STAT(numVecOpsExecutedMAC64,
2455 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2456 ADD_STAT(numVecOpsExecutedMAD16,
2457 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2458 ADD_STAT(numVecOpsExecutedMAD32,
2459 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2460 ADD_STAT(numVecOpsExecutedMAD64,
2461 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2462 ADD_STAT(numVecOpsExecutedMFMA,
2463 "number of mfma vec ops executed (e.g. WF size/inst)"),
2464 ADD_STAT(numVecOpsExecutedMFMAI8,
2465 "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
2466 ADD_STAT(numVecOpsExecutedMFMAF16,
2467 "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
2468 ADD_STAT(numVecOpsExecutedMFMAF32,
2469 "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
2470 ADD_STAT(numVecOpsExecutedMFMAF64,
2471 "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
2472 ADD_STAT(numVecOpsExecutedTwoOpFP,
2473 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2474 ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2475 ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2476 ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2477 ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2478 ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2479 ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2480 ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2481 "instruction (over all instructions)"),
2482 ADD_STAT(activeLanesPerGMemInstrDist,
2483 "number of active lanes per global memory instruction"),
2484 ADD_STAT(activeLanesPerLMemInstrDist,
2485 "number of active lanes per local memory instruction"),
2486 ADD_STAT(numALUInstsExecuted,
2487 "Number of dynamic non-GM memory insts executed"),
2488 ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2489 "blocked due to VGPR allocation per SIMD"),
2490 ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2491 "blocked due to SGPR allocation per SIMD"),
2492 ADD_STAT(numCASOps, "number of compare and swap operations"),
2493 ADD_STAT(numFailedCASOps,
2494 "number of compare and swap operations that failed"),
2495 ADD_STAT(completedWfs, "number of completed wavefronts"),
2496 ADD_STAT(completedWGs, "number of completed workgroups"),
2497 ADD_STAT(headTailLatency, "ticks between first and last cache block "
2498 "arrival at coalescer"),
2499 ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2500{
2501 ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2502
2506
2508 execRateDist.init(0, 10-1, 2);
2509 ldsBankConflictDist.init(0, cu->wfSize()-1, 2);
2510
2511 pageDivergenceDist.init(1, cu->wfSize(), 4);
2515
2516 headTailLatency.init(0, 1000000-1, 10000).flags(statistics::pdf |
2518 waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2519 instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2520
2531
2540
2548
2550
2551 // fixed number of TLB levels
2552 for (int i = 0; i < 4; ++i) {
2553 if (!i)
2554 hitsPerTLBLevel.subname(i,"page_table");
2555 else
2556 hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2557 }
2558
2564
2567}
2568
2569} // namespace gem5
#define DPRINTFN(...)
Definition trace.hh:237
#define DPRINTF(x,...)
Definition trace.hh:209
void sendRequest(PacketPtr pkt, Event *callback)
Definition system_hub.cc:42
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Tick clockPeriod() const
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
const char * description() const
Return a C string describing the event.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
const char * description() const
Return a C string describing the event.
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
void sendInvL2(Addr paddr)
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void processFetchReturn(PacketPtr pkt)
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
int numWg(int dim) const
int wgId(int dim) const
static const int MAX_DIM
int wgSize(int dim) const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:58
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition lds_state.hh:363
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition lds_state.hh:549
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition lds_state.hh:431
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition lds_state.hh:398
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
const std::string & toString() const
Return the string to a cmd given by idx.
Definition packet.hh:276
virtual std::string name() const
Definition named.hh:47
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition output.cc:210
std::ostream * stream() const
Get the output underlying output stream.
Definition output.hh:62
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
bool isRead() const
Definition packet.hh:593
Addr getAddr() const
Definition packet.hh:807
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition packet.cc:334
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225
bool isWrite() const
Definition packet.hh:594
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
unsigned getSize() const
Definition packet.hh:817
unsigned size
The size of the request or transfer.
Definition packet.hh:397
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
std::vector< PoolManager * > vrfPoolMgrs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
std::vector< PoolManager * > srfPoolMgrs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition port.hh:603
@ KERNEL
The request should be marked with KERNEL.
Definition request.hh:183
void injectScalarMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req)
bool timingSim
Definition shader.hh:233
void notifyCuSleep()
Definition shader.cc:540
int64_t total_valu_insts
Definition shader.hh:275
void incNumOutstandingInvL2s()
Definition shader.hh:339
int getNumOutstandingInvL2s() const
Definition shader.hh:340
ThreadContext * gpuTc
Definition shader.hh:124
GPUDispatcher & dispatcher()
Definition shader.cc:111
void decNumOutstandingInvL2s()
Definition shader.cc:560
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:586
AMDGPUSystemHub * systemHub
Definition shader.hh:272
int64_t max_valu_insts
Definition shader.hh:274
int impl_kern_end_rel
Definition shader.hh:239
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:270
virtual Process * getProcessPtr()=0
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition token_port.cc:72
WF barrier slots.
static const int InvalidID
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition misc.hh:76
bool rdy(Cycles cycles=Cycles(0)) const
Definition misc.hh:93
void setStatus(status_e newStatus)
Definition wavefront.cc:573
const int simdId
Definition wavefront.hh:101
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:111
status_e getStatus()
Definition wavefront.hh:141
const int wfSlotId
Definition wavefront.hh:98
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
STL pair class.
Definition stl.hh:58
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition intmath.hh:59
static constexpr bool isPowerOf2(const T &n)
Definition intmath.hh:98
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 18, 16 > len
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21, 20 > stride
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 1 > vpc
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 23 > k
Bitfield< 0 > w
const Addr PageShift
Definition page_size.hh:48
Bitfield< 3 > addr
Definition types.hh:84
Bitfield< 2 > pf
Definition misc.hh:565
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ GMEnqueue
Definition misc.hh:56
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
OutputDirectory simout
Definition output.cc:62
uint64_t Tick
Tick count type.
Definition types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition sim_events.cc:88
uint16_t RequestorID
Definition request.hh:95
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition core.cc:143
Declarations of a non-full system Page Table.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
const std::string & name()
Definition trace.cc:48

Generated on Mon Jan 13 2025 04:28:36 for gem5 by doxygen 1.9.8