gem5 v24.0.0.0
Loading...
Searching...
No Matches
compute_unit.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <limits>
35
38#include "base/output.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUExec.hh"
41#include "debug/GPUFetch.hh"
42#include "debug/GPUMem.hh"
43#include "debug/GPUPort.hh"
44#include "debug/GPUPrefetch.hh"
45#include "debug/GPUReg.hh"
46#include "debug/GPURename.hh"
47#include "debug/GPUSync.hh"
48#include "debug/GPUTLB.hh"
55#include "gpu-compute/shader.hh"
59#include "mem/page_table.hh"
60#include "sim/process.hh"
61#include "sim/sim_exit.hh"
62
63namespace gem5
64{
65
67 numVectorGlobalMemUnits(p.num_global_mem_pipes),
68 numVectorSharedMemUnits(p.num_shared_mem_pipes),
69 numScalarMemUnits(p.num_scalar_mem_pipes),
70 numVectorALUs(p.num_SIMDs),
71 numScalarALUs(p.num_scalar_cores),
72 vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
73 coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
74 registerManager(p.register_manager),
75 fetchStage(p, *this),
76 scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
77 scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
78 execStage(p, *this, scheduleToExecute),
79 globalMemoryPipe(p, *this),
80 localMemoryPipe(p, *this),
81 scalarMemoryPipe(p, *this),
82 tickEvent([this]{ exec(); }, "Compute unit tick event",
83 false, Event::CPU_Tick_Pri),
84 cu_id(p.cu_id),
85 vrf(p.vector_register_file), srf(p.scalar_register_file),
86 rfc(p.register_file_cache),
87 simdWidth(p.simd_width),
88 spBypassPipeLength(p.spbypass_pipe_length),
89 dpBypassPipeLength(p.dpbypass_pipe_length),
90 rfcPipeLength(p.rfc_pipe_length),
91 scalarPipeStages(p.scalar_pipe_length),
92 operandNetworkLength(p.operand_network_length),
93 issuePeriod(p.issue_period),
94 vrf_gm_bus_latency(p.vrf_gm_bus_latency),
95 srf_scm_bus_latency(p.srf_scm_bus_latency),
96 vrf_lm_bus_latency(p.vrf_lm_bus_latency),
97 perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
98 prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
99 debugSegFault(p.debugSegFault),
100 functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
101 countPages(p.countPages),
102 req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
103 resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
104 scalar_req_tick_latency(
105 p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
106 scalar_resp_tick_latency(
107 p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
108 _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
109 lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
110 ldsPort(csprintf("%s-port", name()), this),
111 scalarDataPort(csprintf("%s-port", name()), this),
112 scalarDTLBPort(csprintf("%s-port", name()), this),
113 sqcPort(csprintf("%s-port", name()), this),
114 sqcTLBPort(csprintf("%s-port", name()), this),
115 _cacheLineSize(p.system->cacheLineSize()),
116 _numBarrierSlots(p.num_barrier_slots),
117 globalSeqNum(0), wavefrontSize(p.wf_size),
118 scoreboardCheckToSchedule(p),
119 scheduleToExecute(p),
120 stats(this, p.n_wf)
121{
122 // This is not currently supported and would require adding more handling
123 // for system vs. device memory requests on the functional paths, so we
124 // fatal immediately in the constructor if this configuration is seen.
125 fatal_if(functionalTLB && FullSystem,
126 "Functional TLB not supported in full-system GPU simulation");
127
137 fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
138 p.wf_size <= 0,
139 "WF size is larger than the host can support");
140 fatal_if(!isPowerOf2(wavefrontSize),
141 "Wavefront size should be a power of 2");
142 // calculate how many cycles a vector load or store will need to transfer
143 // its data over the corresponding buses
144 numCyclesPerStoreTransfer =
145 (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
146 (double)vrfToCoalescerBusWidth);
147
148 numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
149 / coalescerToVrfBusWidth;
150
151 // Initialization: all WF slots are assumed STOPPED
152 idleWfs = p.n_wf * numVectorALUs;
153 lastVaddrWF.resize(numVectorALUs);
154 wfList.resize(numVectorALUs);
155
156 wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
157
158 for (int i = 0; i < p.num_barrier_slots; ++i) {
159 freeBarrierIds.insert(i);
160 }
161
162 for (int j = 0; j < numVectorALUs; ++j) {
163 lastVaddrWF[j].resize(p.n_wf);
164
165 for (int i = 0; i < p.n_wf; ++i) {
166 lastVaddrWF[j][i].resize(wfSize());
167
168 wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
169 wfList[j][i]->setParent(this);
170
171 for (int k = 0; k < wfSize(); ++k) {
172 lastVaddrWF[j][i][k] = 0;
173 }
174 }
175 }
176
177 lastVaddrSimd.resize(numVectorALUs);
178
179 for (int i = 0; i < numVectorALUs; ++i) {
180 lastVaddrSimd[i].resize(wfSize(), 0);
181 }
182
183 lastVaddrCU.resize(wfSize());
184
185 lds.setParent(this);
186
187 if (p.execPolicy == "OLDEST-FIRST") {
188 exec_policy = EXEC_POLICY::OLDEST;
189 } else if (p.execPolicy == "ROUND-ROBIN") {
190 exec_policy = EXEC_POLICY::RR;
191 } else {
192 fatal("Invalid WF execution policy (CU)\n");
193 }
194
195 for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
196 memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
197 }
198
199 for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
200 tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
201 }
202
203 // Setup tokens for response ports. The number of tokens in memPortTokens
204 // is the total token count for the entire vector port (i.e., this CU).
205 memPortTokens = new TokenManager(p.max_cu_tokens);
206
207 registerExitCallback([this]() { exitCallback(); });
208
209 lastExecCycle.resize(numVectorALUs, 0);
210
211 for (int i = 0; i < vrf.size(); ++i) {
212 vrf[i]->setParent(this);
213 rfc[i]->setParent(this);
214 }
215 for (int i = 0; i < srf.size(); ++i) {
216 srf[i]->setParent(this);
217 }
218 numVecRegsPerSimd = vrf[0]->numRegs();
219 numScalarRegsPerSimd = srf[0]->numRegs();
220
221 registerManager->setParent(this);
222
223 activeWaves = 0;
224
225 instExecPerSimd.resize(numVectorALUs, 0);
226
227 // Calculate the number of bits to address a cache line
228 panic_if(!isPowerOf2(_cacheLineSize),
229 "Cache line size should be a power of two.");
230 cacheLineBits = floorLog2(_cacheLineSize);
231}
232
234{
235 // Delete wavefront slots
236 for (int j = 0; j < numVectorALUs; ++j) {
237 for (int i = 0; i < shader->n_wf; ++i) {
238 delete wfList[j][i];
239 }
240 lastVaddrSimd[j].clear();
241 }
242 lastVaddrCU.clear();
243}
244
245int
251
252// index into readyList of the first memory unit
253int
258
259// index into readyList of the last memory unit
260int
262{
263 return numExeUnits() - 1;
264}
265
266// index into scalarALUs vector of SALU used by the wavefront
267int
269{
270 if (numScalarALUs == 1) {
271 return 0;
272 } else {
273 return w->simdId % numScalarALUs;
274 }
275}
276
277// index into readyList of Scalar ALU unit used by wavefront
278int
283
284// index into readyList of Global Memory unit used by wavefront
285int
287{
288 // TODO: FIXME if more than 1 GM pipe supported
290}
291
292// index into readyList of Local Memory unit used by wavefront
293int
295{
296 // TODO: FIXME if more than 1 LM pipe supported
298}
299
300// index into readyList of Scalar Memory unit used by wavefront
301int
303{
304 // TODO: FIXME if more than 1 ScM pipe supported
307}
308
309void
311{
312 w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
313 w->workGroupSz[0] = task->wgSize(0);
314 w->workGroupSz[1] = task->wgSize(1);
315 w->workGroupSz[2] = task->wgSize(2);
316 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
317 w->gridSz[0] = task->gridSize(0);
318 w->gridSz[1] = task->gridSize(1);
319 w->gridSz[2] = task->gridSize(2);
320 w->computeActualWgSz(task);
321}
322
323void
325 HSAQueueEntry *task, int bar_id, bool fetchContext)
326{
327 static int _n_wave = 0;
328
329 VectorMask init_mask;
330 init_mask.reset();
331
332 for (int k = 0; k < wfSize(); ++k) {
333 if (k + waveId * wfSize() < w->actualWgSzTotal)
334 init_mask[k] = 1;
335 }
336
337 w->execMask() = init_mask;
338
339 w->kernId = task->dispatchId();
340 w->wfId = waveId;
341 w->initMask = init_mask.to_ullong();
342
343 if (bar_id > WFBarrier::InvalidID) {
344 w->barrierId(bar_id);
345 } else {
346 assert(!w->hasBarrier());
347 }
348
349 for (int k = 0; k < wfSize(); ++k) {
350 w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
351 w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
352 w->actualWgSz[1];
353 w->workItemId[2][k] = (k + waveId * wfSize()) /
354 (w->actualWgSz[0] * w->actualWgSz[1]);
355
356 w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
357 w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
358 w->workItemId[0][k];
359 }
360
361 // WG state
362 w->wgId = task->globalWgId();
363 w->dispatchId = task->dispatchId();
364 w->workGroupId[0] = w->wgId % task->numWg(0);
365 w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
366 w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
367
368 // set the wavefront context to have a pointer to this section of the LDS
369 w->ldsChunk = ldsChunk;
370
371 [[maybe_unused]] int32_t refCount =
372 lds.increaseRefCounter(w->dispatchId, w->wgId);
373 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
374 cu_id, w->wgId, refCount);
375
376 w->instructionBuffer.clear();
377
378 if (w->pendingFetch)
379 w->dropFetch = true;
380
381 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
382 "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
383 w->simdId, w->wfSlotId, refCount);
384
385 w->initRegState(task, w->actualWgSzTotal);
386 w->start(_n_wave++, task->codeAddr());
387
389 activeWaves++;
390
391 panic_if(w->wrGmReqsInPipe, "GM write counter for wavefront non-zero\n");
392 panic_if(w->rdGmReqsInPipe, "GM read counter for wavefront non-zero\n");
393 panic_if(w->wrLmReqsInPipe, "LM write counter for wavefront non-zero\n");
394 panic_if(w->rdLmReqsInPipe, "GM read counter for wavefront non-zero\n");
395 panic_if(w->outstandingReqs,
396 "Outstanding reqs counter for wavefront non-zero\n");
397}
398
404void
406 GPUDynInstPtr gpuDynInst
407 = std::make_shared<GPUDynInst>(this, nullptr,
409
410 // kern_id will be used in inv responses
411 gpuDynInst->kern_id = kernId;
412 // update contextId field
413 req->setContext(gpuDynInst->wfDynId);
414
415 injectGlobalMemFence(gpuDynInst, true, req);
416}
417
423void
425 injectGlobalMemFence(gpuDynInst, true);
426}
427
433void
435 GPUDynInstPtr gpuDynInst
436 = std::make_shared<GPUDynInst>(this, nullptr,
438
439 // kern_id will be used in inv responses
440 gpuDynInst->kern_id = kernId;
441 // update contextId field
442 req->setContext(gpuDynInst->wfDynId);
443
444 gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
445 scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
446}
447
448// reseting SIMD register pools
449// I couldn't think of any other place and
450// I think it is needed in my implementation
451void
453{
454 for (int i=0; i<numVectorALUs; i++)
455 {
458 }
459}
460
461void
463{
464 // If we aren't ticking, start it up!
465 if (!tickEvent.scheduled()) {
466 DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
468 }
469
470 // the kernel's invalidate must have finished before any wg dispatch
471 assert(task->isInvDone());
472
473 // reserve the LDS capacity allocated to the work group
474 // disambiguated by the dispatch ID and workgroup ID, which should be
475 // globally unique
476 LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
477 task->globalWgId(),
478 task->ldsSize());
479
480 panic_if(!ldsChunk, "was not able to reserve space for this WG");
481
482 // calculate the number of 32-bit vector registers required
483 // by each work item
484 int vregDemand = task->numVectorRegs();
485 int sregDemand = task->numScalarRegs();
486 int wave_id = 0;
487
488 int barrier_id = WFBarrier::InvalidID;
489
494 if (num_wfs_in_wg > 1) {
499 barrier_id = getFreeBarrierId();
500 auto &wf_barrier = barrierSlot(barrier_id);
501 assert(!wf_barrier.maxBarrierCnt());
502 assert(!wf_barrier.numAtBarrier());
503 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
504
505 DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
506 "%d waves using this barrier.\n", cu_id, barrier_id,
507 num_wfs_in_wg);
508 }
509
510 // Assign WFs according to numWfsToSched vector, which is computed by
511 // hasDispResources()
512 for (int j = 0; j < shader->n_wf; ++j) {
513 for (int i = 0; i < numVectorALUs; ++i) {
514 Wavefront *w = wfList[i][j];
515 // Check if this wavefront slot is available and there are WFs
516 // remaining to be dispatched to current SIMD:
517 // WF slot must be stopped and not waiting
518 // for a release to complete S_RETURNING
519 if (w->getStatus() == Wavefront::S_STOPPED &&
520 numWfsToSched[i] > 0) {
521 // decrement number of WFs awaiting dispatch to current SIMD
522 numWfsToSched[i] -= 1;
523
524 fillKernelState(w, task);
525
526 DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
527 "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
528 vregDemand, sregDemand);
529
530 registerManager->allocateRegisters(w, vregDemand, sregDemand);
531
532 startWavefront(w, wave_id, ldsChunk, task, barrier_id);
533 ++wave_id;
534 }
535 }
536 }
537}
538
539void
541{
542 panic_if(w->instructionBuffer.empty(),
543 "Instruction Buffer of WF%d can't be empty", w->wgId);
544 GPUDynInstPtr ii = w->instructionBuffer.front();
545 pipeMap.emplace(ii->seqNum());
546}
547
548void
550{
551 panic_if(w->instructionBuffer.empty(),
552 "Instruction Buffer of WF%d can't be empty", w->wgId);
553 GPUDynInstPtr ii = w->instructionBuffer.front();
554 // delete the dynamic instruction from the pipeline map
555 auto it = pipeMap.find(ii->seqNum());
556 panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
557 pipeMap.erase(it);
558}
559
560bool
562{
563 // compute true size of workgroup (after clamping to grid size)
564 int trueWgSize[HSAQueueEntry::MAX_DIM];
565 int trueWgSizeTotal = 1;
566
567 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
568 trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
569 task->wgId(d) * task->wgSize(d));
570
571 trueWgSizeTotal *= trueWgSize[d];
572 DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
573 }
574
575 DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
576
577 // calculate the number of WFs in this WG
578 int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
579 num_wfs_in_wg = numWfs;
580
581 bool barrier_avail = true;
582
583 if (numWfs > 1 && !freeBarrierIds.size()) {
584 barrier_avail = false;
585 }
586
587 // calculate the number of 32-bit vector registers required by each
588 // work item of the work group
589 int vregDemandPerWI = task->numVectorRegs();
590 // calculate the number of 32-bit scalar registers required by each
591 // work item of the work group
592 int sregDemandPerWI = task->numScalarRegs();
593
594 // check if the total number of VGPRs snd SGPRs required by all WFs
595 // of the WG fit in the VRFs of all SIMD units and the CU's SRF
596 panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
597 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
598 "that has %d VGPRs\n",
599 numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
600 panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
601 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
602 "with %d SGPRs\n",
603 numWfs, sregDemandPerWI, numScalarRegsPerSimd);
604
605 // number of WF slots that are not occupied
606 int freeWfSlots = 0;
607 // number of Wfs from WG that were successfully mapped to a SIMD
608 int numMappedWfs = 0;
609 numWfsToSched.clear();
610 numWfsToSched.resize(numVectorALUs, 0);
611
612 // attempt to map WFs to the SIMDs, based on WF slot availability
613 // and register file availability
614 for (int j = 0; j < shader->n_wf; ++j) {
615 for (int i = 0; i < numVectorALUs; ++i) {
616 if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
617 ++freeWfSlots;
618 // check if current WF will fit onto current SIMD/VRF
619 // if all WFs have not yet been mapped to the SIMDs
620 if (numMappedWfs < numWfs &&
622 sregDemandPerWI) &&
624 vregDemandPerWI)) {
625 numWfsToSched[i]++;
626 numMappedWfs++;
627 }
628 }
629 }
630 }
631
632 // check that the number of mapped WFs is not greater
633 // than the actual number of WFs
634 assert(numMappedWfs <= numWfs);
635
636 bool vregAvail = true;
637 bool sregAvail = true;
638 // if a WF to SIMD mapping was not found, find the limiting resource
639 if (numMappedWfs < numWfs) {
640
641 for (int j = 0; j < numVectorALUs; ++j) {
642 // find if there are enough free VGPRs in the SIMD's VRF
643 // to accomodate the WFs of the new WG that would be mapped
644 // to this SIMD unit
645 vregAvail &= registerManager->
646 canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
647 // find if there are enough free SGPRs in the SIMD's SRF
648 // to accomodate the WFs of the new WG that would be mapped
649 // to this SIMD unit
650 sregAvail &= registerManager->
651 canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
652 }
653 }
654
655 DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
656 VGPR Availability = %d, SGPR Availability = %d\n",
657 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
658
659 if (!vregAvail) {
661 }
662
663 if (!sregAvail) {
665 }
666
667 // Return true if enough WF slots to submit workgroup and if there are
668 // enough VGPRs to schedule all WFs to their SIMD units
669 bool ldsAvail = lds.canReserve(task->ldsSize());
670 if (!ldsAvail) {
672 }
673
674 if (!barrier_avail) {
676 }
677
678 // Return true if the following are all true:
679 // (a) all WFs of the WG were mapped to free WF slots
680 // (b) there are enough VGPRs to schedule all WFs to their SIMD units
681 // (c) there are enough SGPRs on the CU to schedule all WFs
682 // (d) there is enough space in LDS to allocate for all WFs
683 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
684 && ldsAvail && barrier_avail;
685 return can_dispatch;
686}
687
688int
690{
691 auto &wf_barrier = barrierSlot(bar_id);
692 return wf_barrier.numYetToReachBarrier();
693}
694
695bool
697{
698 auto &wf_barrier = barrierSlot(bar_id);
699 return wf_barrier.allAtBarrier();
700}
701
702void
704{
705 auto &wf_barrier = barrierSlot(bar_id);
706 wf_barrier.incNumAtBarrier();
707}
708
709int
711{
712 auto &wf_barrier = barrierSlot(bar_id);
713 return wf_barrier.numAtBarrier();
714}
715
716int
718{
719 auto &wf_barrier = barrierSlot(bar_id);
720 return wf_barrier.maxBarrierCnt();
721}
722
723void
725{
726 auto &wf_barrier = barrierSlot(bar_id);
727 wf_barrier.reset();
728}
729
730void
732{
733 auto &wf_barrier = barrierSlot(bar_id);
734 wf_barrier.decMaxBarrierCnt();
735}
736
737void
739{
740 auto &wf_barrier = barrierSlot(bar_id);
741 wf_barrier.release();
742 freeBarrierIds.insert(bar_id);
743}
744
745void
747{
748 for (int i = 0; i < numVectorALUs; ++i) {
749 for (int j = 0; j < shader->n_wf; ++j) {
750 Wavefront *wf = wfList[i][j];
751 if (wf->barrierId() == bar_id) {
752 assert(wf->getStatus() == Wavefront::S_BARRIER);
754 }
755 }
756 }
757}
758
759// Execute one clock worth of work on the ComputeUnit.
760void
762{
763 // process reads and writes in the RFs
764 for (auto &vecRegFile : vrf) {
765 vecRegFile->exec();
766 }
767
768 for (auto &scRegFile : srf) {
769 scRegFile->exec();
770 }
771
772 // Execute pipeline stages in reverse order to simulate
773 // the pipeline latency
777 execStage.exec();
781
783
784 // Put this CU to sleep if there is no more work to be done.
785 if (!isDone()) {
787 } else {
789 DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
790 }
791}
792
793void
795{
796 // Initialize CU Bus models and execution resources
797
798 // Vector ALUs
799 vectorALUs.clear();
800 for (int i = 0; i < numVectorALUs; i++) {
801 vectorALUs.emplace_back(this, clockPeriod());
802 }
803
804 // Scalar ALUs
805 scalarALUs.clear();
806 for (int i = 0; i < numScalarALUs; i++) {
807 scalarALUs.emplace_back(this, clockPeriod());
808 }
809
810 // Vector Global Memory
812 "No support for multiple Global Memory Pipelines exists!!!");
816
817 // Vector Local/Shared Memory
819 "No support for multiple Local Memory Pipelines exists!!!");
823
824 // Scalar Memory
826 "No support for multiple Scalar Memory Pipelines exists!!!");
830
833
836 execStage.init();
838
840}
841
842bool
847
848bool
850{
851 // Ruby has completed the memory op. Schedule the mem_resp_event at the
852 // appropriate cycle to process the timing memory response
853 // This delay represents the pipeline delay
854 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
855 PortID index = sender_state->port_index;
856 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
857 GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
858
859 // MemSyncResp + WriteAckResp are handled completely here and we don't
860 // schedule a MemRespEvent to process the responses further
861 if (pkt->cmd == MemCmd::MemSyncResp) {
862 // This response is for 1 of the following request types:
863 // - kernel launch
864 // - kernel end
865 // - non-kernel mem sync
866
867 // Non-kernel mem sync not from an instruction
868 if (!gpuDynInst) {
869 // If there is no dynamic instruction, a CU must be present.
870 ComputeUnit *cu = sender_state->computeUnit;
871 assert(cu != nullptr);
872
873 if (pkt->req->isInvL2()) {
875 assert(cu->shader->getNumOutstandingInvL2s() >= 0);
876 } else {
877 panic("Unknown MemSyncResp not from an instruction");
878 }
879
880 // Cleanup and return, no other response events needed.
881 delete pkt->senderState;
882 delete pkt;
883 return true;
884 }
885
886 // Kernel Launch
887 // wavefront was nullptr when launching kernel, so it is meaningless
888 // here (simdId=-1, wfSlotId=-1)
889 if (gpuDynInst->isKernelLaunch()) {
890 // for kernel launch, the original request must be both kernel-type
891 // and INV_L1
892 assert(pkt->req->isKernel());
893 assert(pkt->req->isInvL1());
894
895 // one D-Cache inv is done, decrement counter
896 dispatcher.updateInvCounter(gpuDynInst->kern_id);
897
898 delete pkt->senderState;
899 delete pkt;
900 return true;
901 }
902
903 // retrieve wavefront from inst
904 Wavefront *w = gpuDynInst->wavefront();
905
906 // Check if we are waiting on Kernel End Flush
907 if (w->getStatus() == Wavefront::S_RETURNING
908 && gpuDynInst->isEndOfKernel()) {
909 // for kernel end, the original request must be both kernel-type
910 // and last-level GPU cache should be flushed if it contains
911 // dirty data. This request may have been quiesced and
912 // immediately responded to if the GL2 is a write-through /
913 // read-only cache.
914 assert(pkt->req->isKernel());
915 assert(pkt->req->isGL2CacheFlush());
916
917 // once flush done, decrement counter, and return whether all
918 // dirty writeback operations are done for the kernel
919 bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
920
921 // not all wbs are done for the kernel, just release pkt
922 // resources
923 if (!isWbDone) {
924 delete pkt->senderState;
925 delete pkt;
926 return true;
927 }
928
929 // all wbs are completed for the kernel, do retirement work
930 // for the workgroup
931 DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
932 computeUnit->cu_id, w->simdId, w->wfSlotId,
933 w->wfDynId, w->wgId);
934
935 dispatcher.notifyWgCompl(w);
936 w->setStatus(Wavefront::S_STOPPED);
937 }
938
939 if (!pkt->req->isKernel()) {
940 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
941 DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
942 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
943 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
944 gpuDynInst->disassemble(), w->outstandingReqs,
945 w->outstandingReqs - 1);
946 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
947 }
948
949 delete pkt->senderState;
950 delete pkt;
951 return true;
952 }
953
954 EventFunctionWrapper *mem_resp_event =
955 computeUnit->memPort[index].createMemRespEvent(pkt);
956
957 DPRINTF(GPUPort,
958 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
959 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
960 gpuDynInst->seqNum(), index, pkt->req->getPaddr());
961
962 computeUnit->schedule(mem_resp_event,
963 curTick() + computeUnit->resp_tick_latency);
964
965 return true;
966}
967
968bool
970{
971 return handleResponse(pkt);
972}
973
974bool
976{
977 // From scalar cache invalidate that was issued at kernel start.
978 if (pkt->req->isKernel()) {
979 delete pkt->senderState;
980 delete pkt;
981
982 return true;
983 }
984
985 assert(!pkt->req->isKernel());
986
987 // retrieve sender state
988 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
989 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
990
991 assert(pkt->isRead() || pkt->isWrite());
992 assert(gpuDynInst->numScalarReqs > 0);
993
994 gpuDynInst->numScalarReqs--;
995
1004 if (!gpuDynInst->numScalarReqs) {
1005 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
1006 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
1007 gpuDynInst);
1008 } else {
1009 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
1010 gpuDynInst);
1011 }
1012 }
1013
1014 delete pkt->senderState;
1015 delete pkt;
1016
1017 return true;
1018}
1019
1020void
1022{
1023 for (const auto &pkt : retries) {
1024 if (!sendTimingReq(pkt)) {
1025 break;
1026 } else {
1027 retries.pop_front();
1028 }
1029 }
1030}
1031
1032void
1034{
1035 int len = retries.size();
1036
1037 assert(len > 0);
1038
1039 for (int i = 0; i < len; ++i) {
1040 PacketPtr pkt = retries.front().first;
1041 [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
1042 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
1043 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1044 pkt->req->getPaddr());
1045
1049 if (!sendTimingReq(pkt)) {
1050 DPRINTF(GPUMem, "failed again!\n");
1051 break;
1052 } else {
1053 DPRINTF(GPUMem, "successful!\n");
1054 retries.pop_front();
1055 }
1056 }
1057}
1058
1059bool
1061{
1062 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1067 if (sender_state->wavefront != nullptr) {
1068 computeUnit->handleSQCReturn(pkt);
1069 } else {
1070 delete pkt->senderState;
1071 delete pkt;
1072 }
1073
1074 return true;
1075}
1076
1077void
1082
1083void
1085{
1086 int len = retries.size();
1087
1088 assert(len > 0);
1089
1090 for (int i = 0; i < len; ++i) {
1091 PacketPtr pkt = retries.front().first;
1092 [[maybe_unused]] Wavefront *wavefront = retries.front().second;
1093 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1094 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1095 pkt->req->getPaddr());
1096 if (!sendTimingReq(pkt)) {
1097 DPRINTF(GPUFetch, "failed again!\n");
1098 break;
1099 } else {
1100 DPRINTF(GPUFetch, "successful!\n");
1101 retries.pop_front();
1102 }
1103 }
1104}
1105
1106const char*
1108{
1109 return "ComputeUnit SQC memory request event";
1110}
1111
1112void
1114{
1115 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1116 [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
1117
1118 assert(!pkt->req->systemReq());
1119
1120 if (!(sqcPort.sendTimingReq(pkt))) {
1122 (pkt, sender_state->wavefront));
1123 }
1124}
1125
1126void
1128{
1129 // There must be a way around this check to do the globalMemStart...
1130 Addr tmp_vaddr = pkt->req->getVaddr();
1131
1132 updatePageDivergenceDist(tmp_vaddr);
1133
1134 // set PC in request
1135 pkt->req->setPC(gpuDynInst->wavefront()->pc());
1136
1137 pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1138
1139 // figure out the type of the request to set read/write
1140 BaseMMU::Mode TLB_mode;
1141 assert(pkt->isRead() || pkt->isWrite());
1142
1143 // only do some things if actually accessing data
1144 bool isDataAccess = pkt->isWrite() || pkt->isRead();
1145
1146 // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode
1147 // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1148 // Rather than hacking up the pagetable to add these bits in, we just
1149 // keep a structure local to our GPUs that are populated in our
1150 // emulated driver whenever memory is allocated. Consult that structure
1151 // here in case we need a memtype override.
1152 //
1153 // In full system mode these can be extracted from the PTE and assigned
1154 // after address translation takes place.
1155 if (!FullSystem) {
1157 }
1158
1159 // Check write before read for atomic operations
1160 // since atomic operations should use BaseMMU::Write
1161 if (pkt->isWrite()) {
1162 TLB_mode = BaseMMU::Write;
1163 } else if (pkt->isRead()) {
1164 TLB_mode = BaseMMU::Read;
1165 } else {
1166 fatal("pkt is not a read nor a write\n");
1167 }
1168
1169 if (!functionalTLB) {
1170 stats.tlbCycles -= curTick();
1171 }
1173
1174 PortID tlbPort_index = perLaneTLB ? index : 0;
1175
1176 if (shader->timingSim) {
1177 if (!FullSystem && debugSegFault) {
1179 Addr vaddr = pkt->req->getVaddr();
1180 unsigned size = pkt->getSize();
1181
1182 if ((vaddr + size - 1) % 64 < vaddr % 64) {
1183 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1184 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1185 }
1186
1187 Addr paddr;
1188
1189 if (!p->pTable->translate(vaddr, paddr)) {
1190 if (!p->fixupFault(vaddr)) {
1191 panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1192 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1193 vaddr);
1194 }
1195 }
1196 }
1197
1198 // This is the SenderState needed upon return
1199 pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1200
1201 // This is the senderState needed by the TLB hierarchy to function
1202 GpuTranslationState *translation_state =
1203 new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1204 pkt->senderState);
1205
1206 pkt->senderState = translation_state;
1207
1208 if (functionalTLB) {
1209 tlbPort[tlbPort_index].sendFunctional(pkt);
1210
1211 // update the hitLevel distribution
1212 int hit_level = translation_state->hitLevel;
1213 assert(hit_level != -1);
1214 stats.hitsPerTLBLevel[hit_level]++;
1215
1216 // New SenderState for the memory access
1217 GpuTranslationState *sender_state =
1219
1220 delete sender_state->tlbEntry;
1221 delete sender_state->saved;
1222 delete sender_state;
1223
1224 assert(pkt->req->hasPaddr());
1225 assert(pkt->req->hasSize());
1226
1227 // this is necessary because the GPU TLB receives packets instead
1228 // of requests. when the translation is complete, all relevent
1229 // fields in the request will be populated, but not in the packet.
1230 // here we create the new packet so we can set the size, addr,
1231 // and proper flags.
1232 PacketPtr oldPkt = pkt;
1233 pkt = new Packet(oldPkt->req, oldPkt->cmd);
1234 if (isDataAccess) {
1235 uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1236 pkt->dataStatic(tmpData);
1237 }
1238 delete oldPkt;
1239
1240
1241 // New SenderState for the memory access
1242 pkt->senderState =
1244 nullptr);
1245
1246 gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1247 gpuDynInst->tlbHitLevel[index] = hit_level;
1248
1249 // translation is done. Schedule the mem_req_event at the
1250 // appropriate cycle to send the timing memory request to ruby
1251 EventFunctionWrapper *mem_req_event =
1252 memPort[index].createMemReqEvent(pkt);
1253
1254 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1255 "scheduled\n", cu_id, gpuDynInst->simdId,
1256 gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1257
1258 schedule(mem_req_event, curTick() + req_tick_latency);
1259 } else if (tlbPort[tlbPort_index].isStalled()) {
1260 assert(tlbPort[tlbPort_index].retries.size() > 0);
1261
1262 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1263 "failed!\n", cu_id, gpuDynInst->simdId,
1264 gpuDynInst->wfSlotId, tmp_vaddr);
1265
1266 tlbPort[tlbPort_index].retries.push_back(pkt);
1267 } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1268 // Stall the data port;
1269 // No more packet will be issued till
1270 // ruby indicates resources are freed by
1271 // a recvReqRetry() call back on this port.
1272 tlbPort[tlbPort_index].stallPort();
1273
1274 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1275 "failed!\n", cu_id, gpuDynInst->simdId,
1276 gpuDynInst->wfSlotId, tmp_vaddr);
1277
1278 tlbPort[tlbPort_index].retries.push_back(pkt);
1279 } else {
1280 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x from "
1281 "instruction %s sent!\n", cu_id, gpuDynInst->simdId,
1282 gpuDynInst->wfSlotId, tmp_vaddr,
1283 gpuDynInst->disassemble().c_str());
1284 }
1285 } else {
1286 if (pkt->cmd == MemCmd::MemSyncReq) {
1287 gpuDynInst->resetEntireStatusVector();
1288 } else {
1289 gpuDynInst->decrementStatusVector(index);
1290 }
1291
1292 // New SenderState for the memory access
1293 delete pkt->senderState;
1294
1295 // Because it's atomic operation, only need TLB translation state
1296 pkt->senderState = new GpuTranslationState(TLB_mode,
1297 shader->gpuTc);
1298
1299 tlbPort[tlbPort_index].sendFunctional(pkt);
1300
1301 // the addr of the packet is not modified, so we need to create a new
1302 // packet, or otherwise the memory access will have the old virtual
1303 // address sent in the translation packet, instead of the physical
1304 // address returned by the translation.
1305 PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1306 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1307
1308 // Translation is done. It is safe to send the packet to memory.
1309 memPort[0].sendFunctional(new_pkt);
1310
1311 DPRINTF(GPUMem, "Functional sendRequest\n");
1312 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1313 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1314 new_pkt->req->getPaddr());
1315
1316 // safe_cast the senderState
1317 GpuTranslationState *sender_state =
1319
1320 delete sender_state->tlbEntry;
1321 delete new_pkt;
1322 delete pkt->senderState;
1323 delete pkt;
1324 }
1325}
1326
1327void
1329{
1330 assert(pkt->isWrite() || pkt->isRead());
1331
1332 BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1333
1334 pkt->senderState =
1336
1337 pkt->senderState =
1338 new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1339 pkt->senderState);
1340
1341 if (scalarDTLBPort.isStalled()) {
1342 assert(scalarDTLBPort.retries.size());
1343 scalarDTLBPort.retries.push_back(pkt);
1344 } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1346 scalarDTLBPort.retries.push_back(pkt);
1347 } else {
1348 DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1349 tlb_mode == BaseMMU::Read ? "read" : "write",
1350 pkt->req->getVaddr());
1351 }
1352}
1353
1354void
1356 bool kernelMemSync,
1357 RequestPtr req)
1358{
1359 assert(gpuDynInst->isGlobalSeg() ||
1360 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1361
1362 // Fences will never be issued to system memory, so we can mark the
1363 // requestor as a device memory ID here.
1364 if (!req) {
1365 req = std::make_shared<Request>(
1366 0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
1367 } else {
1368 req->requestorId(vramRequestorId());
1369 }
1370
1371 // all mem sync requests have Paddr == 0
1372 req->setPaddr(0);
1373
1374 PacketPtr pkt = nullptr;
1375
1376 if (kernelMemSync) {
1377 if (gpuDynInst->isKernelLaunch()) {
1378 req->setCacheCoherenceFlags(Request::INV_L1);
1379 req->setReqInstSeqNum(gpuDynInst->seqNum());
1380 req->setFlags(Request::KERNEL);
1381 pkt = new Packet(req, MemCmd::MemSyncReq);
1382 pkt->pushSenderState(
1383 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1384
1385 EventFunctionWrapper *mem_req_event =
1386 memPort[0].createMemReqEvent(pkt);
1387
1388 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1389 "an acquire\n", cu_id, gpuDynInst->simdId,
1390 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1391
1392 schedule(mem_req_event, curTick() + req_tick_latency);
1393 } else {
1394 // kernel end flush of GL2 cache may be quiesced by Ruby if the
1395 // GL2 is a read-only cache
1396 assert(shader->impl_kern_end_rel);
1397 assert(gpuDynInst->isEndOfKernel());
1398
1399 req->setCacheCoherenceFlags(Request::FLUSH_L2);
1400 req->setReqInstSeqNum(gpuDynInst->seqNum());
1401 req->setFlags(Request::KERNEL);
1402 pkt = new Packet(req, MemCmd::MemSyncReq);
1403 pkt->pushSenderState(
1404 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1405
1406 EventFunctionWrapper *mem_req_event =
1407 memPort[0].createMemReqEvent(pkt);
1408
1409 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1410 "a release\n", cu_id, gpuDynInst->simdId,
1411 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1412
1413 schedule(mem_req_event, curTick() + req_tick_latency);
1414 }
1415 } else {
1416 gpuDynInst->setRequestFlags(req);
1417
1418 req->setReqInstSeqNum(gpuDynInst->seqNum());
1419
1420 pkt = new Packet(req, MemCmd::MemSyncReq);
1421 pkt->pushSenderState(
1422 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1423
1424 EventFunctionWrapper *mem_req_event =
1425 memPort[0].createMemReqEvent(pkt);
1426
1427 DPRINTF(GPUPort,
1428 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1429 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1430 pkt->req->getPaddr());
1431
1432 schedule(mem_req_event, curTick() + req_tick_latency);
1433 }
1434}
1435
1436void
1438{
1439 auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
1440 req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
1441
1442 auto pkt = new Packet(req, MemCmd::MemSyncReq);
1443 pkt->pushSenderState(
1444 new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
1445
1446 EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
1447
1448 schedule(mem_req_event, curTick() + req_tick_latency);
1449
1451}
1452
1453void
1455{
1456 DataPort::SenderState *sender_state =
1458
1459 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1460 ComputeUnit *compute_unit = computeUnit;
1461
1462 assert(gpuDynInst);
1463
1464 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1465 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1466 pkt->req->getPaddr(), id);
1467
1468 Addr paddr = pkt->req->getPaddr();
1469
1470 // mem sync resp callback must be handled already in
1471 // DataPort::recvTimingResp
1472 assert(pkt->cmd != MemCmd::MemSyncResp);
1473
1474 // The status vector and global memory response for WriteResp packets get
1475 // handled by the WriteCompleteResp packets.
1476 if (pkt->cmd == MemCmd::WriteResp) {
1477 if (!FullSystem || !pkt->req->systemReq()) {
1478 delete pkt;
1479 return;
1480 }
1481 }
1482
1483 // this is for read, write and atomic
1484 int index = gpuDynInst->memStatusVector[paddr].back();
1485
1486 DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1487 pkt->req->getPaddr(), id);
1488
1489 gpuDynInst->memStatusVector[paddr].pop_back();
1490 gpuDynInst->pAddr = pkt->req->getPaddr();
1491
1492 gpuDynInst->decrementStatusVector(index);
1493 DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1494
1495 if (gpuDynInst->allLanesZero()) {
1496 auto iter = gpuDynInst->memStatusVector.begin();
1497 auto end = gpuDynInst->memStatusVector.end();
1498
1499 while (iter != end) {
1500 assert(iter->second.empty());
1501 ++iter;
1502 }
1503
1504 // Calculate the difference between the arrival of the first cache
1505 // block and the last cache block to arrive if we have the time
1506 // for the first cache block.
1507 if (compute_unit->headTailMap.count(gpuDynInst)) {
1508 Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1509 compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1510 compute_unit->headTailMap.erase(gpuDynInst);
1511 }
1512
1513 gpuDynInst->memStatusVector.clear();
1514
1515 gpuDynInst->
1516 profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1517 compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1518
1519 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1520 compute_unit->cu_id, gpuDynInst->simdId,
1521 gpuDynInst->wfSlotId);
1522 } else {
1523 if (pkt->isRead()) {
1524 if (!compute_unit->headTailMap.count(gpuDynInst)) {
1525 compute_unit->headTailMap
1526 .insert(std::make_pair(gpuDynInst, curTick()));
1527 }
1528 }
1529 }
1530
1531 delete pkt->senderState;
1532 delete pkt;
1533}
1534
1535bool
1537{
1538 Addr line = pkt->req->getPaddr();
1539
1540 DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1541 pkt->req->getVaddr(), line);
1542
1543 assert(pkt->senderState);
1544 computeUnit->stats.tlbCycles += curTick();
1545
1546 // pop off the TLB translation state
1547 GpuTranslationState *translation_state =
1549
1550 // no PageFaults are permitted for data accesses
1551 if (!translation_state->tlbEntry) {
1552 DTLBPort::SenderState *sender_state =
1553 safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1554
1555 [[maybe_unused]] Wavefront *w =
1556 computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1557 [sender_state->_gpuDynInst->wfSlotId];
1558
1559 DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1560 pkt->req->getVaddr());
1561 }
1562
1563 // update the hitLevel distribution
1564 int hit_level = translation_state->hitLevel;
1565 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1566
1567 delete translation_state->tlbEntry;
1568 assert(!translation_state->ports.size());
1569 pkt->senderState = translation_state->saved;
1570
1571 // for prefetch pkt
1572 BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1573
1574 delete translation_state;
1575
1576 // use the original sender state to know how to close this transaction
1577 DTLBPort::SenderState *sender_state =
1579
1580 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1581 PortID mp_index = sender_state->portIndex;
1582 Addr vaddr = pkt->req->getVaddr();
1583 gpuDynInst->memStatusVector[line].push_back(mp_index);
1584 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1585
1586 MemCmd requestCmd;
1587
1588 if (pkt->cmd == MemCmd::ReadResp) {
1589 requestCmd = MemCmd::ReadReq;
1590 } else if (pkt->cmd == MemCmd::WriteResp) {
1591 requestCmd = MemCmd::WriteReq;
1592 } else if (pkt->cmd == MemCmd::SwapResp) {
1593 requestCmd = MemCmd::SwapReq;
1594 } else {
1595 panic("unsupported response to request conversion %s\n",
1596 pkt->cmd.toString());
1597 }
1598
1599 if (computeUnit->prefetchDepth) {
1600 int simdId = gpuDynInst->simdId;
1601 int wfSlotId = gpuDynInst->wfSlotId;
1602 Addr last = 0;
1603
1604 switch(computeUnit->prefetchType) {
1605 case enums::PF_CU:
1606 last = computeUnit->lastVaddrCU[mp_index];
1607 break;
1608 case enums::PF_PHASE:
1609 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1610 break;
1611 case enums::PF_WF:
1612 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1613 default:
1614 break;
1615 }
1616
1617 DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1618 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1619
1620 int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1622 : 0;
1623
1624 DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1625
1626 computeUnit->lastVaddrCU[mp_index] = vaddr;
1627 computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1628 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1629
1630 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1631 computeUnit->prefetchStride: stride;
1632
1633 DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1634 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1635
1636 DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1637
1638 // Prefetch Next few pages atomically
1639 for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1640 DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1642
1643 if (!stride)
1644 break;
1645
1646 RequestPtr prefetch_req = std::make_shared<Request>(
1648 sizeof(uint8_t), 0,
1649 computeUnit->requestorId(),
1650 0, 0, nullptr);
1651
1652 PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1653 uint8_t foo = 0;
1654 prefetch_pkt->dataStatic(&foo);
1655
1656 // Because it's atomic operation, only need TLB translation state
1657 prefetch_pkt->senderState =
1658 new GpuTranslationState(TLB_mode,
1659 computeUnit->shader->gpuTc, true);
1660
1661 // Currently prefetches are zero-latency, hence the sendFunctional
1662 sendFunctional(prefetch_pkt);
1663
1664 /* safe_cast the senderState */
1665 GpuTranslationState *tlb_state =
1667 prefetch_pkt->senderState);
1668
1669
1670 delete tlb_state->tlbEntry;
1671 delete tlb_state;
1672 delete prefetch_pkt;
1673 }
1674 }
1675
1676 // First we must convert the response cmd back to a request cmd so that
1677 // the request can be sent through the cu's request port
1678 PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1679 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1680 delete pkt->senderState;
1681 delete pkt;
1682
1683 // New SenderState for the memory access
1684 new_pkt->senderState =
1685 new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1686 nullptr);
1687
1688 // Set VRAM ID for device requests
1689 // For now, system vmem requests use functional reads. This is not that
1690 // critical to model as the region of interest should always be accessing
1691 // device memory. System vmem requests are used by blit kernels to do
1692 // memcpys and load code objects into device memory.
1693 if (new_pkt->req->systemReq()) {
1694 // There will be multiple packets returned for the same gpuDynInst,
1695 // so first check if systemReq is not already set and if so, return
1696 // the token acquired when the dispatch list is filled as system
1697 // requests do not require a GPU coalescer token.
1698 if (!gpuDynInst->isSystemReq()) {
1699 computeUnit->getTokenManager()->recvTokens(1);
1700 gpuDynInst->setSystemReq();
1701 }
1702 } else {
1703 new_pkt->req->requestorId(computeUnit->vramRequestorId());
1704 }
1705
1706 // translation is done. Schedule the mem_req_event at the appropriate
1707 // cycle to send the timing memory request to ruby
1708 EventFunctionWrapper *mem_req_event =
1709 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1710
1711 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1712 computeUnit->cu_id, gpuDynInst->simdId,
1713 gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1714
1715 computeUnit->schedule(mem_req_event, curTick() +
1716 computeUnit->req_tick_latency);
1717
1718 return true;
1719}
1720
1723{
1724 return new EventFunctionWrapper(
1725 [this, pkt]{ processMemReqEvent(pkt); },
1726 "ComputeUnit memory request event", true);
1727}
1728
1731{
1732 return new EventFunctionWrapper(
1733 [this, pkt]{ processMemRespEvent(pkt); },
1734 "ComputeUnit memory response event", true);
1735}
1736
1737void
1739{
1740 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1741 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1742 [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1743
1744 if (pkt->req->systemReq()) {
1745 assert(compute_unit->shader->systemHub);
1746 SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
1747 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1748 } else if (!(sendTimingReq(pkt))) {
1749 retries.emplace_back(pkt, gpuDynInst);
1750
1751 if (gpuDynInst) {
1752 DPRINTF(GPUPort,
1753 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1754 compute_unit->cu_id, gpuDynInst->simdId,
1755 gpuDynInst->wfSlotId, id, pkt->req->getPaddr());
1756 }
1757 } else {
1758 if (gpuDynInst) {
1759 DPRINTF(GPUPort,
1760 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
1761 " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1762 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1763 pkt->req->getPaddr());
1764 }
1765 }
1766}
1767
1768const char*
1770{
1771 return "ComputeUnit scalar memory request event";
1772}
1773
1774void
1776{
1777 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1778 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1779 [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1780
1781 if (pkt->req->systemReq()) {
1782 assert(compute_unit->shader->systemHub);
1783 SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
1784 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1785 } else if (!(scalarDataPort.sendTimingReq(pkt))) {
1786 scalarDataPort.retries.emplace_back(pkt);
1787
1788 DPRINTF(GPUPort,
1789 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1790 compute_unit->cu_id, gpuDynInst->simdId,
1791 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1792 } else {
1793 DPRINTF(GPUPort,
1794 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1795 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1796 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1797 pkt->req->getPaddr());
1798 }
1799}
1800
1801/*
1802 * The initial translation request could have been rejected,
1803 * if <retries> queue is not Retry sending the translation
1804 * request. sendRetry() is called from the peer port whenever
1805 * a translation completes.
1806 */
1807void
1809{
1810 int len = retries.size();
1811
1812 DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1813 computeUnit->cu_id, len);
1814
1815 assert(len > 0);
1816 assert(isStalled());
1817 // recvReqRetry is an indication that the resource on which this
1818 // port was stalling on is freed. So, remove the stall first
1819 unstallPort();
1820
1821 for (int i = 0; i < len; ++i) {
1822 PacketPtr pkt = retries.front();
1823 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1824 DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1825
1826 if (!sendTimingReq(pkt)) {
1827 // Stall port
1828 stallPort();
1829 DPRINTF(GPUTLB, ": failed again\n");
1830 break;
1831 } else {
1832 DPRINTF(GPUTLB, ": successful\n");
1833 retries.pop_front();
1834 }
1835 }
1836}
1837
1838bool
1840{
1841 assert(pkt->senderState);
1842
1843 GpuTranslationState *translation_state =
1845
1846 // Page faults are not allowed
1847 fatal_if(!translation_state->tlbEntry,
1848 "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1849
1850 delete translation_state->tlbEntry;
1851 assert(!translation_state->ports.size());
1852
1853 pkt->senderState = translation_state->saved;
1854 delete translation_state;
1855
1856 ScalarDTLBPort::SenderState *sender_state =
1858
1859 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1860 delete pkt->senderState;
1861
1862 [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1863
1864 DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1865 "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1866 w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1867
1868 MemCmd mem_cmd;
1869
1870 if (pkt->cmd == MemCmd::ReadResp) {
1871 mem_cmd = MemCmd::ReadReq;
1872 } else if (pkt->cmd == MemCmd::WriteResp) {
1873 mem_cmd = MemCmd::WriteReq;
1874 } else {
1875 fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1876 pkt->cmd.toString());
1877 }
1878
1879 PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1880 req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1881 delete pkt;
1882
1883 req_pkt->senderState =
1885
1886 // For a system request we want to mark the GPU instruction as a system
1887 // load/store so that after the request is issued to system memory we can
1888 // return any token acquired for the request. Since tokens are returned
1889 // by the coalescer and system requests do not take that path, this needs
1890 // to be tracked.
1891 //
1892 // Device requests change the requestor ID to something in the device
1893 // memory Ruby network.
1894 if (req_pkt->req->systemReq()) {
1895 gpuDynInst->setSystemReq();
1896 } else {
1897 req_pkt->req->requestorId(computeUnit->vramRequestorId());
1898 }
1899
1900 ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
1902 (computeUnit->scalarDataPort, req_pkt);
1903 computeUnit->schedule(scalar_mem_req_event, curTick() +
1904 computeUnit->scalar_req_tick_latency);
1905
1906 return true;
1907}
1908
1909bool
1911{
1912 [[maybe_unused]] Addr line = pkt->req->getPaddr();
1913 DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1914 computeUnit->cu_id, pkt->req->getVaddr(), line);
1915
1916 assert(pkt->senderState);
1917
1918 // pop off the TLB translation state
1919 GpuTranslationState *translation_state
1921
1922 bool success = translation_state->tlbEntry != nullptr;
1923 delete translation_state->tlbEntry;
1924 assert(!translation_state->ports.size());
1925 pkt->senderState = translation_state->saved;
1926 delete translation_state;
1927
1928 // use the original sender state to know how to close this transaction
1929 ITLBPort::SenderState *sender_state =
1931
1932 // get the wavefront associated with this translation request
1933 Wavefront *wavefront = sender_state->wavefront;
1934 delete pkt->senderState;
1935
1936 if (success) {
1937 // pkt is reused in fetch(), don't delete it here. However, we must
1938 // reset the command to be a request so that it can be sent through
1939 // the cu's request port
1940 assert(pkt->cmd == MemCmd::ReadResp);
1941 pkt->cmd = MemCmd::ReadReq;
1942
1943 computeUnit->fetchStage.fetch(pkt, wavefront);
1944 } else {
1945 if (wavefront->dropFetch) {
1946 assert(wavefront->instructionBuffer.empty());
1947 wavefront->dropFetch = false;
1948 }
1949
1950 wavefront->pendingFetch = 0;
1951 }
1952
1953 return true;
1954}
1955
1956/*
1957 * The initial translation request could have been rejected, if
1958 * <retries> queue is not empty. Retry sending the translation
1959 * request. sendRetry() is called from the peer port whenever
1960 * a translation completes.
1961 */
1962void
1964{
1965
1966 int len = retries.size();
1967 DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1968
1969 assert(len > 0);
1970 assert(isStalled());
1971
1972 // recvReqRetry is an indication that the resource on which this
1973 // port was stalling on is freed. So, remove the stall first
1974 unstallPort();
1975
1976 for (int i = 0; i < len; ++i) {
1977 PacketPtr pkt = retries.front();
1978 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1979 DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1980
1981 if (!sendTimingReq(pkt)) {
1982 stallPort(); // Stall port
1983 DPRINTF(GPUTLB, ": failed again\n");
1984 break;
1985 } else {
1986 DPRINTF(GPUTLB, ": successful\n");
1987 retries.pop_front();
1988 }
1989 }
1990}
1991
1992void
1994{
1995 if (gpuDynInst->isScalar()) {
1996 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1997 stats.sALUInsts++;
1999 } else if (gpuDynInst->isLoad()) {
2001 } else if (gpuDynInst->isStore()) {
2003 }
2004 } else {
2005 if (gpuDynInst->isALU()) {
2008 exitSimLoop("max vALU insts");
2009 }
2010 stats.vALUInsts++;
2013 += gpuDynInst->wavefront()->execMask().count();
2014 } else if (gpuDynInst->isFlat()) {
2015 if (gpuDynInst->isLocalMem()) {
2017 } else {
2019 }
2020 } else if (gpuDynInst->isFlatGlobal()) {
2022 } else if (gpuDynInst->isFlatScratch()) {
2024 } else if (gpuDynInst->isLocalMem()) {
2026 } else if (gpuDynInst->isLoad()) {
2028 } else if (gpuDynInst->isStore()) {
2030 }
2031
2032 if (gpuDynInst->isLoad()) {
2033 switch (gpuDynInst->executedAs()) {
2034 case enums::SC_SPILL:
2035 stats.spillReads++;
2036 break;
2037 case enums::SC_GLOBAL:
2039 break;
2040 case enums::SC_GROUP:
2041 stats.groupReads++;
2042 break;
2043 case enums::SC_PRIVATE:
2044 stats.privReads++;
2045 break;
2046 case enums::SC_READONLY:
2048 break;
2049 case enums::SC_KERNARG:
2051 break;
2052 case enums::SC_ARG:
2053 stats.argReads++;
2054 break;
2055 case enums::SC_NONE:
2060 break;
2061 default:
2062 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2063 break;
2064 }
2065 } else if (gpuDynInst->isStore()) {
2066 switch (gpuDynInst->executedAs()) {
2067 case enums::SC_SPILL:
2069 break;
2070 case enums::SC_GLOBAL:
2072 break;
2073 case enums::SC_GROUP:
2075 break;
2076 case enums::SC_PRIVATE:
2077 stats.privWrites++;
2078 break;
2079 case enums::SC_READONLY:
2081 break;
2082 case enums::SC_KERNARG:
2084 break;
2085 case enums::SC_ARG:
2086 stats.argWrites++;
2087 break;
2088 case enums::SC_NONE:
2093 break;
2094 default:
2095 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2096 break;
2097 }
2098 }
2099 }
2100}
2101
2102void
2104{
2105 Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
2106
2107 if (!pagesTouched.count(virt_page_addr))
2108 pagesTouched[virt_page_addr] = 1;
2109 else
2110 pagesTouched[virt_page_addr]++;
2111}
2112
2113void
2115{
2116 if (countPages) {
2117 std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2118
2119 *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2120 std::endl;
2121
2122 for (auto iter : pageAccesses) {
2123 *page_stat_file << std::hex << iter.first << ",";
2124 *page_stat_file << std::dec << iter.second.first << ",";
2125 *page_stat_file << std::dec << iter.second.second << std::endl;
2126 }
2127 }
2128}
2129
2130bool
2132{
2133 for (int i = 0; i < numVectorALUs; ++i) {
2134 if (!isVectorAluIdle(i)) {
2135 return false;
2136 }
2137 }
2138
2139 // TODO: FIXME if more than 1 of any memory pipe supported
2140 if (!srfToScalarMemPipeBus.rdy()) {
2141 return false;
2142 }
2143 if (!vrfToGlobalMemPipeBus.rdy()) {
2144 return false;
2145 }
2146 if (!vrfToLocalMemPipeBus.rdy()) {
2147 return false;
2148 }
2149
2154 return false;
2155 }
2156
2157 return true;
2158}
2159
2160int32_t
2161ComputeUnit::getRefCounter(const uint32_t dispatchId,
2162 const uint32_t wgId) const
2163{
2164 return lds.getRefCounter(dispatchId, wgId);
2165}
2166
2167bool
2168ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2169{
2170 assert(simdId < numVectorALUs);
2171
2172 for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2173 if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2174 return false;
2175 }
2176 }
2177
2178 return true;
2179}
2180
2186bool
2188{
2189 // this is just a request to carry the GPUDynInstPtr
2190 // back and forth
2191 RequestPtr newRequest = std::make_shared<Request>();
2192 newRequest->setPaddr(0x0);
2193
2194 // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2195 PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2196
2197 // This is the SenderState needed upon return
2198 newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2199
2200 return ldsPort.sendTimingReq(newPacket);
2201}
2202
2211
2215bool
2217{
2218 const ComputeUnit::LDSPort::SenderState *senderState =
2219 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2220
2221 fatal_if(!senderState, "did not get the right sort of sender state");
2222
2223 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2224
2225 delete packet->senderState;
2226 delete packet;
2227
2228 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2229 return true;
2230}
2231
2237bool
2239{
2240 ComputeUnit::LDSPort::SenderState *sender_state =
2242 fatal_if(!sender_state, "packet without a valid sender state");
2243
2244 [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2245
2246 if (isStalled()) {
2247 fatal_if(retries.empty(), "must have retries waiting to be stalled");
2248
2249 retries.push(pkt);
2250
2251 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2252 computeUnit->cu_id, gpuDynInst->simdId,
2253 gpuDynInst->wfSlotId);
2254 return false;
2255 } else if (!RequestPort::sendTimingReq(pkt)) {
2256 // need to stall the LDS port until a recvReqRetry() is received
2257 // this indicates that there is more space
2258 stallPort();
2259 retries.push(pkt);
2260
2261 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2262 computeUnit->cu_id, gpuDynInst->simdId,
2263 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2264 return false;
2265 } else {
2266 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2267 computeUnit->cu_id, gpuDynInst->simdId,
2268 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2269 return true;
2270 }
2271}
2272
2279void
2281{
2282 auto queueSize = retries.size();
2283
2284 DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2285 computeUnit->cu_id, queueSize);
2286
2287 fatal_if(queueSize < 1,
2288 "why was there a recvReqRetry() with no pending reqs?");
2289 fatal_if(!isStalled(),
2290 "recvReqRetry() happened when the port was not stalled");
2291
2292 unstallPort();
2293
2294 while (!retries.empty()) {
2295 PacketPtr packet = retries.front();
2296
2297 DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2298
2299 if (!RequestPort::sendTimingReq(packet)) {
2300 // Stall port
2301 stallPort();
2302 DPRINTF(GPUPort, ": LDS send failed again\n");
2303 break;
2304 } else {
2305 DPRINTF(GPUTLB, ": LDS send successful\n");
2306 retries.pop();
2307 }
2308 }
2309}
2310
2312 int n_wf)
2313 : statistics::Group(parent),
2314 ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2315 ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2316 "per-wavefront."),
2317 ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2318 ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2319 "per-wavefront."),
2320 ADD_STAT(instCyclesVALU,
2321 "Number of cycles needed to execute VALU insts."),
2322 ADD_STAT(instCyclesSALU,
2323 "Number of cycles needed to execute SALU insts."),
2324 ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2325 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2326 "the number of active threads."),
2327 ADD_STAT(vALUUtilization,
2328 "Percentage of active vector ALU threads in a wave."),
2329 ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2330 " accesses that resolve to LDS."),
2331 ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2332 "including FLAT accesses that resolve to LDS) per-wavefront."),
2333 ADD_STAT(flatVMemInsts,
2334 "The number of FLAT insts that resolve to vmem issued."),
2335 ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2336 "resolve to vmem issued per-wavefront."),
2337 ADD_STAT(flatLDSInsts,
2338 "The number of FLAT insts that resolve to LDS issued."),
2339 ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2340 "resolve to LDS issued per-wavefront."),
2341 ADD_STAT(vectorMemWrites,
2342 "Number of vector mem write insts (excluding FLAT insts)."),
2343 ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2344 "insts (excluding FLAT insts) per-wavefront."),
2345 ADD_STAT(vectorMemReads,
2346 "Number of vector mem read insts (excluding FLAT insts)."),
2347 ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2348 "(excluding FLAT insts) per-wavefront."),
2349 ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2350 ADD_STAT(scalarMemWritesPerWF,
2351 "The average number of scalar mem write insts per-wavefront."),
2352 ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2353 ADD_STAT(scalarMemReadsPerWF,
2354 "The average number of scalar mem read insts per-wavefront."),
2355 ADD_STAT(vectorMemReadsPerKiloInst,
2356 "Number of vector mem reads per kilo-instruction"),
2357 ADD_STAT(vectorMemWritesPerKiloInst,
2358 "Number of vector mem writes per kilo-instruction"),
2359 ADD_STAT(vectorMemInstsPerKiloInst,
2360 "Number of vector mem insts per kilo-instruction"),
2361 ADD_STAT(scalarMemReadsPerKiloInst,
2362 "Number of scalar mem reads per kilo-instruction"),
2363 ADD_STAT(scalarMemWritesPerKiloInst,
2364 "Number of scalar mem writes per kilo-instruction"),
2365 ADD_STAT(scalarMemInstsPerKiloInst,
2366 "Number of scalar mem insts per kilo-instruction"),
2367 ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2368 "command, data from VRF to vector memory unit, per SIMD"),
2369 ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2370 "command, data from SRF to scalar memory unit, per SIMD"),
2371 ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2372 "command, data from VRF to LDS unit, per SIMD"),
2373 ADD_STAT(globalReads, "Number of reads to the global segment"),
2374 ADD_STAT(globalWrites, "Number of writes to the global segment"),
2375 ADD_STAT(globalMemInsts,
2376 "Number of memory instructions sent to the global segment"),
2377 ADD_STAT(argReads, "Number of reads to the arg segment"),
2378 ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2379 ADD_STAT(argMemInsts,
2380 "Number of memory instructions sent to the arg segment"),
2381 ADD_STAT(spillReads, "Number of reads to the spill segment"),
2382 ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2383 ADD_STAT(spillMemInsts,
2384 "Number of memory instructions sent to the spill segment"),
2385 ADD_STAT(groupReads, "Number of reads to the group segment"),
2386 ADD_STAT(groupWrites, "Number of writes to the group segment"),
2387 ADD_STAT(groupMemInsts,
2388 "Number of memory instructions sent to the group segment"),
2389 ADD_STAT(privReads, "Number of reads to the private segment"),
2390 ADD_STAT(privWrites, "Number of writes to the private segment"),
2391 ADD_STAT(privMemInsts,
2392 "Number of memory instructions sent to the private segment"),
2393 ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2394 ADD_STAT(readonlyWrites,
2395 "Number of memory instructions sent to the readonly segment"),
2396 ADD_STAT(readonlyMemInsts,
2397 "Number of memory instructions sent to the readonly segment"),
2398 ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2399 ADD_STAT(kernargWrites,
2400 "Number of memory instructions sent to the kernarg segment"),
2401 ADD_STAT(kernargMemInsts,
2402 "Number of memory instructions sent to the kernarg segment"),
2403 ADD_STAT(waveLevelParallelism,
2404 "wave level parallelism: count of active waves at wave launch"),
2405 ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2406 ADD_STAT(tlbCycles,
2407 "total number of cycles for all uncoalesced requests"),
2408 ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2409 ADD_STAT(hitsPerTLBLevel,
2410 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2411 ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2412 ADD_STAT(ldsBankConflictDist,
2413 "Number of bank conflicts per LDS memory packet"),
2414 ADD_STAT(pageDivergenceDist,
2415 "pages touched per wf (over all mem. instr.)"),
2416 ADD_STAT(dynamicGMemInstrCnt,
2417 "dynamic non-flat global memory instruction count"),
2418 ADD_STAT(dynamicFlatMemInstrCnt,
2419 "dynamic flat global memory instruction count"),
2420 ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2421 ADD_STAT(wgBlockedDueBarrierAllocation,
2422 "WG dispatch was blocked due to lack of barrier resources"),
2423 ADD_STAT(wgBlockedDueLdsAllocation,
2424 "Workgroup blocked due to LDS capacity"),
2425 ADD_STAT(numInstrExecuted, "number of instructions executed"),
2426 ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2427 "vector instructions per cycle"),
2428 ADD_STAT(numVecOpsExecuted,
2429 "number of vec ops executed (e.g. WF size/inst)"),
2430 ADD_STAT(numVecOpsExecutedF16,
2431 "number of f16 vec ops executed (e.g. WF size/inst)"),
2432 ADD_STAT(numVecOpsExecutedF32,
2433 "number of f32 vec ops executed (e.g. WF size/inst)"),
2434 ADD_STAT(numVecOpsExecutedF64,
2435 "number of f64 vec ops executed (e.g. WF size/inst)"),
2436 ADD_STAT(numVecOpsExecutedFMA16,
2437 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2438 ADD_STAT(numVecOpsExecutedFMA32,
2439 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2440 ADD_STAT(numVecOpsExecutedFMA64,
2441 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2442 ADD_STAT(numVecOpsExecutedMAC16,
2443 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2444 ADD_STAT(numVecOpsExecutedMAC32,
2445 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2446 ADD_STAT(numVecOpsExecutedMAC64,
2447 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2448 ADD_STAT(numVecOpsExecutedMAD16,
2449 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2450 ADD_STAT(numVecOpsExecutedMAD32,
2451 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2452 ADD_STAT(numVecOpsExecutedMAD64,
2453 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2454 ADD_STAT(numVecOpsExecutedMFMA,
2455 "number of mfma vec ops executed (e.g. WF size/inst)"),
2456 ADD_STAT(numVecOpsExecutedMFMAI8,
2457 "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
2458 ADD_STAT(numVecOpsExecutedMFMAF16,
2459 "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
2460 ADD_STAT(numVecOpsExecutedMFMAF32,
2461 "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
2462 ADD_STAT(numVecOpsExecutedMFMAF64,
2463 "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
2464 ADD_STAT(numVecOpsExecutedTwoOpFP,
2465 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2466 ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2467 ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2468 ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2469 ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2470 ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2471 ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2472 ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2473 "instruction (over all instructions)"),
2474 ADD_STAT(activeLanesPerGMemInstrDist,
2475 "number of active lanes per global memory instruction"),
2476 ADD_STAT(activeLanesPerLMemInstrDist,
2477 "number of active lanes per local memory instruction"),
2478 ADD_STAT(numALUInstsExecuted,
2479 "Number of dynamic non-GM memory insts executed"),
2480 ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2481 "blocked due to VGPR allocation per SIMD"),
2482 ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2483 "blocked due to SGPR allocation per SIMD"),
2484 ADD_STAT(numCASOps, "number of compare and swap operations"),
2485 ADD_STAT(numFailedCASOps,
2486 "number of compare and swap operations that failed"),
2487 ADD_STAT(completedWfs, "number of completed wavefronts"),
2488 ADD_STAT(completedWGs, "number of completed workgroups"),
2489 ADD_STAT(headTailLatency, "ticks between first and last cache block "
2490 "arrival at coalescer"),
2491 ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2492{
2493 ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2494
2498
2500 execRateDist.init(0, 10-1, 2);
2501 ldsBankConflictDist.init(0, cu->wfSize()-1, 2);
2502
2503 pageDivergenceDist.init(1, cu->wfSize(), 4);
2507
2508 headTailLatency.init(0, 1000000-1, 10000).flags(statistics::pdf |
2510 waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2511 instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2512
2523
2532
2540
2542
2543 // fixed number of TLB levels
2544 for (int i = 0; i < 4; ++i) {
2545 if (!i)
2546 hitsPerTLBLevel.subname(i,"page_table");
2547 else
2548 hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2549 }
2550
2556
2559}
2560
2561} // namespace gem5
#define DPRINTFN(...)
Definition trace.hh:238
#define DPRINTF(x,...)
Definition trace.hh:210
void sendRequest(PacketPtr pkt, Event *callback)
Definition system_hub.cc:42
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Tick clockPeriod() const
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
const char * description() const
Return a C string describing the event.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
const char * description() const
Return a C string describing the event.
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
void sendInvL2(Addr paddr)
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void processFetchReturn(PacketPtr pkt)
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
int numWg(int dim) const
int wgId(int dim) const
static const int MAX_DIM
int wgSize(int dim) const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:58
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Definition lds_state.hh:363
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
Definition lds_state.hh:549
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Definition lds_state.hh:431
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
Definition lds_state.hh:398
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
const std::string & toString() const
Return the string to a cmd given by idx.
Definition packet.hh:276
virtual std::string name() const
Definition named.hh:47
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
Definition output.cc:210
std::ostream * stream() const
Get the output underlying output stream.
Definition output.hh:62
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
bool isRead() const
Definition packet.hh:593
Addr getAddr() const
Definition packet.hh:807
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition packet.cc:334
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225
bool isWrite() const
Definition packet.hh:594
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
unsigned getSize() const
Definition packet.hh:817
unsigned size
The size of the request or transfer.
Definition packet.hh:397
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
std::vector< PoolManager * > vrfPoolMgrs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
std::vector< PoolManager * > srfPoolMgrs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition port.hh:603
@ KERNEL
The request should be marked with KERNEL.
Definition request.hh:183
void injectScalarMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req)
bool timingSim
Definition shader.hh:233
void notifyCuSleep()
Definition shader.cc:536
int64_t total_valu_insts
Definition shader.hh:275
void incNumOutstandingInvL2s()
Definition shader.hh:339
int getNumOutstandingInvL2s() const
Definition shader.hh:340
ThreadContext * gpuTc
Definition shader.hh:124
GPUDispatcher & dispatcher()
Definition shader.cc:111
void decNumOutstandingInvL2s()
Definition shader.cc:556
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:582
AMDGPUSystemHub * systemHub
Definition shader.hh:272
int64_t max_valu_insts
Definition shader.hh:274
int impl_kern_end_rel
Definition shader.hh:239
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:270
virtual Process * getProcessPtr()=0
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
Definition token_port.cc:72
WF barrier slots.
static const int InvalidID
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
Definition misc.hh:76
bool rdy(Cycles cycles=Cycles(0)) const
Definition misc.hh:93
void setStatus(status_e newStatus)
Definition wavefront.cc:573
const int simdId
Definition wavefront.hh:101
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:111
status_e getStatus()
Definition wavefront.hh:141
const int wfSlotId
Definition wavefront.hh:98
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
STL pair class.
Definition stl.hh:58
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition intmath.hh:59
static constexpr bool isPowerOf2(const T &n)
Definition intmath.hh:98
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 18, 16 > len
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21, 20 > stride
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 1 > vpc
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 23 > k
Bitfield< 0 > w
const Addr PageShift
Definition page_size.hh:48
Bitfield< 3 > addr
Definition types.hh:84
Bitfield< 2 > pf
Definition misc.hh:565
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ GMEnqueue
Definition misc.hh:56
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
OutputDirectory simout
Definition output.cc:62
uint64_t Tick
Tick count type.
Definition types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition sim_events.cc:88
uint16_t RequestorID
Definition request.hh:95
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition core.cc:143
Declarations of a non-full system Page Table.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
const std::string & name()
Definition trace.cc:48

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0