gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
compute_unit.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <limits>
35
38#include "base/output.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUExec.hh"
41#include "debug/GPUFetch.hh"
42#include "debug/GPUMem.hh"
43#include "debug/GPUPort.hh"
44#include "debug/GPUPrefetch.hh"
45#include "debug/GPUReg.hh"
46#include "debug/GPURename.hh"
47#include "debug/GPUSync.hh"
48#include "debug/GPUTLB.hh"
49#include "debug/GPUTrace.hh"
50#include "enums/GfxVersion.hh"
57#include "gpu-compute/shader.hh"
61#include "mem/page_table.hh"
62#include "sim/process.hh"
63#include "sim/sim_exit.hh"
64
65namespace gem5
66{
67
69 numVectorGlobalMemUnits(p.num_global_mem_pipes),
70 numVectorSharedMemUnits(p.num_shared_mem_pipes),
71 numScalarMemUnits(p.num_scalar_mem_pipes),
72 numVectorALUs(p.num_SIMDs),
73 numScalarALUs(p.num_scalar_cores),
74 vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width),
75 coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width),
76 registerManager(p.register_manager),
77 fetchStage(p, *this),
81 globalMemoryPipe(p, *this),
82 localMemoryPipe(p, *this),
83 scalarMemoryPipe(p, *this),
84 tickEvent([this]{ exec(); }, "Compute unit tick event",
85 false, Event::CPU_Tick_Pri),
86 cu_id(p.cu_id),
87 vrf(p.vector_register_file), srf(p.scalar_register_file),
88 rfc(p.register_file_cache),
89 simdWidth(p.simd_width),
90 spBypassPipeLength(p.spbypass_pipe_length),
91 dpBypassPipeLength(p.dpbypass_pipe_length),
92 rfcPipeLength(p.rfc_pipe_length),
93 scalarPipeStages(p.scalar_pipe_length),
94 operandNetworkLength(p.operand_network_length),
95 issuePeriod(p.issue_period),
96 vrf_gm_bus_latency(p.vrf_gm_bus_latency),
97 srf_scm_bus_latency(p.srf_scm_bus_latency),
98 vrf_lm_bus_latency(p.vrf_lm_bus_latency),
99 perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth),
100 prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type),
101 debugSegFault(p.debugSegFault),
102 functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier),
103 countPages(p.countPages),
104 req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()),
105 resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()),
106 scalar_req_tick_latency(
107 p.scalar_mem_req_latency * p.clk_domain->clockPeriod()),
108 scalar_resp_tick_latency(
109 p.scalar_mem_resp_latency * p.clk_domain->clockPeriod()),
110 memtime_latency(p.memtime_latency * p.clk_domain->clockPeriod()),
111 mfma_scale(p.mfma_scale),
112 mfma_cycles({
113 // gfx90a is MI200 series (MI210, MI250X). The latency values are the
114 // "passes" in the MI200 Instruction Set Architecture reference listed
115 // for each instruction in section 12.10:
116 // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
117 // instruction-set-architectures/
118 // instinct-mi200-cdna2-instruction-set-architecture.pdf
119 {GfxVersion::gfx90a, {
120 {"v_mfma_f32_32x32x1_2b_f32", 64},
121 {"v_mfma_f32_16x16x1_4b_f32", 32},
122 {"v_mfma_f32_4x4x1_16b_f32", 8},
123 {"v_mfma_f32_32x32x2_f32", 64},
124 {"v_mfma_f32_16x16x4_f32", 32},
125 {"v_mfma_f32_32x32x4_2b_f16", 64},
126 {"v_mfma_f32_16x16x4_4b_f16", 32},
127 {"v_mfma_f32_4x4x4_16b_f16", 8},
128 {"v_mfma_f32_32x32x8_f16", 64},
129 {"v_mfma_f32_16x16x16_f16", 32},
130 {"v_mfma_i32_32x32x4_2b_i8", 64},
131 {"v_mfma_i32_16x16x4_4b_i8", 32},
132 {"v_mfma_i32_4x4x4_16b_i8", 8},
133 {"v_mfma_i32_32x32x8_i8", 64},
134 {"v_mfma_i32_16x16x16_i8", 32},
135 {"v_mfma_f32_32x32x2_2b_bf16", 64},
136 {"v_mfma_f32_16x16x2_4b_bf16", 32},
137 {"v_mfma_f32_4x4x2_16b_bf16", 8},
138 {"v_mfma_f32_32x32x4_bf16", 64},
139 {"v_mfma_f32_16x16x8_bf16", 32},
140 {"v_mfma_f64_16x16x4_f64", 32},
141 {"v_mfma_f64_4x4x4_4b_f64", 16},
142 }},
143 // gfx942 is MI300X. The latency values are taken from table 28 in
144 // section 7.1.2 in the MI300 Instruction Set Architecture reference:
145 // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
146 // instruction-set-architectures/
147 // amd-instinct-mi300-cdna3-instruction-set-architecture.pdf
148 {GfxVersion::gfx942, {
149 {"v_mfma_f32_32x32x1_2b_f32", 64},
150 {"v_mfma_f32_16x16x1_4b_f32", 32},
151 {"v_mfma_f32_4x4x1_16b_f32", 8},
152 {"v_mfma_f32_32x32x2_f32", 64},
153 {"v_mfma_f32_16x16x4_f32", 32},
154 {"v_mfma_f32_32x32x4_2b_f16", 64},
155 {"v_mfma_f32_16x16x4_4b_f16", 32},
156 {"v_mfma_f32_4x4x4_16b_f16", 8},
157 {"v_mfma_f32_32x32x8_f16", 32},
158 {"v_mfma_f32_16x16x16_f16", 16},
159 {"v_mfma_f32_32x32x4_2b_bf16", 64},
160 {"v_mfma_f32_16x16x4_4b_bf16", 32},
161 {"v_mfma_f32_4x4x4_16b_bf16", 8},
162 {"v_mfma_f32_32x32x8_bf16", 32},
163 {"v_mfma_f32_16x16x16_bf16", 16},
164 {"v_mfma_i32_32x32x4_2b_i8", 64},
165 {"v_mfma_i32_16x16x4_4b_i8", 32},
166 {"v_mfma_i32_4x4x4_16b_i8", 8},
167 {"v_mfma_i32_32x32x16_i8", 32},
168 {"v_mfma_i32_16x16x32_i8", 16},
169 {"v_mfma_f32_16x16x8_xf32", 16},
170 {"v_mfma_f32_32x32x4_xf32", 32},
171 {"v_mfma_f64_16x16x4_f64", 32},
172 {"v_mfma_f64_4x4x4_4b_f64", 16},
173 {"v_mfma_f32_16x16x32_bf8_bf8", 16},
174 {"v_mfma_f32_16x16x32_bf8_fp8", 16},
175 {"v_mfma_f32_32x32x16_fp8_bf8", 32},
176 {"v_mfma_f32_32x32x16_fp8_fp8", 32},
177 }}
178 }),
179 _requestorId(p.system->getRequestorId(this, "ComputeUnit")),
180 lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
181 ldsPort(csprintf("%s-port", name()), this),
182 scalarDataPort(csprintf("%s-port", name()), this),
183 scalarDTLBPort(csprintf("%s-port", name()), this),
184 sqcPort(csprintf("%s-port", name()), this),
185 sqcTLBPort(csprintf("%s-port", name()), this),
186 _cacheLineSize(p.system->cacheLineSize()),
187 _numBarrierSlots(p.num_barrier_slots),
188 globalSeqNum(0), wavefrontSize(p.wf_size),
189 scoreboardCheckToSchedule(p),
190 scheduleToExecute(p),
191 stats(this, p.n_wf)
192{
193 // This is not currently supported and would require adding more handling
194 // for system vs. device memory requests on the functional paths, so we
195 // fatal immediately in the constructor if this configuration is seen.
196 fatal_if(functionalTLB && FullSystem,
197 "Functional TLB not supported in full-system GPU simulation");
198
208 fatal_if(p.wf_size > std::numeric_limits<unsigned long long>::digits ||
209 p.wf_size <= 0,
210 "WF size is larger than the host can support");
211 fatal_if(!isPowerOf2(wavefrontSize),
212 "Wavefront size should be a power of 2");
213 // calculate how many cycles a vector load or store will need to transfer
214 // its data over the corresponding buses
215 numCyclesPerStoreTransfer =
216 (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
217 (double)vrfToCoalescerBusWidth);
218
219 numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
220 / coalescerToVrfBusWidth;
221
222 // Initialization: all WF slots are assumed STOPPED
223 idleWfs = p.n_wf * numVectorALUs;
224 lastVaddrWF.resize(numVectorALUs);
225 wfList.resize(numVectorALUs);
226
227 wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier());
228
229 for (int i = 0; i < p.num_barrier_slots; ++i) {
230 freeBarrierIds.insert(i);
231 }
232
233 for (int j = 0; j < numVectorALUs; ++j) {
234 lastVaddrWF[j].resize(p.n_wf);
235
236 for (int i = 0; i < p.n_wf; ++i) {
237 lastVaddrWF[j][i].resize(wfSize());
238
239 wfList[j].push_back(p.wavefronts[j * p.n_wf + i]);
240 wfList[j][i]->setParent(this);
241
242 for (int k = 0; k < wfSize(); ++k) {
243 lastVaddrWF[j][i][k] = 0;
244 }
245 }
246 }
247
248 lastVaddrSimd.resize(numVectorALUs);
249
250 for (int i = 0; i < numVectorALUs; ++i) {
251 lastVaddrSimd[i].resize(wfSize(), 0);
252 }
253
254 lastVaddrCU.resize(wfSize());
255
256 lds.setParent(this);
257
258 if (p.execPolicy == "OLDEST-FIRST") {
259 exec_policy = EXEC_POLICY::OLDEST;
260 } else if (p.execPolicy == "ROUND-ROBIN") {
261 exec_policy = EXEC_POLICY::RR;
262 } else {
263 fatal("Invalid WF execution policy (CU)\n");
264 }
265
266 for (int i = 0; i < p.port_memory_port_connection_count; ++i) {
267 memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
268 }
269
270 for (int i = 0; i < p.port_translation_port_connection_count; ++i) {
271 tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i);
272 }
273
274 // Setup tokens for response ports. The number of tokens in memPortTokens
275 // is the total token count for the entire vector port (i.e., this CU).
276 memPortTokens = new TokenManager(p.max_cu_tokens);
277
278 registerExitCallback([this]() { exitCallback(); });
279
280 lastExecCycle.resize(numVectorALUs, 0);
281
282 for (int i = 0; i < vrf.size(); ++i) {
283 vrf[i]->setParent(this);
284 rfc[i]->setParent(this);
285 }
286 for (int i = 0; i < srf.size(); ++i) {
287 srf[i]->setParent(this);
288 }
289 numVecRegsPerSimd = vrf[0]->numRegs();
290 numScalarRegsPerSimd = srf[0]->numRegs();
291
292 registerManager->setParent(this);
293
294 activeWaves = 0;
295
296 instExecPerSimd.resize(numVectorALUs, 0);
297
298 // Calculate the number of bits to address a cache line
299 panic_if(!isPowerOf2(_cacheLineSize),
300 "Cache line size should be a power of two.");
301 cacheLineBits = floorLog2(_cacheLineSize);
302
303 matrix_core_ready.resize(numVectorALUs);
304 for (int i = 0; i < numVectorALUs; i++) {
305 matrix_core_ready[i] = 0;
306 }
307
308 // Used for periodic pipeline prints
309 execCycles = 0;
310}
311
313{
314 // Delete wavefront slots
315 for (int j = 0; j < numVectorALUs; ++j) {
316 for (int i = 0; i < shader->n_wf; ++i) {
317 delete wfList[j][i];
318 }
319 lastVaddrSimd[j].clear();
320 }
321 lastVaddrCU.clear();
322}
323
324int
330
331// index into readyList of the first memory unit
332int
337
338// index into readyList of the last memory unit
339int
341{
342 return numExeUnits() - 1;
343}
344
345// index into scalarALUs vector of SALU used by the wavefront
346int
348{
349 if (numScalarALUs == 1) {
350 return 0;
351 } else {
352 return w->simdId % numScalarALUs;
353 }
354}
355
356// index into readyList of Scalar ALU unit used by wavefront
357int
362
363// index into readyList of Global Memory unit used by wavefront
364int
366{
367 // TODO: FIXME if more than 1 GM pipe supported
369}
370
371// index into readyList of Local Memory unit used by wavefront
372int
374{
375 // TODO: FIXME if more than 1 LM pipe supported
377}
378
379// index into readyList of Scalar Memory unit used by wavefront
380int
382{
383 // TODO: FIXME if more than 1 ScM pipe supported
386}
387
388void
390{
391 w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
392 w->workGroupSz[0] = task->wgSize(0);
393 w->workGroupSz[1] = task->wgSize(1);
394 w->workGroupSz[2] = task->wgSize(2);
395 w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
396 w->gridSz[0] = task->gridSize(0);
397 w->gridSz[1] = task->gridSize(1);
398 w->gridSz[2] = task->gridSize(2);
399 w->computeActualWgSz(task);
400}
401
402void
404 HSAQueueEntry *task, int bar_id, bool fetchContext)
405{
406 static int _n_wave = 0;
407
408 VectorMask init_mask;
409 init_mask.reset();
410
411 for (int k = 0; k < wfSize(); ++k) {
412 if (k + waveId * wfSize() < w->actualWgSzTotal)
413 init_mask[k] = 1;
414 }
415
416 w->execMask() = init_mask;
417
418 w->kernId = task->dispatchId();
419 w->wfId = waveId;
420 w->initMask = init_mask.to_ullong();
421
422 if (bar_id > WFBarrier::InvalidID) {
423 w->barrierId(bar_id);
424 } else {
425 assert(!w->hasBarrier());
426 }
427
428 for (int k = 0; k < wfSize(); ++k) {
429 w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
430 w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
431 w->actualWgSz[1];
432 w->workItemId[2][k] = (k + waveId * wfSize()) /
433 (w->actualWgSz[0] * w->actualWgSz[1]);
434
435 w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
436 w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
437 w->workItemId[0][k];
438 }
439
440 // WG state
441 w->wgId = task->globalWgId();
442 w->dispatchId = task->dispatchId();
443 w->workGroupId[0] = w->wgId % task->numWg(0);
444 w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
445 w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
446
447 // set the wavefront context to have a pointer to this section of the LDS
448 w->ldsChunk = ldsChunk;
449
450 [[maybe_unused]] int32_t refCount =
451 lds.increaseRefCounter(w->dispatchId, w->wgId);
452 DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
453 cu_id, w->wgId, refCount);
454
455 w->instructionBuffer.clear();
456
457 if (w->pendingFetch)
458 w->dropFetch = true;
459
460 DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
461 "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
462 w->simdId, w->wfSlotId, refCount);
463
464 w->initRegState(task, w->actualWgSzTotal);
465 w->start(_n_wave++, task->codeAddr());
466
467 stats.waveLevelParallelism.sample(activeWaves);
468 activeWaves++;
469
470 w->vmemIssued.clear();
471 w->lgkmIssued.clear();
472 w->expIssued.clear();
473
474 panic_if(w->wrGmReqsInPipe, "GM write counter for wavefront non-zero\n");
475 panic_if(w->rdGmReqsInPipe, "GM read counter for wavefront non-zero\n");
476 panic_if(w->wrLmReqsInPipe, "LM write counter for wavefront non-zero\n");
477 panic_if(w->rdLmReqsInPipe, "GM read counter for wavefront non-zero\n");
478 panic_if(w->outstandingReqs,
479 "Outstanding reqs counter for wavefront non-zero\n");
480}
481
487void
489 GPUDynInstPtr gpuDynInst
490 = std::make_shared<GPUDynInst>(this, nullptr,
492
493 // kern_id will be used in inv responses
494 gpuDynInst->kern_id = kernId;
495
496 injectGlobalMemFence(gpuDynInst, true, req);
497}
498
504void
506 injectGlobalMemFence(gpuDynInst, true);
507}
508
514void
516 GPUDynInstPtr gpuDynInst
517 = std::make_shared<GPUDynInst>(this, nullptr,
519
520 // kern_id will be used in inv responses
521 gpuDynInst->kern_id = kernId;
522
523 gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
524 scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
525}
526
527// reseting SIMD register pools
528// I couldn't think of any other place and
529// I think it is needed in my implementation
530void
532{
533 for (int i=0; i<numVectorALUs; i++)
534 {
535 registerManager->vrfPoolMgrs[i]->resetRegion(numVecRegsPerSimd);
536 registerManager->srfPoolMgrs[i]->resetRegion(numScalarRegsPerSimd);
537 }
538}
539
540void
542{
543 // If we aren't ticking, start it up!
544 if (!tickEvent.scheduled()) {
545 DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
547 }
548
549 // the kernel's invalidate must have finished before any wg dispatch
550 assert(task->isInvDone());
551
552 // reserve the LDS capacity allocated to the work group
553 // disambiguated by the dispatch ID and workgroup ID, which should be
554 // globally unique
555 LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
556 task->globalWgId(),
557 task->ldsSize());
558
559 panic_if(!ldsChunk, "was not able to reserve space for this WG");
560
561 // calculate the number of 32-bit vector registers required
562 // by each work item
563 int vregDemand = task->numVectorRegs();
564 int sregDemand = task->numScalarRegs();
565 int wave_id = 0;
566
567 int barrier_id = WFBarrier::InvalidID;
568
573 if (num_wfs_in_wg > 1) {
578 barrier_id = getFreeBarrierId();
579 auto &wf_barrier = barrierSlot(barrier_id);
580 assert(!wf_barrier.maxBarrierCnt());
581 assert(!wf_barrier.numAtBarrier());
582 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
583
584 DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
585 "%d waves using this barrier.\n", cu_id, barrier_id,
586 num_wfs_in_wg);
587 }
588
589 // Assign WFs according to numWfsToSched vector, which is computed by
590 // hasDispResources()
591 for (int j = 0; j < shader->n_wf; ++j) {
592 for (int i = 0; i < numVectorALUs; ++i) {
593 Wavefront *w = wfList[i][j];
594 // Check if this wavefront slot is available and there are WFs
595 // remaining to be dispatched to current SIMD:
596 // WF slot must be stopped and not waiting
597 // for a release to complete S_RETURNING
598 if (w->getStatus() == Wavefront::S_STOPPED &&
599 numWfsToSched[i] > 0) {
600 // decrement number of WFs awaiting dispatch to current SIMD
601 numWfsToSched[i] -= 1;
602
603 fillKernelState(w, task);
604
605 DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
606 "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
607 vregDemand, sregDemand);
608
609 registerManager->allocateRegisters(w, vregDemand, sregDemand);
610
611 startWavefront(w, wave_id, ldsChunk, task, barrier_id);
612 ++wave_id;
613 }
614 }
615 }
616}
617
618void
620{
621 panic_if(w->instructionBuffer.empty(),
622 "Instruction Buffer of WF%d can't be empty", w->wgId);
623 GPUDynInstPtr ii = w->instructionBuffer.front();
624 pipeMap.emplace(ii->seqNum());
625}
626
627void
629{
630 panic_if(w->instructionBuffer.empty(),
631 "Instruction Buffer of WF%d can't be empty", w->wgId);
632 GPUDynInstPtr ii = w->instructionBuffer.front();
633 // delete the dynamic instruction from the pipeline map
634 auto it = pipeMap.find(ii->seqNum());
635 panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
636 pipeMap.erase(it);
637}
638
639bool
641{
642 // compute true size of workgroup (after clamping to grid size)
643 int trueWgSize[HSAQueueEntry::MAX_DIM];
644 int trueWgSizeTotal = 1;
645
646 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
647 trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
648 task->wgId(d) * task->wgSize(d));
649
650 trueWgSizeTotal *= trueWgSize[d];
651 DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
652 }
653
654 DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
655
656 // calculate the number of WFs in this WG
657 int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
658 num_wfs_in_wg = numWfs;
659
660 bool barrier_avail = true;
661
662 if (numWfs > 1 && !freeBarrierIds.size()) {
663 barrier_avail = false;
664 }
665
666 // calculate the number of 32-bit vector registers required by each
667 // work item of the work group
668 int vregDemandPerWI = task->numVectorRegs();
669 // calculate the number of 32-bit scalar registers required by each
670 // work item of the work group
671 int sregDemandPerWI = task->numScalarRegs();
672
673 // check if the total number of VGPRs snd SGPRs required by all WFs
674 // of the WG fit in the VRFs of all SIMD units and the CU's SRF
675 panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
676 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
677 "that has %d VGPRs\n",
678 numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
679 panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
680 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
681 "with %d SGPRs\n",
682 numWfs, sregDemandPerWI, numScalarRegsPerSimd);
683
684 // number of WF slots that are not occupied
685 int freeWfSlots = 0;
686 // number of Wfs from WG that were successfully mapped to a SIMD
687 int numMappedWfs = 0;
688 numWfsToSched.clear();
689 numWfsToSched.resize(numVectorALUs, 0);
690
691 // attempt to map WFs to the SIMDs, based on WF slot availability
692 // and register file availability
693 for (int j = 0; j < shader->n_wf; ++j) {
694 for (int i = 0; i < numVectorALUs; ++i) {
695 if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
696 ++freeWfSlots;
697 // check if current WF will fit onto current SIMD/VRF
698 // if all WFs have not yet been mapped to the SIMDs
699 if (numMappedWfs < numWfs &&
700 registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1,
701 sregDemandPerWI) &&
702 registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1,
703 vregDemandPerWI)) {
704 numWfsToSched[i]++;
705 numMappedWfs++;
706 }
707 }
708 }
709 }
710
711 // check that the number of mapped WFs is not greater
712 // than the actual number of WFs
713 assert(numMappedWfs <= numWfs);
714
715 bool vregAvail = true;
716 bool sregAvail = true;
717 // if a WF to SIMD mapping was not found, find the limiting resource
718 if (numMappedWfs < numWfs) {
719
720 for (int j = 0; j < numVectorALUs; ++j) {
721 // find if there are enough free VGPRs in the SIMD's VRF
722 // to accomodate the WFs of the new WG that would be mapped
723 // to this SIMD unit
724 vregAvail &= registerManager->
725 canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
726 // find if there are enough free SGPRs in the SIMD's SRF
727 // to accomodate the WFs of the new WG that would be mapped
728 // to this SIMD unit
729 sregAvail &= registerManager->
730 canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
731 }
732 }
733
734 DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
735 VGPR Availability = %d, SGPR Availability = %d\n",
736 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
737
738 if (!vregAvail) {
739 ++stats.numTimesWgBlockedDueVgprAlloc;
740 }
741
742 if (!sregAvail) {
743 ++stats.numTimesWgBlockedDueSgprAlloc;
744 }
745
746 // Return true if enough WF slots to submit workgroup and if there are
747 // enough VGPRs to schedule all WFs to their SIMD units
748 bool ldsAvail = lds.canReserve(task->ldsSize());
749 if (!ldsAvail) {
750 stats.wgBlockedDueLdsAllocation++;
751 }
752
753 if (!barrier_avail) {
754 stats.wgBlockedDueBarrierAllocation++;
755 }
756
757 // Return true if the following are all true:
758 // (a) all WFs of the WG were mapped to free WF slots
759 // (b) there are enough VGPRs to schedule all WFs to their SIMD units
760 // (c) there are enough SGPRs on the CU to schedule all WFs
761 // (d) there is enough space in LDS to allocate for all WFs
762 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
763 && ldsAvail && barrier_avail;
764 return can_dispatch;
765}
766
767int
769{
770 auto &wf_barrier = barrierSlot(bar_id);
771 return wf_barrier.numYetToReachBarrier();
772}
773
774bool
776{
777 auto &wf_barrier = barrierSlot(bar_id);
778 return wf_barrier.allAtBarrier();
779}
780
781void
783{
784 auto &wf_barrier = barrierSlot(bar_id);
785 wf_barrier.incNumAtBarrier();
786}
787
788int
790{
791 auto &wf_barrier = barrierSlot(bar_id);
792 return wf_barrier.numAtBarrier();
793}
794
795int
797{
798 auto &wf_barrier = barrierSlot(bar_id);
799 return wf_barrier.maxBarrierCnt();
800}
801
802void
804{
805 auto &wf_barrier = barrierSlot(bar_id);
806 wf_barrier.reset();
807}
808
809void
811{
812 auto &wf_barrier = barrierSlot(bar_id);
813 wf_barrier.decMaxBarrierCnt();
814}
815
816void
818{
819 auto &wf_barrier = barrierSlot(bar_id);
820 wf_barrier.release();
821 freeBarrierIds.insert(bar_id);
822}
823
824void
826{
827 for (int i = 0; i < numVectorALUs; ++i) {
828 for (int j = 0; j < shader->n_wf; ++j) {
829 Wavefront *wf = wfList[i][j];
830 if (wf->barrierId() == bar_id) {
831 assert(wf->getStatus() == Wavefront::S_BARRIER);
833 }
834 }
835 }
836}
837
838// Execute one clock worth of work on the ComputeUnit.
839void
841{
842 // process reads and writes in the RFs
843 for (auto &vecRegFile : vrf) {
844 vecRegFile->exec();
845 }
846
847 for (auto &scRegFile : srf) {
848 scRegFile->exec();
849 }
850
851 // Execute pipeline stages in reverse order to simulate
852 // the pipeline latency
853 scalarMemoryPipe.exec();
854 globalMemoryPipe.exec();
855 localMemoryPipe.exec();
856 execStage.exec();
857 scheduleStage.exec();
859 fetchStage.exec();
860
861 stats.totalCycles++;
862 execCycles++;
863
864 if (shader->getProgressInterval() != 0 &&
865 execCycles >= shader->getProgressInterval()) {
867 execCycles = 0;
868 }
869
870 // Put this CU to sleep if there is no more work to be done.
871 if (!isDone()) {
873 } else {
874 shader->notifyCuSleep();
875 DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
876 }
877}
878
879void
881{
882 // Initialize CU Bus models and execution resources
883
884 // Vector ALUs
885 vectorALUs.clear();
886 for (int i = 0; i < numVectorALUs; i++) {
887 vectorALUs.emplace_back(this, clockPeriod());
888 }
889
890 // Scalar ALUs
891 scalarALUs.clear();
892 for (int i = 0; i < numScalarALUs; i++) {
893 scalarALUs.emplace_back(this, clockPeriod());
894 }
895
896 // Vector Global Memory
898 "No support for multiple Global Memory Pipelines exists!!!");
899 vectorGlobalMemUnit.init(this, clockPeriod());
901 glbMemToVrfBus.init(this, clockPeriod());
902
903 // Vector Local/Shared Memory
905 "No support for multiple Local Memory Pipelines exists!!!");
906 vectorSharedMemUnit.init(this, clockPeriod());
907 vrfToLocalMemPipeBus.init(this, clockPeriod());
908 locMemToVrfBus.init(this, clockPeriod());
909
910 // Scalar Memory
912 "No support for multiple Scalar Memory Pipelines exists!!!");
913 scalarMemUnit.init(this, clockPeriod());
915 scalarMemToSrfBus.init(this, clockPeriod());
916
919
920 fetchStage.init();
921 scheduleStage.init();
922 execStage.init();
923 globalMemoryPipe.init();
924
925 gmTokenPort.setTokenManager(memPortTokens);
926}
927
928bool
933
934bool
936{
937 // Ruby has completed the memory op. Schedule the mem_resp_event at the
938 // appropriate cycle to process the timing memory response
939 // This delay represents the pipeline delay
940 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
941 PortID index = sender_state->port_index;
942 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
943 GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
944
945 // MemSyncResp + WriteAckResp are handled completely here and we don't
946 // schedule a MemRespEvent to process the responses further
947 if (pkt->cmd == MemCmd::MemSyncResp) {
948 // This response is for 1 of the following request types:
949 // - kernel launch
950 // - kernel end
951 // - non-kernel mem sync
952
953 // Non-kernel mem sync not from an instruction
954 if (!gpuDynInst) {
955 // If there is no dynamic instruction, a CU must be present.
956 ComputeUnit *cu = sender_state->computeUnit;
957 assert(cu != nullptr);
958
959 if (pkt->req->isInvL2()) {
961 assert(cu->shader->getNumOutstandingInvL2s() >= 0);
962 } else {
963 panic("Unknown MemSyncResp not from an instruction");
964 }
965
966 // Cleanup and return, no other response events needed.
967 delete pkt->senderState;
968 delete pkt;
969 return true;
970 }
971
972 // Kernel Launch
973 // wavefront was nullptr when launching kernel, so it is meaningless
974 // here (simdId=-1, wfSlotId=-1)
975 if (gpuDynInst->isKernelLaunch()) {
976 // for kernel launch, the original request must be both kernel-type
977 // and INV_L1
978 assert(pkt->req->isKernel());
979 assert(pkt->req->isInvL1());
980
981 // one D-Cache inv is done, decrement counter
982 dispatcher.updateInvCounter(gpuDynInst->kern_id);
983
984 delete pkt->senderState;
985 delete pkt;
986 return true;
987 }
988
989 // retrieve wavefront from inst
990 Wavefront *w = gpuDynInst->wavefront();
991
992 // Check if we are waiting on Kernel End Flush
993 if (w->getStatus() == Wavefront::S_RETURNING
994 && gpuDynInst->isEndOfKernel()) {
995 // for kernel end, the original request must be both kernel-type
996 // and last-level GPU cache should be flushed if it contains
997 // dirty data. This request may have been quiesced and
998 // immediately responded to if the GL2 is a write-through /
999 // read-only cache.
1000 assert(pkt->req->isKernel());
1001 assert(pkt->req->isGL2CacheFlush());
1002
1003 // once flush done, decrement counter, and return whether all
1004 // dirty writeback operations are done for the kernel
1005 bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
1006
1007 // not all wbs are done for the kernel, just release pkt
1008 // resources
1009 if (!isWbDone) {
1010 delete pkt->senderState;
1011 delete pkt;
1012 return true;
1013 }
1014
1015 // all wbs are completed for the kernel, do retirement work
1016 // for the workgroup
1017 DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
1018 computeUnit->cu_id, w->simdId, w->wfSlotId,
1019 w->wfDynId, w->wgId);
1020
1021 dispatcher.notifyWgCompl(w);
1022 w->setStatus(Wavefront::S_STOPPED);
1023 }
1024
1025 if (!pkt->req->isKernel()) {
1026 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
1027 DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
1028 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
1029 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
1030 gpuDynInst->disassemble(), w->outstandingReqs,
1031 w->outstandingReqs - 1);
1032 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
1033 }
1034
1035 delete pkt->senderState;
1036 delete pkt;
1037 return true;
1038 }
1039
1040 EventFunctionWrapper *mem_resp_event =
1041 computeUnit->memPort[index].createMemRespEvent(pkt);
1042
1043 DPRINTF(GPUPort,
1044 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
1045 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1046 gpuDynInst->seqNum(), index, pkt->req->getPaddr());
1047
1048 computeUnit->schedule(mem_resp_event,
1049 curTick() + computeUnit->resp_tick_latency);
1050
1051 return true;
1052}
1053
1054bool
1059
1060bool
1062{
1063 // From scalar cache invalidate that was issued at kernel start.
1064 if (pkt->req->isKernel()) {
1065 delete pkt->senderState;
1066 delete pkt;
1067
1068 return true;
1069 }
1070
1071 assert(!pkt->req->isKernel());
1072
1073 // retrieve sender state
1074 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1075 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1076
1077 assert(pkt->isRead() || pkt->isWrite());
1078 assert(gpuDynInst->numScalarReqs > 0);
1079
1080 gpuDynInst->numScalarReqs--;
1081
1090 if (!gpuDynInst->numScalarReqs) {
1091 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
1092 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
1093 gpuDynInst);
1094 } else {
1095 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
1096 gpuDynInst);
1097 }
1098 }
1099
1100 delete pkt->senderState;
1101 delete pkt;
1102
1103 return true;
1104}
1105
1106void
1108{
1109 for (const auto &pkt : retries) {
1110 if (!sendTimingReq(pkt)) {
1111 break;
1112 } else {
1113 retries.pop_front();
1114 }
1115 }
1116}
1117
1118void
1120{
1121 int len = retries.size();
1122
1123 assert(len > 0);
1124
1125 for (int i = 0; i < len; ++i) {
1126 PacketPtr pkt = retries.front().first;
1127 [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second;
1128 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
1129 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1130 pkt->req->getPaddr());
1131
1135 if (!sendTimingReq(pkt)) {
1136 DPRINTF(GPUMem, "failed again!\n");
1137 break;
1138 } else {
1139 DPRINTF(GPUMem, "successful!\n");
1140 retries.pop_front();
1141 }
1142 }
1143}
1144
1145bool
1147{
1148 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1153 if (sender_state->wavefront != nullptr) {
1154 RequestPtr req = pkt->req;
1155 // If the sender state's isKernDispath is set, then the request came
1156 // from the gpu command processor. The request fetches information
1157 // that will be used in the kernel dispatch process. It should be
1158 // handled in the gpu command processor. If the flag isn't set,
1159 // then the request is an instruction fetch and can be handled in
1160 // the compute unit
1161 if (sender_state->isKernDispatch) {
1162 int dispType = sender_state->dispatchType;
1163 computeUnit->shader->gpuCmdProc.completeTimingRead(dispType);
1164 } else {
1165 computeUnit->handleSQCReturn(pkt);
1166 }
1167 } else {
1168 delete pkt->senderState;
1169 delete pkt;
1170 }
1171
1172 return true;
1173}
1174
1175void
1177{
1178 fetchStage.processFetchReturn(pkt);
1179}
1180
1181void
1183{
1184 int len = retries.size();
1185
1186 assert(len > 0);
1187
1188 for (int i = 0; i < len; ++i) {
1189 PacketPtr pkt = retries.front().first;
1190 [[maybe_unused]] Wavefront *wavefront = retries.front().second;
1191 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1192 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1193 pkt->req->getPaddr());
1194 if (!sendTimingReq(pkt)) {
1195 DPRINTF(GPUFetch, "failed again!\n");
1196 break;
1197 } else {
1198 DPRINTF(GPUFetch, "successful!\n");
1199 retries.pop_front();
1200 }
1201 }
1202}
1203
1204const char*
1206{
1207 return "ComputeUnit SQC memory request event";
1208}
1209
1210void
1212{
1213 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1214 [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
1215
1216 assert(!pkt->req->systemReq());
1217
1218 if (!(sqcPort.sendTimingReq(pkt))) {
1220 (pkt, sender_state->wavefront));
1221 }
1222}
1223
1224void
1226{
1227 // There must be a way around this check to do the globalMemStart...
1228 Addr tmp_vaddr = pkt->req->getVaddr();
1229
1230 updatePageDivergenceDist(tmp_vaddr);
1231
1232 // set PC in request
1233 pkt->req->setPC(gpuDynInst->wavefront()->pc());
1234
1235 pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
1236
1237 // figure out the type of the request to set read/write
1238 BaseMMU::Mode TLB_mode;
1239 assert(pkt->isRead() || pkt->isWrite());
1240
1241 // only do some things if actually accessing data
1242 bool isDataAccess = pkt->isWrite() || pkt->isRead();
1243
1244 // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode
1245 // uses x86 pagetables which don't have fields to track GPU MTYPEs.
1246 // Rather than hacking up the pagetable to add these bits in, we just
1247 // keep a structure local to our GPUs that are populated in our
1248 // emulated driver whenever memory is allocated. Consult that structure
1249 // here in case we need a memtype override.
1250 //
1251 // In full system mode these can be extracted from the PTE and assigned
1252 // after address translation takes place.
1253 if (!FullSystem) {
1254 shader->gpuCmdProc.driver()->setMtype(pkt->req);
1255 }
1256
1257 // Check write before read for atomic operations
1258 // since atomic operations should use BaseMMU::Write
1259 if (pkt->isWrite()) {
1260 TLB_mode = BaseMMU::Write;
1261 } else if (pkt->isRead()) {
1262 TLB_mode = BaseMMU::Read;
1263 } else {
1264 fatal("pkt is not a read nor a write\n");
1265 }
1266
1267 if (!functionalTLB) {
1268 stats.tlbCycles -= curTick();
1269 }
1270 ++stats.tlbRequests;
1271
1272 PortID tlbPort_index = perLaneTLB ? index : 0;
1273
1274 if (shader->timingSim) {
1275 if (!FullSystem && debugSegFault) {
1276 Process *p = shader->gpuTc->getProcessPtr();
1277 Addr vaddr = pkt->req->getVaddr();
1278 unsigned size = pkt->getSize();
1279
1280 if ((vaddr + size - 1) % 64 < vaddr % 64) {
1281 panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1282 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
1283 }
1284
1285 Addr paddr;
1286
1287 if (!p->pTable->translate(vaddr, paddr)) {
1288 if (!p->fixupFault(vaddr)) {
1289 panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1290 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1291 vaddr);
1292 }
1293 }
1294 }
1295
1296 // This is the SenderState needed upon return
1297 pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
1298
1299 // This is the senderState needed by the TLB hierarchy to function
1300 GpuTranslationState *translation_state =
1301 new GpuTranslationState(TLB_mode, shader->gpuTc, false,
1302 pkt->senderState);
1303
1304 pkt->senderState = translation_state;
1305
1306 if (functionalTLB) {
1307 tlbPort[tlbPort_index].sendFunctional(pkt);
1308
1309 // update the hitLevel distribution
1310 int hit_level = translation_state->hitLevel;
1311 assert(hit_level != -1);
1312 stats.hitsPerTLBLevel[hit_level]++;
1313
1314 // New SenderState for the memory access
1315 GpuTranslationState *sender_state =
1317
1318 delete sender_state->tlbEntry;
1319 delete sender_state->saved;
1320 delete sender_state;
1321
1322 assert(pkt->req->hasPaddr());
1323 assert(pkt->req->hasSize());
1324
1325 // this is necessary because the GPU TLB receives packets instead
1326 // of requests. when the translation is complete, all relevent
1327 // fields in the request will be populated, but not in the packet.
1328 // here we create the new packet so we can set the size, addr,
1329 // and proper flags.
1330 PacketPtr oldPkt = pkt;
1331 pkt = new Packet(oldPkt->req, oldPkt->cmd);
1332 if (isDataAccess) {
1333 uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
1334 pkt->dataStatic(tmpData);
1335 }
1336 delete oldPkt;
1337
1338
1339 // New SenderState for the memory access
1340 pkt->senderState =
1342 nullptr);
1343
1344 gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
1345 gpuDynInst->tlbHitLevel[index] = hit_level;
1346
1347 // translation is done. Schedule the mem_req_event at the
1348 // appropriate cycle to send the timing memory request to ruby
1349 EventFunctionWrapper *mem_req_event =
1350 memPort[index].createMemReqEvent(pkt);
1351
1352 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
1353 "scheduled\n", cu_id, gpuDynInst->simdId,
1354 gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
1355
1356 schedule(mem_req_event, curTick() + req_tick_latency);
1357 } else if (tlbPort[tlbPort_index].isStalled()) {
1358 assert(tlbPort[tlbPort_index].retries.size() > 0);
1359
1360 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1361 "failed!\n", cu_id, gpuDynInst->simdId,
1362 gpuDynInst->wfSlotId, tmp_vaddr);
1363
1364 tlbPort[tlbPort_index].retries.push_back(pkt);
1365 } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1366 // Stall the data port;
1367 // No more packet will be issued till
1368 // ruby indicates resources are freed by
1369 // a recvReqRetry() call back on this port.
1370 tlbPort[tlbPort_index].stallPort();
1371
1372 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
1373 "failed!\n", cu_id, gpuDynInst->simdId,
1374 gpuDynInst->wfSlotId, tmp_vaddr);
1375
1376 tlbPort[tlbPort_index].retries.push_back(pkt);
1377 } else {
1378 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x from "
1379 "instruction %s sent!\n", cu_id, gpuDynInst->simdId,
1380 gpuDynInst->wfSlotId, tmp_vaddr,
1381 gpuDynInst->disassemble().c_str());
1382 }
1383 } else {
1384 if (pkt->cmd == MemCmd::MemSyncReq) {
1385 gpuDynInst->resetEntireStatusVector();
1386 } else {
1387 gpuDynInst->decrementStatusVector(index);
1388 }
1389
1390 // New SenderState for the memory access
1391 delete pkt->senderState;
1392
1393 // Because it's atomic operation, only need TLB translation state
1394 pkt->senderState = new GpuTranslationState(TLB_mode,
1395 shader->gpuTc);
1396
1397 tlbPort[tlbPort_index].sendFunctional(pkt);
1398
1399 // the addr of the packet is not modified, so we need to create a new
1400 // packet, or otherwise the memory access will have the old virtual
1401 // address sent in the translation packet, instead of the physical
1402 // address returned by the translation.
1403 PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
1404 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1405
1406 // Translation is done. It is safe to send the packet to memory.
1407 memPort[0].sendFunctional(new_pkt);
1408
1409 DPRINTF(GPUMem, "Functional sendRequest\n");
1410 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
1411 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
1412 new_pkt->req->getPaddr());
1413
1414 // safe_cast the senderState
1415 GpuTranslationState *sender_state =
1417
1418 delete sender_state->tlbEntry;
1419 delete new_pkt;
1420 delete pkt->senderState;
1421 delete pkt;
1422 }
1423}
1424
1425void
1427{
1428 assert(pkt->isWrite() || pkt->isRead());
1429
1430 BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write;
1431
1432 pkt->senderState =
1434
1435 pkt->senderState =
1436 new GpuTranslationState(tlb_mode, shader->gpuTc, false,
1437 pkt->senderState);
1438
1439 if (scalarDTLBPort.isStalled()) {
1440 assert(scalarDTLBPort.retries.size());
1441 scalarDTLBPort.retries.push_back(pkt);
1442 } else if (!scalarDTLBPort.sendTimingReq(pkt)) {
1443 scalarDTLBPort.stallPort();
1444 scalarDTLBPort.retries.push_back(pkt);
1445 } else {
1446 DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
1447 tlb_mode == BaseMMU::Read ? "read" : "write",
1448 pkt->req->getVaddr());
1449 }
1450}
1451
1452void
1454 bool kernelMemSync,
1455 RequestPtr req)
1456{
1457 assert(gpuDynInst->isGlobalSeg() ||
1458 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1459
1460 // Fences will never be issued to system memory, so we can mark the
1461 // requestor as a device memory ID here.
1462 if (!req) {
1463 req = std::make_shared<Request>(
1464 0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
1465 } else {
1466 req->requestorId(vramRequestorId());
1467 }
1468
1469 // all mem sync requests have Paddr == 0
1470 req->setPaddr(0);
1471
1472 PacketPtr pkt = nullptr;
1473
1474 if (kernelMemSync) {
1475 if (gpuDynInst->isKernelLaunch()) {
1476 req->setCacheCoherenceFlags(Request::INV_L1);
1477 req->setReqInstSeqNum(gpuDynInst->seqNum());
1478 req->setFlags(Request::KERNEL);
1479 pkt = new Packet(req, MemCmd::MemSyncReq);
1480 pkt->pushSenderState(
1481 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1482
1483 EventFunctionWrapper *mem_req_event =
1484 memPort[0].createMemReqEvent(pkt);
1485
1486 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1487 "an acquire\n", cu_id, gpuDynInst->simdId,
1488 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1489
1490 schedule(mem_req_event, curTick() + req_tick_latency);
1491 } else {
1492 // kernel end flush of GL2 cache may be quiesced by Ruby if the
1493 // GL2 is a read-only cache
1494 assert(shader->impl_kern_end_rel);
1495 assert(gpuDynInst->isEndOfKernel());
1496
1497 req->setCacheCoherenceFlags(Request::FLUSH_L2);
1498 req->setReqInstSeqNum(gpuDynInst->seqNum());
1499 req->setFlags(Request::KERNEL);
1500 pkt = new Packet(req, MemCmd::MemSyncReq);
1501 pkt->pushSenderState(
1502 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1503
1504 EventFunctionWrapper *mem_req_event =
1505 memPort[0].createMemReqEvent(pkt);
1506
1507 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1508 "a release\n", cu_id, gpuDynInst->simdId,
1509 gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
1510
1511 schedule(mem_req_event, curTick() + req_tick_latency);
1512 }
1513 } else {
1514 gpuDynInst->setRequestFlags(req);
1515
1516 req->setReqInstSeqNum(gpuDynInst->seqNum());
1517
1518 pkt = new Packet(req, MemCmd::MemSyncReq);
1519 pkt->pushSenderState(
1520 new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
1521
1522 EventFunctionWrapper *mem_req_event =
1523 memPort[0].createMemReqEvent(pkt);
1524
1525 DPRINTF(GPUPort,
1526 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1527 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1528 pkt->req->getPaddr());
1529
1530 schedule(mem_req_event, curTick() + req_tick_latency);
1531 }
1532}
1533
1534void
1536{
1537 auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
1538 req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
1539
1540 auto pkt = new Packet(req, MemCmd::MemSyncReq);
1541 pkt->pushSenderState(
1542 new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
1543
1544 EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
1545
1546 schedule(mem_req_event, curTick() + req_tick_latency);
1547
1548 shader->incNumOutstandingInvL2s();
1549}
1550
1551void
1553{
1554 DataPort::SenderState *sender_state =
1556
1557 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1558 ComputeUnit *compute_unit = computeUnit;
1559
1560 assert(gpuDynInst);
1561
1562 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1563 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1564 pkt->req->getPaddr(), id);
1565
1566 Addr paddr = pkt->req->getPaddr();
1567
1568 // mem sync resp callback must be handled already in
1569 // DataPort::recvTimingResp
1570 assert(pkt->cmd != MemCmd::MemSyncResp);
1571
1572 // The status vector and global memory response for WriteResp packets get
1573 // handled by the WriteCompleteResp packets.
1574 if (pkt->cmd == MemCmd::WriteResp) {
1575 if (!FullSystem || !pkt->req->systemReq()) {
1576 delete pkt;
1577 return;
1578 }
1579 }
1580
1581 // this is for read, write and atomic
1582 int index = gpuDynInst->memStatusVector[paddr].back();
1583
1584 DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1585 pkt->req->getPaddr(), id);
1586
1587 gpuDynInst->memStatusVector[paddr].pop_back();
1588 gpuDynInst->pAddr = pkt->req->getPaddr();
1589
1590 gpuDynInst->decrementStatusVector(index);
1591 DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
1592
1593 if (gpuDynInst->allLanesZero()) {
1594 auto iter = gpuDynInst->memStatusVector.begin();
1595 auto end = gpuDynInst->memStatusVector.end();
1596
1597 while (iter != end) {
1598 assert(iter->second.empty());
1599 ++iter;
1600 }
1601
1602 // Calculate the difference between the arrival of the first cache
1603 // block and the last cache block to arrive if we have the time
1604 // for the first cache block.
1605 if (compute_unit->headTailMap.count(gpuDynInst)) {
1606 Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
1607 compute_unit->stats.headTailLatency.sample(curTick() - headTick);
1608 compute_unit->headTailMap.erase(gpuDynInst);
1609 }
1610
1611 gpuDynInst->memStatusVector.clear();
1612
1613 gpuDynInst->
1614 profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
1615 compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
1616
1617 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1618 compute_unit->cu_id, gpuDynInst->simdId,
1619 gpuDynInst->wfSlotId);
1620 } else {
1621 if (pkt->isRead()) {
1622 if (!compute_unit->headTailMap.count(gpuDynInst)) {
1623 compute_unit->headTailMap
1624 .insert(std::make_pair(gpuDynInst, curTick()));
1625 }
1626 }
1627 }
1628
1629 delete pkt->senderState;
1630 delete pkt;
1631}
1632
1633bool
1635{
1636 Addr line = pkt->req->getPaddr();
1637
1638 DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1639 pkt->req->getVaddr(), line);
1640
1641 assert(pkt->senderState);
1642 computeUnit->stats.tlbCycles += curTick();
1643
1644 // pop off the TLB translation state
1645 GpuTranslationState *translation_state =
1647
1648 // no PageFaults are permitted for data accesses
1649 if (!translation_state->tlbEntry) {
1650 DTLBPort::SenderState *sender_state =
1651 safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1652
1653 [[maybe_unused]] Wavefront *w =
1654 computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1655 [sender_state->_gpuDynInst->wfSlotId];
1656
1657 DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1658 pkt->req->getVaddr());
1659 }
1660
1661 // update the hitLevel distribution
1662 int hit_level = translation_state->hitLevel;
1663 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1664
1665 delete translation_state->tlbEntry;
1666 assert(!translation_state->ports.size());
1667 pkt->senderState = translation_state->saved;
1668
1669 // for prefetch pkt
1670 BaseMMU::Mode TLB_mode = translation_state->tlbMode;
1671
1672 delete translation_state;
1673
1674 // use the original sender state to know how to close this transaction
1675 DTLBPort::SenderState *sender_state =
1677
1678 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1679 PortID mp_index = sender_state->portIndex;
1680 Addr vaddr = pkt->req->getVaddr();
1681 gpuDynInst->memStatusVector[line].push_back(mp_index);
1682 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1683
1684 DPRINTF(GPUTrace, "CU%d WF[%d][%d]: Translated %#lx -> %#lx for "
1685 "instruction %s (seqNum: %ld)\n", computeUnit->cu_id,
1686 gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getVaddr(),
1687 line, gpuDynInst->disassemble().c_str(), gpuDynInst->seqNum());
1688
1689 MemCmd requestCmd;
1690
1691 if (pkt->cmd == MemCmd::ReadResp) {
1692 requestCmd = MemCmd::ReadReq;
1693 } else if (pkt->cmd == MemCmd::WriteResp) {
1694 requestCmd = MemCmd::WriteReq;
1695 } else if (pkt->cmd == MemCmd::SwapResp) {
1696 requestCmd = MemCmd::SwapReq;
1697 } else {
1698 panic("unsupported response to request conversion %s\n",
1699 pkt->cmd.toString());
1700 }
1701
1702 if (computeUnit->prefetchDepth) {
1703 int simdId = gpuDynInst->simdId;
1704 int wfSlotId = gpuDynInst->wfSlotId;
1705 Addr last = 0;
1706
1707 switch(computeUnit->prefetchType) {
1708 case enums::PF_CU:
1709 last = computeUnit->lastVaddrCU[mp_index];
1710 break;
1711 case enums::PF_PHASE:
1712 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1713 break;
1714 case enums::PF_WF:
1715 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1716 default:
1717 break;
1718 }
1719
1720 DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1721 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1722
1723 int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) -
1725 : 0;
1726
1727 DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1728
1729 computeUnit->lastVaddrCU[mp_index] = vaddr;
1730 computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
1731 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1732
1733 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1734 computeUnit->prefetchStride: stride;
1735
1736 DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1737 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1738
1739 DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1740
1741 // Prefetch Next few pages atomically
1742 for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1743 DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1745
1746 if (!stride)
1747 break;
1748
1749 RequestPtr prefetch_req = std::make_shared<Request>(
1751 sizeof(uint8_t), 0,
1752 computeUnit->requestorId(),
1753 0, 0, nullptr);
1754
1755 PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1756 uint8_t foo = 0;
1757 prefetch_pkt->dataStatic(&foo);
1758
1759 // Because it's atomic operation, only need TLB translation state
1760 prefetch_pkt->senderState =
1761 new GpuTranslationState(TLB_mode,
1762 computeUnit->shader->gpuTc, true);
1763
1764 // Currently prefetches are zero-latency, hence the sendFunctional
1765 sendFunctional(prefetch_pkt);
1766
1767 /* safe_cast the senderState */
1768 GpuTranslationState *tlb_state =
1770 prefetch_pkt->senderState);
1771
1772
1773 delete tlb_state->tlbEntry;
1774 delete tlb_state;
1775 delete prefetch_pkt;
1776 }
1777 }
1778
1779 // First we must convert the response cmd back to a request cmd so that
1780 // the request can be sent through the cu's request port
1781 PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1782 new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1783 delete pkt->senderState;
1784 delete pkt;
1785
1786 // New SenderState for the memory access
1787 new_pkt->senderState =
1788 new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1789 nullptr);
1790
1791 // Set VRAM ID for device requests
1792 // For now, system vmem requests use functional reads. This is not that
1793 // critical to model as the region of interest should always be accessing
1794 // device memory. System vmem requests are used by blit kernels to do
1795 // memcpys and load code objects into device memory.
1796 if (new_pkt->req->systemReq()) {
1797 // There will be multiple packets returned for the same gpuDynInst,
1798 // so first check if systemReq is not already set and if so, return
1799 // the token acquired when the dispatch list is filled as system
1800 // requests do not require a GPU coalescer token.
1801 if (!gpuDynInst->isSystemReq()) {
1802 computeUnit->getTokenManager()->recvTokens(1);
1803 gpuDynInst->setSystemReq();
1804 }
1805 } else {
1806 new_pkt->req->requestorId(computeUnit->vramRequestorId());
1807 }
1808
1809 // translation is done. Schedule the mem_req_event at the appropriate
1810 // cycle to send the timing memory request to ruby
1811 EventFunctionWrapper *mem_req_event =
1812 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1813
1814 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1815 computeUnit->cu_id, gpuDynInst->simdId,
1816 gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1817
1818 computeUnit->schedule(mem_req_event, curTick() +
1819 computeUnit->req_tick_latency);
1820
1821 return true;
1822}
1823
1826{
1827 return new EventFunctionWrapper(
1828 [this, pkt]{ processMemReqEvent(pkt); },
1829 "ComputeUnit memory request event", true);
1830}
1831
1834{
1835 return new EventFunctionWrapper(
1836 [this, pkt]{ processMemRespEvent(pkt); },
1837 "ComputeUnit memory response event", true);
1838}
1839
1840void
1842{
1843 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1844 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1845 [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;
1846
1847 if (pkt->req->systemReq()) {
1848 assert(compute_unit->shader->systemHub);
1849 SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
1850 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1851 } else if (!(sendTimingReq(pkt))) {
1852 retries.emplace_back(pkt, gpuDynInst);
1853
1854 if (gpuDynInst) {
1855 DPRINTF(GPUPort,
1856 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1857 compute_unit->cu_id, gpuDynInst->simdId,
1858 gpuDynInst->wfSlotId, id, pkt->req->getPaddr());
1859 }
1860 } else {
1861 if (gpuDynInst) {
1862 DPRINTF(GPUPort,
1863 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
1864 " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1865 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
1866 pkt->req->getPaddr());
1867 }
1868 }
1869}
1870
1871const char*
1873{
1874 return "ComputeUnit scalar memory request event";
1875}
1876
1877void
1879{
1880 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1881 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1882 [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;
1883
1884 if (pkt->req->systemReq()) {
1885 assert(compute_unit->shader->systemHub);
1886 SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
1887 compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
1888 } else if (!(scalarDataPort.sendTimingReq(pkt))) {
1889 scalarDataPort.retries.emplace_back(pkt);
1890
1891 DPRINTF(GPUPort,
1892 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1893 compute_unit->cu_id, gpuDynInst->simdId,
1894 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1895 } else {
1896 DPRINTF(GPUPort,
1897 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1898 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1899 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1900 pkt->req->getPaddr());
1901 }
1902}
1903
1904/*
1905 * The initial translation request could have been rejected,
1906 * if <retries> queue is not Retry sending the translation
1907 * request. sendRetry() is called from the peer port whenever
1908 * a translation completes.
1909 */
1910void
1912{
1913 int len = retries.size();
1914
1915 DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1916 computeUnit->cu_id, len);
1917
1918 assert(len > 0);
1919 assert(isStalled());
1920 // recvReqRetry is an indication that the resource on which this
1921 // port was stalling on is freed. So, remove the stall first
1922 unstallPort();
1923
1924 for (int i = 0; i < len; ++i) {
1925 PacketPtr pkt = retries.front();
1926 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
1927 DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1928
1929 if (!sendTimingReq(pkt)) {
1930 // Stall port
1931 stallPort();
1932 DPRINTF(GPUTLB, ": failed again\n");
1933 break;
1934 } else {
1935 DPRINTF(GPUTLB, ": successful\n");
1936 retries.pop_front();
1937 }
1938 }
1939}
1940
1941bool
1943{
1944 assert(pkt->senderState);
1945
1946 GpuTranslationState *translation_state =
1948
1949 // Page faults are not allowed
1950 fatal_if(!translation_state->tlbEntry,
1951 "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
1952
1953 delete translation_state->tlbEntry;
1954 assert(!translation_state->ports.size());
1955
1956 pkt->senderState = translation_state->saved;
1957 delete translation_state;
1958
1959 ScalarDTLBPort::SenderState *sender_state =
1961
1962 GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1963 delete pkt->senderState;
1964
1965 [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront();
1966
1967 DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1968 "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
1969 w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
1970
1971 MemCmd mem_cmd;
1972
1973 if (pkt->cmd == MemCmd::ReadResp) {
1974 mem_cmd = MemCmd::ReadReq;
1975 } else if (pkt->cmd == MemCmd::WriteResp) {
1976 mem_cmd = MemCmd::WriteReq;
1977 } else {
1978 fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
1979 pkt->cmd.toString());
1980 }
1981
1982 PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
1983 req_pkt->dataStatic(pkt->getPtr<uint8_t>());
1984 delete pkt;
1985
1986 req_pkt->senderState =
1988
1989 // For a system request we want to mark the GPU instruction as a system
1990 // load/store so that after the request is issued to system memory we can
1991 // return any token acquired for the request. Since tokens are returned
1992 // by the coalescer and system requests do not take that path, this needs
1993 // to be tracked.
1994 //
1995 // Device requests change the requestor ID to something in the device
1996 // memory Ruby network.
1997 if (req_pkt->req->systemReq()) {
1998 gpuDynInst->setSystemReq();
1999 } else {
2000 req_pkt->req->requestorId(computeUnit->vramRequestorId());
2001 }
2002
2003 ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
2005 (computeUnit->scalarDataPort, req_pkt);
2006 computeUnit->schedule(scalar_mem_req_event, curTick() +
2007 computeUnit->scalar_req_tick_latency);
2008
2009 return true;
2010}
2011
2012bool
2014{
2015 [[maybe_unused]] Addr line = pkt->req->getPaddr();
2016 DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
2017 computeUnit->cu_id, pkt->req->getVaddr(), line);
2018
2019 assert(pkt->senderState);
2020
2021 // pop off the TLB translation state
2022 GpuTranslationState *translation_state
2024
2025 bool success = translation_state->tlbEntry != nullptr;
2026 delete translation_state->tlbEntry;
2027 assert(!translation_state->ports.size());
2028 pkt->senderState = translation_state->saved;
2029 delete translation_state;
2030
2031 // use the original sender state to know how to close this transaction
2032 ITLBPort::SenderState *sender_state =
2034
2035 // get the wavefront associated with this translation request
2036 Wavefront *wavefront = sender_state->wavefront;
2037 delete pkt->senderState;
2038
2039 if (success) {
2040 // pkt is reused in fetch(), don't delete it here. However, we must
2041 // reset the command to be a request so that it can be sent through
2042 // the cu's request port
2043 assert(pkt->cmd == MemCmd::ReadResp);
2044 pkt->cmd = MemCmd::ReadReq;
2045
2046 computeUnit->fetchStage.fetch(pkt, wavefront);
2047 } else {
2048 if (wavefront->dropFetch) {
2049 assert(wavefront->instructionBuffer.empty());
2050 wavefront->dropFetch = false;
2051 }
2052
2053 wavefront->pendingFetch = 0;
2054 }
2055
2056 return true;
2057}
2058
2059/*
2060 * The initial translation request could have been rejected, if
2061 * <retries> queue is not empty. Retry sending the translation
2062 * request. sendRetry() is called from the peer port whenever
2063 * a translation completes.
2064 */
2065void
2067{
2068
2069 int len = retries.size();
2070 DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
2071
2072 assert(len > 0);
2073 assert(isStalled());
2074
2075 // recvReqRetry is an indication that the resource on which this
2076 // port was stalling on is freed. So, remove the stall first
2077 unstallPort();
2078
2079 for (int i = 0; i < len; ++i) {
2080 PacketPtr pkt = retries.front();
2081 [[maybe_unused]] Addr vaddr = pkt->req->getVaddr();
2082 DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
2083
2084 if (!sendTimingReq(pkt)) {
2085 stallPort(); // Stall port
2086 DPRINTF(GPUTLB, ": failed again\n");
2087 break;
2088 } else {
2089 DPRINTF(GPUTLB, ": successful\n");
2090 retries.pop_front();
2091 }
2092 }
2093}
2094
2095void
2097{
2098 if (gpuDynInst->isScalar()) {
2099 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2100 stats.sALUInsts++;
2101 stats.instCyclesSALU++;
2102 } else if (gpuDynInst->isLoad()) {
2103 stats.scalarMemReads++;
2104 } else if (gpuDynInst->isStore()) {
2105 stats.scalarMemWrites++;
2106 }
2107 } else {
2108 if (gpuDynInst->isALU()) {
2109 shader->total_valu_insts++;
2110 if (shader->total_valu_insts == shader->max_valu_insts) {
2111 exitSimLoop("max vALU insts");
2112 }
2113 stats.vALUInsts++;
2114 stats.instCyclesVALU++;
2115 stats.threadCyclesVALU
2116 += gpuDynInst->wavefront()->execMask().count();
2117 } else if (gpuDynInst->isFlat()) {
2118 if (gpuDynInst->isLocalMem()) {
2119 stats.flatLDSInsts++;
2120 } else {
2121 stats.flatVMemInsts++;
2122 }
2123 } else if (gpuDynInst->isFlatGlobal()) {
2124 stats.flatVMemInsts++;
2125 } else if (gpuDynInst->isFlatScratch()) {
2126 stats.flatVMemInsts++;
2127 } else if (gpuDynInst->isLocalMem()) {
2128 stats.ldsNoFlatInsts++;
2129 } else if (gpuDynInst->isLoad()) {
2130 stats.vectorMemReads++;
2131 } else if (gpuDynInst->isStore()) {
2132 stats.vectorMemWrites++;
2133 }
2134
2135 if (gpuDynInst->isLoad()) {
2136 switch (gpuDynInst->executedAs()) {
2137 case enums::SC_SPILL:
2138 stats.spillReads++;
2139 break;
2140 case enums::SC_GLOBAL:
2141 stats.globalReads++;
2142 break;
2143 case enums::SC_GROUP:
2144 stats.groupReads++;
2145 break;
2146 case enums::SC_PRIVATE:
2147 stats.privReads++;
2148 break;
2149 case enums::SC_READONLY:
2150 stats.readonlyReads++;
2151 break;
2152 case enums::SC_KERNARG:
2153 stats.kernargReads++;
2154 break;
2155 case enums::SC_ARG:
2156 stats.argReads++;
2157 break;
2158 case enums::SC_NONE:
2163 break;
2164 default:
2165 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2166 break;
2167 }
2168 } else if (gpuDynInst->isStore()) {
2169 switch (gpuDynInst->executedAs()) {
2170 case enums::SC_SPILL:
2171 stats.spillWrites++;
2172 break;
2173 case enums::SC_GLOBAL:
2174 stats.globalWrites++;
2175 break;
2176 case enums::SC_GROUP:
2177 stats.groupWrites++;
2178 break;
2179 case enums::SC_PRIVATE:
2180 stats.privWrites++;
2181 break;
2182 case enums::SC_READONLY:
2183 stats.readonlyWrites++;
2184 break;
2185 case enums::SC_KERNARG:
2186 stats.kernargWrites++;
2187 break;
2188 case enums::SC_ARG:
2189 stats.argWrites++;
2190 break;
2191 case enums::SC_NONE:
2196 break;
2197 default:
2198 fatal("%s has no valid segment\n", gpuDynInst->disassemble());
2199 break;
2200 }
2201 }
2202 }
2203}
2204
2205void
2207{
2208 Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes);
2209
2210 if (!pagesTouched.count(virt_page_addr))
2211 pagesTouched[virt_page_addr] = 1;
2212 else
2213 pagesTouched[virt_page_addr]++;
2214}
2215
2216void
2218{
2219 if (countPages) {
2220 std::ostream *page_stat_file = simout.create(name().c_str())->stream();
2221
2222 *page_stat_file << "page, wavefront accesses, workitem accesses" <<
2223 std::endl;
2224
2225 for (auto iter : pageAccesses) {
2226 *page_stat_file << std::hex << iter.first << ",";
2227 *page_stat_file << std::dec << iter.second.first << ",";
2228 *page_stat_file << std::dec << iter.second.second << std::endl;
2229 }
2230 }
2231}
2232
2233bool
2235{
2236 for (int i = 0; i < numVectorALUs; ++i) {
2237 if (!isVectorAluIdle(i)) {
2238 return false;
2239 }
2240 }
2241
2242 // TODO: FIXME if more than 1 of any memory pipe supported
2243 if (!srfToScalarMemPipeBus.rdy()) {
2244 return false;
2245 }
2246 if (!vrfToGlobalMemPipeBus.rdy()) {
2247 return false;
2248 }
2249 if (!vrfToLocalMemPipeBus.rdy()) {
2250 return false;
2251 }
2252
2253 if (!globalMemoryPipe.isGMReqFIFOWrRdy()
2254 || !localMemoryPipe.isLMReqFIFOWrRdy()
2255 || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
2256 !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) {
2257 return false;
2258 }
2259
2260 return true;
2261}
2262
2263int32_t
2264ComputeUnit::getRefCounter(const uint32_t dispatchId,
2265 const uint32_t wgId) const
2266{
2267 return lds.getRefCounter(dispatchId, wgId);
2268}
2269
2270bool
2271ComputeUnit::isVectorAluIdle(uint32_t simdId) const
2272{
2273 assert(simdId < numVectorALUs);
2274
2275 for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
2276 if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
2277 return false;
2278 }
2279 }
2280
2281 return true;
2282}
2283
2289bool
2291{
2292 // this is just a request to carry the GPUDynInstPtr
2293 // back and forth
2294 RequestPtr newRequest = std::make_shared<Request>();
2295 newRequest->setPaddr(0x0);
2296
2297 // ReadReq is not evaluted by the LDS but the Packet ctor requires this
2298 PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
2299
2300 // This is the SenderState needed upon return
2301 newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
2302
2303 return ldsPort.sendTimingReq(newPacket);
2304}
2305
2311{
2312 return FullSystem ? shader->vramRequestorId() : requestorId();
2313}
2314
2315void
2317{
2318 for (int j = 0; j < numVectorALUs; ++j) {
2319 for (int i = 0; i < shader->n_wf; ++i) {
2320 if (wfList[j][i]->getStatus() == Wavefront::status_e::S_STOPPED) {
2321 continue;
2322 }
2323
2324 std::cout << curTick() << ": ";
2325 std::cout << "CU" << cu_id << " WF[" << j << "][" << i << "] ";
2326 wfList[j][i]->printProgress();
2327 }
2328 }
2329 globalMemoryPipe.printProgress();
2330 scalarMemoryPipe.printProgress();
2331 localMemoryPipe.printProgress();
2332 std::cout << std::endl;
2333}
2334
2338bool
2340{
2341 const ComputeUnit::LDSPort::SenderState *senderState =
2342 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
2343
2344 fatal_if(!senderState, "did not get the right sort of sender state");
2345
2346 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
2347
2348 delete packet->senderState;
2349 delete packet;
2350
2351 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2352 return true;
2353}
2354
2360bool
2362{
2363 ComputeUnit::LDSPort::SenderState *sender_state =
2365 fatal_if(!sender_state, "packet without a valid sender state");
2366
2367 [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst();
2368
2369 if (isStalled()) {
2370 fatal_if(retries.empty(), "must have retries waiting to be stalled");
2371
2372 retries.push(pkt);
2373
2374 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
2375 computeUnit->cu_id, gpuDynInst->simdId,
2376 gpuDynInst->wfSlotId);
2377 return false;
2378 } else if (!RequestPort::sendTimingReq(pkt)) {
2379 // need to stall the LDS port until a recvReqRetry() is received
2380 // this indicates that there is more space
2381 stallPort();
2382 retries.push(pkt);
2383
2384 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2385 computeUnit->cu_id, gpuDynInst->simdId,
2386 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2387 return false;
2388 } else {
2389 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2390 computeUnit->cu_id, gpuDynInst->simdId,
2391 gpuDynInst->wfSlotId, pkt->req->getPaddr());
2392 return true;
2393 }
2394}
2395
2402void
2404{
2405 auto queueSize = retries.size();
2406
2407 DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
2408 computeUnit->cu_id, queueSize);
2409
2410 fatal_if(queueSize < 1,
2411 "why was there a recvReqRetry() with no pending reqs?");
2413 "recvReqRetry() happened when the port was not stalled");
2414
2415 unstallPort();
2416
2417 while (!retries.empty()) {
2418 PacketPtr packet = retries.front();
2419
2420 DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
2421
2422 if (!RequestPort::sendTimingReq(packet)) {
2423 // Stall port
2424 stallPort();
2425 DPRINTF(GPUPort, ": LDS send failed again\n");
2426 break;
2427 } else {
2428 DPRINTF(GPUTLB, ": LDS send successful\n");
2429 retries.pop();
2430 }
2431 }
2432}
2433
2435 int n_wf)
2436 : statistics::Group(parent),
2437 ADD_STAT(vALUInsts, "Number of vector ALU insts issued."),
2438 ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued "
2439 "per-wavefront."),
2440 ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."),
2441 ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued "
2442 "per-wavefront."),
2444 "Number of cycles needed to execute VALU insts."),
2446 "Number of cycles needed to execute SALU insts."),
2447 ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute "
2448 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2449 "the number of active threads."),
2451 "Percentage of active vector ALU threads in a wave."),
2452 ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT"
2453 " accesses that resolve to LDS."),
2454 ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not "
2455 "including FLAT accesses that resolve to LDS) per-wavefront."),
2457 "The number of FLAT insts that resolve to vmem issued."),
2458 ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that "
2459 "resolve to vmem issued per-wavefront."),
2461 "The number of FLAT insts that resolve to LDS issued."),
2462 ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that "
2463 "resolve to LDS issued per-wavefront."),
2465 "Number of vector mem write insts (excluding FLAT insts)."),
2466 ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write "
2467 "insts (excluding FLAT insts) per-wavefront."),
2469 "Number of vector mem read insts (excluding FLAT insts)."),
2470 ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts "
2471 "(excluding FLAT insts) per-wavefront."),
2472 ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."),
2474 "The average number of scalar mem write insts per-wavefront."),
2475 ADD_STAT(scalarMemReads, "Number of scalar mem read insts."),
2477 "The average number of scalar mem read insts per-wavefront."),
2479 "Number of vector mem reads per kilo-instruction"),
2481 "Number of vector mem writes per kilo-instruction"),
2483 "Number of vector mem insts per kilo-instruction"),
2485 "Number of scalar mem reads per kilo-instruction"),
2487 "Number of scalar mem writes per kilo-instruction"),
2489 "Number of scalar mem insts per kilo-instruction"),
2490 ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, "
2491 "command, data from VRF to vector memory unit, per SIMD"),
2492 ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, "
2493 "command, data from SRF to scalar memory unit, per SIMD"),
2494 ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, "
2495 "command, data from VRF to LDS unit, per SIMD"),
2496 ADD_STAT(globalReads, "Number of reads to the global segment"),
2497 ADD_STAT(globalWrites, "Number of writes to the global segment"),
2499 "Number of memory instructions sent to the global segment"),
2500 ADD_STAT(argReads, "Number of reads to the arg segment"),
2501 ADD_STAT(argWrites, "NUmber of writes to the arg segment"),
2503 "Number of memory instructions sent to the arg segment"),
2504 ADD_STAT(spillReads, "Number of reads to the spill segment"),
2505 ADD_STAT(spillWrites, "Number of writes to the spill segment"),
2507 "Number of memory instructions sent to the spill segment"),
2508 ADD_STAT(groupReads, "Number of reads to the group segment"),
2509 ADD_STAT(groupWrites, "Number of writes to the group segment"),
2511 "Number of memory instructions sent to the group segment"),
2512 ADD_STAT(privReads, "Number of reads to the private segment"),
2513 ADD_STAT(privWrites, "Number of writes to the private segment"),
2515 "Number of memory instructions sent to the private segment"),
2516 ADD_STAT(readonlyReads, "Number of reads to the readonly segment"),
2518 "Number of memory instructions sent to the readonly segment"),
2520 "Number of memory instructions sent to the readonly segment"),
2521 ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"),
2523 "Number of memory instructions sent to the kernarg segment"),
2525 "Number of memory instructions sent to the kernarg segment"),
2527 "wave level parallelism: count of active waves at wave launch"),
2528 ADD_STAT(tlbRequests, "number of uncoalesced requests"),
2530 "total number of cycles for all uncoalesced requests"),
2531 ADD_STAT(tlbLatency, "Avg. translation latency for data translations"),
2533 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2534 ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"),
2536 "Number of bank conflicts per LDS memory packet"),
2538 "pages touched per wf (over all mem. instr.)"),
2540 "dynamic non-flat global memory instruction count"),
2542 "dynamic flat global memory instruction count"),
2543 ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"),
2545 "WG dispatch was blocked due to lack of barrier resources"),
2547 "Workgroup blocked due to LDS capacity"),
2548 ADD_STAT(numInstrExecuted, "number of instructions executed"),
2549 ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed "
2550 "vector instructions per cycle"),
2552 "number of vec ops executed (e.g. WF size/inst)"),
2554 "number of f16 vec ops executed (e.g. WF size/inst)"),
2556 "number of f32 vec ops executed (e.g. WF size/inst)"),
2558 "number of f64 vec ops executed (e.g. WF size/inst)"),
2560 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2562 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2564 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2566 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2568 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2570 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2572 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2574 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2576 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2578 "number of mfma vec ops executed (e.g. WF size/inst)"),
2580 "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
2582 "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
2584 "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
2586 "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
2588 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2589 ADD_STAT(totalCycles, "number of cycles the CU ran for"),
2590 ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"),
2591 ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"),
2592 ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"),
2593 ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"),
2594 ADD_STAT(ipc, "Instructions per cycle (this CU only)"),
2595 ADD_STAT(controlFlowDivergenceDist, "number of lanes active per "
2596 "instruction (over all instructions)"),
2598 "number of active lanes per global memory instruction"),
2600 "number of active lanes per local memory instruction"),
2602 "Number of dynamic non-GM memory insts executed"),
2603 ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are "
2604 "blocked due to VGPR allocation per SIMD"),
2605 ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are "
2606 "blocked due to SGPR allocation per SIMD"),
2607 ADD_STAT(numCASOps, "number of compare and swap operations"),
2609 "number of compare and swap operations that failed"),
2610 ADD_STAT(completedWfs, "number of completed wavefronts"),
2611 ADD_STAT(completedWGs, "number of completed workgroups"),
2612 ADD_STAT(headTailLatency, "ticks between first and last cache block "
2613 "arrival at coalescer"),
2614 ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD")
2615{
2616 ComputeUnit *cu = static_cast<ComputeUnit*>(parent);
2617
2621
2622 hitsPerTLBLevel.init(4);
2623 execRateDist.init(0, 10-1, 2);
2624 ldsBankConflictDist.init(0, cu->wfSize()-1, 2);
2625
2626 pageDivergenceDist.init(1, cu->wfSize(), 4);
2627 controlFlowDivergenceDist.init(1, cu->wfSize(), 4);
2628 activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4);
2629 activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4);
2630
2631 headTailLatency.init(0, 1000000-1, 10000).flags(statistics::pdf |
2633 waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1);
2634 instInterleave.init(cu->numVectorALUs, 0, 20, 1);
2635
2646
2655
2663
2665
2666 // fixed number of TLB levels
2667 for (int i = 0; i < 4; ++i) {
2668 if (!i)
2669 hitsPerTLBLevel.subname(i,"page_table");
2670 else
2671 hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
2672 }
2673
2679
2682}
2683
2684} // namespace gem5
#define DPRINTFN(...)
Definition trace.hh:237
#define DPRINTF(x,...)
Definition trace.hh:209
void sendRequest(PacketPtr pkt, Event *callback)
Definition system_hub.cc:42
ClockedObject(const ClockedObjectParams &p)
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
Tick clockPeriod() const
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
const char * description() const
Return a C string describing the event.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
const char * description() const
Return a C string describing the event.
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
void sendInvL2(Addr paddr)
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
int numWg(int dim) const
int wgId(int dim) const
static const int MAX_DIM
int wgSize(int dim) const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:58
const std::string & toString() const
Return the string to a cmd given by idx.
Definition packet.hh:276
virtual std::string name() const
Definition named.hh:60
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
bool isRead() const
Definition packet.hh:593
Addr getAddr() const
Definition packet.hh:807
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
Definition packet.cc:334
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225
bool isWrite() const
Definition packet.hh:594
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
unsigned getSize() const
Definition packet.hh:817
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition port.hh:603
void sendFunctional(PacketPtr pkt) const
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
Definition port.hh:579
@ KERNEL
The request should be marked with KERNEL.
Definition request.hh:183
int getNumOutstandingInvL2s() const
Definition shader.hh:345
void decNumOutstandingInvL2s()
Definition shader.cc:561
AMDGPUSystemHub * systemHub
Definition shader.hh:273
WF barrier slots.
static const int InvalidID
void setStatus(status_e newStatus)
Definition wavefront.cc:599
const int simdId
Definition wavefront.hh:102
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:112
status_e getStatus()
Definition wavefront.hh:142
const int wfSlotId
Definition wavefront.hh:99
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:93
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Statistics container.
Definition group.hh:93
STL pair class.
Definition stl.hh:58
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition intmath.hh:59
static constexpr bool isPowerOf2(const T &n)
Definition intmath.hh:98
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
Bitfield< 18, 16 > len
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21, 20 > stride
Bitfield< 9 > d
Definition misc_types.hh:64
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 23 > k
Bitfield< 0 > w
const Addr PageShift
Definition page_size.hh:48
Bitfield< 3 > addr
Definition types.hh:84
Bitfield< 2 > pf
Definition misc.hh:565
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ GMEnqueue
Definition misc.hh:56
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
OutputDirectory simout
Definition output.cc:62
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
Packet * PacketPtr
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Definition core.cc:143
Declarations of a non-full system Page Table.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
const std::string & name()
Definition trace.cc:48

Generated on Mon May 26 2025 09:19:10 for gem5 by doxygen 1.13.2