39#include "debug/GPUDisp.hh"
40#include "debug/GPUExec.hh"
41#include "debug/GPUFetch.hh"
42#include "debug/GPUMem.hh"
43#include "debug/GPUPort.hh"
44#include "debug/GPUPrefetch.hh"
45#include "debug/GPUReg.hh"
46#include "debug/GPURename.hh"
47#include "debug/GPUSync.hh"
48#include "debug/GPUTLB.hh"
49#include "debug/GPUTrace.hh"
50#include "enums/GfxVersion.hh"
87 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
88 rfc(
p.register_file_cache),
89 simdWidth(
p.simd_width),
90 spBypassPipeLength(
p.spbypass_pipe_length),
91 dpBypassPipeLength(
p.dpbypass_pipe_length),
92 rfcPipeLength(
p.rfc_pipe_length),
93 scalarPipeStages(
p.scalar_pipe_length),
94 operandNetworkLength(
p.operand_network_length),
95 issuePeriod(
p.issue_period),
96 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
97 srf_scm_bus_latency(
p.srf_scm_bus_latency),
98 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
99 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
100 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
101 debugSegFault(
p.debugSegFault),
102 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
103 countPages(
p.countPages),
104 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
105 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
106 scalar_req_tick_latency(
107 p.scalar_mem_req_latency *
p.clk_domain->clockPeriod()),
108 scalar_resp_tick_latency(
109 p.scalar_mem_resp_latency *
p.clk_domain->clockPeriod()),
110 memtime_latency(
p.memtime_latency *
p.clk_domain->clockPeriod()),
111 mfma_scale(
p.mfma_scale),
119 {GfxVersion::gfx90a, {
120 {
"v_mfma_f32_32x32x1_2b_f32", 64},
121 {
"v_mfma_f32_16x16x1_4b_f32", 32},
122 {
"v_mfma_f32_4x4x1_16b_f32", 8},
123 {
"v_mfma_f32_32x32x2_f32", 64},
124 {
"v_mfma_f32_16x16x4_f32", 32},
125 {
"v_mfma_f32_32x32x4_2b_f16", 64},
126 {
"v_mfma_f32_16x16x4_4b_f16", 32},
127 {
"v_mfma_f32_4x4x4_16b_f16", 8},
128 {
"v_mfma_f32_32x32x8_f16", 64},
129 {
"v_mfma_f32_16x16x16_f16", 32},
130 {
"v_mfma_i32_32x32x4_2b_i8", 64},
131 {
"v_mfma_i32_16x16x4_4b_i8", 32},
132 {
"v_mfma_i32_4x4x4_16b_i8", 8},
133 {
"v_mfma_i32_32x32x8_i8", 64},
134 {
"v_mfma_i32_16x16x16_i8", 32},
135 {
"v_mfma_f32_32x32x2_2b_bf16", 64},
136 {
"v_mfma_f32_16x16x2_4b_bf16", 32},
137 {
"v_mfma_f32_4x4x2_16b_bf16", 8},
138 {
"v_mfma_f32_32x32x4_bf16", 64},
139 {
"v_mfma_f32_16x16x8_bf16", 32},
140 {
"v_mfma_f64_16x16x4_f64", 32},
141 {
"v_mfma_f64_4x4x4_4b_f64", 16},
148 {GfxVersion::gfx942, {
149 {
"v_mfma_f32_32x32x1_2b_f32", 64},
150 {
"v_mfma_f32_16x16x1_4b_f32", 32},
151 {
"v_mfma_f32_4x4x1_16b_f32", 8},
152 {
"v_mfma_f32_32x32x2_f32", 64},
153 {
"v_mfma_f32_16x16x4_f32", 32},
154 {
"v_mfma_f32_32x32x4_2b_f16", 64},
155 {
"v_mfma_f32_16x16x4_4b_f16", 32},
156 {
"v_mfma_f32_4x4x4_16b_f16", 8},
157 {
"v_mfma_f32_32x32x8_f16", 32},
158 {
"v_mfma_f32_16x16x16_f16", 16},
159 {
"v_mfma_f32_32x32x4_2b_bf16", 64},
160 {
"v_mfma_f32_16x16x4_4b_bf16", 32},
161 {
"v_mfma_f32_4x4x4_16b_bf16", 8},
162 {
"v_mfma_f32_32x32x8_bf16", 32},
163 {
"v_mfma_f32_16x16x16_bf16", 16},
164 {
"v_mfma_i32_32x32x4_2b_i8", 64},
165 {
"v_mfma_i32_16x16x4_4b_i8", 32},
166 {
"v_mfma_i32_4x4x4_16b_i8", 8},
167 {
"v_mfma_i32_32x32x16_i8", 32},
168 {
"v_mfma_i32_16x16x32_i8", 16},
169 {
"v_mfma_f32_16x16x8_xf32", 16},
170 {
"v_mfma_f32_32x32x4_xf32", 32},
171 {
"v_mfma_f64_16x16x4_f64", 32},
172 {
"v_mfma_f64_4x4x4_4b_f64", 16},
173 {
"v_mfma_f32_16x16x32_bf8_bf8", 16},
174 {
"v_mfma_f32_16x16x32_bf8_fp8", 16},
175 {
"v_mfma_f32_32x32x16_fp8_bf8", 32},
176 {
"v_mfma_f32_32x32x16_fp8_fp8", 32},
179 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
180 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
186 _cacheLineSize(
p.system->cacheLineSize()),
187 _numBarrierSlots(
p.num_barrier_slots),
188 globalSeqNum(0), wavefrontSize(
p.wf_size),
189 scoreboardCheckToSchedule(
p),
190 scheduleToExecute(
p),
197 "Functional TLB not supported in full-system GPU simulation");
208 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
210 "WF size is larger than the host can support");
212 "Wavefront size should be a power of 2");
215 numCyclesPerStoreTransfer =
216 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
217 (
double)vrfToCoalescerBusWidth);
219 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
220 / coalescerToVrfBusWidth;
223 idleWfs =
p.n_wf * numVectorALUs;
224 lastVaddrWF.resize(numVectorALUs);
225 wfList.resize(numVectorALUs);
227 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
229 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
230 freeBarrierIds.insert(
i);
233 for (
int j = 0; j < numVectorALUs; ++j) {
234 lastVaddrWF[j].resize(
p.n_wf);
236 for (
int i = 0;
i <
p.n_wf; ++
i) {
237 lastVaddrWF[j][
i].resize(wfSize());
239 wfList[j].push_back(
p.wavefronts[j *
p.n_wf +
i]);
240 wfList[j][
i]->setParent(
this);
242 for (
int k = 0;
k < wfSize(); ++
k) {
243 lastVaddrWF[j][
i][
k] = 0;
248 lastVaddrSimd.resize(numVectorALUs);
250 for (
int i = 0;
i < numVectorALUs; ++
i) {
251 lastVaddrSimd[
i].resize(wfSize(), 0);
254 lastVaddrCU.resize(wfSize());
258 if (
p.execPolicy ==
"OLDEST-FIRST") {
260 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
263 fatal(
"Invalid WF execution policy (CU)\n");
266 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
270 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
280 lastExecCycle.resize(numVectorALUs, 0);
282 for (
int i = 0;
i < vrf.size(); ++
i) {
283 vrf[
i]->setParent(
this);
284 rfc[
i]->setParent(
this);
286 for (
int i = 0;
i < srf.size(); ++
i) {
287 srf[
i]->setParent(
this);
289 numVecRegsPerSimd = vrf[0]->numRegs();
290 numScalarRegsPerSimd = srf[0]->numRegs();
292 registerManager->setParent(
this);
296 instExecPerSimd.resize(numVectorALUs, 0);
300 "Cache line size should be a power of two.");
301 cacheLineBits =
floorLog2(_cacheLineSize);
303 matrix_core_ready.resize(numVectorALUs);
304 for (
int i = 0;
i < numVectorALUs;
i++) {
305 matrix_core_ready[
i] = 0;
392 w->workGroupSz[0] = task->
wgSize(0);
393 w->workGroupSz[1] = task->
wgSize(1);
394 w->workGroupSz[2] = task->
wgSize(2);
395 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
399 w->computeActualWgSz(task);
406 static int _n_wave = 0;
412 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
416 w->execMask() = init_mask;
420 w->initMask = init_mask.to_ullong();
423 w->barrierId(bar_id);
425 assert(!
w->hasBarrier());
429 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
430 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
432 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
433 (
w->actualWgSz[0] *
w->actualWgSz[1]);
435 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
436 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
443 w->workGroupId[0] =
w->wgId % task->
numWg(0);
444 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
445 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
448 w->ldsChunk = ldsChunk;
450 [[maybe_unused]] int32_t refCount =
451 lds.increaseRefCounter(
w->dispatchId,
w->wgId);
452 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
453 cu_id,
w->wgId, refCount);
455 w->instructionBuffer.clear();
460 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
461 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
462 w->simdId,
w->wfSlotId, refCount);
464 w->initRegState(task,
w->actualWgSzTotal);
470 w->vmemIssued.clear();
471 w->lgkmIssued.clear();
472 w->expIssued.clear();
474 panic_if(
w->wrGmReqsInPipe,
"GM write counter for wavefront non-zero\n");
475 panic_if(
w->rdGmReqsInPipe,
"GM read counter for wavefront non-zero\n");
476 panic_if(
w->wrLmReqsInPipe,
"LM write counter for wavefront non-zero\n");
477 panic_if(
w->rdLmReqsInPipe,
"GM read counter for wavefront non-zero\n");
479 "Outstanding reqs counter for wavefront non-zero\n");
490 = std::make_shared<GPUDynInst>(
this,
nullptr,
494 gpuDynInst->kern_id = kernId;
517 = std::make_shared<GPUDynInst>(
this,
nullptr,
521 gpuDynInst->kern_id = kernId;
523 gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
545 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
559 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
573 if (num_wfs_in_wg > 1) {
580 assert(!wf_barrier.maxBarrierCnt());
581 assert(!wf_barrier.numAtBarrier());
582 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
584 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
585 "%d waves using this barrier.\n",
cu_id, barrier_id,
591 for (
int j = 0; j <
shader->n_wf; ++j) {
605 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
606 "vregDemand[%d] sregDemand[%d]\n",
i, j,
w->wfDynId,
607 vregDemand, sregDemand);
622 "Instruction Buffer of WF%d can't be empty",
w->wgId);
631 "Instruction Buffer of WF%d can't be empty",
w->wgId);
634 auto it =
pipeMap.find(ii->seqNum());
644 int trueWgSizeTotal = 1;
650 trueWgSizeTotal *= trueWgSize[
d];
651 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
654 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
657 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
658 num_wfs_in_wg = numWfs;
660 bool barrier_avail =
true;
663 barrier_avail =
false;
676 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
677 "that has %d VGPRs\n",
680 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
687 int numMappedWfs = 0;
693 for (
int j = 0; j <
shader->n_wf; ++j) {
699 if (numMappedWfs < numWfs &&
713 assert(numMappedWfs <= numWfs);
715 bool vregAvail =
true;
716 bool sregAvail =
true;
718 if (numMappedWfs < numWfs) {
734 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
735 VGPR Availability = %d, SGPR Availability = %d\n",
736 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
739 ++
stats.numTimesWgBlockedDueVgprAlloc;
743 ++
stats.numTimesWgBlockedDueSgprAlloc;
748 bool ldsAvail =
lds.canReserve(task->
ldsSize());
750 stats.wgBlockedDueLdsAllocation++;
753 if (!barrier_avail) {
754 stats.wgBlockedDueBarrierAllocation++;
762 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
763 && ldsAvail && barrier_avail;
771 return wf_barrier.numYetToReachBarrier();
778 return wf_barrier.allAtBarrier();
785 wf_barrier.incNumAtBarrier();
792 return wf_barrier.numAtBarrier();
799 return wf_barrier.maxBarrierCnt();
813 wf_barrier.decMaxBarrierCnt();
820 wf_barrier.release();
828 for (
int j = 0; j <
shader->n_wf; ++j) {
843 for (
auto &vecRegFile :
vrf) {
847 for (
auto &scRegFile :
srf) {
864 if (
shader->getProgressInterval() != 0 &&
898 "No support for multiple Global Memory Pipelines exists!!!");
905 "No support for multiple Local Memory Pipelines exists!!!");
912 "No support for multiple Scalar Memory Pipelines exists!!!");
957 assert(cu !=
nullptr);
959 if (pkt->
req->isInvL2()) {
963 panic(
"Unknown MemSyncResp not from an instruction");
975 if (gpuDynInst->isKernelLaunch()) {
978 assert(pkt->
req->isKernel());
979 assert(pkt->
req->isInvL1());
994 && gpuDynInst->isEndOfKernel()) {
1000 assert(pkt->
req->isKernel());
1001 assert(pkt->
req->isGL2CacheFlush());
1017 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
1019 w->wfDynId,
w->wgId);
1025 if (!pkt->
req->isKernel()) {
1026 w =
computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
1027 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
1028 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
1029 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
1030 gpuDynInst->disassemble(),
w->outstandingReqs,
1031 w->outstandingReqs - 1);
1032 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
1044 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
1045 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1046 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
1064 if (pkt->
req->isKernel()) {
1071 assert(!pkt->
req->isKernel());
1078 assert(gpuDynInst->numScalarReqs > 0);
1080 gpuDynInst->numScalarReqs--;
1090 if (!gpuDynInst->numScalarReqs) {
1091 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
1092 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
1095 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
1109 for (
const auto &pkt :
retries) {
1125 for (
int i = 0;
i <
len; ++
i) {
1128 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
1129 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1130 pkt->
req->getPaddr());
1136 DPRINTF(GPUMem,
"failed again!\n");
1139 DPRINTF(GPUMem,
"successful!\n");
1153 if (sender_state->
wavefront !=
nullptr) {
1163 computeUnit->shader->gpuCmdProc.completeTimingRead(dispType);
1188 for (
int i = 0;
i <
len; ++
i) {
1191 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1193 pkt->
req->getPaddr());
1195 DPRINTF(GPUFetch,
"failed again!\n");
1198 DPRINTF(GPUFetch,
"successful!\n");
1207 return "ComputeUnit SQC memory request event";
1216 assert(!
pkt->req->systemReq());
1228 Addr tmp_vaddr = pkt->
req->getVaddr();
1233 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1235 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1254 shader->gpuCmdProc.driver()->setMtype(pkt->
req);
1261 }
else if (pkt->
isRead()) {
1264 fatal(
"pkt is not a read nor a write\n");
1270 ++
stats.tlbRequests;
1278 unsigned size = pkt->
getSize();
1281 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1282 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1287 if (!
p->pTable->translate(
vaddr, paddr)) {
1288 if (!
p->fixupFault(
vaddr)) {
1289 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1290 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1307 tlbPort[tlbPort_index].sendFunctional(pkt);
1310 int hit_level = translation_state->
hitLevel;
1311 assert(hit_level != -1);
1312 stats.hitsPerTLBLevel[hit_level]++;
1319 delete sender_state->
saved;
1320 delete sender_state;
1322 assert(pkt->
req->hasPaddr());
1323 assert(pkt->
req->hasSize());
1333 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1344 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1345 gpuDynInst->tlbHitLevel[
index] = hit_level;
1352 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1353 "scheduled\n",
cu_id, gpuDynInst->simdId,
1354 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1357 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1358 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1360 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1361 "failed!\n",
cu_id, gpuDynInst->simdId,
1362 gpuDynInst->wfSlotId, tmp_vaddr);
1364 tlbPort[tlbPort_index].retries.push_back(pkt);
1365 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1370 tlbPort[tlbPort_index].stallPort();
1372 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1373 "failed!\n",
cu_id, gpuDynInst->simdId,
1374 gpuDynInst->wfSlotId, tmp_vaddr);
1376 tlbPort[tlbPort_index].retries.push_back(pkt);
1378 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x from "
1379 "instruction %s sent!\n",
cu_id, gpuDynInst->simdId,
1380 gpuDynInst->wfSlotId, tmp_vaddr,
1381 gpuDynInst->disassemble().c_str());
1385 gpuDynInst->resetEntireStatusVector();
1387 gpuDynInst->decrementStatusVector(
index);
1397 tlbPort[tlbPort_index].sendFunctional(pkt);
1407 memPort[0].sendFunctional(new_pkt);
1409 DPRINTF(GPUMem,
"Functional sendRequest\n");
1410 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1411 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1412 new_pkt->
req->getPaddr());
1446 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1448 pkt->
req->getVaddr());
1457 assert(gpuDynInst->isGlobalSeg() ||
1458 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1463 req = std::make_shared<Request>(
1474 if (kernelMemSync) {
1475 if (gpuDynInst->isKernelLaunch()) {
1477 req->setReqInstSeqNum(gpuDynInst->seqNum());
1484 memPort[0].createMemReqEvent(pkt);
1486 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1487 "an acquire\n",
cu_id, gpuDynInst->simdId,
1488 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1494 assert(
shader->impl_kern_end_rel);
1495 assert(gpuDynInst->isEndOfKernel());
1498 req->setReqInstSeqNum(gpuDynInst->seqNum());
1505 memPort[0].createMemReqEvent(pkt);
1507 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1508 "a release\n",
cu_id, gpuDynInst->simdId,
1509 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1514 gpuDynInst->setRequestFlags(req);
1516 req->setReqInstSeqNum(gpuDynInst->seqNum());
1523 memPort[0].createMemReqEvent(pkt);
1526 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1527 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1528 pkt->
req->getPaddr());
1537 auto req = std::make_shared<Request>(paddr, 64, 0,
vramRequestorId());
1541 pkt->pushSenderState(
1548 shader->incNumOutstandingInvL2s();
1562 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1563 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1564 pkt->
req->getPaddr(),
id);
1566 Addr paddr = pkt->
req->getPaddr();
1582 int index = gpuDynInst->memStatusVector[paddr].back();
1584 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1585 pkt->
req->getPaddr(),
id);
1587 gpuDynInst->memStatusVector[paddr].pop_back();
1588 gpuDynInst->pAddr = pkt->
req->getPaddr();
1590 gpuDynInst->decrementStatusVector(
index);
1591 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1593 if (gpuDynInst->allLanesZero()) {
1594 auto iter = gpuDynInst->memStatusVector.begin();
1595 auto end = gpuDynInst->memStatusVector.end();
1597 while (iter != end) {
1598 assert(iter->second.empty());
1605 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1611 gpuDynInst->memStatusVector.clear();
1617 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1618 compute_unit->
cu_id, gpuDynInst->simdId,
1619 gpuDynInst->wfSlotId);
1622 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1624 .insert(std::make_pair(gpuDynInst,
curTick()));
1636 Addr line = pkt->
req->getPaddr();
1639 pkt->
req->getVaddr(), line);
1649 if (!translation_state->
tlbEntry) {
1657 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1658 pkt->
req->getVaddr());
1662 int hit_level = translation_state->
hitLevel;
1665 delete translation_state->
tlbEntry;
1666 assert(!translation_state->
ports.size());
1672 delete translation_state;
1681 gpuDynInst->memStatusVector[line].push_back(mp_index);
1682 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1684 DPRINTF(GPUTrace,
"CU%d WF[%d][%d]: Translated %#lx -> %#lx for "
1685 "instruction %s (seqNum: %ld)\n",
computeUnit->cu_id,
1686 gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->
req->getVaddr(),
1687 line, gpuDynInst->disassemble().c_str(), gpuDynInst->seqNum());
1698 panic(
"unsupported response to request conversion %s\n",
1703 int simdId = gpuDynInst->simdId;
1704 int wfSlotId = gpuDynInst->wfSlotId;
1711 case enums::PF_PHASE:
1712 last =
computeUnit->lastVaddrSimd[simdId][mp_index];
1715 last =
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1720 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1721 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1736 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1749 RequestPtr prefetch_req = std::make_shared<Request>(
1775 delete prefetch_pkt;
1796 if (new_pkt->
req->systemReq()) {
1801 if (!gpuDynInst->isSystemReq()) {
1803 gpuDynInst->setSystemReq();
1812 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1814 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1816 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1829 "ComputeUnit memory request event",
true);
1837 "ComputeUnit memory response event",
true);
1847 if (pkt->
req->systemReq()) {
1852 retries.emplace_back(pkt, gpuDynInst);
1856 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1857 compute_unit->
cu_id, gpuDynInst->simdId,
1858 gpuDynInst->wfSlotId,
id, pkt->
req->getPaddr());
1863 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
1864 " req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1865 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1866 pkt->
req->getPaddr());
1874 return "ComputeUnit scalar memory request event";
1884 if (
pkt->req->systemReq()) {
1892 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1893 compute_unit->
cu_id, gpuDynInst->simdId,
1894 gpuDynInst->wfSlotId,
pkt->req->getPaddr());
1897 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1898 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1899 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1900 pkt->req->getPaddr());
1915 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1924 for (
int i = 0;
i <
len; ++
i) {
1927 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1932 DPRINTF(GPUTLB,
": failed again\n");
1935 DPRINTF(GPUTLB,
": successful\n");
1951 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1953 delete translation_state->
tlbEntry;
1954 assert(!translation_state->
ports.size());
1957 delete translation_state;
1965 [[maybe_unused]]
Wavefront *
w = gpuDynInst->wavefront();
1967 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1968 "translation: PA %#x -> %#x\n",
computeUnit->cu_id,
w->simdId,
1969 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1978 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1997 if (req_pkt->
req->systemReq()) {
1998 gpuDynInst->setSystemReq();
2015 [[maybe_unused]]
Addr line = pkt->
req->getPaddr();
2016 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
2025 bool success = translation_state->
tlbEntry !=
nullptr;
2026 delete translation_state->
tlbEntry;
2027 assert(!translation_state->
ports.size());
2029 delete translation_state;
2070 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
2079 for (
int i = 0;
i <
len; ++
i) {
2082 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
2086 DPRINTF(GPUTLB,
": failed again\n");
2089 DPRINTF(GPUTLB,
": successful\n");
2098 if (gpuDynInst->isScalar()) {
2099 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2101 stats.instCyclesSALU++;
2102 }
else if (gpuDynInst->isLoad()) {
2103 stats.scalarMemReads++;
2104 }
else if (gpuDynInst->isStore()) {
2105 stats.scalarMemWrites++;
2108 if (gpuDynInst->isALU()) {
2109 shader->total_valu_insts++;
2110 if (
shader->total_valu_insts ==
shader->max_valu_insts) {
2114 stats.instCyclesVALU++;
2115 stats.threadCyclesVALU
2116 += gpuDynInst->wavefront()->execMask().count();
2117 }
else if (gpuDynInst->isFlat()) {
2118 if (gpuDynInst->isLocalMem()) {
2119 stats.flatLDSInsts++;
2121 stats.flatVMemInsts++;
2123 }
else if (gpuDynInst->isFlatGlobal()) {
2124 stats.flatVMemInsts++;
2125 }
else if (gpuDynInst->isFlatScratch()) {
2126 stats.flatVMemInsts++;
2127 }
else if (gpuDynInst->isLocalMem()) {
2128 stats.ldsNoFlatInsts++;
2129 }
else if (gpuDynInst->isLoad()) {
2130 stats.vectorMemReads++;
2131 }
else if (gpuDynInst->isStore()) {
2132 stats.vectorMemWrites++;
2135 if (gpuDynInst->isLoad()) {
2136 switch (gpuDynInst->executedAs()) {
2137 case enums::SC_SPILL:
2140 case enums::SC_GLOBAL:
2141 stats.globalReads++;
2143 case enums::SC_GROUP:
2146 case enums::SC_PRIVATE:
2149 case enums::SC_READONLY:
2150 stats.readonlyReads++;
2152 case enums::SC_KERNARG:
2153 stats.kernargReads++;
2158 case enums::SC_NONE:
2165 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2168 }
else if (gpuDynInst->isStore()) {
2169 switch (gpuDynInst->executedAs()) {
2170 case enums::SC_SPILL:
2171 stats.spillWrites++;
2173 case enums::SC_GLOBAL:
2174 stats.globalWrites++;
2176 case enums::SC_GROUP:
2177 stats.groupWrites++;
2179 case enums::SC_PRIVATE:
2182 case enums::SC_READONLY:
2183 stats.readonlyWrites++;
2185 case enums::SC_KERNARG:
2186 stats.kernargWrites++;
2191 case enums::SC_NONE:
2198 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2220 std::ostream *page_stat_file =
simout.create(
name().c_str())->stream();
2222 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
2226 *page_stat_file << std::hex << iter.first <<
",";
2227 *page_stat_file << std::dec << iter.second.first <<
",";
2228 *page_stat_file << std::dec << iter.second.second << std::endl;
2265 const uint32_t wgId)
const
2267 return lds.getRefCounter(dispatchId, wgId);
2275 for (
int i_wf = 0; i_wf <
shader->n_wf; ++i_wf){
2294 RequestPtr newRequest = std::make_shared<Request>();
2295 newRequest->setPaddr(0x0);
2303 return ldsPort.sendTimingReq(newPacket);
2319 for (
int i = 0;
i <
shader->n_wf; ++
i) {
2324 std::cout <<
curTick() <<
": ";
2325 std::cout <<
"CU" <<
cu_id <<
" WF[" << j <<
"][" <<
i <<
"] ";
2326 wfList[j][
i]->printProgress();
2332 std::cout << std::endl;
2344 fatal_if(!senderState,
"did not get the right sort of sender state");
2351 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2365 fatal_if(!sender_state,
"packet without a valid sender state");
2374 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2376 gpuDynInst->wfSlotId);
2384 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2386 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2389 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2391 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2405 auto queueSize =
retries.size();
2407 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2411 "why was there a recvReqRetry() with no pending reqs?");
2413 "recvReqRetry() happened when the port was not stalled");
2425 DPRINTF(GPUPort,
": LDS send failed again\n");
2428 DPRINTF(GPUTLB,
": LDS send successful\n");
2444 "Number of cycles needed to execute VALU insts."),
2446 "Number of cycles needed to execute SALU insts."),
2448 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2449 "the number of active threads."),
2451 "Percentage of active vector ALU threads in a wave."),
2453 " accesses that resolve to LDS."),
2455 "including FLAT accesses that resolve to LDS) per-wavefront."),
2457 "The number of FLAT insts that resolve to vmem issued."),
2459 "resolve to vmem issued per-wavefront."),
2461 "The number of FLAT insts that resolve to LDS issued."),
2463 "resolve to LDS issued per-wavefront."),
2465 "Number of vector mem write insts (excluding FLAT insts)."),
2467 "insts (excluding FLAT insts) per-wavefront."),
2469 "Number of vector mem read insts (excluding FLAT insts)."),
2471 "(excluding FLAT insts) per-wavefront."),
2474 "The average number of scalar mem write insts per-wavefront."),
2477 "The average number of scalar mem read insts per-wavefront."),
2479 "Number of vector mem reads per kilo-instruction"),
2481 "Number of vector mem writes per kilo-instruction"),
2483 "Number of vector mem insts per kilo-instruction"),
2485 "Number of scalar mem reads per kilo-instruction"),
2487 "Number of scalar mem writes per kilo-instruction"),
2489 "Number of scalar mem insts per kilo-instruction"),
2491 "command, data from VRF to vector memory unit, per SIMD"),
2493 "command, data from SRF to scalar memory unit, per SIMD"),
2495 "command, data from VRF to LDS unit, per SIMD"),
2499 "Number of memory instructions sent to the global segment"),
2503 "Number of memory instructions sent to the arg segment"),
2507 "Number of memory instructions sent to the spill segment"),
2511 "Number of memory instructions sent to the group segment"),
2515 "Number of memory instructions sent to the private segment"),
2518 "Number of memory instructions sent to the readonly segment"),
2520 "Number of memory instructions sent to the readonly segment"),
2523 "Number of memory instructions sent to the kernarg segment"),
2525 "Number of memory instructions sent to the kernarg segment"),
2527 "wave level parallelism: count of active waves at wave launch"),
2530 "total number of cycles for all uncoalesced requests"),
2533 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2536 "Number of bank conflicts per LDS memory packet"),
2538 "pages touched per wf (over all mem. instr.)"),
2540 "dynamic non-flat global memory instruction count"),
2542 "dynamic flat global memory instruction count"),
2545 "WG dispatch was blocked due to lack of barrier resources"),
2547 "Workgroup blocked due to LDS capacity"),
2550 "vector instructions per cycle"),
2552 "number of vec ops executed (e.g. WF size/inst)"),
2554 "number of f16 vec ops executed (e.g. WF size/inst)"),
2556 "number of f32 vec ops executed (e.g. WF size/inst)"),
2558 "number of f64 vec ops executed (e.g. WF size/inst)"),
2560 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2562 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2564 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2566 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2568 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2570 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2572 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2574 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2576 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2578 "number of mfma vec ops executed (e.g. WF size/inst)"),
2580 "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
2582 "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
2584 "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
2586 "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
2588 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2590 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2594 ADD_STAT(
ipc,
"Instructions per cycle (this CU only)"),
2596 "instruction (over all instructions)"),
2598 "number of active lanes per global memory instruction"),
2600 "number of active lanes per local memory instruction"),
2602 "Number of dynamic non-GM memory insts executed"),
2604 "blocked due to VGPR allocation per SIMD"),
2606 "blocked due to SGPR allocation per SIMD"),
2609 "number of compare and swap operations that failed"),
2613 "arrival at coalescer"),
2667 for (
int i = 0;
i < 4; ++
i) {
void sendRequest(PacketPtr pkt, Event *callback)
ClockedObject(const ClockedObjectParams &p)
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ComputeUnit * computeUnit
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
ComputeUnit * computeUnit
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ComputeUnit * computeUnit
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
GPUDynInstPtr getMemInst() const
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
ComputeUnit * computeUnit
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
const char * description() const
Return a C string describing the event.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
ComputeUnit * computeUnit
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ComputeUnit * computeUnit
const char * description() const
Return a C string describing the event.
ScalarDataPort & scalarDataPort
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ComputeUnit * computeUnit
bool handleResponse(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
int numVectorGlobalMemUnits
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
void resetBarrier(int bar_id)
std::vector< std::vector< Addr > > lastVaddrSimd
int numVectorSharedMemUnits
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
void sendInvL2(Addr paddr)
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
int vrfToCoalescerBusWidth
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
ScoreboardCheckStage scoreboardCheckStage
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
int coalescerToVrfBusWidth
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
int wgSize(int dim) const
int numVectorRegs() const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
int numScalarRegs() const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
const std::string & toString() const
Return the string to a cmd given by idx.
virtual std::string name() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
T * getPtr()
get a pointer to the data ptr.
RequestPtr req
A pointer to the original request.
MemCmd cmd
The command field of the packet.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
void sendFunctional(PacketPtr pkt) const
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
@ KERNEL
The request should be marked with KERNEL.
int getNumOutstandingInvL2s() const
void decNumOutstandingInvL2s()
AMDGPUSystemHub * systemHub
static const int InvalidID
void setStatus(status_e newStatus)
std::deque< GPUDynInstPtr > instructionBuffer
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
void schedule(Event &event, Tick when)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Bitfield< 21, 20 > stride
const FlagsType pdf
Print the percent of the total that this entry represents.
const FlagsType oneline
Print all values on a single line.
Copyright (c) 2024 Arm Limited All rights reserved.
T safe_cast(U &&ref_or_ptr)
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
std::string csprintf(const char *format, const Args &...args)
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Declarations of a non-full system Page Table.
statistics::Scalar spillReads
statistics::Scalar groupWrites
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numFailedCASOps
statistics::Scalar numVecOpsExecuted
statistics::Formula vpc_f64
statistics::Scalar instCyclesSALU
statistics::Formula vectorMemWritesPerWF
statistics::Scalar argWrites
statistics::Scalar globalReads
statistics::Scalar numCASOps
statistics::Scalar completedWGs
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vALUInstsPerWF
statistics::Formula vectorMemWritesPerKiloInst
statistics::Formula sALUInstsPerWF
statistics::Formula readonlyMemInsts
statistics::Formula vALUUtilization
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::Formula privMemInsts
statistics::VectorDistribution instInterleave
statistics::Scalar flatVMemInsts
statistics::Scalar numVecOpsExecutedMAC64
statistics::Formula vpc_f16
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar dynamicLMemInstrCnt
statistics::Formula flatLDSInstsPerWF
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Vector instCyclesVMemPerSimd
statistics::Formula flatVMemInstsPerWF
statistics::Scalar argReads
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedFMA64
statistics::Scalar scalarMemWrites
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Formula groupMemInsts
statistics::Scalar privReads
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Formula numALUInstsExecuted
statistics::Scalar completedWfs
statistics::Distribution ldsBankConflictDist
statistics::Scalar vectorMemWrites
statistics::Scalar numInstrExecuted
statistics::Scalar vectorMemReads
statistics::Formula argMemInsts
statistics::Scalar tlbCycles
statistics::Formula scalarMemWritesPerKiloInst
statistics::Scalar scalarMemReads
statistics::Scalar tlbRequests
statistics::Formula kernargMemInsts
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar groupReads
statistics::Scalar privWrites
statistics::Scalar kernargReads
statistics::Scalar instCyclesVALU
statistics::Formula scalarMemWritesPerWF
statistics::Scalar readonlyWrites
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Formula vectorMemReadsPerWF
statistics::Scalar dynamicGMemInstrCnt
statistics::Formula vpc_f32
statistics::Scalar ldsBankAccesses
statistics::Formula tlbLatency
statistics::Scalar vALUInsts
statistics::Scalar numVecOpsExecutedFMA32
statistics::Formula scalarMemReadsPerKiloInst
statistics::Formula globalMemInsts
statistics::Formula scalarMemReadsPerWF
statistics::Scalar numVecOpsExecutedMAD16
statistics::Vector hitsPerTLBLevel
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar threadCyclesVALU
statistics::Scalar ldsNoFlatInsts
statistics::Scalar flatLDSInsts
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar spillWrites
statistics::Formula ldsNoFlatInstsPerWF
statistics::Scalar numVecOpsExecutedMAD32
statistics::Formula spillMemInsts
statistics::Scalar numVecOpsExecutedMFMAI8
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Scalar kernargWrites
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar globalWrites
statistics::Scalar dynamicFlatMemInstrCnt
statistics::Distribution headTailLatency
statistics::Scalar totalCycles
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
statistics::Scalar readonlyReads
statistics::Scalar sALUInsts
statistics::Scalar numVecOpsExecutedMFMA
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
ComputeUnit * computeUnit
GPUDynInstPtr _gpuDynInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
GPUDynInstPtr _gpuDynInst
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
Packet::SenderState * saved
const std::string & name()