39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUReg.hh"
46 #include "debug/GPURename.hh"
47 #include "debug/GPUSync.hh"
48 #include "debug/GPUTLB.hh"
66 numVectorGlobalMemUnits(
p.num_global_mem_pipes),
67 numVectorSharedMemUnits(
p.num_shared_mem_pipes),
68 numScalarMemUnits(
p.num_scalar_mem_pipes),
69 numVectorALUs(
p.num_SIMDs),
70 numScalarALUs(
p.num_scalar_cores),
71 vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
72 coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
73 registerManager(
p.register_manager),
75 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
76 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
77 execStage(
p, *this, scheduleToExecute),
78 globalMemoryPipe(
p, *this),
79 localMemoryPipe(
p, *this),
80 scalarMemoryPipe(
p, *this),
81 tickEvent([this]{
exec(); },
"Compute unit tick event",
84 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
85 simdWidth(
p.simd_width),
86 spBypassPipeLength(
p.spbypass_pipe_length),
87 dpBypassPipeLength(
p.dpbypass_pipe_length),
88 scalarPipeStages(
p.scalar_pipe_length),
89 operandNetworkLength(
p.operand_network_length),
90 issuePeriod(
p.issue_period),
91 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
92 srf_scm_bus_latency(
p.srf_scm_bus_latency),
93 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
94 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
95 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
96 debugSegFault(
p.debugSegFault),
97 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
98 countPages(
p.countPages),
99 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
100 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
101 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
102 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
108 _cacheLineSize(
p.system->cacheLineSize()),
109 _numBarrierSlots(
p.num_barrier_slots),
110 globalSeqNum(0), wavefrontSize(
p.wf_size),
111 scoreboardCheckToSchedule(
p),
112 scheduleToExecute(
p),
119 "Functional TLB not supported in full-system GPU simulation");
130 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
132 "WF size is larger than the host can support");
134 "Wavefront size should be a power of 2");
137 numCyclesPerStoreTransfer =
138 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
139 (double)vrfToCoalescerBusWidth);
141 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
142 / coalescerToVrfBusWidth;
145 idleWfs =
p.n_wf * numVectorALUs;
146 lastVaddrWF.resize(numVectorALUs);
147 wfList.resize(numVectorALUs);
149 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
151 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
152 freeBarrierIds.insert(
i);
155 for (
int j = 0;
j < numVectorALUs; ++
j) {
156 lastVaddrWF[
j].resize(
p.n_wf);
158 for (
int i = 0;
i <
p.n_wf; ++
i) {
159 lastVaddrWF[
j][
i].resize(wfSize());
161 wfList[
j].push_back(
p.wavefronts[
j *
p.n_wf +
i]);
162 wfList[
j][
i]->setParent(
this);
164 for (
int k = 0;
k < wfSize(); ++
k) {
165 lastVaddrWF[
j][
i][
k] = 0;
170 lastVaddrSimd.resize(numVectorALUs);
172 for (
int i = 0;
i < numVectorALUs; ++
i) {
173 lastVaddrSimd[
i].resize(wfSize(), 0);
176 lastVaddrCU.resize(wfSize());
180 if (
p.execPolicy ==
"OLDEST-FIRST") {
182 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
185 fatal(
"Invalid WF execution policy (CU)\n");
188 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
192 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
198 memPortTokens =
new TokenManager(
p.max_cu_tokens);
202 lastExecCycle.resize(numVectorALUs, 0);
204 for (
int i = 0;
i < vrf.size(); ++
i) {
205 vrf[
i]->setParent(
this);
207 for (
int i = 0;
i < srf.size(); ++
i) {
208 srf[
i]->setParent(
this);
210 numVecRegsPerSimd = vrf[0]->numRegs();
211 numScalarRegsPerSimd = srf[0]->numRegs();
213 registerManager->setParent(
this);
217 instExecPerSimd.resize(numVectorALUs, 0);
221 "Cache line size should be a power of two.");
222 cacheLineBits =
floorLog2(_cacheLineSize);
305 w->workGroupSz[0] = task->
wgSize(0);
306 w->workGroupSz[1] = task->
wgSize(1);
307 w->workGroupSz[2] = task->
wgSize(2);
308 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
312 w->computeActualWgSz(task);
319 static int _n_wave = 0;
325 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
329 w->execMask() = init_mask;
333 w->initMask = init_mask.to_ullong();
336 w->barrierId(bar_id);
338 assert(!
w->hasBarrier());
342 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
343 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
345 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
346 (
w->actualWgSz[0] *
w->actualWgSz[1]);
348 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
349 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
356 w->workGroupId[0] =
w->wgId % task->
numWg(0);
357 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
358 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
361 w->ldsChunk = ldsChunk;
363 [[maybe_unused]] int32_t refCount =
365 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
366 cu_id,
w->wgId, refCount);
368 w->instructionBuffer.clear();
373 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
374 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
375 w->simdId,
w->wfSlotId, refCount);
377 w->initRegState(task,
w->actualWgSzTotal);
392 = std::make_shared<GPUDynInst>(
this,
nullptr,
396 gpuDynInst->kern_id = kernId;
398 req->setContext(gpuDynInst->wfDynId);
431 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
445 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
459 if (num_wfs_in_wg > 1) {
466 assert(!wf_barrier.maxBarrierCnt());
467 assert(!wf_barrier.numAtBarrier());
468 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
470 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
471 "%d waves using this barrier.\n",
cu_id, barrier_id,
491 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
492 "vregDemand[%d] sregDemand[%d]\n",
i,
j,
w->wfDynId,
493 vregDemand, sregDemand);
508 "Instruction Buffer of WF%d can't be empty",
w->wgId);
517 "Instruction Buffer of WF%d can't be empty",
w->wgId);
520 auto it =
pipeMap.find(ii->seqNum());
530 int trueWgSizeTotal = 1;
536 trueWgSizeTotal *= trueWgSize[
d];
537 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
540 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
543 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
544 num_wfs_in_wg = numWfs;
546 bool barrier_avail =
true;
549 barrier_avail =
false;
562 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
563 "that has %d VGPRs\n",
566 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
573 int numMappedWfs = 0;
585 if (numMappedWfs < numWfs &&
599 assert(numMappedWfs <= numWfs);
601 bool vregAvail =
true;
602 bool sregAvail =
true;
604 if (numMappedWfs < numWfs) {
620 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
621 VGPR Availability = %d, SGPR Availability = %d\n",
622 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
639 if (!barrier_avail) {
648 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
649 && ldsAvail && barrier_avail;
657 return wf_barrier.numYetToReachBarrier();
664 return wf_barrier.allAtBarrier();
671 wf_barrier.incNumAtBarrier();
678 return wf_barrier.numAtBarrier();
685 return wf_barrier.maxBarrierCnt();
699 wf_barrier.decMaxBarrierCnt();
706 wf_barrier.release();
729 for (
auto &vecRegFile :
vrf) {
733 for (
auto &scRegFile :
srf) {
777 "No support for multiple Global Memory Pipelines exists!!!");
784 "No support for multiple Local Memory Pipelines exists!!!");
791 "No support for multiple Scalar Memory Pipelines exists!!!");
835 if (gpuDynInst->isKernelLaunch()) {
838 assert(pkt->
req->isKernel());
839 assert(pkt->
req->isInvL1());
854 && gpuDynInst->isEndOfKernel()) {
860 assert(pkt->
req->isKernel());
861 assert(pkt->
req->isGL2CacheFlush());
877 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
878 computeUnit->cu_id,
w->simdId,
w->wfSlotId,
879 w->wfDynId,
w->wgId);
885 if (!pkt->
req->isKernel()) {
886 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
887 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
888 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
889 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
890 gpuDynInst->disassemble(),
w->outstandingReqs,
891 w->outstandingReqs - 1);
892 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
901 computeUnit->memPort[
index].createMemRespEvent(pkt);
904 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
905 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
906 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
908 computeUnit->schedule(mem_resp_event,
909 curTick() + computeUnit->resp_tick_latency);
917 return handleResponse(pkt);
923 assert(!pkt->
req->isKernel());
930 assert(gpuDynInst->numScalarReqs > 0);
932 gpuDynInst->numScalarReqs--;
942 if (!gpuDynInst->numScalarReqs) {
943 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
944 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
947 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
961 for (
const auto &pkt : retries) {
962 if (!sendTimingReq(pkt)) {
973 int len = retries.size();
977 for (
int i = 0;
i <
len; ++
i) {
979 [[maybe_unused]]
GPUDynInstPtr gpuDynInst = retries.front().second;
980 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
981 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
982 pkt->
req->getPaddr());
987 if (!sendTimingReq(pkt)) {
988 DPRINTF(GPUMem,
"failed again!\n");
991 DPRINTF(GPUMem,
"successful!\n");
1000 computeUnit->handleSQCReturn(pkt);
1014 int len = retries.size();
1018 for (
int i = 0;
i <
len; ++
i) {
1020 [[maybe_unused]]
Wavefront *wavefront = retries.front().second;
1021 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1023 pkt->
req->getPaddr());
1024 if (!sendTimingReq(pkt)) {
1025 DPRINTF(GPUFetch,
"failed again!\n");
1028 DPRINTF(GPUFetch,
"successful!\n");
1029 retries.pop_front();
1038 Addr tmp_vaddr = pkt->
req->getVaddr();
1043 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1045 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1071 }
else if (pkt->
isRead()) {
1074 fatal(
"pkt is not a read nor a write\n");
1086 unsigned size = pkt->
getSize();
1089 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1090 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1095 if (!
p->pTable->translate(
vaddr, paddr)) {
1096 if (!
p->fixupFault(
vaddr)) {
1097 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1098 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1115 tlbPort[tlbPort_index].sendFunctional(pkt);
1118 int hit_level = translation_state->
hitLevel;
1119 assert(hit_level != -1);
1124 safe_cast<GpuTranslationState*>(pkt->
senderState);
1127 delete sender_state->
saved;
1128 delete sender_state;
1130 assert(pkt->
req->hasPaddr());
1131 assert(pkt->
req->hasSize());
1141 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1152 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1153 gpuDynInst->tlbHitLevel[
index] = hit_level;
1160 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1161 "scheduled\n",
cu_id, gpuDynInst->simdId,
1162 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1165 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1166 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1168 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1169 "failed!\n",
cu_id, gpuDynInst->simdId,
1170 gpuDynInst->wfSlotId, tmp_vaddr);
1172 tlbPort[tlbPort_index].retries.push_back(pkt);
1173 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1178 tlbPort[tlbPort_index].stallPort();
1180 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1181 "failed!\n",
cu_id, gpuDynInst->simdId,
1182 gpuDynInst->wfSlotId, tmp_vaddr);
1184 tlbPort[tlbPort_index].retries.push_back(pkt);
1187 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1188 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1192 gpuDynInst->resetEntireStatusVector();
1194 gpuDynInst->decrementStatusVector(
index);
1204 tlbPort[tlbPort_index].sendFunctional(pkt);
1214 memPort[0].sendFunctional(new_pkt);
1216 DPRINTF(GPUMem,
"Functional sendRequest\n");
1217 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1218 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1219 new_pkt->
req->getPaddr());
1223 safe_cast<GpuTranslationState*>(pkt->
senderState);
1253 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1255 pkt->
req->getVaddr());
1264 assert(gpuDynInst->isGlobalSeg() ||
1265 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1270 req = std::make_shared<Request>(
1281 if (kernelMemSync) {
1282 if (gpuDynInst->isKernelLaunch()) {
1284 req->setReqInstSeqNum(gpuDynInst->seqNum());
1291 memPort[0].createMemReqEvent(pkt);
1293 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1294 "an acquire\n",
cu_id, gpuDynInst->simdId,
1295 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1302 assert(gpuDynInst->isEndOfKernel());
1305 req->setReqInstSeqNum(gpuDynInst->seqNum());
1312 memPort[0].createMemReqEvent(pkt);
1314 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1315 "a release\n",
cu_id, gpuDynInst->simdId,
1316 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1321 gpuDynInst->setRequestFlags(req);
1323 req->setReqInstSeqNum(gpuDynInst->seqNum());
1330 memPort[0].createMemReqEvent(pkt);
1333 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1334 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1335 pkt->
req->getPaddr());
1345 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1352 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1353 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1354 pkt->
req->getPaddr(),
id);
1356 Addr paddr = pkt->
req->getPaddr();
1372 int index = gpuDynInst->memStatusVector[paddr].back();
1374 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1375 pkt->
req->getPaddr(),
id);
1377 gpuDynInst->memStatusVector[paddr].pop_back();
1378 gpuDynInst->pAddr = pkt->
req->getPaddr();
1380 gpuDynInst->decrementStatusVector(
index);
1381 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1383 if (gpuDynInst->allLanesZero()) {
1384 auto iter = gpuDynInst->memStatusVector.begin();
1385 auto end = gpuDynInst->memStatusVector.end();
1387 while (iter != end) {
1388 assert(iter->second.empty());
1395 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1401 gpuDynInst->memStatusVector.clear();
1407 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1408 compute_unit->
cu_id, gpuDynInst->simdId,
1409 gpuDynInst->wfSlotId);
1412 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1414 .insert(std::make_pair(gpuDynInst,
curTick()));
1426 Addr line = pkt->
req->getPaddr();
1428 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1429 pkt->
req->getVaddr(), line);
1432 computeUnit->stats.tlbCycles +=
curTick();
1436 safe_cast<GpuTranslationState*>(pkt->
senderState);
1439 if (!translation_state->
tlbEntry) {
1441 safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
1444 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1447 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1448 pkt->
req->getVaddr());
1452 int hit_level = translation_state->
hitLevel;
1453 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1455 delete translation_state->
tlbEntry;
1456 assert(!translation_state->
ports.size());
1462 delete translation_state;
1466 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1471 gpuDynInst->memStatusVector[line].push_back(mp_index);
1472 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1483 panic(
"unsupported response to request conversion %s\n",
1487 if (computeUnit->prefetchDepth) {
1488 int simdId = gpuDynInst->simdId;
1489 int wfSlotId = gpuDynInst->wfSlotId;
1492 switch(computeUnit->prefetchType) {
1494 last = computeUnit->lastVaddrCU[mp_index];
1496 case enums::PF_PHASE:
1497 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1500 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1505 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1506 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1514 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1515 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1516 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1518 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1519 computeUnit->prefetchStride:
stride;
1521 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1522 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1527 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1534 RequestPtr prefetch_req = std::make_shared<Request>(
1537 computeUnit->requestorId(),
1547 computeUnit->shader->gpuTc,
true);
1550 sendFunctional(prefetch_pkt);
1554 safe_cast<GpuTranslationState*>(
1560 delete prefetch_pkt;
1581 if (new_pkt->
req->systemReq()) {
1586 if (!gpuDynInst->isSystemReq()) {
1587 computeUnit->getTokenManager()->recvTokens(1);
1588 gpuDynInst->setSystemReq();
1591 new_pkt->
req->requestorId(computeUnit->vramRequestorId());
1597 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1599 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1600 computeUnit->cu_id, gpuDynInst->simdId,
1601 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1603 computeUnit->schedule(mem_req_event,
curTick() +
1604 computeUnit->req_tick_latency);
1613 [
this, pkt]{ processMemReqEvent(pkt); },
1614 "ComputeUnit memory request event",
true);
1621 [
this, pkt]{ processMemRespEvent(pkt); },
1622 "ComputeUnit memory response event",
true);
1630 [[maybe_unused]]
ComputeUnit *compute_unit = computeUnit;
1632 if (pkt->
req->systemReq()) {
1636 }
else if (!(sendTimingReq(pkt))) {
1637 retries.push_back(std::make_pair(pkt, gpuDynInst));
1640 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1641 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1642 id, pkt->
req->getPaddr());
1645 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1646 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1647 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1648 pkt->
req->getPaddr());
1655 return "ComputeUnit scalar memory request event";
1661 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1665 if (pkt->req->systemReq()) {
1673 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1674 compute_unit->
cu_id, gpuDynInst->simdId,
1675 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1678 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1679 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1680 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1681 pkt->req->getPaddr());
1694 int len = retries.size();
1696 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1697 computeUnit->cu_id,
len);
1700 assert(isStalled());
1705 for (
int i = 0;
i <
len; ++
i) {
1708 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1710 if (!sendTimingReq(pkt)) {
1713 DPRINTF(GPUTLB,
": failed again\n");
1716 DPRINTF(GPUTLB,
": successful\n");
1717 retries.pop_front();
1728 safe_cast<GpuTranslationState*>(pkt->
senderState);
1732 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1734 delete translation_state->
tlbEntry;
1735 assert(!translation_state->
ports.size());
1738 delete translation_state;
1741 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1746 [[maybe_unused]]
Wavefront *
w = gpuDynInst->wavefront();
1748 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1749 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1750 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1759 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1778 if (req_pkt->
req->systemReq()) {
1779 gpuDynInst->setSystemReq();
1781 req_pkt->
req->requestorId(computeUnit->vramRequestorId());
1786 (computeUnit->scalarDataPort, req_pkt);
1787 computeUnit->schedule(scalar_mem_req_event,
curTick() +
1788 computeUnit->req_tick_latency);
1796 [[maybe_unused]]
Addr line = pkt->
req->getPaddr();
1797 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1798 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1804 = safe_cast<GpuTranslationState*>(pkt->
senderState);
1806 bool success = translation_state->
tlbEntry !=
nullptr;
1807 delete translation_state->
tlbEntry;
1808 assert(!translation_state->
ports.size());
1810 delete translation_state;
1814 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1827 computeUnit->fetchStage.fetch(pkt, wavefront);
1850 int len = retries.size();
1851 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1854 assert(isStalled());
1860 for (
int i = 0;
i <
len; ++
i) {
1863 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1865 if (!sendTimingReq(pkt)) {
1867 DPRINTF(GPUTLB,
": failed again\n");
1870 DPRINTF(GPUTLB,
": successful\n");
1871 retries.pop_front();
1879 if (gpuDynInst->isScalar()) {
1880 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1883 }
else if (gpuDynInst->isLoad()) {
1885 }
else if (gpuDynInst->isStore()) {
1889 if (gpuDynInst->isALU()) {
1897 += gpuDynInst->wavefront()->execMask().count();
1898 }
else if (gpuDynInst->isFlat()) {
1899 if (gpuDynInst->isLocalMem()) {
1904 }
else if (gpuDynInst->isFlatGlobal()) {
1906 }
else if (gpuDynInst->isLocalMem()) {
1908 }
else if (gpuDynInst->isLoad()) {
1910 }
else if (gpuDynInst->isStore()) {
1914 if (gpuDynInst->isLoad()) {
1915 switch (gpuDynInst->executedAs()) {
1916 case enums::SC_SPILL:
1919 case enums::SC_GLOBAL:
1922 case enums::SC_GROUP:
1925 case enums::SC_PRIVATE:
1928 case enums::SC_READONLY:
1931 case enums::SC_KERNARG:
1944 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1947 }
else if (gpuDynInst->isStore()) {
1948 switch (gpuDynInst->executedAs()) {
1949 case enums::SC_SPILL:
1952 case enums::SC_GLOBAL:
1955 case enums::SC_GROUP:
1958 case enums::SC_PRIVATE:
1961 case enums::SC_READONLY:
1964 case enums::SC_KERNARG:
1977 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2001 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
2005 *page_stat_file << std::hex << iter.first <<
",";
2006 *page_stat_file << std::dec << iter.second.first <<
",";
2007 *page_stat_file << std::dec << iter.second.second << std::endl;
2044 const uint32_t wgId)
const
2054 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
2073 RequestPtr newRequest = std::make_shared<Request>();
2074 newRequest->setPaddr(0x0);
2103 fatal_if(!senderState,
"did not get the right sort of sender state");
2110 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2124 fatal_if(!sender_state,
"packet without a valid sender state");
2129 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2133 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2134 computeUnit->cu_id, gpuDynInst->simdId,
2135 gpuDynInst->wfSlotId);
2143 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2144 computeUnit->cu_id, gpuDynInst->simdId,
2145 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2148 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2149 computeUnit->cu_id, gpuDynInst->simdId,
2150 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2164 auto queueSize = retries.size();
2166 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2167 computeUnit->cu_id, queueSize);
2170 "why was there a recvReqRetry() with no pending reqs?");
2172 "recvReqRetry() happened when the port was not stalled");
2176 while (!retries.empty()) {
2179 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2184 DPRINTF(GPUPort,
": LDS send failed again\n");
2187 DPRINTF(GPUTLB,
": LDS send successful\n");
2195 : statistics::
Group(parent),
2196 ADD_STAT(vALUInsts,
"Number of vector ALU insts issued."),
2197 ADD_STAT(vALUInstsPerWF,
"The avg. number of vector ALU insts issued "
2199 ADD_STAT(sALUInsts,
"Number of scalar ALU insts issued."),
2200 ADD_STAT(sALUInstsPerWF,
"The avg. number of scalar ALU insts issued "
2203 "Number of cycles needed to execute VALU insts."),
2205 "Number of cycles needed to execute SALU insts."),
2206 ADD_STAT(threadCyclesVALU,
"Number of thread cycles used to execute "
2207 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2208 "the number of active threads."),
2210 "Percentage of active vector ALU threads in a wave."),
2211 ADD_STAT(ldsNoFlatInsts,
"Number of LDS insts issued, not including FLAT"
2212 " accesses that resolve to LDS."),
2213 ADD_STAT(ldsNoFlatInstsPerWF,
"The avg. number of LDS insts (not "
2214 "including FLAT accesses that resolve to LDS) per-wavefront."),
2216 "The number of FLAT insts that resolve to vmem issued."),
2217 ADD_STAT(flatVMemInstsPerWF,
"The average number of FLAT insts that "
2218 "resolve to vmem issued per-wavefront."),
2220 "The number of FLAT insts that resolve to LDS issued."),
2221 ADD_STAT(flatLDSInstsPerWF,
"The average number of FLAT insts that "
2222 "resolve to LDS issued per-wavefront."),
2224 "Number of vector mem write insts (excluding FLAT insts)."),
2225 ADD_STAT(vectorMemWritesPerWF,
"The average number of vector mem write "
2226 "insts (excluding FLAT insts) per-wavefront."),
2228 "Number of vector mem read insts (excluding FLAT insts)."),
2229 ADD_STAT(vectorMemReadsPerWF,
"The avg. number of vector mem read insts "
2230 "(excluding FLAT insts) per-wavefront."),
2231 ADD_STAT(scalarMemWrites,
"Number of scalar mem write insts."),
2233 "The average number of scalar mem write insts per-wavefront."),
2234 ADD_STAT(scalarMemReads,
"Number of scalar mem read insts."),
2236 "The average number of scalar mem read insts per-wavefront."),
2237 ADD_STAT(vectorMemReadsPerKiloInst,
2238 "Number of vector mem reads per kilo-instruction"),
2239 ADD_STAT(vectorMemWritesPerKiloInst,
2240 "Number of vector mem writes per kilo-instruction"),
2241 ADD_STAT(vectorMemInstsPerKiloInst,
2242 "Number of vector mem insts per kilo-instruction"),
2243 ADD_STAT(scalarMemReadsPerKiloInst,
2244 "Number of scalar mem reads per kilo-instruction"),
2245 ADD_STAT(scalarMemWritesPerKiloInst,
2246 "Number of scalar mem writes per kilo-instruction"),
2247 ADD_STAT(scalarMemInstsPerKiloInst,
2248 "Number of scalar mem insts per kilo-instruction"),
2249 ADD_STAT(instCyclesVMemPerSimd,
"Number of cycles to send address, "
2250 "command, data from VRF to vector memory unit, per SIMD"),
2251 ADD_STAT(instCyclesScMemPerSimd,
"Number of cycles to send address, "
2252 "command, data from SRF to scalar memory unit, per SIMD"),
2253 ADD_STAT(instCyclesLdsPerSimd,
"Number of cycles to send address, "
2254 "command, data from VRF to LDS unit, per SIMD"),
2255 ADD_STAT(globalReads,
"Number of reads to the global segment"),
2256 ADD_STAT(globalWrites,
"Number of writes to the global segment"),
2258 "Number of memory instructions sent to the global segment"),
2259 ADD_STAT(argReads,
"Number of reads to the arg segment"),
2260 ADD_STAT(argWrites,
"NUmber of writes to the arg segment"),
2262 "Number of memory instructions sent to the arg segment"),
2263 ADD_STAT(spillReads,
"Number of reads to the spill segment"),
2264 ADD_STAT(spillWrites,
"Number of writes to the spill segment"),
2266 "Number of memory instructions sent to the spill segment"),
2267 ADD_STAT(groupReads,
"Number of reads to the group segment"),
2268 ADD_STAT(groupWrites,
"Number of writes to the group segment"),
2270 "Number of memory instructions sent to the group segment"),
2271 ADD_STAT(privReads,
"Number of reads to the private segment"),
2272 ADD_STAT(privWrites,
"Number of writes to the private segment"),
2274 "Number of memory instructions sent to the private segment"),
2275 ADD_STAT(readonlyReads,
"Number of reads to the readonly segment"),
2277 "Number of memory instructions sent to the readonly segment"),
2279 "Number of memory instructions sent to the readonly segment"),
2280 ADD_STAT(kernargReads,
"Number of reads sent to the kernarg segment"),
2282 "Number of memory instructions sent to the kernarg segment"),
2284 "Number of memory instructions sent to the kernarg segment"),
2286 "wave level parallelism: count of active waves at wave launch"),
2287 ADD_STAT(tlbRequests,
"number of uncoalesced requests"),
2289 "total number of cycles for all uncoalesced requests"),
2290 ADD_STAT(tlbLatency,
"Avg. translation latency for data translations"),
2292 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2293 ADD_STAT(ldsBankAccesses,
"Total number of LDS bank accesses"),
2295 "Number of bank conflicts per LDS memory packet"),
2297 "pages touched per wf (over all mem. instr.)"),
2299 "dynamic non-flat global memory instruction count"),
2301 "dynamic flat global memory instruction count"),
2302 ADD_STAT(dynamicLMemInstrCnt,
"dynamic local memory intruction count"),
2303 ADD_STAT(wgBlockedDueBarrierAllocation,
2304 "WG dispatch was blocked due to lack of barrier resources"),
2305 ADD_STAT(wgBlockedDueLdsAllocation,
2306 "Workgroup blocked due to LDS capacity"),
2307 ADD_STAT(numInstrExecuted,
"number of instructions executed"),
2308 ADD_STAT(execRateDist,
"Instruction Execution Rate: Number of executed "
2309 "vector instructions per cycle"),
2311 "number of vec ops executed (e.g. WF size/inst)"),
2313 "number of f16 vec ops executed (e.g. WF size/inst)"),
2315 "number of f32 vec ops executed (e.g. WF size/inst)"),
2317 "number of f64 vec ops executed (e.g. WF size/inst)"),
2319 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2321 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2323 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2325 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2327 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2329 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2331 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2333 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2335 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2337 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2338 ADD_STAT(totalCycles,
"number of cycles the CU ran for"),
2339 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2340 ADD_STAT(vpc_f16,
"F16 Vector Operations per cycle (this CU only)"),
2341 ADD_STAT(vpc_f32,
"F32 Vector Operations per cycle (this CU only)"),
2342 ADD_STAT(vpc_f64,
"F64 Vector Operations per cycle (this CU only)"),
2343 ADD_STAT(ipc,
"Instructions per cycle (this CU only)"),
2344 ADD_STAT(controlFlowDivergenceDist,
"number of lanes active per "
2345 "instruction (over all instructions)"),
2346 ADD_STAT(activeLanesPerGMemInstrDist,
2347 "number of active lanes per global memory instruction"),
2348 ADD_STAT(activeLanesPerLMemInstrDist,
2349 "number of active lanes per local memory instruction"),
2351 "Number of dynamic non-GM memory insts executed"),
2352 ADD_STAT(numTimesWgBlockedDueVgprAlloc,
"Number of times WGs are "
2353 "blocked due to VGPR allocation per SIMD"),
2354 ADD_STAT(numTimesWgBlockedDueSgprAlloc,
"Number of times WGs are "
2355 "blocked due to SGPR allocation per SIMD"),
2356 ADD_STAT(numCASOps,
"number of compare and swap operations"),
2358 "number of compare and swap operations that failed"),
2359 ADD_STAT(completedWfs,
"number of completed wavefronts"),
2360 ADD_STAT(completedWGs,
"number of completed workgroups"),
2361 ADD_STAT(headTailLatency,
"ticks between first and last cache block "
2362 "arrival at coalescer"),
2363 ADD_STAT(instInterleave,
"Measure of instruction interleaving per SIMD")
2416 for (
int i = 0;
i < 4; ++
i) {