38 #include "debug/GPUDisp.hh"
39 #include "debug/GPUExec.hh"
40 #include "debug/GPUFetch.hh"
41 #include "debug/GPUMem.hh"
42 #include "debug/GPUPort.hh"
43 #include "debug/GPUPrefetch.hh"
44 #include "debug/GPUReg.hh"
45 #include "debug/GPURename.hh"
46 #include "debug/GPUSync.hh"
47 #include "debug/GPUTLB.hh"
65 numVectorGlobalMemUnits(
p.num_global_mem_pipes),
66 numVectorSharedMemUnits(
p.num_shared_mem_pipes),
67 numScalarMemUnits(
p.num_scalar_mem_pipes),
68 numVectorALUs(
p.num_SIMDs),
69 numScalarALUs(
p.num_scalar_cores),
70 vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
71 coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
72 registerManager(
p.register_manager),
74 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
75 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
76 execStage(
p, *this, scheduleToExecute),
77 globalMemoryPipe(
p, *this),
78 localMemoryPipe(
p, *this),
79 scalarMemoryPipe(
p, *this),
80 tickEvent([this]{
exec(); },
"Compute unit tick event",
83 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
84 simdWidth(
p.simd_width),
85 spBypassPipeLength(
p.spbypass_pipe_length),
86 dpBypassPipeLength(
p.dpbypass_pipe_length),
87 scalarPipeStages(
p.scalar_pipe_length),
88 operandNetworkLength(
p.operand_network_length),
89 issuePeriod(
p.issue_period),
90 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
91 srf_scm_bus_latency(
p.srf_scm_bus_latency),
92 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
93 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
94 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
95 debugSegFault(
p.debugSegFault),
96 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
97 countPages(
p.countPages),
98 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
99 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
100 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
101 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
107 _cacheLineSize(
p.system->cacheLineSize()),
108 _numBarrierSlots(
p.num_barrier_slots),
109 globalSeqNum(0), wavefrontSize(
p.wf_size),
110 scoreboardCheckToSchedule(
p),
111 scheduleToExecute(
p),
123 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
125 "WF size is larger than the host can support");
127 "Wavefront size should be a power of 2");
130 numCyclesPerStoreTransfer =
131 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
132 (double)vrfToCoalescerBusWidth);
134 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
135 / coalescerToVrfBusWidth;
138 idleWfs =
p.n_wf * numVectorALUs;
139 lastVaddrWF.resize(numVectorALUs);
140 wfList.resize(numVectorALUs);
142 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
144 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
145 freeBarrierIds.insert(
i);
148 for (
int j = 0;
j < numVectorALUs; ++
j) {
149 lastVaddrWF[
j].resize(
p.n_wf);
151 for (
int i = 0;
i <
p.n_wf; ++
i) {
152 lastVaddrWF[
j][
i].resize(wfSize());
154 wfList[
j].push_back(
p.wavefronts[
j *
p.n_wf +
i]);
155 wfList[
j][
i]->setParent(
this);
157 for (
int k = 0;
k < wfSize(); ++
k) {
158 lastVaddrWF[
j][
i][
k] = 0;
163 lastVaddrSimd.resize(numVectorALUs);
165 for (
int i = 0;
i < numVectorALUs; ++
i) {
166 lastVaddrSimd[
i].resize(wfSize(), 0);
169 lastVaddrCU.resize(wfSize());
173 if (
p.execPolicy ==
"OLDEST-FIRST") {
175 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
178 fatal(
"Invalid WF execution policy (CU)\n");
181 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
185 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
191 memPortTokens =
new TokenManager(
p.max_cu_tokens);
195 lastExecCycle.resize(numVectorALUs, 0);
197 for (
int i = 0;
i < vrf.size(); ++
i) {
198 vrf[
i]->setParent(
this);
200 for (
int i = 0;
i < srf.size(); ++
i) {
201 srf[
i]->setParent(
this);
203 numVecRegsPerSimd = vrf[0]->numRegs();
204 numScalarRegsPerSimd = srf[0]->numRegs();
206 registerManager->setParent(
this);
210 instExecPerSimd.resize(numVectorALUs, 0);
214 "Cache line size should be a power of two.");
215 cacheLineBits =
floorLog2(_cacheLineSize);
298 w->workGroupSz[0] = task->
wgSize(0);
299 w->workGroupSz[1] = task->
wgSize(1);
300 w->workGroupSz[2] = task->
wgSize(2);
301 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
305 w->computeActualWgSz(task);
312 static int _n_wave = 0;
318 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
322 w->execMask() = init_mask;
326 w->initMask = init_mask.to_ullong();
329 w->barrierId(bar_id);
331 assert(!
w->hasBarrier());
335 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
336 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
338 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
339 (
w->actualWgSz[0] *
w->actualWgSz[1]);
341 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
342 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
349 w->workGroupId[0] =
w->wgId % task->
numWg(0);
350 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
351 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
354 w->ldsChunk = ldsChunk;
356 [[maybe_unused]] int32_t refCount =
358 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
359 cu_id,
w->wgId, refCount);
361 w->instructionBuffer.clear();
366 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
367 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
368 w->simdId,
w->wfSlotId, refCount);
370 w->initRegState(task,
w->actualWgSzTotal);
385 = std::make_shared<GPUDynInst>(
this,
nullptr,
389 gpuDynInst->kern_id = kernId;
391 req->setContext(gpuDynInst->wfDynId);
424 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
438 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
452 if (num_wfs_in_wg > 1) {
459 assert(!wf_barrier.maxBarrierCnt());
460 assert(!wf_barrier.numAtBarrier());
461 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
463 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
464 "%d waves using this barrier.\n",
cu_id, barrier_id,
484 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
485 "vregDemand[%d] sregDemand[%d]\n",
i,
j,
w->wfDynId,
486 vregDemand, sregDemand);
501 "Instruction Buffer of WF%d can't be empty",
w->wgId);
510 "Instruction Buffer of WF%d can't be empty",
w->wgId);
513 auto it =
pipeMap.find(ii->seqNum());
523 int trueWgSizeTotal = 1;
529 trueWgSizeTotal *= trueWgSize[
d];
530 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
533 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
536 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
537 num_wfs_in_wg = numWfs;
539 bool barrier_avail =
true;
542 barrier_avail =
false;
555 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
556 "that has %d VGPRs\n",
559 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
566 int numMappedWfs = 0;
578 if (numMappedWfs < numWfs &&
592 assert(numMappedWfs <= numWfs);
594 bool vregAvail =
true;
595 bool sregAvail =
true;
597 if (numMappedWfs < numWfs) {
613 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
614 VGPR Availability = %d, SGPR Availability = %d\n",
615 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
632 if (!barrier_avail) {
641 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
642 && ldsAvail && barrier_avail;
650 return wf_barrier.numYetToReachBarrier();
657 return wf_barrier.allAtBarrier();
664 wf_barrier.incNumAtBarrier();
671 return wf_barrier.numAtBarrier();
678 return wf_barrier.maxBarrierCnt();
692 wf_barrier.decMaxBarrierCnt();
699 wf_barrier.release();
722 for (
auto &vecRegFile :
vrf) {
726 for (
auto &scRegFile :
srf) {
770 "No support for multiple Global Memory Pipelines exists!!!");
777 "No support for multiple Local Memory Pipelines exists!!!");
784 "No support for multiple Scalar Memory Pipelines exists!!!");
822 if (gpuDynInst->isKernelLaunch()) {
825 assert(pkt->
req->isKernel());
826 assert(pkt->
req->isInvL1());
841 && gpuDynInst->isEndOfKernel()) {
847 assert(pkt->
req->isKernel());
848 assert(pkt->
req->isGL2CacheFlush());
864 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
866 w->wfDynId,
w->wgId);
872 if (!pkt->
req->isKernel()) {
874 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
875 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
876 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
877 gpuDynInst->disassemble(),
w->outstandingReqs,
878 w->outstandingReqs - 1);
891 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
893 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
904 assert(!pkt->
req->isKernel());
911 assert(gpuDynInst->numScalarReqs > 0);
913 gpuDynInst->numScalarReqs--;
923 if (!gpuDynInst->numScalarReqs) {
924 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
925 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
928 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
942 for (
const auto &pkt : retries) {
943 if (!sendTimingReq(pkt)) {
954 int len = retries.size();
958 for (
int i = 0;
i <
len; ++
i) {
960 [[maybe_unused]]
GPUDynInstPtr gpuDynInst = retries.front().second;
961 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
962 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
963 pkt->
req->getPaddr());
968 if (!sendTimingReq(pkt)) {
969 DPRINTF(GPUMem,
"failed again!\n");
972 DPRINTF(GPUMem,
"successful!\n");
981 computeUnit->fetchStage.processFetchReturn(pkt);
988 int len = retries.size();
992 for (
int i = 0;
i <
len; ++
i) {
994 [[maybe_unused]]
Wavefront *wavefront = retries.front().second;
995 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
997 pkt->
req->getPaddr());
998 if (!sendTimingReq(pkt)) {
999 DPRINTF(GPUFetch,
"failed again!\n");
1002 DPRINTF(GPUFetch,
"successful!\n");
1003 retries.pop_front();
1012 Addr tmp_vaddr = pkt->
req->getVaddr();
1017 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1019 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1040 }
else if (pkt->
isRead()) {
1043 fatal(
"pkt is not a read nor a write\n");
1055 unsigned size = pkt->
getSize();
1058 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1059 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1064 if (!
p->pTable->translate(
vaddr, paddr)) {
1065 if (!
p->fixupFault(
vaddr)) {
1066 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1067 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1084 tlbPort[tlbPort_index].sendFunctional(pkt);
1087 int hit_level = translation_state->
hitLevel;
1088 assert(hit_level != -1);
1093 safe_cast<GpuTranslationState*>(pkt->
senderState);
1096 delete sender_state->
saved;
1097 delete sender_state;
1099 assert(pkt->
req->hasPaddr());
1100 assert(pkt->
req->hasSize());
1110 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1121 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1122 gpuDynInst->tlbHitLevel[
index] = hit_level;
1129 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1130 "scheduled\n",
cu_id, gpuDynInst->simdId,
1131 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1134 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1135 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1137 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1138 "failed!\n",
cu_id, gpuDynInst->simdId,
1139 gpuDynInst->wfSlotId, tmp_vaddr);
1141 tlbPort[tlbPort_index].retries.push_back(pkt);
1142 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1147 tlbPort[tlbPort_index].stallPort();
1149 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1150 "failed!\n",
cu_id, gpuDynInst->simdId,
1151 gpuDynInst->wfSlotId, tmp_vaddr);
1153 tlbPort[tlbPort_index].retries.push_back(pkt);
1156 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1157 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1161 gpuDynInst->resetEntireStatusVector();
1163 gpuDynInst->decrementStatusVector(
index);
1173 tlbPort[tlbPort_index].sendFunctional(pkt);
1183 memPort[0].sendFunctional(new_pkt);
1185 DPRINTF(GPUMem,
"Functional sendRequest\n");
1186 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1187 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1188 new_pkt->
req->getPaddr());
1192 safe_cast<GpuTranslationState*>(pkt->
senderState);
1222 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1224 pkt->
req->getVaddr());
1233 assert(gpuDynInst->isGlobalSeg() ||
1234 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1237 req = std::make_shared<Request>(
1246 if (kernelMemSync) {
1247 if (gpuDynInst->isKernelLaunch()) {
1249 req->setReqInstSeqNum(gpuDynInst->seqNum());
1256 memPort[0].createMemReqEvent(pkt);
1258 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1259 "an acquire\n",
cu_id, gpuDynInst->simdId,
1260 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1267 assert(gpuDynInst->isEndOfKernel());
1270 req->setReqInstSeqNum(gpuDynInst->seqNum());
1277 memPort[0].createMemReqEvent(pkt);
1279 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1280 "a release\n",
cu_id, gpuDynInst->simdId,
1281 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1286 gpuDynInst->setRequestFlags(req);
1288 req->setReqInstSeqNum(gpuDynInst->seqNum());
1295 memPort[0].createMemReqEvent(pkt);
1298 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1299 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1300 pkt->
req->getPaddr());
1310 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1317 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1318 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1319 pkt->
req->getPaddr(),
id);
1321 Addr paddr = pkt->
req->getPaddr();
1335 int index = gpuDynInst->memStatusVector[paddr].back();
1337 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1338 pkt->
req->getPaddr(),
id);
1340 gpuDynInst->memStatusVector[paddr].pop_back();
1341 gpuDynInst->pAddr = pkt->
req->getPaddr();
1343 gpuDynInst->decrementStatusVector(
index);
1344 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1346 if (gpuDynInst->allLanesZero()) {
1347 auto iter = gpuDynInst->memStatusVector.begin();
1348 auto end = gpuDynInst->memStatusVector.end();
1350 while (iter != end) {
1351 assert(iter->second.empty());
1358 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1364 gpuDynInst->memStatusVector.clear();
1370 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1371 compute_unit->
cu_id, gpuDynInst->simdId,
1372 gpuDynInst->wfSlotId);
1375 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1377 .insert(std::make_pair(gpuDynInst,
curTick()));
1389 Addr line = pkt->
req->getPaddr();
1391 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1392 pkt->
req->getVaddr(), line);
1395 computeUnit->stats.tlbCycles +=
curTick();
1399 safe_cast<GpuTranslationState*>(pkt->
senderState);
1402 if (!translation_state->
tlbEntry) {
1404 safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
1407 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1410 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1411 pkt->
req->getVaddr());
1415 int hit_level = translation_state->
hitLevel;
1416 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1418 delete translation_state->
tlbEntry;
1419 assert(!translation_state->
ports.size());
1425 delete translation_state;
1429 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1434 gpuDynInst->memStatusVector[line].push_back(mp_index);
1435 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1446 panic(
"unsupported response to request conversion %s\n",
1450 if (computeUnit->prefetchDepth) {
1451 int simdId = gpuDynInst->simdId;
1452 int wfSlotId = gpuDynInst->wfSlotId;
1455 switch(computeUnit->prefetchType) {
1457 last = computeUnit->lastVaddrCU[mp_index];
1459 case enums::PF_PHASE:
1460 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1463 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1468 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1469 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1477 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1478 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1479 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1481 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1482 computeUnit->prefetchStride:
stride;
1484 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1485 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1490 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1497 RequestPtr prefetch_req = std::make_shared<Request>(
1500 computeUnit->requestorId(),
1510 computeUnit->shader->gpuTc,
true);
1513 sendFunctional(prefetch_pkt);
1517 safe_cast<GpuTranslationState*>(
1523 delete prefetch_pkt;
1542 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1544 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1545 computeUnit->cu_id, gpuDynInst->simdId,
1546 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1548 computeUnit->schedule(mem_req_event,
curTick() +
1549 computeUnit->req_tick_latency);
1558 [
this, pkt]{ processMemReqEvent(pkt); },
1559 "ComputeUnit memory request event",
true);
1566 [
this, pkt]{ processMemRespEvent(pkt); },
1567 "ComputeUnit memory response event",
true);
1575 [[maybe_unused]]
ComputeUnit *compute_unit = computeUnit;
1577 if (!(sendTimingReq(pkt))) {
1578 retries.push_back(std::make_pair(pkt, gpuDynInst));
1581 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1582 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1583 id, pkt->
req->getPaddr());
1586 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1587 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1588 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1589 pkt->
req->getPaddr());
1596 return "ComputeUnit scalar memory request event";
1602 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1610 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1611 compute_unit->
cu_id, gpuDynInst->simdId,
1612 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1615 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1616 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1617 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1618 pkt->req->getPaddr());
1631 int len = retries.size();
1633 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1634 computeUnit->cu_id,
len);
1637 assert(isStalled());
1642 for (
int i = 0;
i <
len; ++
i) {
1645 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1647 if (!sendTimingReq(pkt)) {
1650 DPRINTF(GPUTLB,
": failed again\n");
1653 DPRINTF(GPUTLB,
": successful\n");
1654 retries.pop_front();
1665 safe_cast<GpuTranslationState*>(pkt->
senderState);
1669 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1671 delete translation_state->
tlbEntry;
1672 assert(!translation_state->
ports.size());
1675 delete translation_state;
1678 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1683 [[maybe_unused]]
Wavefront *
w = gpuDynInst->wavefront();
1685 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1686 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1687 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1696 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1707 if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1708 computeUnit->scalarDataPort.retries.push_back(req_pkt);
1709 DPRINTF(GPUMem,
"send scalar req failed for: %s\n",
1710 gpuDynInst->disassemble());
1712 DPRINTF(GPUMem,
"send scalar req for: %s\n",
1713 gpuDynInst->disassemble());
1722 [[maybe_unused]]
Addr line = pkt->
req->getPaddr();
1723 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1724 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1730 = safe_cast<GpuTranslationState*>(pkt->
senderState);
1732 bool success = translation_state->
tlbEntry !=
nullptr;
1733 delete translation_state->
tlbEntry;
1734 assert(!translation_state->
ports.size());
1736 delete translation_state;
1740 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1753 computeUnit->fetchStage.fetch(pkt, wavefront);
1776 int len = retries.size();
1777 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1780 assert(isStalled());
1786 for (
int i = 0;
i <
len; ++
i) {
1789 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1791 if (!sendTimingReq(pkt)) {
1793 DPRINTF(GPUTLB,
": failed again\n");
1796 DPRINTF(GPUTLB,
": successful\n");
1797 retries.pop_front();
1805 if (gpuDynInst->isScalar()) {
1806 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1809 }
else if (gpuDynInst->isLoad()) {
1811 }
else if (gpuDynInst->isStore()) {
1815 if (gpuDynInst->isALU()) {
1823 += gpuDynInst->wavefront()->execMask().count();
1824 }
else if (gpuDynInst->isFlat()) {
1825 if (gpuDynInst->isLocalMem()) {
1830 }
else if (gpuDynInst->isFlatGlobal()) {
1832 }
else if (gpuDynInst->isLocalMem()) {
1834 }
else if (gpuDynInst->isLoad()) {
1836 }
else if (gpuDynInst->isStore()) {
1840 if (gpuDynInst->isLoad()) {
1841 switch (gpuDynInst->executedAs()) {
1842 case enums::SC_SPILL:
1845 case enums::SC_GLOBAL:
1848 case enums::SC_GROUP:
1851 case enums::SC_PRIVATE:
1854 case enums::SC_READONLY:
1857 case enums::SC_KERNARG:
1870 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1873 }
else if (gpuDynInst->isStore()) {
1874 switch (gpuDynInst->executedAs()) {
1875 case enums::SC_SPILL:
1878 case enums::SC_GLOBAL:
1881 case enums::SC_GROUP:
1884 case enums::SC_PRIVATE:
1887 case enums::SC_READONLY:
1890 case enums::SC_KERNARG:
1903 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1927 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
1931 *page_stat_file << std::hex << iter.first <<
",";
1932 *page_stat_file << std::dec << iter.second.first <<
",";
1933 *page_stat_file << std::dec << iter.second.second << std::endl;
1970 const uint32_t wgId)
const
1980 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
1999 RequestPtr newRequest = std::make_shared<Request>();
2000 newRequest->setPaddr(0x0);
2020 fatal_if(!senderState,
"did not get the right sort of sender state");
2027 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2041 fatal_if(!sender_state,
"packet without a valid sender state");
2046 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2050 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2051 computeUnit->cu_id, gpuDynInst->simdId,
2052 gpuDynInst->wfSlotId);
2060 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2061 computeUnit->cu_id, gpuDynInst->simdId,
2062 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2065 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2066 computeUnit->cu_id, gpuDynInst->simdId,
2067 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2081 auto queueSize = retries.size();
2083 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2084 computeUnit->cu_id, queueSize);
2087 "why was there a recvReqRetry() with no pending reqs?");
2089 "recvReqRetry() happened when the port was not stalled");
2093 while (!retries.empty()) {
2096 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2101 DPRINTF(GPUPort,
": LDS send failed again\n");
2104 DPRINTF(GPUTLB,
": LDS send successful\n");
2112 : statistics::
Group(parent),
2113 ADD_STAT(vALUInsts,
"Number of vector ALU insts issued."),
2114 ADD_STAT(vALUInstsPerWF,
"The avg. number of vector ALU insts issued "
2116 ADD_STAT(sALUInsts,
"Number of scalar ALU insts issued."),
2117 ADD_STAT(sALUInstsPerWF,
"The avg. number of scalar ALU insts issued "
2120 "Number of cycles needed to execute VALU insts."),
2122 "Number of cycles needed to execute SALU insts."),
2123 ADD_STAT(threadCyclesVALU,
"Number of thread cycles used to execute "
2124 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2125 "the number of active threads."),
2127 "Percentage of active vector ALU threads in a wave."),
2128 ADD_STAT(ldsNoFlatInsts,
"Number of LDS insts issued, not including FLAT"
2129 " accesses that resolve to LDS."),
2130 ADD_STAT(ldsNoFlatInstsPerWF,
"The avg. number of LDS insts (not "
2131 "including FLAT accesses that resolve to LDS) per-wavefront."),
2133 "The number of FLAT insts that resolve to vmem issued."),
2134 ADD_STAT(flatVMemInstsPerWF,
"The average number of FLAT insts that "
2135 "resolve to vmem issued per-wavefront."),
2137 "The number of FLAT insts that resolve to LDS issued."),
2138 ADD_STAT(flatLDSInstsPerWF,
"The average number of FLAT insts that "
2139 "resolve to LDS issued per-wavefront."),
2141 "Number of vector mem write insts (excluding FLAT insts)."),
2142 ADD_STAT(vectorMemWritesPerWF,
"The average number of vector mem write "
2143 "insts (excluding FLAT insts) per-wavefront."),
2145 "Number of vector mem read insts (excluding FLAT insts)."),
2146 ADD_STAT(vectorMemReadsPerWF,
"The avg. number of vector mem read insts "
2147 "(excluding FLAT insts) per-wavefront."),
2148 ADD_STAT(scalarMemWrites,
"Number of scalar mem write insts."),
2150 "The average number of scalar mem write insts per-wavefront."),
2151 ADD_STAT(scalarMemReads,
"Number of scalar mem read insts."),
2153 "The average number of scalar mem read insts per-wavefront."),
2154 ADD_STAT(vectorMemReadsPerKiloInst,
2155 "Number of vector mem reads per kilo-instruction"),
2156 ADD_STAT(vectorMemWritesPerKiloInst,
2157 "Number of vector mem writes per kilo-instruction"),
2158 ADD_STAT(vectorMemInstsPerKiloInst,
2159 "Number of vector mem insts per kilo-instruction"),
2160 ADD_STAT(scalarMemReadsPerKiloInst,
2161 "Number of scalar mem reads per kilo-instruction"),
2162 ADD_STAT(scalarMemWritesPerKiloInst,
2163 "Number of scalar mem writes per kilo-instruction"),
2164 ADD_STAT(scalarMemInstsPerKiloInst,
2165 "Number of scalar mem insts per kilo-instruction"),
2166 ADD_STAT(instCyclesVMemPerSimd,
"Number of cycles to send address, "
2167 "command, data from VRF to vector memory unit, per SIMD"),
2168 ADD_STAT(instCyclesScMemPerSimd,
"Number of cycles to send address, "
2169 "command, data from SRF to scalar memory unit, per SIMD"),
2170 ADD_STAT(instCyclesLdsPerSimd,
"Number of cycles to send address, "
2171 "command, data from VRF to LDS unit, per SIMD"),
2172 ADD_STAT(globalReads,
"Number of reads to the global segment"),
2173 ADD_STAT(globalWrites,
"Number of writes to the global segment"),
2175 "Number of memory instructions sent to the global segment"),
2176 ADD_STAT(argReads,
"Number of reads to the arg segment"),
2177 ADD_STAT(argWrites,
"NUmber of writes to the arg segment"),
2179 "Number of memory instructions sent to the arg segment"),
2180 ADD_STAT(spillReads,
"Number of reads to the spill segment"),
2181 ADD_STAT(spillWrites,
"Number of writes to the spill segment"),
2183 "Number of memory instructions sent to the spill segment"),
2184 ADD_STAT(groupReads,
"Number of reads to the group segment"),
2185 ADD_STAT(groupWrites,
"Number of writes to the group segment"),
2187 "Number of memory instructions sent to the group segment"),
2188 ADD_STAT(privReads,
"Number of reads to the private segment"),
2189 ADD_STAT(privWrites,
"Number of writes to the private segment"),
2191 "Number of memory instructions sent to the private segment"),
2192 ADD_STAT(readonlyReads,
"Number of reads to the readonly segment"),
2194 "Number of memory instructions sent to the readonly segment"),
2196 "Number of memory instructions sent to the readonly segment"),
2197 ADD_STAT(kernargReads,
"Number of reads sent to the kernarg segment"),
2199 "Number of memory instructions sent to the kernarg segment"),
2201 "Number of memory instructions sent to the kernarg segment"),
2203 "wave level parallelism: count of active waves at wave launch"),
2204 ADD_STAT(tlbRequests,
"number of uncoalesced requests"),
2206 "total number of cycles for all uncoalesced requests"),
2207 ADD_STAT(tlbLatency,
"Avg. translation latency for data translations"),
2209 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2210 ADD_STAT(ldsBankAccesses,
"Total number of LDS bank accesses"),
2212 "Number of bank conflicts per LDS memory packet"),
2214 "pages touched per wf (over all mem. instr.)"),
2216 "dynamic non-flat global memory instruction count"),
2218 "dynamic flat global memory instruction count"),
2219 ADD_STAT(dynamicLMemInstrCnt,
"dynamic local memory intruction count"),
2220 ADD_STAT(wgBlockedDueBarrierAllocation,
2221 "WG dispatch was blocked due to lack of barrier resources"),
2222 ADD_STAT(wgBlockedDueLdsAllocation,
2223 "Workgroup blocked due to LDS capacity"),
2224 ADD_STAT(numInstrExecuted,
"number of instructions executed"),
2225 ADD_STAT(execRateDist,
"Instruction Execution Rate: Number of executed "
2226 "vector instructions per cycle"),
2228 "number of vec ops executed (e.g. WF size/inst)"),
2230 "number of f16 vec ops executed (e.g. WF size/inst)"),
2232 "number of f32 vec ops executed (e.g. WF size/inst)"),
2234 "number of f64 vec ops executed (e.g. WF size/inst)"),
2236 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2238 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2240 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2242 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2244 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2246 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2248 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2250 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2252 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2254 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2255 ADD_STAT(totalCycles,
"number of cycles the CU ran for"),
2256 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2257 ADD_STAT(vpc_f16,
"F16 Vector Operations per cycle (this CU only)"),
2258 ADD_STAT(vpc_f32,
"F32 Vector Operations per cycle (this CU only)"),
2259 ADD_STAT(vpc_f64,
"F64 Vector Operations per cycle (this CU only)"),
2260 ADD_STAT(ipc,
"Instructions per cycle (this CU only)"),
2261 ADD_STAT(controlFlowDivergenceDist,
"number of lanes active per "
2262 "instruction (over all instructions)"),
2263 ADD_STAT(activeLanesPerGMemInstrDist,
2264 "number of active lanes per global memory instruction"),
2265 ADD_STAT(activeLanesPerLMemInstrDist,
2266 "number of active lanes per local memory instruction"),
2268 "Number of dynamic non-GM memory insts executed"),
2269 ADD_STAT(numTimesWgBlockedDueVgprAlloc,
"Number of times WGs are "
2270 "blocked due to VGPR allocation per SIMD"),
2271 ADD_STAT(numTimesWgBlockedDueSgprAlloc,
"Number of times WGs are "
2272 "blocked due to SGPR allocation per SIMD"),
2273 ADD_STAT(numCASOps,
"number of compare and swap operations"),
2275 "number of compare and swap operations that failed"),
2276 ADD_STAT(completedWfs,
"number of completed wavefronts"),
2277 ADD_STAT(completedWGs,
"number of completed workgroups"),
2278 ADD_STAT(headTailLatency,
"ticks between first and last cache block "
2279 "arrival at coalescer"),
2280 ADD_STAT(instInterleave,
"Measure of instruction interleaving per SIMD")
2333 for (
int i = 0;
i < 4; ++
i) {