39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUReg.hh"
46 #include "debug/GPURename.hh"
47 #include "debug/GPUSync.hh"
48 #include "debug/GPUTLB.hh"
66 numVectorGlobalMemUnits(
p.num_global_mem_pipes),
67 numVectorSharedMemUnits(
p.num_shared_mem_pipes),
68 numScalarMemUnits(
p.num_scalar_mem_pipes),
69 numVectorALUs(
p.num_SIMDs),
70 numScalarALUs(
p.num_scalar_cores),
71 vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
72 coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
73 registerManager(
p.register_manager),
75 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
76 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
77 execStage(
p, *this, scheduleToExecute),
78 globalMemoryPipe(
p, *this),
79 localMemoryPipe(
p, *this),
80 scalarMemoryPipe(
p, *this),
81 tickEvent([this]{
exec(); },
"Compute unit tick event",
84 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
85 simdWidth(
p.simd_width),
86 spBypassPipeLength(
p.spbypass_pipe_length),
87 dpBypassPipeLength(
p.dpbypass_pipe_length),
88 scalarPipeStages(
p.scalar_pipe_length),
89 operandNetworkLength(
p.operand_network_length),
90 issuePeriod(
p.issue_period),
91 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
92 srf_scm_bus_latency(
p.srf_scm_bus_latency),
93 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
94 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
95 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
96 debugSegFault(
p.debugSegFault),
97 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
98 countPages(
p.countPages),
99 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
100 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
101 scalar_req_tick_latency(
102 p.scalar_mem_req_latency *
p.clk_domain->clockPeriod()),
103 scalar_resp_tick_latency(
104 p.scalar_mem_resp_latency *
p.clk_domain->clockPeriod()),
105 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
106 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
112 _cacheLineSize(
p.system->cacheLineSize()),
113 _numBarrierSlots(
p.num_barrier_slots),
114 globalSeqNum(0), wavefrontSize(
p.wf_size),
115 scoreboardCheckToSchedule(
p),
116 scheduleToExecute(
p),
123 "Functional TLB not supported in full-system GPU simulation");
134 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
136 "WF size is larger than the host can support");
138 "Wavefront size should be a power of 2");
141 numCyclesPerStoreTransfer =
142 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
143 (double)vrfToCoalescerBusWidth);
145 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
146 / coalescerToVrfBusWidth;
149 idleWfs =
p.n_wf * numVectorALUs;
150 lastVaddrWF.resize(numVectorALUs);
151 wfList.resize(numVectorALUs);
153 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
155 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
156 freeBarrierIds.insert(
i);
159 for (
int j = 0;
j < numVectorALUs; ++
j) {
160 lastVaddrWF[
j].resize(
p.n_wf);
162 for (
int i = 0;
i <
p.n_wf; ++
i) {
163 lastVaddrWF[
j][
i].resize(wfSize());
165 wfList[
j].push_back(
p.wavefronts[
j *
p.n_wf +
i]);
166 wfList[
j][
i]->setParent(
this);
168 for (
int k = 0;
k < wfSize(); ++
k) {
169 lastVaddrWF[
j][
i][
k] = 0;
174 lastVaddrSimd.resize(numVectorALUs);
176 for (
int i = 0;
i < numVectorALUs; ++
i) {
177 lastVaddrSimd[
i].resize(wfSize(), 0);
180 lastVaddrCU.resize(wfSize());
184 if (
p.execPolicy ==
"OLDEST-FIRST") {
186 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
189 fatal(
"Invalid WF execution policy (CU)\n");
192 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
196 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
202 memPortTokens =
new TokenManager(
p.max_cu_tokens);
206 lastExecCycle.resize(numVectorALUs, 0);
208 for (
int i = 0;
i < vrf.size(); ++
i) {
209 vrf[
i]->setParent(
this);
211 for (
int i = 0;
i < srf.size(); ++
i) {
212 srf[
i]->setParent(
this);
214 numVecRegsPerSimd = vrf[0]->numRegs();
215 numScalarRegsPerSimd = srf[0]->numRegs();
217 registerManager->setParent(
this);
221 instExecPerSimd.resize(numVectorALUs, 0);
225 "Cache line size should be a power of two.");
226 cacheLineBits =
floorLog2(_cacheLineSize);
309 w->workGroupSz[0] = task->
wgSize(0);
310 w->workGroupSz[1] = task->
wgSize(1);
311 w->workGroupSz[2] = task->
wgSize(2);
312 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
316 w->computeActualWgSz(task);
323 static int _n_wave = 0;
329 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
333 w->execMask() = init_mask;
337 w->initMask = init_mask.to_ullong();
340 w->barrierId(bar_id);
342 assert(!
w->hasBarrier());
346 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
347 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
349 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
350 (
w->actualWgSz[0] *
w->actualWgSz[1]);
352 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
353 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
360 w->workGroupId[0] =
w->wgId % task->
numWg(0);
361 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
362 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
365 w->ldsChunk = ldsChunk;
367 [[maybe_unused]] int32_t refCount =
369 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
370 cu_id,
w->wgId, refCount);
372 w->instructionBuffer.clear();
377 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
378 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
379 w->simdId,
w->wfSlotId, refCount);
381 w->initRegState(task,
w->actualWgSzTotal);
396 = std::make_shared<GPUDynInst>(
this,
nullptr,
400 gpuDynInst->kern_id = kernId;
402 req->setContext(gpuDynInst->wfDynId);
435 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
449 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
463 if (num_wfs_in_wg > 1) {
470 assert(!wf_barrier.maxBarrierCnt());
471 assert(!wf_barrier.numAtBarrier());
472 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
474 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
475 "%d waves using this barrier.\n",
cu_id, barrier_id,
495 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
496 "vregDemand[%d] sregDemand[%d]\n",
i,
j,
w->wfDynId,
497 vregDemand, sregDemand);
512 "Instruction Buffer of WF%d can't be empty",
w->wgId);
521 "Instruction Buffer of WF%d can't be empty",
w->wgId);
524 auto it =
pipeMap.find(ii->seqNum());
534 int trueWgSizeTotal = 1;
540 trueWgSizeTotal *= trueWgSize[
d];
541 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
544 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
547 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
548 num_wfs_in_wg = numWfs;
550 bool barrier_avail =
true;
553 barrier_avail =
false;
566 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
567 "that has %d VGPRs\n",
570 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
577 int numMappedWfs = 0;
589 if (numMappedWfs < numWfs &&
603 assert(numMappedWfs <= numWfs);
605 bool vregAvail =
true;
606 bool sregAvail =
true;
608 if (numMappedWfs < numWfs) {
624 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
625 VGPR Availability = %d, SGPR Availability = %d\n",
626 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
643 if (!barrier_avail) {
652 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
653 && ldsAvail && barrier_avail;
661 return wf_barrier.numYetToReachBarrier();
668 return wf_barrier.allAtBarrier();
675 wf_barrier.incNumAtBarrier();
682 return wf_barrier.numAtBarrier();
689 return wf_barrier.maxBarrierCnt();
703 wf_barrier.decMaxBarrierCnt();
710 wf_barrier.release();
733 for (
auto &vecRegFile :
vrf) {
737 for (
auto &scRegFile :
srf) {
781 "No support for multiple Global Memory Pipelines exists!!!");
788 "No support for multiple Local Memory Pipelines exists!!!");
795 "No support for multiple Scalar Memory Pipelines exists!!!");
839 if (gpuDynInst->isKernelLaunch()) {
842 assert(pkt->
req->isKernel());
843 assert(pkt->
req->isInvL1());
858 && gpuDynInst->isEndOfKernel()) {
864 assert(pkt->
req->isKernel());
865 assert(pkt->
req->isGL2CacheFlush());
881 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
882 computeUnit->cu_id,
w->simdId,
w->wfSlotId,
883 w->wfDynId,
w->wgId);
889 if (!pkt->
req->isKernel()) {
890 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
891 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
892 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
893 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
894 gpuDynInst->disassemble(),
w->outstandingReqs,
895 w->outstandingReqs - 1);
896 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
905 computeUnit->memPort[
index].createMemRespEvent(pkt);
908 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
909 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
910 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
912 computeUnit->schedule(mem_resp_event,
913 curTick() + computeUnit->resp_tick_latency);
921 return handleResponse(pkt);
927 assert(!pkt->
req->isKernel());
934 assert(gpuDynInst->numScalarReqs > 0);
936 gpuDynInst->numScalarReqs--;
946 if (!gpuDynInst->numScalarReqs) {
947 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
948 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
951 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
965 for (
const auto &pkt : retries) {
966 if (!sendTimingReq(pkt)) {
977 int len = retries.size();
981 for (
int i = 0;
i <
len; ++
i) {
983 [[maybe_unused]]
GPUDynInstPtr gpuDynInst = retries.front().second;
984 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
985 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
986 pkt->
req->getPaddr());
991 if (!sendTimingReq(pkt)) {
992 DPRINTF(GPUMem,
"failed again!\n");
995 DPRINTF(GPUMem,
"successful!\n");
1004 computeUnit->handleSQCReturn(pkt);
1018 int len = retries.size();
1022 for (
int i = 0;
i <
len; ++
i) {
1024 [[maybe_unused]]
Wavefront *wavefront = retries.front().second;
1025 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1027 pkt->
req->getPaddr());
1028 if (!sendTimingReq(pkt)) {
1029 DPRINTF(GPUFetch,
"failed again!\n");
1032 DPRINTF(GPUFetch,
"successful!\n");
1033 retries.pop_front();
1042 Addr tmp_vaddr = pkt->
req->getVaddr();
1047 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1049 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1075 }
else if (pkt->
isRead()) {
1078 fatal(
"pkt is not a read nor a write\n");
1092 unsigned size = pkt->
getSize();
1095 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1096 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1101 if (!
p->pTable->translate(
vaddr, paddr)) {
1102 if (!
p->fixupFault(
vaddr)) {
1103 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1104 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1121 tlbPort[tlbPort_index].sendFunctional(pkt);
1124 int hit_level = translation_state->
hitLevel;
1125 assert(hit_level != -1);
1130 safe_cast<GpuTranslationState*>(pkt->
senderState);
1133 delete sender_state->
saved;
1134 delete sender_state;
1136 assert(pkt->
req->hasPaddr());
1137 assert(pkt->
req->hasSize());
1147 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1158 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1159 gpuDynInst->tlbHitLevel[
index] = hit_level;
1166 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1167 "scheduled\n",
cu_id, gpuDynInst->simdId,
1168 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1171 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1172 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1174 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1175 "failed!\n",
cu_id, gpuDynInst->simdId,
1176 gpuDynInst->wfSlotId, tmp_vaddr);
1178 tlbPort[tlbPort_index].retries.push_back(pkt);
1179 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1184 tlbPort[tlbPort_index].stallPort();
1186 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1187 "failed!\n",
cu_id, gpuDynInst->simdId,
1188 gpuDynInst->wfSlotId, tmp_vaddr);
1190 tlbPort[tlbPort_index].retries.push_back(pkt);
1192 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x from "
1193 "instruction %s sent!\n",
cu_id, gpuDynInst->simdId,
1194 gpuDynInst->wfSlotId, tmp_vaddr,
1195 gpuDynInst->disassemble().c_str());
1199 gpuDynInst->resetEntireStatusVector();
1201 gpuDynInst->decrementStatusVector(
index);
1211 tlbPort[tlbPort_index].sendFunctional(pkt);
1221 memPort[0].sendFunctional(new_pkt);
1223 DPRINTF(GPUMem,
"Functional sendRequest\n");
1224 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1225 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1226 new_pkt->
req->getPaddr());
1230 safe_cast<GpuTranslationState*>(pkt->
senderState);
1260 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1262 pkt->
req->getVaddr());
1271 assert(gpuDynInst->isGlobalSeg() ||
1272 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1277 req = std::make_shared<Request>(
1288 if (kernelMemSync) {
1289 if (gpuDynInst->isKernelLaunch()) {
1291 req->setReqInstSeqNum(gpuDynInst->seqNum());
1298 memPort[0].createMemReqEvent(pkt);
1300 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1301 "an acquire\n",
cu_id, gpuDynInst->simdId,
1302 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1309 assert(gpuDynInst->isEndOfKernel());
1312 req->setReqInstSeqNum(gpuDynInst->seqNum());
1319 memPort[0].createMemReqEvent(pkt);
1321 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1322 "a release\n",
cu_id, gpuDynInst->simdId,
1323 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1328 gpuDynInst->setRequestFlags(req);
1330 req->setReqInstSeqNum(gpuDynInst->seqNum());
1337 memPort[0].createMemReqEvent(pkt);
1340 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1341 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1342 pkt->
req->getPaddr());
1352 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1359 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1360 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1361 pkt->
req->getPaddr(),
id);
1363 Addr paddr = pkt->
req->getPaddr();
1379 int index = gpuDynInst->memStatusVector[paddr].back();
1381 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1382 pkt->
req->getPaddr(),
id);
1384 gpuDynInst->memStatusVector[paddr].pop_back();
1385 gpuDynInst->pAddr = pkt->
req->getPaddr();
1387 gpuDynInst->decrementStatusVector(
index);
1388 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1390 if (gpuDynInst->allLanesZero()) {
1391 auto iter = gpuDynInst->memStatusVector.begin();
1392 auto end = gpuDynInst->memStatusVector.end();
1394 while (iter != end) {
1395 assert(iter->second.empty());
1402 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1408 gpuDynInst->memStatusVector.clear();
1414 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1415 compute_unit->
cu_id, gpuDynInst->simdId,
1416 gpuDynInst->wfSlotId);
1419 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1421 .insert(std::make_pair(gpuDynInst,
curTick()));
1433 Addr line = pkt->
req->getPaddr();
1435 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1436 pkt->
req->getVaddr(), line);
1439 computeUnit->stats.tlbCycles +=
curTick();
1443 safe_cast<GpuTranslationState*>(pkt->
senderState);
1446 if (!translation_state->
tlbEntry) {
1448 safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
1451 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1454 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1455 pkt->
req->getVaddr());
1459 int hit_level = translation_state->
hitLevel;
1460 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1462 delete translation_state->
tlbEntry;
1463 assert(!translation_state->
ports.size());
1469 delete translation_state;
1473 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1478 gpuDynInst->memStatusVector[line].push_back(mp_index);
1479 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1490 panic(
"unsupported response to request conversion %s\n",
1494 if (computeUnit->prefetchDepth) {
1495 int simdId = gpuDynInst->simdId;
1496 int wfSlotId = gpuDynInst->wfSlotId;
1499 switch(computeUnit->prefetchType) {
1501 last = computeUnit->lastVaddrCU[mp_index];
1503 case enums::PF_PHASE:
1504 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1507 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1512 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1513 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1521 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1522 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1523 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1525 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1526 computeUnit->prefetchStride:
stride;
1528 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1529 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1534 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1541 RequestPtr prefetch_req = std::make_shared<Request>(
1544 computeUnit->requestorId(),
1554 computeUnit->shader->gpuTc,
true);
1557 sendFunctional(prefetch_pkt);
1561 safe_cast<GpuTranslationState*>(
1567 delete prefetch_pkt;
1588 if (new_pkt->
req->systemReq()) {
1593 if (!gpuDynInst->isSystemReq()) {
1594 computeUnit->getTokenManager()->recvTokens(1);
1595 gpuDynInst->setSystemReq();
1598 new_pkt->
req->requestorId(computeUnit->vramRequestorId());
1604 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1606 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1607 computeUnit->cu_id, gpuDynInst->simdId,
1608 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1610 computeUnit->schedule(mem_req_event,
curTick() +
1611 computeUnit->req_tick_latency);
1620 [
this, pkt]{ processMemReqEvent(pkt); },
1621 "ComputeUnit memory request event",
true);
1628 [
this, pkt]{ processMemRespEvent(pkt); },
1629 "ComputeUnit memory response event",
true);
1637 [[maybe_unused]]
ComputeUnit *compute_unit = computeUnit;
1639 if (pkt->
req->systemReq()) {
1643 }
else if (!(sendTimingReq(pkt))) {
1644 retries.push_back(std::make_pair(pkt, gpuDynInst));
1647 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1648 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1649 id, pkt->
req->getPaddr());
1652 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1653 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1654 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1655 pkt->
req->getPaddr());
1662 return "ComputeUnit scalar memory request event";
1668 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1672 if (pkt->req->systemReq()) {
1680 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1681 compute_unit->
cu_id, gpuDynInst->simdId,
1682 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1685 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1686 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1687 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1688 pkt->req->getPaddr());
1701 int len = retries.size();
1703 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1704 computeUnit->cu_id,
len);
1707 assert(isStalled());
1712 for (
int i = 0;
i <
len; ++
i) {
1715 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1717 if (!sendTimingReq(pkt)) {
1720 DPRINTF(GPUTLB,
": failed again\n");
1723 DPRINTF(GPUTLB,
": successful\n");
1724 retries.pop_front();
1735 safe_cast<GpuTranslationState*>(pkt->
senderState);
1739 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1741 delete translation_state->
tlbEntry;
1742 assert(!translation_state->
ports.size());
1745 delete translation_state;
1748 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1753 [[maybe_unused]]
Wavefront *
w = gpuDynInst->wavefront();
1755 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1756 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1757 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1766 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1785 if (req_pkt->
req->systemReq()) {
1786 gpuDynInst->setSystemReq();
1788 req_pkt->
req->requestorId(computeUnit->vramRequestorId());
1793 (computeUnit->scalarDataPort, req_pkt);
1794 computeUnit->schedule(scalar_mem_req_event,
curTick() +
1795 computeUnit->scalar_req_tick_latency);
1803 [[maybe_unused]]
Addr line = pkt->
req->getPaddr();
1804 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1805 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1811 = safe_cast<GpuTranslationState*>(pkt->
senderState);
1813 bool success = translation_state->
tlbEntry !=
nullptr;
1814 delete translation_state->
tlbEntry;
1815 assert(!translation_state->
ports.size());
1817 delete translation_state;
1821 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1834 computeUnit->fetchStage.fetch(pkt, wavefront);
1857 int len = retries.size();
1858 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1861 assert(isStalled());
1867 for (
int i = 0;
i <
len; ++
i) {
1870 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1872 if (!sendTimingReq(pkt)) {
1874 DPRINTF(GPUTLB,
": failed again\n");
1877 DPRINTF(GPUTLB,
": successful\n");
1878 retries.pop_front();
1886 if (gpuDynInst->isScalar()) {
1887 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1890 }
else if (gpuDynInst->isLoad()) {
1892 }
else if (gpuDynInst->isStore()) {
1896 if (gpuDynInst->isALU()) {
1904 += gpuDynInst->wavefront()->execMask().count();
1905 }
else if (gpuDynInst->isFlat()) {
1906 if (gpuDynInst->isLocalMem()) {
1911 }
else if (gpuDynInst->isFlatGlobal()) {
1913 }
else if (gpuDynInst->isLocalMem()) {
1915 }
else if (gpuDynInst->isLoad()) {
1917 }
else if (gpuDynInst->isStore()) {
1921 if (gpuDynInst->isLoad()) {
1922 switch (gpuDynInst->executedAs()) {
1923 case enums::SC_SPILL:
1926 case enums::SC_GLOBAL:
1929 case enums::SC_GROUP:
1932 case enums::SC_PRIVATE:
1935 case enums::SC_READONLY:
1938 case enums::SC_KERNARG:
1951 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1954 }
else if (gpuDynInst->isStore()) {
1955 switch (gpuDynInst->executedAs()) {
1956 case enums::SC_SPILL:
1959 case enums::SC_GLOBAL:
1962 case enums::SC_GROUP:
1965 case enums::SC_PRIVATE:
1968 case enums::SC_READONLY:
1971 case enums::SC_KERNARG:
1984 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2008 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
2012 *page_stat_file << std::hex << iter.first <<
",";
2013 *page_stat_file << std::dec << iter.second.first <<
",";
2014 *page_stat_file << std::dec << iter.second.second << std::endl;
2051 const uint32_t wgId)
const
2061 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
2080 RequestPtr newRequest = std::make_shared<Request>();
2081 newRequest->setPaddr(0x0);
2110 fatal_if(!senderState,
"did not get the right sort of sender state");
2117 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2131 fatal_if(!sender_state,
"packet without a valid sender state");
2136 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2140 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2141 computeUnit->cu_id, gpuDynInst->simdId,
2142 gpuDynInst->wfSlotId);
2150 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2151 computeUnit->cu_id, gpuDynInst->simdId,
2152 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2155 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2156 computeUnit->cu_id, gpuDynInst->simdId,
2157 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2171 auto queueSize = retries.size();
2173 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2174 computeUnit->cu_id, queueSize);
2177 "why was there a recvReqRetry() with no pending reqs?");
2179 "recvReqRetry() happened when the port was not stalled");
2183 while (!retries.empty()) {
2186 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2191 DPRINTF(GPUPort,
": LDS send failed again\n");
2194 DPRINTF(GPUTLB,
": LDS send successful\n");
2202 : statistics::
Group(parent),
2203 ADD_STAT(vALUInsts,
"Number of vector ALU insts issued."),
2204 ADD_STAT(vALUInstsPerWF,
"The avg. number of vector ALU insts issued "
2206 ADD_STAT(sALUInsts,
"Number of scalar ALU insts issued."),
2207 ADD_STAT(sALUInstsPerWF,
"The avg. number of scalar ALU insts issued "
2210 "Number of cycles needed to execute VALU insts."),
2212 "Number of cycles needed to execute SALU insts."),
2213 ADD_STAT(threadCyclesVALU,
"Number of thread cycles used to execute "
2214 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2215 "the number of active threads."),
2217 "Percentage of active vector ALU threads in a wave."),
2218 ADD_STAT(ldsNoFlatInsts,
"Number of LDS insts issued, not including FLAT"
2219 " accesses that resolve to LDS."),
2220 ADD_STAT(ldsNoFlatInstsPerWF,
"The avg. number of LDS insts (not "
2221 "including FLAT accesses that resolve to LDS) per-wavefront."),
2223 "The number of FLAT insts that resolve to vmem issued."),
2224 ADD_STAT(flatVMemInstsPerWF,
"The average number of FLAT insts that "
2225 "resolve to vmem issued per-wavefront."),
2227 "The number of FLAT insts that resolve to LDS issued."),
2228 ADD_STAT(flatLDSInstsPerWF,
"The average number of FLAT insts that "
2229 "resolve to LDS issued per-wavefront."),
2231 "Number of vector mem write insts (excluding FLAT insts)."),
2232 ADD_STAT(vectorMemWritesPerWF,
"The average number of vector mem write "
2233 "insts (excluding FLAT insts) per-wavefront."),
2235 "Number of vector mem read insts (excluding FLAT insts)."),
2236 ADD_STAT(vectorMemReadsPerWF,
"The avg. number of vector mem read insts "
2237 "(excluding FLAT insts) per-wavefront."),
2238 ADD_STAT(scalarMemWrites,
"Number of scalar mem write insts."),
2240 "The average number of scalar mem write insts per-wavefront."),
2241 ADD_STAT(scalarMemReads,
"Number of scalar mem read insts."),
2243 "The average number of scalar mem read insts per-wavefront."),
2244 ADD_STAT(vectorMemReadsPerKiloInst,
2245 "Number of vector mem reads per kilo-instruction"),
2246 ADD_STAT(vectorMemWritesPerKiloInst,
2247 "Number of vector mem writes per kilo-instruction"),
2248 ADD_STAT(vectorMemInstsPerKiloInst,
2249 "Number of vector mem insts per kilo-instruction"),
2250 ADD_STAT(scalarMemReadsPerKiloInst,
2251 "Number of scalar mem reads per kilo-instruction"),
2252 ADD_STAT(scalarMemWritesPerKiloInst,
2253 "Number of scalar mem writes per kilo-instruction"),
2254 ADD_STAT(scalarMemInstsPerKiloInst,
2255 "Number of scalar mem insts per kilo-instruction"),
2256 ADD_STAT(instCyclesVMemPerSimd,
"Number of cycles to send address, "
2257 "command, data from VRF to vector memory unit, per SIMD"),
2258 ADD_STAT(instCyclesScMemPerSimd,
"Number of cycles to send address, "
2259 "command, data from SRF to scalar memory unit, per SIMD"),
2260 ADD_STAT(instCyclesLdsPerSimd,
"Number of cycles to send address, "
2261 "command, data from VRF to LDS unit, per SIMD"),
2262 ADD_STAT(globalReads,
"Number of reads to the global segment"),
2263 ADD_STAT(globalWrites,
"Number of writes to the global segment"),
2265 "Number of memory instructions sent to the global segment"),
2266 ADD_STAT(argReads,
"Number of reads to the arg segment"),
2267 ADD_STAT(argWrites,
"NUmber of writes to the arg segment"),
2269 "Number of memory instructions sent to the arg segment"),
2270 ADD_STAT(spillReads,
"Number of reads to the spill segment"),
2271 ADD_STAT(spillWrites,
"Number of writes to the spill segment"),
2273 "Number of memory instructions sent to the spill segment"),
2274 ADD_STAT(groupReads,
"Number of reads to the group segment"),
2275 ADD_STAT(groupWrites,
"Number of writes to the group segment"),
2277 "Number of memory instructions sent to the group segment"),
2278 ADD_STAT(privReads,
"Number of reads to the private segment"),
2279 ADD_STAT(privWrites,
"Number of writes to the private segment"),
2281 "Number of memory instructions sent to the private segment"),
2282 ADD_STAT(readonlyReads,
"Number of reads to the readonly segment"),
2284 "Number of memory instructions sent to the readonly segment"),
2286 "Number of memory instructions sent to the readonly segment"),
2287 ADD_STAT(kernargReads,
"Number of reads sent to the kernarg segment"),
2289 "Number of memory instructions sent to the kernarg segment"),
2291 "Number of memory instructions sent to the kernarg segment"),
2293 "wave level parallelism: count of active waves at wave launch"),
2294 ADD_STAT(tlbRequests,
"number of uncoalesced requests"),
2296 "total number of cycles for all uncoalesced requests"),
2297 ADD_STAT(tlbLatency,
"Avg. translation latency for data translations"),
2299 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2300 ADD_STAT(ldsBankAccesses,
"Total number of LDS bank accesses"),
2302 "Number of bank conflicts per LDS memory packet"),
2304 "pages touched per wf (over all mem. instr.)"),
2306 "dynamic non-flat global memory instruction count"),
2308 "dynamic flat global memory instruction count"),
2309 ADD_STAT(dynamicLMemInstrCnt,
"dynamic local memory intruction count"),
2310 ADD_STAT(wgBlockedDueBarrierAllocation,
2311 "WG dispatch was blocked due to lack of barrier resources"),
2312 ADD_STAT(wgBlockedDueLdsAllocation,
2313 "Workgroup blocked due to LDS capacity"),
2314 ADD_STAT(numInstrExecuted,
"number of instructions executed"),
2315 ADD_STAT(execRateDist,
"Instruction Execution Rate: Number of executed "
2316 "vector instructions per cycle"),
2318 "number of vec ops executed (e.g. WF size/inst)"),
2320 "number of f16 vec ops executed (e.g. WF size/inst)"),
2322 "number of f32 vec ops executed (e.g. WF size/inst)"),
2324 "number of f64 vec ops executed (e.g. WF size/inst)"),
2326 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2328 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2330 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2332 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2334 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2336 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2338 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2340 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2342 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2344 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2345 ADD_STAT(totalCycles,
"number of cycles the CU ran for"),
2346 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2347 ADD_STAT(vpc_f16,
"F16 Vector Operations per cycle (this CU only)"),
2348 ADD_STAT(vpc_f32,
"F32 Vector Operations per cycle (this CU only)"),
2349 ADD_STAT(vpc_f64,
"F64 Vector Operations per cycle (this CU only)"),
2350 ADD_STAT(ipc,
"Instructions per cycle (this CU only)"),
2351 ADD_STAT(controlFlowDivergenceDist,
"number of lanes active per "
2352 "instruction (over all instructions)"),
2353 ADD_STAT(activeLanesPerGMemInstrDist,
2354 "number of active lanes per global memory instruction"),
2355 ADD_STAT(activeLanesPerLMemInstrDist,
2356 "number of active lanes per local memory instruction"),
2358 "Number of dynamic non-GM memory insts executed"),
2359 ADD_STAT(numTimesWgBlockedDueVgprAlloc,
"Number of times WGs are "
2360 "blocked due to VGPR allocation per SIMD"),
2361 ADD_STAT(numTimesWgBlockedDueSgprAlloc,
"Number of times WGs are "
2362 "blocked due to SGPR allocation per SIMD"),
2363 ADD_STAT(numCASOps,
"number of compare and swap operations"),
2365 "number of compare and swap operations that failed"),
2366 ADD_STAT(completedWfs,
"number of completed wavefronts"),
2367 ADD_STAT(completedWGs,
"number of completed workgroups"),
2368 ADD_STAT(headTailLatency,
"ticks between first and last cache block "
2369 "arrival at coalescer"),
2370 ADD_STAT(instInterleave,
"Measure of instruction interleaving per SIMD")
2423 for (
int i = 0;
i < 4; ++
i) {