40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
67 numVectorGlobalMemUnits(
p.num_global_mem_pipes),
68 numVectorSharedMemUnits(
p.num_shared_mem_pipes),
69 numScalarMemUnits(
p.num_scalar_mem_pipes),
70 numVectorALUs(
p.num_SIMDs),
71 numScalarALUs(
p.num_scalar_cores),
72 vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
73 coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
74 registerManager(
p.register_manager),
76 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
77 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
78 execStage(
p, *this, scheduleToExecute),
79 globalMemoryPipe(
p, *this),
80 localMemoryPipe(
p, *this),
81 scalarMemoryPipe(
p, *this),
82 tickEvent([this]{
exec(); },
"Compute unit tick event",
85 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
86 simdWidth(
p.simd_width),
87 spBypassPipeLength(
p.spbypass_pipe_length),
88 dpBypassPipeLength(
p.dpbypass_pipe_length),
89 scalarPipeStages(
p.scalar_pipe_length),
90 operandNetworkLength(
p.operand_network_length),
91 issuePeriod(
p.issue_period),
92 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
93 srf_scm_bus_latency(
p.srf_scm_bus_latency),
94 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
95 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
96 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
97 debugSegFault(
p.debugSegFault),
98 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
99 countPages(
p.countPages),
100 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
101 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
102 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
103 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
109 _cacheLineSize(
p.system->cacheLineSize()),
110 _numBarrierSlots(
p.num_barrier_slots),
111 globalSeqNum(0), wavefrontSize(
p.wf_size),
112 scoreboardCheckToSchedule(
p),
113 scheduleToExecute(
p),
125 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
127 "WF size is larger than the host can support");
129 "Wavefront size should be a power of 2");
132 numCyclesPerStoreTransfer =
133 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
134 (double)vrfToCoalescerBusWidth);
136 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
137 / coalescerToVrfBusWidth;
140 idleWfs =
p.n_wf * numVectorALUs;
141 lastVaddrWF.resize(numVectorALUs);
142 wfList.resize(numVectorALUs);
144 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
146 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
147 freeBarrierIds.insert(
i);
150 for (
int j = 0;
j < numVectorALUs; ++
j) {
151 lastVaddrWF[
j].resize(
p.n_wf);
153 for (
int i = 0;
i <
p.n_wf; ++
i) {
154 lastVaddrWF[
j][
i].resize(wfSize());
156 wfList[
j].push_back(
p.wavefronts[
j *
p.n_wf +
i]);
157 wfList[
j][
i]->setParent(
this);
159 for (
int k = 0;
k < wfSize(); ++
k) {
160 lastVaddrWF[
j][
i][
k] = 0;
165 lastVaddrSimd.resize(numVectorALUs);
167 for (
int i = 0;
i < numVectorALUs; ++
i) {
168 lastVaddrSimd[
i].resize(wfSize(), 0);
171 lastVaddrCU.resize(wfSize());
175 if (
p.execPolicy ==
"OLDEST-FIRST") {
177 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
180 fatal(
"Invalid WF execution policy (CU)\n");
183 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
187 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
193 memPortTokens =
new TokenManager(
p.max_cu_tokens);
197 lastExecCycle.resize(numVectorALUs, 0);
199 for (
int i = 0;
i < vrf.size(); ++
i) {
200 vrf[
i]->setParent(
this);
202 for (
int i = 0;
i < srf.size(); ++
i) {
203 srf[
i]->setParent(
this);
205 numVecRegsPerSimd = vrf[0]->numRegs();
206 numScalarRegsPerSimd = srf[0]->numRegs();
208 registerManager->setParent(
this);
212 instExecPerSimd.resize(numVectorALUs, 0);
216 "Cache line size should be a power of two.");
217 cacheLineBits =
floorLog2(_cacheLineSize);
300 w->workGroupSz[0] = task->
wgSize(0);
301 w->workGroupSz[1] = task->
wgSize(1);
302 w->workGroupSz[2] = task->
wgSize(2);
303 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
307 w->computeActualWgSz(task);
314 static int _n_wave = 0;
320 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
324 w->execMask() = init_mask;
328 w->initMask = init_mask.to_ullong();
331 w->barrierId(bar_id);
333 assert(!
w->hasBarrier());
337 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
338 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
340 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
341 (
w->actualWgSz[0] *
w->actualWgSz[1]);
343 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
344 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
351 w->workGroupId[0] =
w->wgId % task->
numWg(0);
352 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
353 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
356 w->ldsChunk = ldsChunk;
358 GEM5_VAR_USED int32_t refCount =
360 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
361 cu_id,
w->wgId, refCount);
363 w->instructionBuffer.clear();
368 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
369 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
370 w->simdId,
w->wfSlotId, refCount);
372 w->initRegState(task,
w->actualWgSzTotal);
387 = std::make_shared<GPUDynInst>(
this,
nullptr,
391 gpuDynInst->kern_id = kernId;
393 req->setContext(gpuDynInst->wfDynId);
426 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
440 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
454 if (num_wfs_in_wg > 1) {
461 assert(!wf_barrier.maxBarrierCnt());
462 assert(!wf_barrier.numAtBarrier());
463 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
465 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
466 "%d waves using this barrier.\n",
cu_id, barrier_id,
486 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
487 "vregDemand[%d] sregDemand[%d]\n",
i,
j,
w->wfDynId,
488 vregDemand, sregDemand);
503 "Instruction Buffer of WF%d can't be empty",
w->wgId);
512 "Instruction Buffer of WF%d can't be empty",
w->wgId);
515 auto it =
pipeMap.find(ii->seqNum());
525 int trueWgSizeTotal = 1;
531 trueWgSizeTotal *= trueWgSize[
d];
532 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
535 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
538 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
539 num_wfs_in_wg = numWfs;
541 bool barrier_avail =
true;
544 barrier_avail =
false;
557 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
558 "that has %d VGPRs\n",
561 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
568 int numMappedWfs = 0;
580 if (numMappedWfs < numWfs &&
594 assert(numMappedWfs <= numWfs);
596 bool vregAvail =
true;
597 bool sregAvail =
true;
599 if (numMappedWfs < numWfs) {
615 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
616 VGPR Availability = %d, SGPR Availability = %d\n",
617 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
634 if (!barrier_avail) {
643 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
644 && ldsAvail && barrier_avail;
652 return wf_barrier.numYetToReachBarrier();
659 return wf_barrier.allAtBarrier();
666 wf_barrier.incNumAtBarrier();
673 return wf_barrier.numAtBarrier();
680 return wf_barrier.maxBarrierCnt();
694 wf_barrier.decMaxBarrierCnt();
701 wf_barrier.release();
724 for (
auto &vecRegFile :
vrf) {
728 for (
auto &scRegFile :
srf) {
772 "No support for multiple Global Memory Pipelines exists!!!");
779 "No support for multiple Local Memory Pipelines exists!!!");
786 "No support for multiple Scalar Memory Pipelines exists!!!");
824 if (gpuDynInst->isKernelLaunch()) {
827 assert(pkt->
req->isKernel());
828 assert(pkt->
req->isInvL1());
843 && gpuDynInst->isEndOfKernel()) {
849 assert(pkt->
req->isKernel());
850 assert(pkt->
req->isGL2CacheFlush());
866 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
868 w->wfDynId,
w->wgId);
874 if (!pkt->
req->isKernel()) {
876 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
877 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
878 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
879 gpuDynInst->disassemble(),
w->outstandingReqs,
880 w->outstandingReqs - 1);
893 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
895 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
906 assert(!pkt->
req->isKernel());
913 assert(gpuDynInst->numScalarReqs > 0);
915 gpuDynInst->numScalarReqs--;
925 if (!gpuDynInst->numScalarReqs) {
926 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
927 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
930 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
944 for (
const auto &pkt : retries) {
945 if (!sendTimingReq(pkt)) {
956 int len = retries.size();
960 for (
int i = 0;
i <
len; ++
i) {
962 GEM5_VAR_USED
GPUDynInstPtr gpuDynInst = retries.front().second;
963 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
964 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
965 pkt->
req->getPaddr());
970 if (!sendTimingReq(pkt)) {
971 DPRINTF(GPUMem,
"failed again!\n");
974 DPRINTF(GPUMem,
"successful!\n");
983 computeUnit->fetchStage.processFetchReturn(pkt);
990 int len = retries.size();
994 for (
int i = 0;
i <
len; ++
i) {
996 GEM5_VAR_USED
Wavefront *wavefront = retries.front().second;
997 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
998 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
999 pkt->
req->getPaddr());
1000 if (!sendTimingReq(pkt)) {
1001 DPRINTF(GPUFetch,
"failed again!\n");
1004 DPRINTF(GPUFetch,
"successful!\n");
1005 retries.pop_front();
1014 Addr tmp_vaddr = pkt->
req->getVaddr();
1019 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1021 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1042 }
else if (pkt->
isRead()) {
1045 fatal(
"pkt is not a read nor a write\n");
1057 unsigned size = pkt->
getSize();
1060 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1061 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1066 if (!
p->pTable->translate(
vaddr, paddr)) {
1067 if (!
p->fixupFault(
vaddr)) {
1068 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1069 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1086 tlbPort[tlbPort_index].sendFunctional(pkt);
1089 int hit_level = translation_state->
hitLevel;
1090 assert(hit_level != -1);
1095 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1098 delete sender_state->
saved;
1099 delete sender_state;
1101 assert(pkt->
req->hasPaddr());
1102 assert(pkt->
req->hasSize());
1112 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1123 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1124 gpuDynInst->tlbHitLevel[
index] = hit_level;
1131 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1132 "scheduled\n",
cu_id, gpuDynInst->simdId,
1133 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1136 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1137 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1139 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1140 "failed!\n",
cu_id, gpuDynInst->simdId,
1141 gpuDynInst->wfSlotId, tmp_vaddr);
1143 tlbPort[tlbPort_index].retries.push_back(pkt);
1144 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1149 tlbPort[tlbPort_index].stallPort();
1151 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1152 "failed!\n",
cu_id, gpuDynInst->simdId,
1153 gpuDynInst->wfSlotId, tmp_vaddr);
1155 tlbPort[tlbPort_index].retries.push_back(pkt);
1158 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1159 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1163 gpuDynInst->resetEntireStatusVector();
1165 gpuDynInst->decrementStatusVector(
index);
1175 tlbPort[tlbPort_index].sendFunctional(pkt);
1185 memPort[0].sendFunctional(new_pkt);
1187 DPRINTF(GPUMem,
"Functional sendRequest\n");
1188 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1189 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1190 new_pkt->
req->getPaddr());
1194 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1224 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1226 pkt->
req->getVaddr());
1235 assert(gpuDynInst->isGlobalSeg() ||
1236 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1239 req = std::make_shared<Request>(
1248 if (kernelMemSync) {
1249 if (gpuDynInst->isKernelLaunch()) {
1251 req->setReqInstSeqNum(gpuDynInst->seqNum());
1258 memPort[0].createMemReqEvent(pkt);
1260 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1261 "an acquire\n",
cu_id, gpuDynInst->simdId,
1262 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1269 assert(gpuDynInst->isEndOfKernel());
1272 req->setReqInstSeqNum(gpuDynInst->seqNum());
1279 memPort[0].createMemReqEvent(pkt);
1281 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1282 "a release\n",
cu_id, gpuDynInst->simdId,
1283 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1288 gpuDynInst->setRequestFlags(req);
1290 req->setReqInstSeqNum(gpuDynInst->seqNum());
1297 memPort[0].createMemReqEvent(pkt);
1300 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1301 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1302 pkt->
req->getPaddr());
1312 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1319 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1320 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1321 pkt->
req->getPaddr(),
id);
1323 Addr paddr = pkt->
req->getPaddr();
1337 int index = gpuDynInst->memStatusVector[paddr].back();
1339 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1340 pkt->
req->getPaddr(),
id);
1342 gpuDynInst->memStatusVector[paddr].pop_back();
1343 gpuDynInst->pAddr = pkt->
req->getPaddr();
1345 gpuDynInst->decrementStatusVector(
index);
1346 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1348 if (gpuDynInst->allLanesZero()) {
1349 auto iter = gpuDynInst->memStatusVector.begin();
1350 auto end = gpuDynInst->memStatusVector.end();
1352 while (iter != end) {
1353 assert(iter->second.empty());
1360 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1366 gpuDynInst->memStatusVector.clear();
1372 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1373 compute_unit->
cu_id, gpuDynInst->simdId,
1374 gpuDynInst->wfSlotId);
1377 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1379 .insert(std::make_pair(gpuDynInst,
curTick()));
1391 Addr line = pkt->
req->getPaddr();
1393 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1394 pkt->
req->getVaddr(), line);
1397 computeUnit->stats.tlbCycles +=
curTick();
1401 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1404 if (!translation_state->
tlbEntry) {
1406 safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
1409 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1412 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1413 pkt->
req->getVaddr());
1417 int hit_level = translation_state->
hitLevel;
1418 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1420 delete translation_state->
tlbEntry;
1421 assert(!translation_state->
ports.size());
1427 delete translation_state;
1431 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1436 gpuDynInst->memStatusVector[line].push_back(mp_index);
1437 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1448 panic(
"unsupported response to request conversion %s\n",
1452 if (computeUnit->prefetchDepth) {
1453 int simdId = gpuDynInst->simdId;
1454 int wfSlotId = gpuDynInst->wfSlotId;
1457 switch(computeUnit->prefetchType) {
1459 last = computeUnit->lastVaddrCU[mp_index];
1461 case enums::PF_PHASE:
1462 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1465 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1470 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1471 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1479 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1480 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1481 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1483 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1484 computeUnit->prefetchStride:
stride;
1486 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1487 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1492 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1499 RequestPtr prefetch_req = std::make_shared<Request>(
1502 computeUnit->requestorId(),
1512 computeUnit->shader->gpuTc,
true);
1515 sendFunctional(prefetch_pkt);
1519 safe_cast<X86ISA::GpuTLB::TranslationState*>(
1525 delete prefetch_pkt;
1544 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1546 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1547 computeUnit->cu_id, gpuDynInst->simdId,
1548 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1550 computeUnit->schedule(mem_req_event,
curTick() +
1551 computeUnit->req_tick_latency);
1560 [
this, pkt]{ processMemReqEvent(pkt); },
1561 "ComputeUnit memory request event",
true);
1568 [
this, pkt]{ processMemRespEvent(pkt); },
1569 "ComputeUnit memory response event",
true);
1577 GEM5_VAR_USED
ComputeUnit *compute_unit = computeUnit;
1579 if (!(sendTimingReq(pkt))) {
1580 retries.push_back(std::make_pair(pkt, gpuDynInst));
1583 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1584 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1585 id, pkt->
req->getPaddr());
1588 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1589 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1590 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1591 pkt->
req->getPaddr());
1598 return "ComputeUnit scalar memory request event";
1604 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1612 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1613 compute_unit->cu_id, gpuDynInst->simdId,
1614 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1617 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1618 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1619 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1620 pkt->req->getPaddr());
1633 int len = retries.size();
1635 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1636 computeUnit->cu_id,
len);
1639 assert(isStalled());
1644 for (
int i = 0;
i <
len; ++
i) {
1647 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1649 if (!sendTimingReq(pkt)) {
1652 DPRINTF(GPUTLB,
": failed again\n");
1655 DPRINTF(GPUTLB,
": successful\n");
1656 retries.pop_front();
1667 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1671 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1673 delete translation_state->
tlbEntry;
1674 assert(!translation_state->
ports.size());
1677 delete translation_state;
1680 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1685 GEM5_VAR_USED
Wavefront *
w = gpuDynInst->wavefront();
1687 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1688 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1689 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1698 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1709 if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1710 computeUnit->scalarDataPort.retries.push_back(req_pkt);
1711 DPRINTF(GPUMem,
"send scalar req failed for: %s\n",
1712 gpuDynInst->disassemble());
1714 DPRINTF(GPUMem,
"send scalar req for: %s\n",
1715 gpuDynInst->disassemble());
1724 GEM5_VAR_USED
Addr line = pkt->
req->getPaddr();
1725 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1726 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1732 = safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1734 bool success = translation_state->
tlbEntry !=
nullptr;
1735 delete translation_state->
tlbEntry;
1736 assert(!translation_state->
ports.size());
1738 delete translation_state;
1742 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1755 computeUnit->fetchStage.fetch(pkt, wavefront);
1778 int len = retries.size();
1779 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1782 assert(isStalled());
1788 for (
int i = 0;
i <
len; ++
i) {
1791 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1793 if (!sendTimingReq(pkt)) {
1795 DPRINTF(GPUTLB,
": failed again\n");
1798 DPRINTF(GPUTLB,
": successful\n");
1799 retries.pop_front();
1807 if (gpuDynInst->isScalar()) {
1808 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1811 }
else if (gpuDynInst->isLoad()) {
1813 }
else if (gpuDynInst->isStore()) {
1817 if (gpuDynInst->isALU()) {
1825 += gpuDynInst->wavefront()->execMask().count();
1826 }
else if (gpuDynInst->isFlat()) {
1827 if (gpuDynInst->isLocalMem()) {
1832 }
else if (gpuDynInst->isLocalMem()) {
1834 }
else if (gpuDynInst->isLoad()) {
1836 }
else if (gpuDynInst->isStore()) {
1840 if (gpuDynInst->isLoad()) {
1841 switch (gpuDynInst->executedAs()) {
1842 case enums::SC_SPILL:
1845 case enums::SC_GLOBAL:
1848 case enums::SC_GROUP:
1851 case enums::SC_PRIVATE:
1854 case enums::SC_READONLY:
1857 case enums::SC_KERNARG:
1870 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1873 }
else if (gpuDynInst->isStore()) {
1874 switch (gpuDynInst->executedAs()) {
1875 case enums::SC_SPILL:
1878 case enums::SC_GLOBAL:
1881 case enums::SC_GROUP:
1884 case enums::SC_PRIVATE:
1887 case enums::SC_READONLY:
1890 case enums::SC_KERNARG:
1903 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1927 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
1931 *page_stat_file << std::hex << iter.first <<
",";
1932 *page_stat_file << std::dec << iter.second.first <<
",";
1933 *page_stat_file << std::dec << iter.second.second << std::endl;
1970 const uint32_t wgId)
const
1980 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
1999 RequestPtr newRequest = std::make_shared<Request>();
2000 newRequest->setPaddr(0x0);
2020 fatal_if(!senderState,
"did not get the right sort of sender state");
2027 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2041 fatal_if(!sender_state,
"packet without a valid sender state");
2046 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2050 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2051 computeUnit->cu_id, gpuDynInst->simdId,
2052 gpuDynInst->wfSlotId);
2060 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2061 computeUnit->cu_id, gpuDynInst->simdId,
2062 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2065 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2066 computeUnit->cu_id, gpuDynInst->simdId,
2067 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2081 auto queueSize = retries.size();
2083 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2084 computeUnit->cu_id, queueSize);
2087 "why was there a recvReqRetry() with no pending reqs?");
2089 "recvReqRetry() happened when the port was not stalled");
2093 while (!retries.empty()) {
2096 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2101 DPRINTF(GPUPort,
": LDS send failed again\n");
2104 DPRINTF(GPUTLB,
": LDS send successful\n");
2112 : statistics::
Group(parent),
2113 ADD_STAT(vALUInsts,
"Number of vector ALU insts issued."),
2114 ADD_STAT(vALUInstsPerWF,
"The avg. number of vector ALU insts issued "
2116 ADD_STAT(sALUInsts,
"Number of scalar ALU insts issued."),
2117 ADD_STAT(sALUInstsPerWF,
"The avg. number of scalar ALU insts issued "
2120 "Number of cycles needed to execute VALU insts."),
2122 "Number of cycles needed to execute SALU insts."),
2123 ADD_STAT(threadCyclesVALU,
"Number of thread cycles used to execute "
2124 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2125 "the number of active threads."),
2127 "Percentage of active vector ALU threads in a wave."),
2128 ADD_STAT(ldsNoFlatInsts,
"Number of LDS insts issued, not including FLAT"
2129 " accesses that resolve to LDS."),
2130 ADD_STAT(ldsNoFlatInstsPerWF,
"The avg. number of LDS insts (not "
2131 "including FLAT accesses that resolve to LDS) per-wavefront."),
2133 "The number of FLAT insts that resolve to vmem issued."),
2134 ADD_STAT(flatVMemInstsPerWF,
"The average number of FLAT insts that "
2135 "resolve to vmem issued per-wavefront."),
2137 "The number of FLAT insts that resolve to LDS issued."),
2138 ADD_STAT(flatLDSInstsPerWF,
"The average number of FLAT insts that "
2139 "resolve to LDS issued per-wavefront."),
2141 "Number of vector mem write insts (excluding FLAT insts)."),
2142 ADD_STAT(vectorMemWritesPerWF,
"The average number of vector mem write "
2143 "insts (excluding FLAT insts) per-wavefront."),
2145 "Number of vector mem read insts (excluding FLAT insts)."),
2146 ADD_STAT(vectorMemReadsPerWF,
"The avg. number of vector mem read insts "
2147 "(excluding FLAT insts) per-wavefront."),
2148 ADD_STAT(scalarMemWrites,
"Number of scalar mem write insts."),
2150 "The average number of scalar mem write insts per-wavefront."),
2151 ADD_STAT(scalarMemReads,
"Number of scalar mem read insts."),
2153 "The average number of scalar mem read insts per-wavefront."),
2154 ADD_STAT(vectorMemReadsPerKiloInst,
2155 "Number of vector mem reads per kilo-instruction"),
2156 ADD_STAT(vectorMemWritesPerKiloInst,
2157 "Number of vector mem writes per kilo-instruction"),
2158 ADD_STAT(vectorMemInstsPerKiloInst,
2159 "Number of vector mem insts per kilo-instruction"),
2160 ADD_STAT(scalarMemReadsPerKiloInst,
2161 "Number of scalar mem reads per kilo-instruction"),
2162 ADD_STAT(scalarMemWritesPerKiloInst,
2163 "Number of scalar mem writes per kilo-instruction"),
2164 ADD_STAT(scalarMemInstsPerKiloInst,
2165 "Number of scalar mem insts per kilo-instruction"),
2166 ADD_STAT(instCyclesVMemPerSimd,
"Number of cycles to send address, "
2167 "command, data from VRF to vector memory unit, per SIMD"),
2168 ADD_STAT(instCyclesScMemPerSimd,
"Number of cycles to send address, "
2169 "command, data from SRF to scalar memory unit, per SIMD"),
2170 ADD_STAT(instCyclesLdsPerSimd,
"Number of cycles to send address, "
2171 "command, data from VRF to LDS unit, per SIMD"),
2172 ADD_STAT(globalReads,
"Number of reads to the global segment"),
2173 ADD_STAT(globalWrites,
"Number of writes to the global segment"),
2175 "Number of memory instructions sent to the global segment"),
2176 ADD_STAT(argReads,
"Number of reads to the arg segment"),
2177 ADD_STAT(argWrites,
"NUmber of writes to the arg segment"),
2179 "Number of memory instructions sent to the arg segment"),
2180 ADD_STAT(spillReads,
"Number of reads to the spill segment"),
2181 ADD_STAT(spillWrites,
"Number of writes to the spill segment"),
2183 "Number of memory instructions sent to the spill segment"),
2184 ADD_STAT(groupReads,
"Number of reads to the group segment"),
2185 ADD_STAT(groupWrites,
"Number of writes to the group segment"),
2187 "Number of memory instructions sent to the group segment"),
2188 ADD_STAT(privReads,
"Number of reads to the private segment"),
2189 ADD_STAT(privWrites,
"Number of writes to the private segment"),
2191 "Number of memory instructions sent to the private segment"),
2192 ADD_STAT(readonlyReads,
"Number of reads to the readonly segment"),
2194 "Number of memory instructions sent to the readonly segment"),
2196 "Number of memory instructions sent to the readonly segment"),
2197 ADD_STAT(kernargReads,
"Number of reads sent to the kernarg segment"),
2199 "Number of memory instructions sent to the kernarg segment"),
2201 "Number of memory instructions sent to the kernarg segment"),
2203 "wave level parallelism: count of active waves at wave launch"),
2204 ADD_STAT(tlbRequests,
"number of uncoalesced requests"),
2206 "total number of cycles for all uncoalesced requests"),
2207 ADD_STAT(tlbLatency,
"Avg. translation latency for data translations"),
2209 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2210 ADD_STAT(ldsBankAccesses,
"Total number of LDS bank accesses"),
2212 "Number of bank conflicts per LDS memory packet"),
2214 "pages touched per wf (over all mem. instr.)"),
2216 "dynamic non-flat global memory instruction count"),
2218 "dynamic flat global memory instruction count"),
2219 ADD_STAT(dynamicLMemInstrCnt,
"dynamic local memory intruction count"),
2220 ADD_STAT(wgBlockedDueBarrierAllocation,
2221 "WG dispatch was blocked due to lack of barrier resources"),
2222 ADD_STAT(wgBlockedDueLdsAllocation,
2223 "Workgroup blocked due to LDS capacity"),
2224 ADD_STAT(numInstrExecuted,
"number of instructions executed"),
2225 ADD_STAT(execRateDist,
"Instruction Execution Rate: Number of executed "
2226 "vector instructions per cycle"),
2228 "number of vec ops executed (e.g. WF size/inst)"),
2230 "number of f16 vec ops executed (e.g. WF size/inst)"),
2232 "number of f32 vec ops executed (e.g. WF size/inst)"),
2234 "number of f64 vec ops executed (e.g. WF size/inst)"),
2236 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2238 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2240 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2242 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2244 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2246 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2248 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2250 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2252 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2254 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2255 ADD_STAT(totalCycles,
"number of cycles the CU ran for"),
2256 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2257 ADD_STAT(vpc_f16,
"F16 Vector Operations per cycle (this CU only)"),
2258 ADD_STAT(vpc_f32,
"F32 Vector Operations per cycle (this CU only)"),
2259 ADD_STAT(vpc_f64,
"F64 Vector Operations per cycle (this CU only)"),
2260 ADD_STAT(ipc,
"Instructions per cycle (this CU only)"),
2261 ADD_STAT(controlFlowDivergenceDist,
"number of lanes active per "
2262 "instruction (over all instructions)"),
2263 ADD_STAT(activeLanesPerGMemInstrDist,
2264 "number of active lanes per global memory instruction"),
2265 ADD_STAT(activeLanesPerLMemInstrDist,
2266 "number of active lanes per local memory instruction"),
2268 "Number of dynamic non-GM memory insts executed"),
2269 ADD_STAT(numTimesWgBlockedDueVgprAlloc,
"Number of times WGs are "
2270 "blocked due to VGPR allocation per SIMD"),
2271 ADD_STAT(numTimesWgBlockedDueSgprAlloc,
"Number of times WGs are "
2272 "blocked due to SGPR allocation per SIMD"),
2273 ADD_STAT(numCASOps,
"number of compare and swap operations"),
2275 "number of compare and swap operations that failed"),
2276 ADD_STAT(completedWfs,
"number of completed wavefronts"),
2277 ADD_STAT(completedWGs,
"number of completed workgroups"),
2278 ADD_STAT(headTailLatency,
"ticks between first and last cache block "
2279 "arrival at coalescer"),
2280 ADD_STAT(instInterleave,
"Measure of instruction interleaving per SIMD")
2333 for (
int i = 0;
i < 4; ++
i) {