40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
63 numVectorGlobalMemUnits(
p.num_global_mem_pipes),
64 numVectorSharedMemUnits(
p.num_shared_mem_pipes),
65 numScalarMemUnits(
p.num_scalar_mem_pipes),
66 numVectorALUs(
p.num_SIMDs),
67 numScalarALUs(
p.num_scalar_cores),
68 vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
69 coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
70 registerManager(
p.register_manager),
72 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
73 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
74 execStage(
p, *this, scheduleToExecute),
75 globalMemoryPipe(
p, *this),
76 localMemoryPipe(
p, *this),
77 scalarMemoryPipe(
p, *this),
78 tickEvent([this]{
exec(); },
"Compute unit tick event",
81 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
82 simdWidth(
p.simd_width),
83 spBypassPipeLength(
p.spbypass_pipe_length),
84 dpBypassPipeLength(
p.dpbypass_pipe_length),
85 scalarPipeStages(
p.scalar_pipe_length),
86 operandNetworkLength(
p.operand_network_length),
87 issuePeriod(
p.issue_period),
88 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
89 srf_scm_bus_latency(
p.srf_scm_bus_latency),
90 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
91 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
92 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
93 debugSegFault(
p.debugSegFault),
94 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
95 countPages(
p.countPages),
96 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
97 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
98 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
99 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
105 _cacheLineSize(
p.system->cacheLineSize()),
106 _numBarrierSlots(
p.num_barrier_slots),
107 globalSeqNum(0), wavefrontSize(
p.wf_size),
108 scoreboardCheckToSchedule(
p),
109 scheduleToExecute(
p),
121 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
123 "WF size is larger than the host can support");
125 "Wavefront size should be a power of 2");
128 numCyclesPerStoreTransfer =
129 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
130 (double)vrfToCoalescerBusWidth);
132 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
133 / coalescerToVrfBusWidth;
136 idleWfs =
p.n_wf * numVectorALUs;
137 lastVaddrWF.resize(numVectorALUs);
138 wfList.resize(numVectorALUs);
140 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
142 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
143 freeBarrierIds.insert(
i);
146 for (
int j = 0;
j < numVectorALUs; ++
j) {
147 lastVaddrWF[
j].resize(
p.n_wf);
149 for (
int i = 0;
i <
p.n_wf; ++
i) {
150 lastVaddrWF[
j][
i].resize(wfSize());
152 wfList[
j].push_back(
p.wavefronts[
j *
p.n_wf +
i]);
153 wfList[
j][
i]->setParent(
this);
155 for (
int k = 0;
k < wfSize(); ++
k) {
156 lastVaddrWF[
j][
i][
k] = 0;
161 lastVaddrSimd.resize(numVectorALUs);
163 for (
int i = 0;
i < numVectorALUs; ++
i) {
164 lastVaddrSimd[
i].resize(wfSize(), 0);
167 lastVaddrCU.resize(wfSize());
171 if (
p.execPolicy ==
"OLDEST-FIRST") {
173 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
176 fatal(
"Invalid WF execution policy (CU)\n");
179 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
183 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
193 lastExecCycle.resize(numVectorALUs, 0);
195 for (
int i = 0;
i < vrf.size(); ++
i) {
196 vrf[
i]->setParent(
this);
198 for (
int i = 0;
i < srf.size(); ++
i) {
199 srf[
i]->setParent(
this);
201 numVecRegsPerSimd = vrf[0]->numRegs();
202 numScalarRegsPerSimd = srf[0]->numRegs();
204 registerManager->setParent(
this);
208 instExecPerSimd.resize(numVectorALUs, 0);
212 "Cache line size should be a power of two.");
213 cacheLineBits =
floorLog2(_cacheLineSize);
296 w->workGroupSz[0] = task->
wgSize(0);
297 w->workGroupSz[1] = task->
wgSize(1);
298 w->workGroupSz[2] = task->
wgSize(2);
299 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
303 w->computeActualWgSz(task);
310 static int _n_wave = 0;
316 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
320 w->execMask() = init_mask;
324 w->initMask = init_mask.to_ullong();
327 w->barrierId(bar_id);
329 assert(!
w->hasBarrier());
333 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
334 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
336 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
337 (
w->actualWgSz[0] *
w->actualWgSz[1]);
339 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
340 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
347 w->workGroupId[0] =
w->wgId % task->
numWg(0);
348 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
349 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
352 w->ldsChunk = ldsChunk;
354 M5_VAR_USED int32_t refCount =
356 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
357 cu_id,
w->wgId, refCount);
359 w->instructionBuffer.clear();
364 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
365 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
366 w->simdId,
w->wfSlotId, refCount);
368 w->initRegState(task,
w->actualWgSzTotal);
383 = std::make_shared<GPUDynInst>(
this,
nullptr,
387 gpuDynInst->kern_id = kernId;
389 req->setContext(gpuDynInst->wfDynId);
422 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
436 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
450 if (num_wfs_in_wg > 1) {
457 assert(!wf_barrier.maxBarrierCnt());
458 assert(!wf_barrier.numAtBarrier());
459 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
461 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
462 "%d waves using this barrier.\n",
cu_id, barrier_id,
482 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
483 "vregDemand[%d] sregDemand[%d]\n",
i,
j,
w->wfDynId,
484 vregDemand, sregDemand);
499 "Instruction Buffer of WF%d can't be empty",
w->wgId);
508 "Instruction Buffer of WF%d can't be empty",
w->wgId);
511 auto it =
pipeMap.find(ii->seqNum());
521 int trueWgSizeTotal = 1;
527 trueWgSizeTotal *= trueWgSize[
d];
528 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
531 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
534 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
535 num_wfs_in_wg = numWfs;
537 bool barrier_avail =
true;
540 barrier_avail =
false;
553 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
554 "that has %d VGPRs\n",
557 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
564 int numMappedWfs = 0;
576 if (numMappedWfs < numWfs &&
590 assert(numMappedWfs <= numWfs);
592 bool vregAvail =
true;
593 bool sregAvail =
true;
595 if (numMappedWfs < numWfs) {
611 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
612 VGPR Availability = %d, SGPR Availability = %d\n",
613 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
630 if (!barrier_avail) {
639 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
640 && ldsAvail && barrier_avail;
648 return wf_barrier.numYetToReachBarrier();
655 return wf_barrier.allAtBarrier();
662 wf_barrier.incNumAtBarrier();
669 return wf_barrier.numAtBarrier();
676 return wf_barrier.maxBarrierCnt();
690 wf_barrier.decMaxBarrierCnt();
697 wf_barrier.release();
720 for (
auto &vecRegFile :
vrf) {
724 for (
auto &scRegFile :
srf) {
768 "No support for multiple Global Memory Pipelines exists!!!");
775 "No support for multiple Local Memory Pipelines exists!!!");
782 "No support for multiple Scalar Memory Pipelines exists!!!");
820 if (gpuDynInst->isKernelLaunch()) {
823 assert(pkt->
req->isKernel());
824 assert(pkt->
req->isInvL1());
839 && gpuDynInst->isEndOfKernel()) {
845 assert(pkt->
req->isKernel());
846 assert(pkt->
req->isGL2CacheFlush());
862 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
864 w->wfDynId,
w->wgId);
870 if (!pkt->
req->isKernel()) {
872 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
873 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
874 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
875 gpuDynInst->disassemble(),
w->outstandingReqs,
876 w->outstandingReqs - 1);
889 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
891 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
902 assert(!pkt->
req->isKernel());
909 assert(gpuDynInst->numScalarReqs > 0);
911 gpuDynInst->numScalarReqs--;
921 if (!gpuDynInst->numScalarReqs) {
922 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
923 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
926 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
940 for (
const auto &pkt : retries) {
941 if (!sendTimingReq(pkt)) {
952 int len = retries.size();
956 for (
int i = 0;
i <
len; ++
i) {
958 M5_VAR_USED
GPUDynInstPtr gpuDynInst = retries.front().second;
959 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
960 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
961 pkt->
req->getPaddr());
966 if (!sendTimingReq(pkt)) {
967 DPRINTF(GPUMem,
"failed again!\n");
970 DPRINTF(GPUMem,
"successful!\n");
979 computeUnit->fetchStage.processFetchReturn(pkt);
986 int len = retries.size();
990 for (
int i = 0;
i <
len; ++
i) {
992 M5_VAR_USED
Wavefront *wavefront = retries.front().second;
993 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
994 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
995 pkt->
req->getPaddr());
996 if (!sendTimingReq(pkt)) {
997 DPRINTF(GPUFetch,
"failed again!\n");
1000 DPRINTF(GPUFetch,
"successful!\n");
1001 retries.pop_front();
1010 Addr tmp_vaddr = pkt->
req->getVaddr();
1015 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1017 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1030 }
else if (pkt->
isRead()) {
1033 fatal(
"pkt is not a read nor a write\n");
1045 unsigned size = pkt->
getSize();
1048 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1049 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1054 if (!
p->pTable->translate(
vaddr, paddr)) {
1055 if (!
p->fixupFault(
vaddr)) {
1056 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1057 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1074 tlbPort[tlbPort_index].sendFunctional(pkt);
1077 int hit_level = translation_state->
hitLevel;
1078 assert(hit_level != -1);
1083 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1086 delete sender_state->
saved;
1087 delete sender_state;
1089 assert(pkt->
req->hasPaddr());
1090 assert(pkt->
req->hasSize());
1100 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1111 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1112 gpuDynInst->tlbHitLevel[
index] = hit_level;
1119 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1120 "scheduled\n",
cu_id, gpuDynInst->simdId,
1121 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1124 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1125 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1127 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1128 "failed!\n",
cu_id, gpuDynInst->simdId,
1129 gpuDynInst->wfSlotId, tmp_vaddr);
1131 tlbPort[tlbPort_index].retries.push_back(pkt);
1132 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1137 tlbPort[tlbPort_index].stallPort();
1139 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1140 "failed!\n",
cu_id, gpuDynInst->simdId,
1141 gpuDynInst->wfSlotId, tmp_vaddr);
1143 tlbPort[tlbPort_index].retries.push_back(pkt);
1146 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1147 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1151 gpuDynInst->resetEntireStatusVector();
1153 gpuDynInst->decrementStatusVector(
index);
1163 tlbPort[tlbPort_index].sendFunctional(pkt);
1173 memPort[0].sendFunctional(new_pkt);
1175 DPRINTF(GPUMem,
"Functional sendRequest\n");
1176 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1177 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1178 new_pkt->
req->getPaddr());
1182 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1212 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1214 pkt->
req->getVaddr());
1223 assert(gpuDynInst->isGlobalSeg() ||
1224 gpuDynInst->executedAs() == Enums::SC_GLOBAL);
1227 req = std::make_shared<Request>(
1236 if (kernelMemSync) {
1237 if (gpuDynInst->isKernelLaunch()) {
1239 req->setReqInstSeqNum(gpuDynInst->seqNum());
1246 memPort[0].createMemReqEvent(pkt);
1248 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1249 "an acquire\n",
cu_id, gpuDynInst->simdId,
1250 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1257 assert(gpuDynInst->isEndOfKernel());
1260 req->setReqInstSeqNum(gpuDynInst->seqNum());
1267 memPort[0].createMemReqEvent(pkt);
1269 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1270 "a release\n",
cu_id, gpuDynInst->simdId,
1271 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1276 gpuDynInst->setRequestFlags(req);
1278 req->setReqInstSeqNum(gpuDynInst->seqNum());
1285 memPort[0].createMemReqEvent(pkt);
1288 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1289 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1290 pkt->
req->getPaddr());
1300 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1307 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1308 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1309 pkt->
req->getPaddr(),
id);
1311 Addr paddr = pkt->
req->getPaddr();
1325 int index = gpuDynInst->memStatusVector[paddr].back();
1327 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1328 pkt->
req->getPaddr(),
id);
1330 gpuDynInst->memStatusVector[paddr].pop_back();
1331 gpuDynInst->pAddr = pkt->
req->getPaddr();
1333 gpuDynInst->decrementStatusVector(
index);
1334 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1336 if (gpuDynInst->allLanesZero()) {
1337 auto iter = gpuDynInst->memStatusVector.begin();
1338 auto end = gpuDynInst->memStatusVector.end();
1340 while (iter != end) {
1341 assert(iter->second.empty());
1348 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1354 gpuDynInst->memStatusVector.clear();
1360 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1361 compute_unit->
cu_id, gpuDynInst->simdId,
1362 gpuDynInst->wfSlotId);
1365 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1367 .insert(std::make_pair(gpuDynInst,
curTick()));
1379 Addr line = pkt->
req->getPaddr();
1381 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1382 pkt->
req->getVaddr(), line);
1385 computeUnit->stats.tlbCycles +=
curTick();
1389 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1392 if (!translation_state->
tlbEntry) {
1394 safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
1397 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1400 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1401 pkt->
req->getVaddr());
1405 int hit_level = translation_state->
hitLevel;
1406 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1408 delete translation_state->
tlbEntry;
1409 assert(!translation_state->
ports.size());
1415 delete translation_state;
1419 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1424 gpuDynInst->memStatusVector[line].push_back(mp_index);
1425 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1436 panic(
"unsupported response to request conversion %s\n",
1440 if (computeUnit->prefetchDepth) {
1441 int simdId = gpuDynInst->simdId;
1442 int wfSlotId = gpuDynInst->wfSlotId;
1445 switch(computeUnit->prefetchType) {
1447 last = computeUnit->lastVaddrCU[mp_index];
1449 case Enums::PF_PHASE:
1450 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1453 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1458 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1459 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1467 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1468 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1469 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1471 stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1472 computeUnit->prefetchStride:
stride;
1474 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1475 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1480 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1487 RequestPtr prefetch_req = std::make_shared<Request>(
1490 computeUnit->requestorId(),
1500 computeUnit->shader->gpuTc,
true);
1503 sendFunctional(prefetch_pkt);
1507 safe_cast<X86ISA::GpuTLB::TranslationState*>(
1513 delete prefetch_pkt;
1532 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1534 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1535 computeUnit->cu_id, gpuDynInst->simdId,
1536 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1538 computeUnit->schedule(mem_req_event,
curTick() +
1539 computeUnit->req_tick_latency);
1548 [
this, pkt]{ processMemReqEvent(pkt); },
1549 "ComputeUnit memory request event",
true);
1556 [
this, pkt]{ processMemRespEvent(pkt); },
1557 "ComputeUnit memory response event",
true);
1565 M5_VAR_USED
ComputeUnit *compute_unit = computeUnit;
1567 if (!(sendTimingReq(pkt))) {
1568 retries.push_back(std::make_pair(pkt, gpuDynInst));
1571 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1572 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1573 id, pkt->
req->getPaddr());
1576 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1577 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1578 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1579 pkt->
req->getPaddr());
1586 return "ComputeUnit scalar memory request event";
1592 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1600 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1601 compute_unit->cu_id, gpuDynInst->simdId,
1602 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1605 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1606 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1607 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1608 pkt->req->getPaddr());
1621 int len = retries.size();
1623 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1624 computeUnit->cu_id,
len);
1627 assert(isStalled());
1632 for (
int i = 0;
i <
len; ++
i) {
1635 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1637 if (!sendTimingReq(pkt)) {
1640 DPRINTF(GPUTLB,
": failed again\n");
1643 DPRINTF(GPUTLB,
": successful\n");
1644 retries.pop_front();
1655 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1659 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1661 delete translation_state->
tlbEntry;
1662 assert(!translation_state->
ports.size());
1665 delete translation_state;
1668 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1673 M5_VAR_USED
Wavefront *
w = gpuDynInst->wavefront();
1675 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1676 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1677 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1686 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1697 if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1698 computeUnit->scalarDataPort.retries.push_back(req_pkt);
1699 DPRINTF(GPUMem,
"send scalar req failed for: %s\n",
1700 gpuDynInst->disassemble());
1702 DPRINTF(GPUMem,
"send scalar req for: %s\n",
1703 gpuDynInst->disassemble());
1712 M5_VAR_USED
Addr line = pkt->
req->getPaddr();
1713 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1714 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1720 = safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1722 bool success = translation_state->
tlbEntry !=
nullptr;
1723 delete translation_state->
tlbEntry;
1724 assert(!translation_state->
ports.size());
1726 delete translation_state;
1730 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1743 computeUnit->fetchStage.fetch(pkt, wavefront);
1766 int len = retries.size();
1767 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1770 assert(isStalled());
1776 for (
int i = 0;
i <
len; ++
i) {
1779 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1781 if (!sendTimingReq(pkt)) {
1783 DPRINTF(GPUTLB,
": failed again\n");
1786 DPRINTF(GPUTLB,
": successful\n");
1787 retries.pop_front();
1795 if (gpuDynInst->isScalar()) {
1796 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1799 }
else if (gpuDynInst->isLoad()) {
1801 }
else if (gpuDynInst->isStore()) {
1805 if (gpuDynInst->isALU()) {
1813 += gpuDynInst->wavefront()->execMask().count();
1814 }
else if (gpuDynInst->isFlat()) {
1815 if (gpuDynInst->isLocalMem()) {
1820 }
else if (gpuDynInst->isLocalMem()) {
1822 }
else if (gpuDynInst->isLoad()) {
1824 }
else if (gpuDynInst->isStore()) {
1828 if (gpuDynInst->isLoad()) {
1829 switch (gpuDynInst->executedAs()) {
1830 case Enums::SC_SPILL:
1833 case Enums::SC_GLOBAL:
1836 case Enums::SC_GROUP:
1839 case Enums::SC_PRIVATE:
1842 case Enums::SC_READONLY:
1845 case Enums::SC_KERNARG:
1858 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1861 }
else if (gpuDynInst->isStore()) {
1862 switch (gpuDynInst->executedAs()) {
1863 case Enums::SC_SPILL:
1866 case Enums::SC_GLOBAL:
1869 case Enums::SC_GROUP:
1872 case Enums::SC_PRIVATE:
1875 case Enums::SC_READONLY:
1878 case Enums::SC_KERNARG:
1891 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1915 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
1919 *page_stat_file << std::hex << iter.first <<
",";
1920 *page_stat_file << std::dec << iter.second.first <<
",";
1921 *page_stat_file << std::dec << iter.second.second << std::endl;
1958 const uint32_t wgId)
const
1968 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
1987 RequestPtr newRequest = std::make_shared<Request>();
1988 newRequest->setPaddr(0x0);
2008 fatal_if(!senderState,
"did not get the right sort of sender state");
2015 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2029 fatal_if(!sender_state,
"packet without a valid sender state");
2034 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2038 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2039 computeUnit->cu_id, gpuDynInst->simdId,
2040 gpuDynInst->wfSlotId);
2048 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2049 computeUnit->cu_id, gpuDynInst->simdId,
2050 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2053 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2054 computeUnit->cu_id, gpuDynInst->simdId,
2055 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2069 auto queueSize = retries.size();
2071 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2072 computeUnit->cu_id, queueSize);
2075 "why was there a recvReqRetry() with no pending reqs?");
2077 "recvReqRetry() happened when the port was not stalled");
2081 while (!retries.empty()) {
2084 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2089 DPRINTF(GPUPort,
": LDS send failed again\n");
2092 DPRINTF(GPUTLB,
": LDS send successful\n");
2100 ADD_STAT(vALUInsts,
"Number of vector ALU insts issued."),
2101 ADD_STAT(vALUInstsPerWF,
"The avg. number of vector ALU insts issued "
2103 ADD_STAT(sALUInsts,
"Number of scalar ALU insts issued."),
2104 ADD_STAT(sALUInstsPerWF,
"The avg. number of scalar ALU insts issued "
2107 "Number of cycles needed to execute VALU insts."),
2109 "Number of cycles needed to execute SALU insts."),
2110 ADD_STAT(threadCyclesVALU,
"Number of thread cycles used to execute "
2111 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2112 "the number of active threads."),
2114 "Percentage of active vector ALU threads in a wave."),
2115 ADD_STAT(ldsNoFlatInsts,
"Number of LDS insts issued, not including FLAT"
2116 " accesses that resolve to LDS."),
2117 ADD_STAT(ldsNoFlatInstsPerWF,
"The avg. number of LDS insts (not "
2118 "including FLAT accesses that resolve to LDS) per-wavefront."),
2120 "The number of FLAT insts that resolve to vmem issued."),
2121 ADD_STAT(flatVMemInstsPerWF,
"The average number of FLAT insts that "
2122 "resolve to vmem issued per-wavefront."),
2124 "The number of FLAT insts that resolve to LDS issued."),
2125 ADD_STAT(flatLDSInstsPerWF,
"The average number of FLAT insts that "
2126 "resolve to LDS issued per-wavefront."),
2128 "Number of vector mem write insts (excluding FLAT insts)."),
2129 ADD_STAT(vectorMemWritesPerWF,
"The average number of vector mem write "
2130 "insts (excluding FLAT insts) per-wavefront."),
2132 "Number of vector mem read insts (excluding FLAT insts)."),
2133 ADD_STAT(vectorMemReadsPerWF,
"The avg. number of vector mem read insts "
2134 "(excluding FLAT insts) per-wavefront."),
2135 ADD_STAT(scalarMemWrites,
"Number of scalar mem write insts."),
2137 "The average number of scalar mem write insts per-wavefront."),
2138 ADD_STAT(scalarMemReads,
"Number of scalar mem read insts."),
2140 "The average number of scalar mem read insts per-wavefront."),
2141 ADD_STAT(vectorMemReadsPerKiloInst,
2142 "Number of vector mem reads per kilo-instruction"),
2143 ADD_STAT(vectorMemWritesPerKiloInst,
2144 "Number of vector mem writes per kilo-instruction"),
2145 ADD_STAT(vectorMemInstsPerKiloInst,
2146 "Number of vector mem insts per kilo-instruction"),
2147 ADD_STAT(scalarMemReadsPerKiloInst,
2148 "Number of scalar mem reads per kilo-instruction"),
2149 ADD_STAT(scalarMemWritesPerKiloInst,
2150 "Number of scalar mem writes per kilo-instruction"),
2151 ADD_STAT(scalarMemInstsPerKiloInst,
2152 "Number of scalar mem insts per kilo-instruction"),
2153 ADD_STAT(instCyclesVMemPerSimd,
"Number of cycles to send address, "
2154 "command, data from VRF to vector memory unit, per SIMD"),
2155 ADD_STAT(instCyclesScMemPerSimd,
"Number of cycles to send address, "
2156 "command, data from SRF to scalar memory unit, per SIMD"),
2157 ADD_STAT(instCyclesLdsPerSimd,
"Number of cycles to send address, "
2158 "command, data from VRF to LDS unit, per SIMD"),
2159 ADD_STAT(globalReads,
"Number of reads to the global segment"),
2160 ADD_STAT(globalWrites,
"Number of writes to the global segment"),
2162 "Number of memory instructions sent to the global segment"),
2163 ADD_STAT(argReads,
"Number of reads to the arg segment"),
2164 ADD_STAT(argWrites,
"NUmber of writes to the arg segment"),
2166 "Number of memory instructions sent to the arg segment"),
2167 ADD_STAT(spillReads,
"Number of reads to the spill segment"),
2168 ADD_STAT(spillWrites,
"Number of writes to the spill segment"),
2170 "Number of memory instructions sent to the spill segment"),
2171 ADD_STAT(groupReads,
"Number of reads to the group segment"),
2172 ADD_STAT(groupWrites,
"Number of writes to the group segment"),
2174 "Number of memory instructions sent to the group segment"),
2175 ADD_STAT(privReads,
"Number of reads to the private segment"),
2176 ADD_STAT(privWrites,
"Number of writes to the private segment"),
2178 "Number of memory instructions sent to the private segment"),
2179 ADD_STAT(readonlyReads,
"Number of reads to the readonly segment"),
2181 "Number of memory instructions sent to the readonly segment"),
2183 "Number of memory instructions sent to the readonly segment"),
2184 ADD_STAT(kernargReads,
"Number of reads sent to the kernarg segment"),
2186 "Number of memory instructions sent to the kernarg segment"),
2188 "Number of memory instructions sent to the kernarg segment"),
2190 "wave level parallelism: count of active waves at wave launch"),
2191 ADD_STAT(tlbRequests,
"number of uncoalesced requests"),
2193 "total number of cycles for all uncoalesced requests"),
2194 ADD_STAT(tlbLatency,
"Avg. translation latency for data translations"),
2196 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2197 ADD_STAT(ldsBankAccesses,
"Total number of LDS bank accesses"),
2199 "Number of bank conflicts per LDS memory packet"),
2201 "pages touched per wf (over all mem. instr.)"),
2203 "dynamic non-flat global memory instruction count"),
2205 "dynamic flat global memory instruction count"),
2206 ADD_STAT(dynamicLMemInstrCnt,
"dynamic local memory intruction count"),
2207 ADD_STAT(wgBlockedDueBarrierAllocation,
2208 "WG dispatch was blocked due to lack of barrier resources"),
2209 ADD_STAT(wgBlockedDueLdsAllocation,
2210 "Workgroup blocked due to LDS capacity"),
2211 ADD_STAT(numInstrExecuted,
"number of instructions executed"),
2212 ADD_STAT(execRateDist,
"Instruction Execution Rate: Number of executed "
2213 "vector instructions per cycle"),
2215 "number of vec ops executed (e.g. WF size/inst)"),
2217 "number of f16 vec ops executed (e.g. WF size/inst)"),
2219 "number of f32 vec ops executed (e.g. WF size/inst)"),
2221 "number of f64 vec ops executed (e.g. WF size/inst)"),
2223 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2225 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2227 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2229 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2231 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2233 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2235 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2237 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2239 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2241 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2242 ADD_STAT(totalCycles,
"number of cycles the CU ran for"),
2243 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2244 ADD_STAT(vpc_f16,
"F16 Vector Operations per cycle (this CU only)"),
2245 ADD_STAT(vpc_f32,
"F32 Vector Operations per cycle (this CU only)"),
2246 ADD_STAT(vpc_f64,
"F64 Vector Operations per cycle (this CU only)"),
2247 ADD_STAT(ipc,
"Instructions per cycle (this CU only)"),
2248 ADD_STAT(controlFlowDivergenceDist,
"number of lanes active per "
2249 "instruction (over all instructions)"),
2250 ADD_STAT(activeLanesPerGMemInstrDist,
2251 "number of active lanes per global memory instruction"),
2252 ADD_STAT(activeLanesPerLMemInstrDist,
2253 "number of active lanes per local memory instruction"),
2255 "Number of dynamic non-GM memory insts executed"),
2256 ADD_STAT(numTimesWgBlockedDueVgprAlloc,
"Number of times WGs are "
2257 "blocked due to VGPR allocation per SIMD"),
2258 ADD_STAT(numTimesWgBlockedDueSgprAlloc,
"Number of times WGs are "
2259 "blocked due to SGPR allocation per SIMD"),
2260 ADD_STAT(numCASOps,
"number of compare and swap operations"),
2262 "number of compare and swap operations that failed"),
2263 ADD_STAT(completedWfs,
"number of completed wavefronts"),
2264 ADD_STAT(completedWGs,
"number of completed workgroups"),
2265 ADD_STAT(headTailLatency,
"ticks between first and last cache block "
2266 "arrival at coalescer"),
2267 ADD_STAT(instInterleave,
"Measure of instruction interleaving per SIMD")
2319 for (
int i = 0;
i < 4; ++
i) {