40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUExec.hh"
42 #include "debug/GPUFetch.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUPort.hh"
45 #include "debug/GPUPrefetch.hh"
46 #include "debug/GPUReg.hh"
47 #include "debug/GPURename.hh"
48 #include "debug/GPUSync.hh"
49 #include "debug/GPUTLB.hh"
63 numVectorGlobalMemUnits(
p->num_global_mem_pipes),
64 numVectorSharedMemUnits(
p->num_shared_mem_pipes),
65 numScalarMemUnits(
p->num_scalar_mem_pipes),
66 numVectorALUs(
p->num_SIMDs),
67 numScalarALUs(
p->num_scalar_cores),
68 vrfToCoalescerBusWidth(
p->vrf_to_coalescer_bus_width),
69 coalescerToVrfBusWidth(
p->coalescer_to_vrf_bus_width),
70 registerManager(
p->register_manager),
72 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
73 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
74 execStage(
p, *this, scheduleToExecute),
75 globalMemoryPipe(
p, *this),
76 localMemoryPipe(
p, *this),
77 scalarMemoryPipe(
p, *this),
78 tickEvent([this]{
exec(); },
"Compute unit tick event",
81 vrf(
p->vector_register_file), srf(
p->scalar_register_file),
82 simdWidth(
p->simd_width),
83 spBypassPipeLength(
p->spbypass_pipe_length),
84 dpBypassPipeLength(
p->dpbypass_pipe_length),
85 scalarPipeStages(
p->scalar_pipe_length),
86 operandNetworkLength(
p->operand_network_length),
87 issuePeriod(
p->issue_period),
88 vrf_gm_bus_latency(
p->vrf_gm_bus_latency),
89 srf_scm_bus_latency(
p->srf_scm_bus_latency),
90 vrf_lm_bus_latency(
p->vrf_lm_bus_latency),
91 perLaneTLB(
p->perLaneTLB), prefetchDepth(
p->prefetch_depth),
92 prefetchStride(
p->prefetch_stride), prefetchType(
p->prefetch_prev_type),
93 debugSegFault(
p->debugSegFault),
94 functionalTLB(
p->functionalTLB), localMemBarrier(
p->localMemBarrier),
95 countPages(
p->countPages),
96 req_tick_latency(
p->mem_req_latency *
p->clk_domain->clockPeriod()),
97 resp_tick_latency(
p->mem_resp_latency *
p->clk_domain->clockPeriod()),
98 _requestorId(
p->system->getRequestorId(
this,
"ComputeUnit")),
99 lds(*
p->localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
105 _cacheLineSize(
p->system->cacheLineSize()),
106 _numBarrierSlots(
p->num_barrier_slots),
107 globalSeqNum(0), wavefrontSize(
p->wf_size),
108 scoreboardCheckToSchedule(
p),
120 fatal_if(
p->wf_size > std::numeric_limits<unsigned long long>::digits ||
122 "WF size is larger than the host can support");
124 "Wavefront size should be a power of 2");
127 numCyclesPerStoreTransfer =
128 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
129 (double)vrfToCoalescerBusWidth);
131 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
132 / coalescerToVrfBusWidth;
135 idleWfs =
p->n_wf * numVectorALUs;
136 lastVaddrWF.resize(numVectorALUs);
137 wfList.resize(numVectorALUs);
139 wfBarrierSlots.resize(
p->num_barrier_slots,
WFBarrier());
141 for (
int i = 0;
i <
p->num_barrier_slots; ++
i) {
142 freeBarrierIds.insert(
i);
145 for (
int j = 0;
j < numVectorALUs; ++
j) {
146 lastVaddrWF[
j].resize(
p->n_wf);
148 for (
int i = 0;
i <
p->n_wf; ++
i) {
149 lastVaddrWF[
j][
i].resize(wfSize());
151 wfList[
j].push_back(
p->wavefronts[
j *
p->n_wf +
i]);
152 wfList[
j][
i]->setParent(
this);
154 for (
int k = 0;
k < wfSize(); ++
k) {
155 lastVaddrWF[
j][
i][
k] = 0;
160 lastVaddrSimd.resize(numVectorALUs);
162 for (
int i = 0;
i < numVectorALUs; ++
i) {
163 lastVaddrSimd[
i].resize(wfSize(), 0);
166 lastVaddrCU.resize(wfSize());
170 if (
p->execPolicy ==
"OLDEST-FIRST") {
172 }
else if (
p->execPolicy ==
"ROUND-ROBIN") {
175 fatal(
"Invalid WF execution policy (CU)\n");
178 for (
int i = 0;
i <
p->port_memory_port_connection_count; ++
i) {
182 for (
int i = 0;
i <
p->port_translation_port_connection_count; ++
i) {
192 lastExecCycle.resize(numVectorALUs, 0);
194 for (
int i = 0;
i < vrf.size(); ++
i) {
195 vrf[
i]->setParent(
this);
197 for (
int i = 0;
i < srf.size(); ++
i) {
198 srf[
i]->setParent(
this);
200 numVecRegsPerSimd = vrf[0]->numRegs();
201 numScalarRegsPerSimd = srf[0]->numRegs();
203 registerManager->setParent(
this);
207 instExecPerSimd.resize(numVectorALUs, 0);
211 "Cache line size should be a power of two.");
212 cacheLineBits =
floorLog2(_cacheLineSize);
295 w->workGroupSz[0] = task->
wgSize(0);
296 w->workGroupSz[1] = task->
wgSize(1);
297 w->workGroupSz[2] = task->
wgSize(2);
298 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
302 w->computeActualWgSz(task);
309 static int _n_wave = 0;
315 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
319 w->execMask() = init_mask;
323 w->initMask = init_mask.to_ullong();
326 w->barrierId(bar_id);
328 assert(!
w->hasBarrier());
332 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
333 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
335 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
336 (
w->actualWgSz[0] *
w->actualWgSz[1]);
338 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
339 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
346 w->workGroupId[0] =
w->wgId % task->
numWg(0);
347 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
348 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
351 w->ldsChunk = ldsChunk;
353 int32_t refCount M5_VAR_USED =
355 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
356 cu_id,
w->wgId, refCount);
358 w->instructionBuffer.clear();
363 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
364 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
365 w->simdId,
w->wfSlotId, refCount);
367 w->initRegState(task,
w->actualWgSzTotal);
382 = std::make_shared<GPUDynInst>(
this,
nullptr,
386 gpuDynInst->kern_id = kernId;
388 req->setContext(gpuDynInst->wfDynId);
408 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
422 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
436 if (num_wfs_in_wg > 1) {
443 assert(!wf_barrier.maxBarrierCnt());
444 assert(!wf_barrier.numAtBarrier());
445 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
447 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
448 "%d waves using this barrier.\n",
cu_id, barrier_id,
468 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
469 "vregDemand[%d] sregDemand[%d]\n",
i,
j,
w->wfDynId,
470 vregDemand, sregDemand);
485 "Instruction Buffer of WF%d can't be empty",
w->wgId);
494 "Instruction Buffer of WF%d can't be empty",
w->wgId);
497 auto it =
pipeMap.find(ii->seqNum());
507 int trueWgSizeTotal = 1;
513 trueWgSizeTotal *= trueWgSize[
d];
514 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
517 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
520 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
521 num_wfs_in_wg = numWfs;
523 bool barrier_avail =
true;
526 barrier_avail =
false;
539 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
540 "that has %d VGPRs\n",
543 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
550 int numMappedWfs = 0;
562 if (numMappedWfs < numWfs &&
576 assert(numMappedWfs <= numWfs);
578 bool vregAvail =
true;
579 bool sregAvail =
true;
581 if (numMappedWfs < numWfs) {
597 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
598 VGPR Availability = %d, SGPR Availability = %d\n",
599 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
616 if (!barrier_avail) {
625 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
626 && ldsAvail && barrier_avail;
634 return wf_barrier.numYetToReachBarrier();
641 return wf_barrier.allAtBarrier();
648 wf_barrier.incNumAtBarrier();
655 return wf_barrier.numAtBarrier();
662 return wf_barrier.maxBarrierCnt();
676 wf_barrier.decMaxBarrierCnt();
683 wf_barrier.release();
706 for (
auto &vecRegFile :
vrf) {
710 for (
auto &scRegFile :
srf) {
754 "No support for multiple Global Memory Pipelines exists!!!");
761 "No support for multiple Local Memory Pipelines exists!!!");
768 "No support for multiple Scalar Memory Pipelines exists!!!");
806 if (gpuDynInst->isKernelLaunch()) {
809 assert(pkt->
req->isKernel());
810 assert(pkt->
req->isAcquire());
825 && gpuDynInst->isEndOfKernel()) {
828 assert(pkt->
req->isKernel());
829 assert(pkt->
req->isRelease());
845 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
847 w->wfDynId,
w->wgId);
853 if (!pkt->
req->isKernel()) {
855 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
856 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
857 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
858 gpuDynInst->disassemble(),
w->outstandingReqs,
859 w->outstandingReqs - 1);
873 DPRINTF(GPUExec,
"WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
874 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
875 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
876 gpuDynInst->disassemble(),
w->outstandingReqs,
877 w->outstandingReqs - 1);
878 if (gpuDynInst->allLanesZero()) {
884 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: write totally complete\n",
886 gpuDynInst->wfSlotId);
899 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
901 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
912 assert(!pkt->
req->isKernel());
919 assert(gpuDynInst->numScalarReqs > 0);
921 gpuDynInst->numScalarReqs--;
931 if (!gpuDynInst->numScalarReqs) {
932 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
933 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
936 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
950 for (
const auto &pkt : retries) {
951 if (!sendTimingReq(pkt)) {
962 int len = retries.size();
966 for (
int i = 0;
i <
len; ++
i) {
968 GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
969 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
970 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
971 pkt->
req->getPaddr());
976 if (!sendTimingReq(pkt)) {
977 DPRINTF(GPUMem,
"failed again!\n");
980 DPRINTF(GPUMem,
"successful!\n");
989 computeUnit->fetchStage.processFetchReturn(pkt);
996 int len = retries.size();
1000 for (
int i = 0;
i <
len; ++
i) {
1002 Wavefront *wavefront M5_VAR_USED = retries.front().second;
1003 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1004 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
1005 pkt->
req->getPaddr());
1006 if (!sendTimingReq(pkt)) {
1007 DPRINTF(GPUFetch,
"failed again!\n");
1010 DPRINTF(GPUFetch,
"successful!\n");
1011 retries.pop_front();
1020 Addr tmp_vaddr = pkt->
req->getVaddr();
1025 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1027 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1040 }
else if (pkt->
isRead()) {
1043 fatal(
"pkt is not a read nor a write\n");
1055 unsigned size = pkt->
getSize();
1058 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1059 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1064 if (!
p->pTable->translate(
vaddr, paddr)) {
1065 if (!
p->fixupFault(
vaddr)) {
1066 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1067 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1077 TheISA::GpuTLB::TranslationState *translation_state =
1078 new TheISA::GpuTLB::TranslationState(TLB_mode,
shader->
gpuTc,
false,
1084 tlbPort[tlbPort_index].sendFunctional(pkt);
1087 int hit_level = translation_state->hitLevel;
1088 assert(hit_level != -1);
1093 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
1096 delete sender_state->
saved;
1097 delete sender_state;
1099 assert(pkt->
req->hasPaddr());
1100 assert(pkt->
req->hasSize());
1110 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1121 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1122 gpuDynInst->tlbHitLevel[
index] = hit_level;
1129 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1130 "scheduled\n",
cu_id, gpuDynInst->simdId,
1131 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1134 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1135 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1137 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1138 "failed!\n",
cu_id, gpuDynInst->simdId,
1139 gpuDynInst->wfSlotId, tmp_vaddr);
1141 tlbPort[tlbPort_index].retries.push_back(pkt);
1142 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1147 tlbPort[tlbPort_index].stallPort();
1149 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1150 "failed!\n",
cu_id, gpuDynInst->simdId,
1151 gpuDynInst->wfSlotId, tmp_vaddr);
1153 tlbPort[tlbPort_index].retries.push_back(pkt);
1156 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
1157 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
1161 gpuDynInst->resetEntireStatusVector();
1163 gpuDynInst->decrementStatusVector(
index);
1170 pkt->
senderState =
new TheISA::GpuTLB::TranslationState(TLB_mode,
1173 tlbPort[tlbPort_index].sendFunctional(pkt);
1183 memPort[0].sendFunctional(new_pkt);
1185 DPRINTF(GPUMem,
"Functional sendRequest\n");
1186 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1187 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1188 new_pkt->
req->getPaddr());
1191 TheISA::GpuTLB::TranslationState *sender_state =
1192 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->
senderState);
1194 delete sender_state->tlbEntry;
1212 new TheISA::GpuTLB::TranslationState(tlb_mode,
shader->
gpuTc,
false,
1222 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1224 pkt->
req->getVaddr());
1233 assert(gpuDynInst->isGlobalSeg() ||
1234 gpuDynInst->executedAs() == Enums::SC_GLOBAL);
1237 req = std::make_shared<Request>(
1246 if (kernelMemSync) {
1247 if (gpuDynInst->isKernelLaunch()) {
1249 req->setReqInstSeqNum(gpuDynInst->seqNum());
1256 memPort[0].createMemReqEvent(pkt);
1258 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1259 "an acquire\n",
cu_id, gpuDynInst->simdId,
1260 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1266 assert(gpuDynInst->isEndOfKernel());
1269 req->setReqInstSeqNum(gpuDynInst->seqNum());
1276 memPort[0].createMemReqEvent(pkt);
1278 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1279 "a release\n",
cu_id, gpuDynInst->simdId,
1280 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1285 gpuDynInst->setRequestFlags(req);
1287 req->setReqInstSeqNum(gpuDynInst->seqNum());
1294 memPort[0].createMemReqEvent(pkt);
1297 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1298 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1299 pkt->
req->getPaddr());
1309 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1316 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1317 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1318 pkt->
req->getPaddr(),
id);
1320 Addr paddr = pkt->
req->getPaddr();
1328 int index = gpuDynInst->memStatusVector[paddr].back();
1330 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1331 pkt->
req->getPaddr(),
id);
1333 gpuDynInst->memStatusVector[paddr].pop_back();
1334 gpuDynInst->pAddr = pkt->
req->getPaddr();
1336 gpuDynInst->decrementStatusVector(
index);
1337 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1339 if (gpuDynInst->allLanesZero()) {
1340 auto iter = gpuDynInst->memStatusVector.begin();
1341 auto end = gpuDynInst->memStatusVector.end();
1343 while (iter != end) {
1344 assert(iter->second.empty());
1351 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1357 gpuDynInst->memStatusVector.clear();
1366 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1367 compute_unit->
cu_id, gpuDynInst->simdId,
1368 gpuDynInst->wfSlotId);
1372 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1374 .insert(std::make_pair(gpuDynInst,
curTick()));
1384 ComputeUnitParams::create()
1392 Addr line = pkt->
req->getPaddr();
1394 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1395 pkt->
req->getVaddr(), line);
1398 computeUnit->tlbCycles +=
curTick();
1401 TheISA::GpuTLB::TranslationState *translation_state =
1402 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->
senderState);
1405 if (!translation_state->tlbEntry) {
1407 safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1410 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1413 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1414 pkt->
req->getVaddr());
1418 int hit_level = translation_state->hitLevel;
1419 computeUnit->hitsPerTLBLevel[hit_level]++;
1421 delete translation_state->tlbEntry;
1422 assert(!translation_state->ports.size());
1428 delete translation_state;
1432 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1437 gpuDynInst->memStatusVector[line].push_back(mp_index);
1438 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1449 panic(
"unsupported response to request conversion %s\n",
1453 if (computeUnit->prefetchDepth) {
1454 int simdId = gpuDynInst->simdId;
1455 int wfSlotId = gpuDynInst->wfSlotId;
1458 switch(computeUnit->prefetchType) {
1460 last = computeUnit->lastVaddrCU[mp_index];
1462 case Enums::PF_PHASE:
1463 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1466 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1471 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1472 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1480 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1481 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1482 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1484 stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1485 computeUnit->prefetchStride:
stride;
1487 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1488 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1493 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1500 RequestPtr prefetch_req = std::make_shared<Request>(
1503 computeUnit->requestorId(),
1512 new TheISA::GpuTLB::TranslationState(TLB_mode,
1513 computeUnit->shader->gpuTc,
true);
1516 sendFunctional(prefetch_pkt);
1519 TheISA::GpuTLB::TranslationState *tlb_state =
1520 safe_cast<TheISA::GpuTLB::TranslationState*>(
1524 delete tlb_state->tlbEntry;
1526 delete prefetch_pkt;
1545 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1547 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1548 computeUnit->cu_id, gpuDynInst->simdId,
1549 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1551 computeUnit->schedule(mem_req_event,
curTick() +
1552 computeUnit->req_tick_latency);
1561 [
this, pkt]{ processMemReqEvent(pkt); },
1562 "ComputeUnit memory request event",
true);
1569 [
this, pkt]{ processMemRespEvent(pkt); },
1570 "ComputeUnit memory response event",
true);
1578 ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1580 if (!(sendTimingReq(pkt))) {
1581 retries.push_back(std::make_pair(pkt, gpuDynInst));
1584 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1585 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1586 id, pkt->
req->getPaddr());
1589 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1590 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1591 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1592 pkt->
req->getPaddr());
1599 return "ComputeUnit scalar memory request event";
1605 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1613 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1614 compute_unit->cu_id, gpuDynInst->simdId,
1615 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1618 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1619 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
1620 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1621 pkt->req->getPaddr());
1634 int len = retries.size();
1636 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1637 computeUnit->cu_id,
len);
1640 assert(isStalled());
1645 for (
int i = 0;
i <
len; ++
i) {
1648 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1650 if (!sendTimingReq(pkt)) {
1653 DPRINTF(GPUTLB,
": failed again\n");
1656 DPRINTF(GPUTLB,
": successful\n");
1657 retries.pop_front();
1667 TheISA::GpuTLB::TranslationState *translation_state =
1668 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->
senderState);
1671 fatal_if(!translation_state->tlbEntry,
1672 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1674 delete translation_state->tlbEntry;
1675 assert(!translation_state->ports.size());
1678 delete translation_state;
1681 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1686 Wavefront *
w M5_VAR_USED = gpuDynInst->wavefront();
1688 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1689 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1690 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1699 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1710 if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
1711 computeUnit->scalarDataPort.retries.push_back(req_pkt);
1712 DPRINTF(GPUMem,
"send scalar req failed for: %s\n",
1713 gpuDynInst->disassemble());
1715 DPRINTF(GPUMem,
"send scalar req for: %s\n",
1716 gpuDynInst->disassemble());
1725 Addr line M5_VAR_USED = pkt->
req->getPaddr();
1726 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1727 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1732 TheISA::GpuTLB::TranslationState *translation_state
1733 = safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->
senderState);
1735 bool success = translation_state->tlbEntry !=
nullptr;
1736 delete translation_state->tlbEntry;
1737 assert(!translation_state->ports.size());
1739 delete translation_state;
1743 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1756 computeUnit->fetchStage.fetch(pkt, wavefront);
1779 int len = retries.size();
1780 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1783 assert(isStalled());
1789 for (
int i = 0;
i <
len; ++
i) {
1792 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1794 if (!sendTimingReq(pkt)) {
1796 DPRINTF(GPUTLB,
": failed again\n");
1799 DPRINTF(GPUTLB,
": successful\n");
1800 retries.pop_front();
1812 .
desc(
"Number of vector ALU insts issued.")
1815 .
name(
name() +
".valu_insts_per_wf")
1816 .
desc(
"The avg. number of vector ALU insts issued per-wavefront.")
1820 .
desc(
"Number of scalar ALU insts issued.")
1823 .
name(
name() +
".salu_insts_per_wf")
1824 .
desc(
"The avg. number of scalar ALU insts issued per-wavefront.")
1827 .
name(
name() +
".inst_cycles_valu")
1828 .
desc(
"Number of cycles needed to execute VALU insts.")
1831 .
name(
name() +
".inst_cycles_salu")
1832 .
desc(
"Number of cycles needed to execute SALU insts.")
1835 .
name(
name() +
".thread_cycles_valu")
1836 .
desc(
"Number of thread cycles used to execute vector ALU ops. "
1837 "Similar to instCyclesVALU but multiplied by the number of "
1841 .
name(
name() +
".valu_utilization")
1842 .
desc(
"Percentage of active vector ALU threads in a wave.")
1845 .
name(
name() +
".lds_no_flat_insts")
1846 .
desc(
"Number of LDS insts issued, not including FLAT "
1847 "accesses that resolve to LDS.")
1850 .
name(
name() +
".lds_no_flat_insts_per_wf")
1851 .
desc(
"The avg. number of LDS insts (not including FLAT "
1852 "accesses that resolve to LDS) per-wavefront.")
1856 .
desc(
"The number of FLAT insts that resolve to vmem issued.")
1859 .
name(
name() +
".flat_vmem_insts_per_wf")
1860 .
desc(
"The average number of FLAT insts that resolve to vmem "
1861 "issued per-wavefront.")
1865 .
desc(
"The number of FLAT insts that resolve to LDS issued.")
1868 .
name(
name() +
".flat_lds_insts_per_wf")
1869 .
desc(
"The average number of FLAT insts that resolve to LDS "
1870 "issued per-wavefront.")
1873 .
name(
name() +
".vector_mem_writes")
1874 .
desc(
"Number of vector mem write insts (excluding FLAT insts).")
1877 .
name(
name() +
".vector_mem_writes_per_wf")
1878 .
desc(
"The average number of vector mem write insts "
1879 "(excluding FLAT insts) per-wavefront.")
1882 .
name(
name() +
".vector_mem_reads")
1883 .
desc(
"Number of vector mem read insts (excluding FLAT insts).")
1886 .
name(
name() +
".vector_mem_reads_per_wf")
1887 .
desc(
"The avg. number of vector mem read insts (excluding "
1888 "FLAT insts) per-wavefront.")
1891 .
name(
name() +
".scalar_mem_writes")
1892 .
desc(
"Number of scalar mem write insts.")
1895 .
name(
name() +
".scalar_mem_writes_per_wf")
1896 .
desc(
"The average number of scalar mem write insts per-wavefront.")
1899 .
name(
name() +
".scalar_mem_reads")
1900 .
desc(
"Number of scalar mem read insts.")
1903 .
name(
name() +
".scalar_mem_reads_per_wf")
1904 .
desc(
"The average number of scalar mem read insts per-wavefront.")
1919 .
name(
name() +
".vector_mem_reads_per_kilo_inst")
1920 .desc(
"Number of vector mem reads per kilo-instruction")
1924 .
name(
name() +
".vector_mem_writes_per_kilo_inst")
1925 .desc(
"Number of vector mem writes per kilo-instruction")
1929 .
name(
name() +
".vector_mem_insts_per_kilo_inst")
1930 .desc(
"Number of vector mem insts per kilo-instruction")
1935 .
name(
name() +
".scalar_mem_reads_per_kilo_inst")
1936 .desc(
"Number of scalar mem reads per kilo-instruction")
1940 .
name(
name() +
".scalar_mem_writes_per_kilo_inst")
1941 .desc(
"Number of scalar mem writes per kilo-instruction")
1945 .
name(
name() +
".scalar_mem_insts_per_kilo_inst")
1946 .desc(
"Number of scalar mem insts per kilo-instruction")
1953 .
name(
name() +
".inst_cycles_vector_memory")
1954 .desc(
"Number of cycles to send address, command, data from VRF to "
1955 "vector memory unit, per SIMD")
1960 .
name(
name() +
".inst_cycles_scalar_memory")
1961 .desc(
"Number of cycles to send address, command, data from SRF to "
1962 "scalar memory unit, per SIMD")
1968 .desc(
"Number of cycles to send address, command, data from VRF to "
1969 "LDS unit, per SIMD")
1973 .
name(
name() +
".global_mem_reads")
1974 .desc(
"Number of reads to the global segment")
1977 .
name(
name() +
".global_mem_writes")
1978 .desc(
"Number of writes to the global segment")
1981 .
name(
name() +
".global_mem_insts")
1982 .desc(
"Number of memory instructions sent to the global segment")
1987 .desc(
"Number of reads to the arg segment")
1991 .desc(
"NUmber of writes to the arg segment")
1995 .desc(
"Number of memory instructions sent to the arg segment")
2000 .desc(
"Number of reads to the spill segment")
2004 .desc(
"Number of writes to the spill segment")
2008 .desc(
"Number of memory instructions sent to the spill segment")
2013 .desc(
"Number of reads to the group segment")
2017 .desc(
"Number of writes to the group segment")
2021 .desc(
"Number of memory instructions sent to the group segment")
2026 .desc(
"Number of reads to the private segment")
2030 .desc(
"Number of writes to the private segment")
2033 .
name(
name() +
".private_mem_insts")
2034 .desc(
"Number of memory instructions sent to the private segment")
2039 .desc(
"Number of reads to the readonly segment")
2043 .desc(
"Number of memory instructions sent to the readonly segment")
2046 .
name(
name() +
".readonly_mem_insts")
2047 .desc(
"Number of memory instructions sent to the readonly segment")
2052 .desc(
"Number of reads sent to the kernarg segment")
2056 .desc(
"Number of memory instructions sent to the kernarg segment")
2059 .
name(
name() +
".kernarg_mem_insts")
2060 .desc(
"Number of memory instructions sent to the kernarg segment")
2066 .desc(
"total number of cycles for all uncoalesced requests")
2071 .desc(
"number of uncoalesced requests")
2075 .
name(
name() +
".avg_translation_latency")
2076 .desc(
"Avg. translation latency for data translations")
2083 .
name(
name() +
".TLB_hits_distribution")
2084 .desc(
"TLB hits distribution (0 for page table, x for Lx-TLB")
2088 for (
int i = 0;
i < 4; ++
i) {
2098 .desc(
"Instruction Execution Rate: Number of executed vector "
2099 "instructions per cycle")
2104 .
name(
name() +
".lds_bank_conflicts")
2105 .desc(
"Number of bank conflicts per LDS memory packet")
2109 .
name(
name() +
".lds_bank_access_cnt")
2110 .desc(
"Total number of LDS bank accesses")
2118 .
name(
name() +
".page_divergence_dist")
2119 .desc(
"pages touched per wf (over all mem. instr.)")
2124 .
name(
name() +
".warp_execution_dist")
2125 .desc(
"number of lanes active per instruction (oval all instructions)")
2130 .
name(
name() +
".gmem_lanes_execution_dist")
2131 .desc(
"number of active lanes per global memory instruction")
2136 .
name(
name() +
".lmem_lanes_execution_dist")
2137 .desc(
"number of active lanes per local memory instruction")
2141 .
name(
name() +
".num_instr_executed")
2142 .desc(
"number of instructions executed")
2146 .
name(
name() +
".num_vec_ops_executed")
2147 .desc(
"number of vec ops executed (e.g. WF size/inst)")
2151 .
name(
name() +
".num_vec_ops_f16_executed")
2152 .desc(
"number of f16 vec ops executed (e.g. WF size/inst)")
2156 .
name(
name() +
".num_vec_ops_f32_executed")
2157 .desc(
"number of f32 vec ops executed (e.g. WF size/inst)")
2161 .
name(
name() +
".num_vec_ops_f64_executed")
2162 .desc(
"number of f64 vec ops executed (e.g. WF size/inst)")
2166 .
name(
name() +
".num_vec_ops_fma16_executed")
2167 .desc(
"number of fma16 vec ops executed (e.g. WF size/inst)")
2171 .
name(
name() +
".num_vec_ops_fma32_executed")
2172 .desc(
"number of fma32 vec ops executed (e.g. WF size/inst)")
2176 .
name(
name() +
".num_vec_ops_fma64_executed")
2177 .desc(
"number of fma64 vec ops executed (e.g. WF size/inst)")
2181 .
name(
name() +
".num_vec_ops_mad16_executed")
2182 .desc(
"number of mad16 vec ops executed (e.g. WF size/inst)")
2186 .
name(
name() +
".num_vec_ops_mad32_executed")
2187 .desc(
"number of mad32 vec ops executed (e.g. WF size/inst)")
2191 .
name(
name() +
".num_vec_ops_mad64_executed")
2192 .desc(
"number of mad64 vec ops executed (e.g. WF size/inst)")
2196 .
name(
name() +
".num_vec_ops_mac16_executed")
2197 .desc(
"number of mac16 vec ops executed (e.g. WF size/inst)")
2201 .
name(
name() +
".num_vec_ops_mac32_executed")
2202 .desc(
"number of mac32 vec ops executed (e.g. WF size/inst)")
2206 .
name(
name() +
".num_vec_ops_mac64_executed")
2207 .desc(
"number of mac64 vec ops executed (e.g. WF size/inst)")
2211 .
name(
name() +
".num_vec_ops_two_op_fp_executed")
2212 .desc(
"number of two op FP vec ops executed (e.g. WF size/inst)")
2216 .
name(
name() +
".num_total_cycles")
2217 .desc(
"number of cycles the CU ran for")
2222 .desc(
"Instructions per cycle (this CU only)")
2227 .desc(
"Vector Operations per cycle (this CU only)")
2232 .desc(
"F16 Vector Operations per cycle (this CU only)")
2237 .desc(
"F32 Vector Operations per cycle (this CU only)")
2242 .desc(
"F64 Vector Operations per cycle (this CU only)")
2246 .
name(
name() +
".num_alu_insts_executed")
2247 .desc(
"Number of dynamic non-GM memory insts executed")
2251 .
name(
name() +
".wg_blocked_due_barrier_alloc")
2252 .desc(
"WG dispatch was blocked due to lack of barrier resources")
2256 .
name(
name() +
".wg_blocked_due_lds_alloc")
2257 .desc(
"Workgroup blocked due to LDS capacity")
2267 .
name(
name() +
".times_wg_blocked_due_vgpr_alloc")
2268 .desc(
"Number of times WGs are blocked due to VGPR allocation per "
2273 .
name(
name() +
".times_wg_blocked_due_sgpr_alloc")
2274 .desc(
"Number of times WGs are blocked due to SGPR allocation per "
2279 .
name(
name() +
".global_mem_instr_cnt")
2280 .desc(
"dynamic non-flat global memory instruction count")
2284 .
name(
name() +
".flat_global_mem_instr_cnt")
2285 .desc(
"dynamic flat global memory instruction count")
2289 .
name(
name() +
".local_mem_instr_cnt")
2290 .desc(
"dynamic local memory intruction count")
2297 .
name(
name() +
".num_completed_wfs")
2298 .desc(
"number of completed wavefronts")
2302 .
name(
name() +
".num_completed_wgs")
2303 .desc(
"number of completed workgroups")
2308 .desc(
"number of compare and swap operations")
2312 .
name(
name() +
".num_failed_CAS_ops")
2313 .desc(
"number of compare and swap operations that failed")
2317 .
init(0, 1000000, 10000)
2318 .
name(
name() +
".head_tail_latency")
2319 .desc(
"ticks between first and last cache block arrival at coalescer")
2326 .desc(
"wave level parallelism: count of active waves at wave launch")
2332 .desc(
"Measure of instruction interleaving per SIMD")
2352 if (gpuDynInst->isScalar()) {
2353 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2356 }
else if (gpuDynInst->isLoad()) {
2358 }
else if (gpuDynInst->isStore()) {
2362 if (gpuDynInst->isALU()) {
2370 }
else if (gpuDynInst->isFlat()) {
2371 if (gpuDynInst->isLocalMem()) {
2376 }
else if (gpuDynInst->isLocalMem()) {
2378 }
else if (gpuDynInst->isLoad()) {
2380 }
else if (gpuDynInst->isStore()) {
2384 if (gpuDynInst->isLoad()) {
2385 switch (gpuDynInst->executedAs()) {
2386 case Enums::SC_SPILL:
2389 case Enums::SC_GLOBAL:
2392 case Enums::SC_GROUP:
2395 case Enums::SC_PRIVATE:
2398 case Enums::SC_READONLY:
2401 case Enums::SC_KERNARG:
2414 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2417 }
else if (gpuDynInst->isStore()) {
2418 switch (gpuDynInst->executedAs()) {
2419 case Enums::SC_SPILL:
2422 case Enums::SC_GLOBAL:
2425 case Enums::SC_GROUP:
2428 case Enums::SC_PRIVATE:
2431 case Enums::SC_READONLY:
2434 case Enums::SC_KERNARG:
2447 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2471 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
2475 *page_stat_file << std::hex << iter.first <<
",";
2476 *page_stat_file << std::dec << iter.second.first <<
",";
2477 *page_stat_file << std::dec << iter.second.second << std::endl;
2514 const uint32_t wgId)
const
2524 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
2543 RequestPtr newRequest = std::make_shared<Request>();
2544 newRequest->setPaddr(0x0);
2564 fatal_if(!senderState,
"did not get the right sort of sender state");
2571 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2585 fatal_if(!sender_state,
"packet without a valid sender state");
2590 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2594 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2595 computeUnit->cu_id, gpuDynInst->simdId,
2596 gpuDynInst->wfSlotId);
2604 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2605 computeUnit->cu_id, gpuDynInst->simdId,
2606 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2609 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2610 computeUnit->cu_id, gpuDynInst->simdId,
2611 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2625 auto queueSize = retries.size();
2627 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2628 computeUnit->cu_id, queueSize);
2631 "why was there a recvReqRetry() with no pending reqs?");
2633 "recvReqRetry() happened when the port was not stalled");
2637 while (!retries.empty()) {
2640 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2645 DPRINTF(GPUPort,
": LDS send failed again\n");
2648 DPRINTF(GPUTLB,
": LDS send successful\n");