39 #include "debug/GPUDisp.hh" 40 #include "debug/GPUExec.hh" 41 #include "debug/GPUFetch.hh" 42 #include "debug/GPUMem.hh" 43 #include "debug/GPUPort.hh" 44 #include "debug/GPUPrefetch.hh" 45 #include "debug/GPUSync.hh" 46 #include "debug/GPUTLB.hh" 59 scoreboardCheckStage(p), scheduleStage(p), execStage(p),
60 globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
61 cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
62 spBypassPipeLength(p->spbypass_pipe_length),
63 dpBypassPipeLength(p->dpbypass_pipe_length),
64 issuePeriod(p->issue_period),
65 numGlbMemUnits(p->num_global_mem_pipes),
66 numLocMemUnits(p->num_shared_mem_pipes),
67 perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
68 prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
69 xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
70 functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
71 countPages(p->countPages), barrier_id(0),
72 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
73 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
74 req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
75 resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
76 _masterId(p->
system->getMasterId(this,
"ComputeUnit")),
77 lds(*p->localDataStore), _cacheLineSize(p->
system->cacheLineSize()),
78 globalSeqNum(0), wavefrontSize(p->wfSize),
90 fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
92 "WF size is larger than the host can support");
94 "Wavefront size should be a power of 2");
98 (uint32_t)ceil((
double)(
wfSize() *
sizeof(uint32_t)) /
110 for (
int i = 0;
i < p->n_wf; ++
i) {
113 wfList[
j].push_back(p->wavefronts[
j * p->n_wf +
i]);
132 if (p->execPolicy ==
"OLDEST-FIRST") {
134 }
else if (p->execPolicy ==
"ROUND-ROBIN") {
137 fatal(
"Invalid WF execution policy (CU)\n");
152 for (
int i = 0;
i <
vrf.size(); ++
i) {
153 vrf[
i]->setParent(
this);
203 while (i < vecSize) {
206 vrf[regInfo.first]->markReg(regInfo.second,
sizeof(uint32_t),
219 vrf[
i]->updateEvents();
228 static int _n_wave = 0;
240 w->
initMask = init_mask.to_ullong();
283 int32_t refCount M5_VAR_USED =
285 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
302 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: " 324 gpuDynInst->useContinuation =
false;
347 uint32_t normSize = 0;
350 allocateRegion(vregDemand, &normSize);
367 int trueWgSizeTotal = 1;
369 for (
int d = 0;
d < 3; ++
d) {
373 trueWgSizeTotal *= trueWgSize[
d];
374 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n", d, trueWgSize[d]);
377 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
382 bool vregAvail =
true;
383 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
388 int numMappedWfs = 0;
397 if (numMappedWfs < numWfs) {
408 if (freeWfSlots >= numWfs) {
413 vregAvail =
vrf[
j]->manager->canAllocate(numWfsPerSimd[
j],
425 DPRINTF(GPUDisp,
"Free WF slots = %d, VGPR Availability = %d\n",
426 freeWfSlots, vregAvail);
447 DPRINTF(GPUSync,
"CU%d: Checking for All At Barrier\n",
cu_id);
450 for (
int i_simd = 0; i_simd <
numSIMDs; ++i_simd) {
451 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf) {
455 DPRINTF(GPUSync,
"Checking WF[%d][%d]\n", i_simd, i_wf);
457 DPRINTF(GPUSync,
"wf->barrier_id = %d, _barrier_id = %d\n",
460 DPRINTF(GPUSync,
"wf->barrier_cnt %d, bcnt = %d\n",
469 DPRINTF(GPUSync,
"WF[%d][%d] at barrier, increment ccnt to " 470 "%d\n", i_simd, i_wf, ccnt);
475 DPRINTF(GPUSync,
"CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
476 cu_id, ccnt, bslots);
478 return ccnt == bslots;
500 if (!curWaveIDQueue.empty()) {
501 for (
auto it : curWaveIDQueue) {
504 if (cur_wave.
simdId == simdId &&
555 "No support for multiple Global Memory Pipelines exists!!!");
563 "No support for multiple Local Memory Pipelines exists!!!");
591 readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
628 if (pkt->
req->isKernel() && pkt->
req->isRelease()) {
630 computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
634 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
638 computeUnit->shader->dispatcher->notifyWgCompl(w);
644 DPRINTF(GPUSync,
"CU%d: WF[%d][%d]: barrier_cnt = %d\n",
645 computeUnit->cu_id, gpuDynInst->simdId,
648 if (gpuDynInst->useContinuation) {
649 assert(!gpuDynInst->isNoScope());
650 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
657 }
else if (pkt->
req->isKernel() && pkt->
req->isAcquire()) {
658 if (gpuDynInst->useContinuation) {
659 assert(!gpuDynInst->isNoScope());
660 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
670 computeUnit->memPort[
index]->createMemRespEvent(pkt);
672 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
673 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
674 index, pkt->
req->getPaddr());
676 computeUnit->schedule(mem_resp_event,
677 curTick() + computeUnit->resp_tick_latency);
684 int len = retries.size();
688 for (
int i = 0;
i <
len; ++
i) {
690 GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
691 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
692 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
693 pkt->
req->getPaddr());
698 if (!sendTimingReq(pkt)) {
699 DPRINTF(GPUMem,
"failed again!\n");
702 DPRINTF(GPUMem,
"successful!\n");
711 computeUnit->fetchStage.processFetchReturn(pkt);
719 int len = retries.size();
723 for (
int i = 0;
i <
len; ++
i) {
725 Wavefront *wavefront M5_VAR_USED = retries.front().second;
726 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
727 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
728 pkt->
req->getPaddr());
729 if (!sendTimingReq(pkt)) {
730 DPRINTF(GPUFetch,
"failed again!\n");
733 DPRINTF(GPUFetch,
"successful!\n");
743 Addr tmp_vaddr = pkt->
req->getVaddr();
748 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
750 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
760 }
else if (pkt->
isRead()) {
763 fatal(
"pkt is not a read nor a write\n");
775 unsigned size = pkt->
getSize();
777 if ((vaddr + size - 1) % 64 < vaddr % 64) {
778 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
779 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
786 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
787 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
797 TheISA::GpuTLB::TranslationState *translation_state =
798 new TheISA::GpuTLB::TranslationState(TLB_mode,
shader->
gpuTc,
false,
804 tlbPort[tlbPort_index]->sendFunctional(pkt);
807 int hit_level = translation_state->hitLevel;
808 assert(hit_level != -1);
816 delete sender_state->
saved;
819 assert(pkt->
req->hasPaddr());
820 assert(pkt->
req->hasSize());
822 uint8_t *tmpData = pkt->
getPtr<uint8_t>();
830 pkt =
new Packet(oldPkt->req, oldPkt->cmd);
839 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(index);
840 gpuDynInst->tlbHitLevel[
index] = hit_level;
848 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data " 849 "scheduled\n",
cu_id, gpuDynInst->simdId,
850 gpuDynInst->wfSlotId, index, pkt->
req->getPaddr());
853 }
else if (
tlbPort[tlbPort_index]->isStalled()) {
854 assert(
tlbPort[tlbPort_index]->retries.size() > 0);
856 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x " 857 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
860 tlbPort[tlbPort_index]->retries.push_back(pkt);
861 }
else if (!
tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
866 tlbPort[tlbPort_index]->stallPort();
868 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x " 869 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
872 tlbPort[tlbPort_index]->retries.push_back(pkt);
875 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
876 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
882 gpuDynInst->statusBitVector &= (~(1ll <<
index));
889 pkt->
senderState =
new TheISA::GpuTLB::TranslationState(TLB_mode,
892 tlbPort[tlbPort_index]->sendFunctional(pkt);
902 memPort[0]->sendFunctional(new_pkt);
904 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
905 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
906 new_pkt->
req->getPaddr());
909 TheISA::GpuTLB::TranslationState *sender_state =
912 delete sender_state->tlbEntry;
930 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
931 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
932 pkt->
req->getPaddr());
941 assert(gpuDynInst->isGlobalSeg());
944 req = std::make_shared<Request>(
945 0, 0, 0,
masterId(), 0, gpuDynInst->wfDynId);
956 gpuDynInst->setRequestFlags(req, kernelLaunch);
959 assert(req->isAcquire() || req->isRelease());
983 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
984 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
987 Addr paddr = pkt->
req->getPaddr();
990 int index = gpuDynInst->memStatusVector[paddr].back();
992 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
995 gpuDynInst->memStatusVector[paddr].pop_back();
996 gpuDynInst->pAddr = pkt->
req->getPaddr();
1001 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1003 assert(gpuDynInst->statusVector[index] > 0);
1004 gpuDynInst->statusVector[
index]--;
1006 if (!gpuDynInst->statusVector[index])
1007 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1010 DPRINTF(GPUMem,
"bitvector is now %#x\n",
1011 gpuDynInst->statusBitVector);
1013 if (gpuDynInst->statusBitVector ==
VectorMask(0)) {
1014 auto iter = gpuDynInst->memStatusVector.begin();
1015 auto end = gpuDynInst->memStatusVector.end();
1017 while (iter != end) {
1018 assert(iter->second.empty());
1022 gpuDynInst->memStatusVector.clear();
1025 gpuDynInst->statusVector.clear();
1029 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1030 compute_unit->
cu_id, gpuDynInst->simdId,
1031 gpuDynInst->wfSlotId);
1037 if (gpuDynInst->useContinuation) {
1038 assert(!gpuDynInst->isNoScope());
1039 gpuDynInst->execContinuation(
1040 gpuDynInst->staticInstruction(),
1048 if (gpuDynInst->useContinuation) {
1049 assert(!gpuDynInst->isNoScope());
1050 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1060 ComputeUnitParams::create()
1068 Addr line = pkt->
req->getPaddr();
1070 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1071 pkt->
req->getVaddr(), line);
1074 computeUnit->tlbCycles +=
curTick();
1077 TheISA::GpuTLB::TranslationState *translation_state =
1081 if (!translation_state->tlbEntry) {
1086 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1089 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1090 pkt->
req->getVaddr());
1094 int hit_level = translation_state->hitLevel;
1095 computeUnit->hitsPerTLBLevel[hit_level]++;
1097 delete translation_state->tlbEntry;
1098 assert(!translation_state->ports.size());
1104 delete translation_state;
1113 gpuDynInst->memStatusVector[line].push_back(mp_index);
1114 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1125 panic(
"unsupported response to request conversion %s\n",
1129 if (computeUnit->prefetchDepth) {
1130 int simdId = gpuDynInst->simdId;
1131 int wfSlotId = gpuDynInst->wfSlotId;
1134 switch(computeUnit->prefetchType) {
1136 last = computeUnit->lastVaddrCU[mp_index];
1138 case Enums::PF_PHASE:
1139 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1142 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1147 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1148 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1154 DPRINTF(GPUPrefetch,
"Stride is %d\n", stride);
1156 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1157 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1158 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1160 stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1161 computeUnit->prefetchStride: stride;
1163 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1164 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1166 DPRINTF(GPUPrefetch,
"Prefetching from %#x:", vaddr);
1169 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1170 DPRINTF(GPUPrefetch,
"%d * %d: %#x\n",
pf, stride,
1176 RequestPtr prefetch_req = std::make_shared<Request>(
1179 computeUnit->masterId(),
1184 prefetch_pkt->dataStatic(&foo);
1187 prefetch_pkt->senderState =
1188 new TheISA::GpuTLB::TranslationState(TLB_mode,
1189 computeUnit->shader->gpuTc,
1193 sendFunctional(prefetch_pkt);
1196 TheISA::GpuTLB::TranslationState *tlb_state =
1197 safe_cast<TheISA::GpuTLB::TranslationState*>(
1198 prefetch_pkt->senderState);
1201 delete tlb_state->tlbEntry;
1203 delete prefetch_pkt;
1222 computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt);
1224 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1225 computeUnit->cu_id, gpuDynInst->simdId,
1226 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1228 computeUnit->schedule(mem_req_event,
curTick() +
1229 computeUnit->req_tick_latency);
1238 [
this, pkt]{ processMemReqEvent(pkt); },
1239 "ComputeUnit memory request event",
true);
1246 [
this, pkt]{ processMemRespEvent(pkt); },
1247 "ComputeUnit memory response event",
true);
1255 ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1257 if (!(sendTimingReq(pkt))) {
1258 retries.push_back(std::make_pair(pkt, gpuDynInst));
1261 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1262 compute_unit->cu_id, gpuDynInst->simdId,
1263 gpuDynInst->wfSlotId,
index,
1264 pkt->
req->getPaddr());
1267 "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1268 compute_unit->cu_id, gpuDynInst->simdId,
1269 gpuDynInst->wfSlotId,
index,
1270 pkt->
req->getPaddr());
1283 int len = retries.size();
1285 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1286 computeUnit->cu_id, len);
1289 assert(isStalled());
1294 for (
int i = 0;
i <
len; ++
i) {
1297 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1299 if (!sendTimingReq(pkt)) {
1302 DPRINTF(GPUTLB,
": failed again\n");
1305 DPRINTF(GPUTLB,
": successful\n");
1306 retries.pop_front();
1314 Addr line M5_VAR_USED = pkt->
req->getPaddr();
1315 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1316 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1321 TheISA::GpuTLB::TranslationState *translation_state =
1324 bool success = translation_state->tlbEntry !=
nullptr;
1325 delete translation_state->tlbEntry;
1326 assert(!translation_state->ports.size());
1328 delete translation_state;
1345 computeUnit->fetchStage.fetch(pkt, wavefront);
1347 if (wavefront->dropFetch) {
1348 assert(wavefront->instructionBuffer.empty());
1349 wavefront->dropFetch =
false;
1352 wavefront->pendingFetch = 0;
1368 int len = retries.size();
1369 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1372 assert(isStalled());
1378 for (
int i = 0;
i <
len; ++
i) {
1381 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1383 if (!sendTimingReq(pkt)) {
1385 DPRINTF(GPUTLB,
": failed again\n");
1388 DPRINTF(GPUTLB,
": successful\n");
1389 retries.pop_front();
1401 .
desc(
"Number of vector ALU insts issued.")
1404 .
name(
name() +
".valu_insts_per_wf")
1405 .
desc(
"The avg. number of vector ALU insts issued per-wavefront.")
1409 .
desc(
"Number of scalar ALU insts issued.")
1412 .
name(
name() +
".salu_insts_per_wf")
1413 .
desc(
"The avg. number of scalar ALU insts issued per-wavefront.")
1416 .
name(
name() +
".inst_cycles_valu")
1417 .
desc(
"Number of cycles needed to execute VALU insts.")
1420 .
name(
name() +
".inst_cycles_salu")
1421 .
desc(
"Number of cycles needed to execute SALU insts.")
1424 .
name(
name() +
".thread_cycles_valu")
1425 .
desc(
"Number of thread cycles used to execute vector ALU ops. " 1426 "Similar to instCyclesVALU but multiplied by the number of " 1430 .
name(
name() +
".valu_utilization")
1431 .
desc(
"Percentage of active vector ALU threads in a wave.")
1434 .
name(
name() +
".lds_no_flat_insts")
1435 .
desc(
"Number of LDS insts issued, not including FLAT " 1436 "accesses that resolve to LDS.")
1439 .
name(
name() +
".lds_no_flat_insts_per_wf")
1440 .
desc(
"The avg. number of LDS insts (not including FLAT " 1441 "accesses that resolve to LDS) per-wavefront.")
1445 .
desc(
"The number of FLAT insts that resolve to vmem issued.")
1448 .
name(
name() +
".flat_vmem_insts_per_wf")
1449 .
desc(
"The average number of FLAT insts that resolve to vmem " 1450 "issued per-wavefront.")
1454 .
desc(
"The number of FLAT insts that resolve to LDS issued.")
1457 .
name(
name() +
".flat_lds_insts_per_wf")
1458 .
desc(
"The average number of FLAT insts that resolve to LDS " 1459 "issued per-wavefront.")
1462 .
name(
name() +
".vector_mem_writes")
1463 .
desc(
"Number of vector mem write insts (excluding FLAT insts).")
1466 .
name(
name() +
".vector_mem_writes_per_wf")
1467 .
desc(
"The average number of vector mem write insts " 1468 "(excluding FLAT insts) per-wavefront.")
1471 .
name(
name() +
".vector_mem_reads")
1472 .
desc(
"Number of vector mem read insts (excluding FLAT insts).")
1475 .
name(
name() +
".vector_mem_reads_per_wf")
1476 .
desc(
"The avg. number of vector mem read insts (excluding " 1477 "FLAT insts) per-wavefront.")
1480 .
name(
name() +
".scalar_mem_writes")
1481 .
desc(
"Number of scalar mem write insts.")
1484 .
name(
name() +
".scalar_mem_writes_per_wf")
1485 .
desc(
"The average number of scalar mem write insts per-wavefront.")
1488 .
name(
name() +
".scalar_mem_reads")
1489 .
desc(
"Number of scalar mem read insts.")
1492 .
name(
name() +
".scalar_mem_reads_per_wf")
1493 .
desc(
"The average number of scalar mem read insts per-wavefront.")
1509 .
desc(
"total number of cycles for all uncoalesced requests")
1514 .
desc(
"number of uncoalesced requests")
1518 .
name(
name() +
".avg_translation_latency")
1519 .
desc(
"Avg. translation latency for data translations")
1526 .
name(
name() +
".TLB_hits_distribution")
1527 .
desc(
"TLB hits distribution (0 for page table, x for Lx-TLB")
1531 for (
int i = 0;
i < 4; ++
i) {
1541 .
desc(
"Instruction Execution Rate: Number of executed vector " 1542 "instructions per cycle")
1547 .
name(
name() +
".lds_bank_conflicts")
1548 .
desc(
"Number of bank conflicts per LDS memory packet")
1552 .
name(
name() +
".lds_bank_access_cnt")
1553 .
desc(
"Total number of LDS bank accesses")
1561 .
name(
name() +
".page_divergence_dist")
1562 .
desc(
"pages touched per wf (over all mem. instr.)")
1567 .
name(
name() +
".warp_execution_dist")
1568 .
desc(
"number of lanes active per instruction (oval all instructions)")
1573 .
name(
name() +
".gmem_lanes_execution_dist")
1574 .
desc(
"number of active lanes per global memory instruction")
1579 .
name(
name() +
".lmem_lanes_execution_dist")
1580 .
desc(
"number of active lanes per local memory instruction")
1584 .
name(
name() +
".num_instr_executed")
1585 .
desc(
"number of instructions executed")
1589 .
name(
name() +
".num_vec_ops_executed")
1590 .
desc(
"number of vec ops executed (e.g. WF size/inst)")
1594 .
name(
name() +
".num_total_cycles")
1595 .
desc(
"number of cycles the CU ran for")
1600 .
desc(
"Instructions per cycle (this CU only)")
1605 .
desc(
"Vector Operations per cycle (this CU only)")
1609 .
name(
name() +
".num_alu_insts_executed")
1610 .
desc(
"Number of dynamic non-GM memory insts executed")
1614 .
name(
name() +
".wg_blocked_due_lds_alloc")
1615 .
desc(
"Workgroup blocked due to LDS capacity")
1622 .
name(
name() +
".times_wg_blocked_due_vgpr_alloc")
1623 .
desc(
"Number of times WGs are blocked due to VGPR allocation per SIMD")
1627 .
name(
name() +
".global_mem_instr_cnt")
1628 .
desc(
"dynamic global memory instructions count")
1632 .
name(
name() +
".local_mem_instr_cnt")
1633 .
desc(
"dynamic local memory intruction count")
1640 .
name(
name() +
".num_completed_wfs")
1641 .
desc(
"number of completed wavefronts")
1646 .
desc(
"number of compare and swap operations")
1650 .
name(
name() +
".num_failed_CAS_ops")
1651 .
desc(
"number of compare and swap operations that failed")
1668 if (gpuDynInst->isScalar()) {
1669 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1672 }
else if (gpuDynInst->isLoad()) {
1674 }
else if (gpuDynInst->isStore()) {
1678 if (gpuDynInst->isALU()) {
1682 }
else if (gpuDynInst->isFlat()) {
1683 if (gpuDynInst->isLocalMem()) {
1688 }
else if (gpuDynInst->isLocalMem()) {
1690 }
else if (gpuDynInst->isLoad()) {
1692 }
else if (gpuDynInst->isStore()) {
1712 if (computeUnit->countPages) {
1713 std::ostream *page_stat_file =
1716 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
1719 for (
auto iter : computeUnit->pageAccesses) {
1720 *page_stat_file << std::hex << iter.first <<
",";
1721 *page_stat_file << std::dec << iter.second.first <<
",";
1722 *page_stat_file << std::dec << iter.second.second << std::endl;
1736 bool glbMemBusRdy =
true;
1740 bool locMemBusRdy =
true;
1779 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
1798 RequestPtr newRequest = std::make_shared<Request>();
1799 newRequest->setPaddr(0x0);
1819 fatal_if(!senderState,
"did not get the right sort of sender state");
1826 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1840 fatal_if(!sender_state,
"packet without a valid sender state");
1845 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
1849 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
1850 computeUnit->cu_id, gpuDynInst->simdId,
1851 gpuDynInst->wfSlotId);
1859 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1860 computeUnit->cu_id, gpuDynInst->simdId,
1861 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
1864 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1865 computeUnit->cu_id, gpuDynInst->simdId,
1866 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
1880 auto queueSize = retries.size();
1882 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
1883 computeUnit->cu_id, queueSize);
1886 "why was there a recvReqRetry() with no pending reqs?");
1888 "recvReqRetry() happened when the port was not stalled");
1892 while (!retries.empty()) {
1895 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
1900 DPRINTF(GPUPort,
": LDS send failed again\n");
1903 DPRINTF(GPUTLB,
": LDS send successful\n");
uint32_t numVecRegsPerSimd
#define panic(...)
This implements a cprintf based panic() function.
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
RubyTester::SenderState SenderState
GPUDynInstPtr _gpuDynInst
void processMemReqEvent(PacketPtr pkt)
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Stats::Scalar flatLDSInsts
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
const Regs::Info & regInfo(Addr daddr)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
std::true_type foo(void(*)(ThreadContext *, const Ret &ret, State &state))
#define fatal(...)
This implements a cprintf based fatal() function.
uint32_t numCyclesPerLoadTransfer
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
std::map< unsigned, waveQueue > xactCasLoadMap
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
void fillKernelState(Wavefront *w, NDRange *ndr)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Stats::Formula flatLDSInstsPerWF
bool isGMStRespFIFOWrRdy() const
Stats::Distribution controlFlowDivergenceDist
const std::string & toString() const
Return the string to a cmd given by idx.
Bitfield< 21, 20 > stride
std::vector< std::vector< Wavefront * > > readyList
std::shared_ptr< Request > RequestPtr
GPUDynInstPtr _gpuDynInst
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
virtual Process * getProcessPtr()=0
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
bool isGMLdRespFIFOWrRdy() const
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
void init(ComputeUnit *cu)
std::vector< DTLBPort * > tlbPort
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Stats::Scalar dynamicLMemInstrCnt
SenderState is information carried along with the packet throughout the TLB hierarchy.
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
GPUStaticInst * kernelLaunchInst
Stats::Scalar numInstrExecuted
Derived & init(size_type size)
Set this vector to have the given size.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Stats::Distribution ldsBankConflictDist
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
RequestPtr req
A pointer to the original request.
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
GlobalMemPipeline globalMemoryPipe
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
std::vector< uint32_t > workItemId[3]
Tick curTick()
The current simulated tick.
std::deque< GPUDynInstPtr > instructionBuffer
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
bool translate(Addr vaddr, Addr &paddr)
Translate function.
std::vector< uint32_t > workItemFlatId
Stats::Distribution pageDivergenceDist
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
Stats::Scalar flatVMemInsts
GPUDynInstPtr getMemInst() const
The request should be marked with KERNEL.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
std::vector< std::vector< Addr > > lastVaddrSimd
bool isPowerOf2(const T &n)
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void schedule(Event &event, Tick when)
void computeActualWgSz(NDRange *ndr)
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Packet::SenderState * saved
#define ULL(N)
uint64_t constant
Stats::Scalar ldsNoFlatInsts
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Stats::Scalar instCyclesVALU
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Stats::Scalar completedWfs
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Stats::Formula vALUInstsPerWF
virtual const std::string name() const
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
EmulationPageTable * pTable
Stats::Distribution activeLanesPerGMemInstrDist
Declarations of a non-full system Page Table.
bool fixupFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
void init(uint64_t *_tcnt, uint32_t _numStages=0)
SenderState is information carried along with the packet, esp.
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet's sender state.
uint32_t spillSizePerItem
MemCmd cmd
The command field of the packet.
void start(uint64_t _wfDynId, uint64_t _base_ptr)
void init(ComputeUnit *cu)
Tick ticks(int numCycles) const
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
int ReadyWorkgroup(NDRange *ndr)
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
void init(ComputeUnit *cu)
std::map< Addr, int > pagesTouched
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Stats::Formula flatVMemInstsPerWF
std::vector< uint8_t > statusVec
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
std::vector< uint64_t > lastExecCycle
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
virtual void regStats()
Callback to set stat parameters.
int impl_kern_boundary_sync
Stats::Scalar ldsBankAccesses
Stats::Scalar totalCycles
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
std::vector< Addr > lastVaddrCU
void regStats() override
Callback to set stat parameters.
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Stats::Formula ldsNoFlatInstsPerWF
bool isSimdDone(uint32_t) const
uint64_t getAndIncSeqNum()
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
std::vector< int > vectorRegsReserved
ProbePointArg< PacketInfo > Packet
Packet probe point.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ScoreboardCheckStage scoreboardCheckStage