42 #include "debug/GPUDisp.hh" 43 #include "debug/GPUExec.hh" 44 #include "debug/GPUFetch.hh" 45 #include "debug/GPUMem.hh" 46 #include "debug/GPUPort.hh" 47 #include "debug/GPUPrefetch.hh" 48 #include "debug/GPUSync.hh" 49 #include "debug/GPUTLB.hh" 62 scoreboardCheckStage(p), scheduleStage(p), execStage(p),
63 globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
64 cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
65 spBypassPipeLength(p->spbypass_pipe_length),
66 dpBypassPipeLength(p->dpbypass_pipe_length),
67 issuePeriod(p->issue_period),
68 numGlbMemUnits(p->num_global_mem_pipes),
69 numLocMemUnits(p->num_shared_mem_pipes),
70 perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
71 prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
72 xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
73 functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
74 countPages(p->countPages), barrier_id(0),
75 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
76 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
77 req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
78 resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
79 _masterId(p->
system->getMasterId(this,
"ComputeUnit")),
80 lds(*p->localDataStore), _cacheLineSize(p->
system->cacheLineSize()),
81 globalSeqNum(0), wavefrontSize(p->wfSize),
93 fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
95 "WF size is larger than the host can support");
97 "Wavefront size should be a power of 2");
101 (uint32_t)ceil((
double)(
wfSize() *
sizeof(uint32_t)) /
113 for (
int i = 0;
i < p->n_wf; ++
i) {
116 wfList[
j].push_back(p->wavefronts[
j * p->n_wf +
i]);
135 if (p->execPolicy ==
"OLDEST-FIRST") {
137 }
else if (p->execPolicy ==
"ROUND-ROBIN") {
140 fatal(
"Invalid WF execution policy (CU)\n");
155 for (
int i = 0;
i <
vrf.size(); ++
i) {
156 vrf[
i]->setParent(
this);
206 while (i < vecSize) {
209 vrf[regInfo.first]->markReg(regInfo.second,
sizeof(uint32_t),
222 vrf[
i]->updateEvents();
231 static int _n_wave = 0;
243 w->
initMask = init_mask.to_ullong();
288 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
305 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: " 327 gpuDynInst->useContinuation =
false;
350 uint32_t normSize = 0;
353 allocateRegion(vregDemand, &normSize);
370 int trueWgSizeTotal = 1;
372 for (
int d = 0;
d < 3; ++
d) {
376 trueWgSizeTotal *= trueWgSize[
d];
377 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n", d, trueWgSize[d]);
380 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
385 bool vregAvail =
true;
386 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
391 int numMappedWfs = 0;
400 if (numMappedWfs < numWfs) {
411 if (freeWfSlots >= numWfs) {
416 vregAvail =
vrf[
j]->manager->canAllocate(numWfsPerSimd[
j],
428 DPRINTF(GPUDisp,
"Free WF slots = %d, VGPR Availability = %d\n",
429 freeWfSlots, vregAvail);
450 DPRINTF(GPUSync,
"CU%d: Checking for All At Barrier\n",
cu_id);
453 for (
int i_simd = 0; i_simd <
numSIMDs; ++i_simd) {
454 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf) {
458 DPRINTF(GPUSync,
"Checking WF[%d][%d]\n", i_simd, i_wf);
460 DPRINTF(GPUSync,
"wf->barrier_id = %d, _barrier_id = %d\n",
463 DPRINTF(GPUSync,
"wf->barrier_cnt %d, bcnt = %d\n",
472 DPRINTF(GPUSync,
"WF[%d][%d] at barrier, increment ccnt to " 473 "%d\n", i_simd, i_wf, ccnt);
478 DPRINTF(GPUSync,
"CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
479 cu_id, ccnt, bslots);
481 return ccnt == bslots;
503 if (!curWaveIDQueue.empty()) {
504 for (
auto it : curWaveIDQueue) {
507 if (cur_wave.
simdId == simdId &&
558 "No support for multiple Global Memory Pipelines exists!!!");
566 "No support for multiple Local Memory Pipelines exists!!!");
594 readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
631 if (pkt->
req->isKernel() && pkt->
req->isRelease()) {
633 computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
637 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
641 computeUnit->shader->dispatcher->notifyWgCompl(w);
647 DPRINTF(GPUSync,
"CU%d: WF[%d][%d]: barrier_cnt = %d\n",
648 computeUnit->cu_id, gpuDynInst->simdId,
651 if (gpuDynInst->useContinuation) {
652 assert(!gpuDynInst->isNoScope());
653 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
660 }
else if (pkt->
req->isKernel() && pkt->
req->isAcquire()) {
661 if (gpuDynInst->useContinuation) {
662 assert(!gpuDynInst->isNoScope());
663 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
673 computeUnit->memPort[
index]->createMemRespEvent(pkt);
675 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
676 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
677 index, pkt->
req->getPaddr());
679 computeUnit->schedule(mem_resp_event,
680 curTick() + computeUnit->resp_tick_latency);
687 int len = retries.size();
691 for (
int i = 0;
i <
len; ++
i) {
694 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
695 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
696 pkt->
req->getPaddr());
701 if (!sendTimingReq(pkt)) {
702 DPRINTF(GPUMem,
"failed again!\n");
705 DPRINTF(GPUMem,
"successful!\n");
714 computeUnit->fetchStage.processFetchReturn(pkt);
722 int len = retries.size();
726 for (
int i = 0;
i <
len; ++
i) {
729 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
730 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
731 pkt->
req->getPaddr());
732 if (!sendTimingReq(pkt)) {
733 DPRINTF(GPUFetch,
"failed again!\n");
736 DPRINTF(GPUFetch,
"successful!\n");
746 Addr tmp_vaddr = pkt->
req->getVaddr();
751 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
753 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
763 }
else if (pkt->
isRead()) {
766 fatal(
"pkt is not a read nor a write\n");
778 unsigned size = pkt->
getSize();
780 if ((vaddr + size - 1) % 64 < vaddr % 64) {
781 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
782 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
789 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
790 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
800 TheISA::GpuTLB::TranslationState *translation_state =
801 new TheISA::GpuTLB::TranslationState(TLB_mode,
shader->
gpuTc,
false,
807 tlbPort[tlbPort_index]->sendFunctional(pkt);
810 int hit_level = translation_state->hitLevel;
811 assert(hit_level != -1);
819 delete sender_state->
saved;
822 assert(pkt->
req->hasPaddr());
823 assert(pkt->
req->hasSize());
825 uint8_t *tmpData = pkt->
getPtr<uint8_t>();
833 pkt =
new Packet(oldPkt->req, oldPkt->cmd);
842 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(index);
843 gpuDynInst->tlbHitLevel[
index] = hit_level;
851 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data " 852 "scheduled\n",
cu_id, gpuDynInst->simdId,
853 gpuDynInst->wfSlotId, index, pkt->
req->getPaddr());
856 }
else if (
tlbPort[tlbPort_index]->isStalled()) {
857 assert(
tlbPort[tlbPort_index]->retries.size() > 0);
859 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x " 860 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
863 tlbPort[tlbPort_index]->retries.push_back(pkt);
864 }
else if (!
tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
869 tlbPort[tlbPort_index]->stallPort();
871 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x " 872 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
875 tlbPort[tlbPort_index]->retries.push_back(pkt);
878 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
879 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
885 gpuDynInst->statusBitVector &= (~(1ll <<
index));
892 pkt->
senderState =
new TheISA::GpuTLB::TranslationState(TLB_mode,
895 tlbPort[tlbPort_index]->sendFunctional(pkt);
905 memPort[0]->sendFunctional(new_pkt);
907 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
908 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
909 new_pkt->
req->getPaddr());
912 TheISA::GpuTLB::TranslationState *sender_state =
915 delete sender_state->tlbEntry;
933 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
934 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
935 pkt->
req->getPaddr());
944 assert(gpuDynInst->isGlobalSeg());
947 req = std::make_shared<Request>(
948 0, 0, 0, 0,
masterId(), 0, gpuDynInst->wfDynId);
959 gpuDynInst->setRequestFlags(req, kernelLaunch);
962 assert(req->isAcquire() || req->isRelease());
986 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
987 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
990 Addr paddr = pkt->
req->getPaddr();
993 int index = gpuDynInst->memStatusVector[paddr].back();
995 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
998 gpuDynInst->memStatusVector[paddr].pop_back();
999 gpuDynInst->pAddr = pkt->
req->getPaddr();
1004 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1006 assert(gpuDynInst->statusVector[index] > 0);
1007 gpuDynInst->statusVector[
index]--;
1009 if (!gpuDynInst->statusVector[index])
1010 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1013 DPRINTF(GPUMem,
"bitvector is now %#x\n",
1014 gpuDynInst->statusBitVector);
1016 if (gpuDynInst->statusBitVector ==
VectorMask(0)) {
1017 auto iter = gpuDynInst->memStatusVector.begin();
1018 auto end = gpuDynInst->memStatusVector.end();
1020 while (iter != end) {
1021 assert(iter->second.empty());
1025 gpuDynInst->memStatusVector.clear();
1028 gpuDynInst->statusVector.clear();
1032 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1033 compute_unit->
cu_id, gpuDynInst->simdId,
1034 gpuDynInst->wfSlotId);
1040 if (gpuDynInst->useContinuation) {
1041 assert(!gpuDynInst->isNoScope());
1042 gpuDynInst->execContinuation(
1043 gpuDynInst->staticInstruction(),
1051 if (gpuDynInst->useContinuation) {
1052 assert(!gpuDynInst->isNoScope());
1053 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1063 ComputeUnitParams::create()
1071 Addr line = pkt->
req->getPaddr();
1073 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1074 pkt->
req->getVaddr(), line);
1077 computeUnit->tlbCycles +=
curTick();
1080 TheISA::GpuTLB::TranslationState *translation_state =
1084 if (!translation_state->tlbEntry) {
1089 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1092 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1093 pkt->
req->getVaddr());
1097 int hit_level = translation_state->hitLevel;
1098 computeUnit->hitsPerTLBLevel[hit_level]++;
1100 delete translation_state->tlbEntry;
1101 assert(!translation_state->ports.size());
1107 delete translation_state;
1116 gpuDynInst->memStatusVector[line].push_back(mp_index);
1117 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1128 panic(
"unsupported response to request conversion %s\n",
1132 if (computeUnit->prefetchDepth) {
1133 int simdId = gpuDynInst->simdId;
1134 int wfSlotId = gpuDynInst->wfSlotId;
1137 switch(computeUnit->prefetchType) {
1139 last = computeUnit->lastVaddrCU[mp_index];
1141 case Enums::PF_PHASE:
1142 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1145 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1150 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1151 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1157 DPRINTF(GPUPrefetch,
"Stride is %d\n", stride);
1159 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1160 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1161 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1163 stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1164 computeUnit->prefetchStride: stride;
1166 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1167 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1169 DPRINTF(GPUPrefetch,
"Prefetching from %#x:", vaddr);
1172 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1173 DPRINTF(GPUPrefetch,
"%d * %d: %#x\n",
pf, stride,
1179 RequestPtr prefetch_req = std::make_shared<Request>(
1182 computeUnit->masterId(),
1187 prefetch_pkt->dataStatic(&foo);
1190 prefetch_pkt->senderState =
1191 new TheISA::GpuTLB::TranslationState(TLB_mode,
1192 computeUnit->shader->gpuTc,
1196 sendFunctional(prefetch_pkt);
1199 TheISA::GpuTLB::TranslationState *tlb_state =
1200 safe_cast<TheISA::GpuTLB::TranslationState*>(
1201 prefetch_pkt->senderState);
1204 delete tlb_state->tlbEntry;
1206 delete prefetch_pkt;
1225 computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt);
1227 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1228 computeUnit->cu_id, gpuDynInst->simdId,
1229 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1231 computeUnit->schedule(mem_req_event,
curTick() +
1232 computeUnit->req_tick_latency);
1241 [
this, pkt]{ processMemReqEvent(pkt); },
1242 "ComputeUnit memory request event",
true);
1249 [
this, pkt]{ processMemRespEvent(pkt); },
1250 "ComputeUnit memory response event",
true);
1260 if (!(sendTimingReq(pkt))) {
1261 retries.push_back(std::make_pair(pkt, gpuDynInst));
1264 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1265 compute_unit->cu_id, gpuDynInst->simdId,
1266 gpuDynInst->wfSlotId,
index,
1267 pkt->
req->getPaddr());
1270 "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1271 compute_unit->cu_id, gpuDynInst->simdId,
1272 gpuDynInst->wfSlotId,
index,
1273 pkt->
req->getPaddr());
1286 int len = retries.size();
1288 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1289 computeUnit->cu_id, len);
1292 assert(isStalled());
1297 for (
int i = 0;
i <
len; ++
i) {
1300 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1302 if (!sendTimingReq(pkt)) {
1305 DPRINTF(GPUTLB,
": failed again\n");
1308 DPRINTF(GPUTLB,
": successful\n");
1309 retries.pop_front();
1318 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1319 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1324 TheISA::GpuTLB::TranslationState *translation_state =
1327 bool success = translation_state->tlbEntry !=
nullptr;
1328 delete translation_state->tlbEntry;
1329 assert(!translation_state->ports.size());
1331 delete translation_state;
1348 computeUnit->fetchStage.fetch(pkt, wavefront);
1350 if (wavefront->dropFetch) {
1351 assert(wavefront->instructionBuffer.empty());
1352 wavefront->dropFetch =
false;
1355 wavefront->pendingFetch = 0;
1371 int len = retries.size();
1372 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1375 assert(isStalled());
1381 for (
int i = 0;
i <
len; ++
i) {
1384 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1386 if (!sendTimingReq(pkt)) {
1388 DPRINTF(GPUTLB,
": failed again\n");
1391 DPRINTF(GPUTLB,
": successful\n");
1392 retries.pop_front();
1404 .
desc(
"Number of vector ALU insts issued.")
1407 .
name(
name() +
".valu_insts_per_wf")
1408 .
desc(
"The avg. number of vector ALU insts issued per-wavefront.")
1412 .
desc(
"Number of scalar ALU insts issued.")
1415 .
name(
name() +
".salu_insts_per_wf")
1416 .
desc(
"The avg. number of scalar ALU insts issued per-wavefront.")
1419 .
name(
name() +
".inst_cycles_valu")
1420 .
desc(
"Number of cycles needed to execute VALU insts.")
1423 .
name(
name() +
".inst_cycles_salu")
1424 .
desc(
"Number of cycles needed to execute SALU insts.")
1427 .
name(
name() +
".thread_cycles_valu")
1428 .
desc(
"Number of thread cycles used to execute vector ALU ops. " 1429 "Similar to instCyclesVALU but multiplied by the number of " 1433 .
name(
name() +
".valu_utilization")
1434 .
desc(
"Percentage of active vector ALU threads in a wave.")
1437 .
name(
name() +
".lds_no_flat_insts")
1438 .
desc(
"Number of LDS insts issued, not including FLAT " 1439 "accesses that resolve to LDS.")
1442 .
name(
name() +
".lds_no_flat_insts_per_wf")
1443 .
desc(
"The avg. number of LDS insts (not including FLAT " 1444 "accesses that resolve to LDS) per-wavefront.")
1448 .
desc(
"The number of FLAT insts that resolve to vmem issued.")
1451 .
name(
name() +
".flat_vmem_insts_per_wf")
1452 .
desc(
"The average number of FLAT insts that resolve to vmem " 1453 "issued per-wavefront.")
1457 .
desc(
"The number of FLAT insts that resolve to LDS issued.")
1460 .
name(
name() +
".flat_lds_insts_per_wf")
1461 .
desc(
"The average number of FLAT insts that resolve to LDS " 1462 "issued per-wavefront.")
1465 .
name(
name() +
".vector_mem_writes")
1466 .
desc(
"Number of vector mem write insts (excluding FLAT insts).")
1469 .
name(
name() +
".vector_mem_writes_per_wf")
1470 .
desc(
"The average number of vector mem write insts " 1471 "(excluding FLAT insts) per-wavefront.")
1474 .
name(
name() +
".vector_mem_reads")
1475 .
desc(
"Number of vector mem read insts (excluding FLAT insts).")
1478 .
name(
name() +
".vector_mem_reads_per_wf")
1479 .
desc(
"The avg. number of vector mem read insts (excluding " 1480 "FLAT insts) per-wavefront.")
1483 .
name(
name() +
".scalar_mem_writes")
1484 .
desc(
"Number of scalar mem write insts.")
1487 .
name(
name() +
".scalar_mem_writes_per_wf")
1488 .
desc(
"The average number of scalar mem write insts per-wavefront.")
1491 .
name(
name() +
".scalar_mem_reads")
1492 .
desc(
"Number of scalar mem read insts.")
1495 .
name(
name() +
".scalar_mem_reads_per_wf")
1496 .
desc(
"The average number of scalar mem read insts per-wavefront.")
1512 .
desc(
"total number of cycles for all uncoalesced requests")
1517 .
desc(
"number of uncoalesced requests")
1521 .
name(
name() +
".avg_translation_latency")
1522 .
desc(
"Avg. translation latency for data translations")
1529 .
name(
name() +
".TLB_hits_distribution")
1530 .
desc(
"TLB hits distribution (0 for page table, x for Lx-TLB")
1534 for (
int i = 0;
i < 4; ++
i) {
1544 .
desc(
"Instruction Execution Rate: Number of executed vector " 1545 "instructions per cycle")
1550 .
name(
name() +
".lds_bank_conflicts")
1551 .
desc(
"Number of bank conflicts per LDS memory packet")
1555 .
name(
name() +
".lds_bank_access_cnt")
1556 .
desc(
"Total number of LDS bank accesses")
1564 .
name(
name() +
".page_divergence_dist")
1565 .
desc(
"pages touched per wf (over all mem. instr.)")
1570 .
name(
name() +
".warp_execution_dist")
1571 .
desc(
"number of lanes active per instruction (oval all instructions)")
1576 .
name(
name() +
".gmem_lanes_execution_dist")
1577 .
desc(
"number of active lanes per global memory instruction")
1582 .
name(
name() +
".lmem_lanes_execution_dist")
1583 .
desc(
"number of active lanes per local memory instruction")
1587 .
name(
name() +
".num_instr_executed")
1588 .
desc(
"number of instructions executed")
1592 .
name(
name() +
".num_vec_ops_executed")
1593 .
desc(
"number of vec ops executed (e.g. WF size/inst)")
1597 .
name(
name() +
".num_total_cycles")
1598 .
desc(
"number of cycles the CU ran for")
1603 .
desc(
"Instructions per cycle (this CU only)")
1608 .
desc(
"Vector Operations per cycle (this CU only)")
1612 .
name(
name() +
".num_alu_insts_executed")
1613 .
desc(
"Number of dynamic non-GM memory insts executed")
1617 .
name(
name() +
".wg_blocked_due_lds_alloc")
1618 .
desc(
"Workgroup blocked due to LDS capacity")
1625 .
name(
name() +
".times_wg_blocked_due_vgpr_alloc")
1626 .
desc(
"Number of times WGs are blocked due to VGPR allocation per SIMD")
1630 .
name(
name() +
".global_mem_instr_cnt")
1631 .
desc(
"dynamic global memory instructions count")
1635 .
name(
name() +
".local_mem_instr_cnt")
1636 .
desc(
"dynamic local memory intruction count")
1643 .
name(
name() +
".num_completed_wfs")
1644 .
desc(
"number of completed wavefronts")
1649 .
desc(
"number of compare and swap operations")
1653 .
name(
name() +
".num_failed_CAS_ops")
1654 .
desc(
"number of compare and swap operations that failed")
1671 if (gpuDynInst->isScalar()) {
1672 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1675 }
else if (gpuDynInst->isLoad()) {
1677 }
else if (gpuDynInst->isStore()) {
1681 if (gpuDynInst->isALU()) {
1685 }
else if (gpuDynInst->isFlat()) {
1686 if (gpuDynInst->isLocalMem()) {
1691 }
else if (gpuDynInst->isLocalMem()) {
1693 }
else if (gpuDynInst->isLoad()) {
1695 }
else if (gpuDynInst->isStore()) {
1715 if (computeUnit->countPages) {
1716 std::ostream *page_stat_file =
1719 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
1722 for (
auto iter : computeUnit->pageAccesses) {
1723 *page_stat_file << std::hex << iter.first <<
",";
1724 *page_stat_file << std::dec << iter.second.first <<
",";
1725 *page_stat_file << std::dec << iter.second.second << std::endl;
1739 bool glbMemBusRdy =
true;
1743 bool locMemBusRdy =
true;
1782 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
1801 RequestPtr newRequest = std::make_shared<Request>();
1802 newRequest->setPaddr(0x0);
1822 fatal_if(!senderState,
"did not get the right sort of sender state");
1829 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1843 fatal_if(!sender_state,
"packet without a valid sender state");
1848 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
1852 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
1853 computeUnit->cu_id, gpuDynInst->simdId,
1854 gpuDynInst->wfSlotId);
1862 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1863 computeUnit->cu_id, gpuDynInst->simdId,
1864 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
1867 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1868 computeUnit->cu_id, gpuDynInst->simdId,
1869 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
1883 auto queueSize = retries.size();
1885 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
1886 computeUnit->cu_id, queueSize);
1889 "why was there a recvReqRetry() with no pending reqs?");
1891 "recvReqRetry() happened when the port was not stalled");
1895 while (!retries.empty()) {
1898 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
1903 DPRINTF(GPUPort,
": LDS send failed again\n");
1906 DPRINTF(GPUTLB,
": LDS send successful\n");
uint32_t numVecRegsPerSimd
#define panic(...)
This implements a cprintf based panic() function.
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
RubyTester::SenderState SenderState
GPUDynInstPtr _gpuDynInst
void processMemReqEvent(PacketPtr pkt)
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Stats::Scalar flatLDSInsts
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
const Regs::Info & regInfo(Addr daddr)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
#define fatal(...)
This implements a cprintf based fatal() function.
uint32_t numCyclesPerLoadTransfer
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
std::map< unsigned, waveQueue > xactCasLoadMap
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
void fillKernelState(Wavefront *w, NDRange *ndr)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Stats::Formula flatLDSInstsPerWF
bool isGMStRespFIFOWrRdy() const
Stats::Distribution controlFlowDivergenceDist
const std::string & toString() const
Return the string to a cmd given by idx.
Bitfield< 21, 20 > stride
std::vector< std::vector< Wavefront * > > readyList
std::shared_ptr< Request > RequestPtr
GPUDynInstPtr _gpuDynInst
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
virtual Process * getProcessPtr()=0
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
virtual void regStats()
Callback to set stat parameters.
bool isGMLdRespFIFOWrRdy() const
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
void init(ComputeUnit *cu)
std::vector< DTLBPort * > tlbPort
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Stats::Scalar dynamicLMemInstrCnt
SenderState is information carried along with the packet throughout the TLB hierarchy.
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
GPUStaticInst * kernelLaunchInst
Stats::Scalar numInstrExecuted
Derived & init(size_type size)
Set this vector to have the given size.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Stats::Distribution ldsBankConflictDist
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
RequestPtr req
A pointer to the original request.
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
GlobalMemPipeline globalMemoryPipe
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
std::vector< uint32_t > workItemId[3]
Tick curTick()
The current simulated tick.
std::deque< GPUDynInstPtr > instructionBuffer
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
bool translate(Addr vaddr, Addr &paddr)
Translate function.
std::vector< uint32_t > workItemFlatId
Stats::Distribution pageDivergenceDist
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
Stats::Scalar flatVMemInsts
GPUDynInstPtr getMemInst() const
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
std::vector< std::vector< Addr > > lastVaddrSimd
bool isPowerOf2(const T &n)
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
The request should be marked with KERNEL.
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Packet::SenderState * saved
#define ULL(N)
uint64_t constant
virtual const std::string name() const
Stats::Scalar ldsNoFlatInsts
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Stats::Scalar instCyclesVALU
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Stats::Scalar completedWfs
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Stats::Formula vALUInstsPerWF
bool fixupStackFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
EmulationPageTable * pTable
Stats::Distribution activeLanesPerGMemInstrDist
Declarations of a non-full system Page Table.
void init(uint64_t *_tcnt, uint32_t _numStages=0)
SenderState is information carried along with the packet, esp.
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet's sender state.
uint32_t spillSizePerItem
MemCmd cmd
The command field of the packet.
void start(uint64_t _wfDynId, uint64_t _base_ptr)
void init(ComputeUnit *cu)
Tick ticks(int numCycles) const
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
int ReadyWorkgroup(NDRange *ndr)
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
void init(ComputeUnit *cu)
std::map< Addr, int > pagesTouched
void schedule(Event &event, Tick when)
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Stats::Formula flatVMemInstsPerWF
std::vector< uint8_t > statusVec
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
std::vector< uint64_t > lastExecCycle
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
static const int NumArgumentRegs M5_VAR_USED
int impl_kern_boundary_sync
Stats::Scalar ldsBankAccesses
Stats::Scalar totalCycles
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
std::vector< Addr > lastVaddrCU
void regStats() override
Callback to set stat parameters.
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Stats::Formula ldsNoFlatInstsPerWF
bool isSimdDone(uint32_t) const
uint64_t getAndIncSeqNum()
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
std::vector< int > vectorRegsReserved
ProbePointArg< PacketInfo > Packet
Packet probe point.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ScoreboardCheckStage scoreboardCheckStage