39 #include "debug/GPUDisp.hh" 40 #include "debug/GPUExec.hh" 41 #include "debug/GPUFetch.hh" 42 #include "debug/GPUMem.hh" 43 #include "debug/GPUPort.hh" 44 #include "debug/GPUPrefetch.hh" 45 #include "debug/GPUSync.hh" 46 #include "debug/GPUTLB.hh" 59 scoreboardCheckStage(p), scheduleStage(p), execStage(p),
60 globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
61 cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
62 spBypassPipeLength(p->spbypass_pipe_length),
63 dpBypassPipeLength(p->dpbypass_pipe_length),
64 issuePeriod(p->issue_period),
65 numGlbMemUnits(p->num_global_mem_pipes),
66 numLocMemUnits(p->num_shared_mem_pipes),
67 perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
68 prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
69 xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
70 functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
71 countPages(p->countPages), barrier_id(0),
72 vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
73 coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
74 req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
75 resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
76 _masterId(p->
system->getMasterId(this,
"ComputeUnit")),
77 lds(*p->localDataStore), gmTokenPort(
name() +
".gmTokenPort", this),
78 _cacheLineSize(p->
system->cacheLineSize()), globalSeqNum(0),
90 fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
92 "WF size is larger than the host can support");
94 "Wavefront size should be a power of 2");
98 (uint32_t)ceil((
double)(
wfSize() *
sizeof(uint32_t)) /
110 for (
int i = 0;
i < p->n_wf; ++
i) {
113 wfList[
j].push_back(p->wavefronts[
j * p->n_wf +
i]);
132 if (p->execPolicy ==
"OLDEST-FIRST") {
134 }
else if (p->execPolicy ==
"ROUND-ROBIN") {
137 fatal(
"Invalid WF execution policy (CU)\n");
156 for (
int i = 0;
i <
vrf.size(); ++
i) {
157 vrf[
i]->setParent(
this);
207 while (i < vecSize) {
210 vrf[regInfo.first]->markReg(regInfo.second,
sizeof(uint32_t),
223 vrf[
i]->updateEvents();
232 static int _n_wave = 0;
244 w->
initMask = init_mask.to_ullong();
287 int32_t refCount M5_VAR_USED =
289 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
306 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: " 328 gpuDynInst->useContinuation =
false;
351 uint32_t normSize = 0;
354 allocateRegion(vregDemand, &normSize);
371 int trueWgSizeTotal = 1;
373 for (
int d = 0;
d < 3; ++
d) {
377 trueWgSizeTotal *= trueWgSize[
d];
378 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n", d, trueWgSize[d]);
381 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
386 bool vregAvail =
true;
387 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
392 int numMappedWfs = 0;
401 if (numMappedWfs < numWfs) {
412 if (freeWfSlots >= numWfs) {
417 vregAvail =
vrf[
j]->manager->canAllocate(numWfsPerSimd[
j],
429 DPRINTF(GPUDisp,
"Free WF slots = %d, VGPR Availability = %d\n",
430 freeWfSlots, vregAvail);
451 DPRINTF(GPUSync,
"CU%d: Checking for All At Barrier\n",
cu_id);
454 for (
int i_simd = 0; i_simd <
numSIMDs; ++i_simd) {
455 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf) {
459 DPRINTF(GPUSync,
"Checking WF[%d][%d]\n", i_simd, i_wf);
461 DPRINTF(GPUSync,
"wf->barrier_id = %d, _barrier_id = %d\n",
464 DPRINTF(GPUSync,
"wf->barrier_cnt %d, bcnt = %d\n",
473 DPRINTF(GPUSync,
"WF[%d][%d] at barrier, increment ccnt to " 474 "%d\n", i_simd, i_wf, ccnt);
479 DPRINTF(GPUSync,
"CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
480 cu_id, ccnt, bslots);
482 return ccnt == bslots;
504 if (!curWaveIDQueue.empty()) {
505 for (
auto it : curWaveIDQueue) {
508 if (cur_wave.
simdId == simdId &&
559 "No support for multiple Global Memory Pipelines exists!!!");
567 "No support for multiple Local Memory Pipelines exists!!!");
595 readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
634 if (pkt->
req->isKernel() && pkt->
req->isRelease()) {
636 computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
640 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
644 computeUnit->shader->dispatcher->notifyWgCompl(w);
650 DPRINTF(GPUSync,
"CU%d: WF[%d][%d]: barrier_cnt = %d\n",
651 computeUnit->cu_id, gpuDynInst->simdId,
654 if (gpuDynInst->useContinuation) {
655 assert(!gpuDynInst->isNoScope());
656 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
663 }
else if (pkt->
req->isKernel() && pkt->
req->isAcquire()) {
664 if (gpuDynInst->useContinuation) {
665 assert(!gpuDynInst->isNoScope());
666 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
676 computeUnit->memPort[
index]->createMemRespEvent(pkt);
678 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
679 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
680 index, pkt->
req->getPaddr());
682 computeUnit->schedule(mem_resp_event,
683 curTick() + computeUnit->resp_tick_latency);
690 int len = retries.size();
694 for (
int i = 0;
i <
len; ++
i) {
696 GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
697 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
698 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
699 pkt->
req->getPaddr());
704 if (!sendTimingReq(pkt)) {
705 DPRINTF(GPUMem,
"failed again!\n");
708 DPRINTF(GPUMem,
"successful!\n");
717 computeUnit->fetchStage.processFetchReturn(pkt);
725 int len = retries.size();
729 for (
int i = 0;
i <
len; ++
i) {
731 Wavefront *wavefront M5_VAR_USED = retries.front().second;
732 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
733 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
734 pkt->
req->getPaddr());
735 if (!sendTimingReq(pkt)) {
736 DPRINTF(GPUFetch,
"failed again!\n");
739 DPRINTF(GPUFetch,
"successful!\n");
749 Addr tmp_vaddr = pkt->
req->getVaddr();
754 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
756 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
766 }
else if (pkt->
isRead()) {
769 fatal(
"pkt is not a read nor a write\n");
781 unsigned size = pkt->
getSize();
783 if ((vaddr + size - 1) % 64 < vaddr % 64) {
784 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
785 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
792 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
793 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
803 TheISA::GpuTLB::TranslationState *translation_state =
804 new TheISA::GpuTLB::TranslationState(TLB_mode,
shader->
gpuTc,
false,
810 tlbPort[tlbPort_index]->sendFunctional(pkt);
813 int hit_level = translation_state->hitLevel;
814 assert(hit_level != -1);
822 delete sender_state->
saved;
825 assert(pkt->
req->hasPaddr());
826 assert(pkt->
req->hasSize());
828 uint8_t *tmpData = pkt->
getPtr<uint8_t>();
836 pkt =
new Packet(oldPkt->req, oldPkt->cmd);
845 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(index);
846 gpuDynInst->tlbHitLevel[
index] = hit_level;
854 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data " 855 "scheduled\n",
cu_id, gpuDynInst->simdId,
856 gpuDynInst->wfSlotId, index, pkt->
req->getPaddr());
859 }
else if (
tlbPort[tlbPort_index]->isStalled()) {
860 assert(
tlbPort[tlbPort_index]->retries.size() > 0);
862 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x " 863 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
866 tlbPort[tlbPort_index]->retries.push_back(pkt);
867 }
else if (!
tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
872 tlbPort[tlbPort_index]->stallPort();
874 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x " 875 "failed!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
878 tlbPort[tlbPort_index]->retries.push_back(pkt);
881 "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
882 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
888 gpuDynInst->statusBitVector &= (~(1ll <<
index));
895 pkt->
senderState =
new TheISA::GpuTLB::TranslationState(TLB_mode,
898 tlbPort[tlbPort_index]->sendFunctional(pkt);
908 memPort[0]->sendFunctional(new_pkt);
910 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
911 gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
912 new_pkt->
req->getPaddr());
915 TheISA::GpuTLB::TranslationState *sender_state =
918 delete sender_state->tlbEntry;
936 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
937 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
938 pkt->
req->getPaddr());
947 assert(gpuDynInst->isGlobalSeg());
950 req = std::make_shared<Request>(
951 0, 0, 0,
masterId(), 0, gpuDynInst->wfDynId);
962 gpuDynInst->setRequestFlags(req, kernelLaunch);
965 assert(req->isAcquire() || req->isRelease());
989 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
990 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
993 Addr paddr = pkt->
req->getPaddr();
996 int index = gpuDynInst->memStatusVector[paddr].back();
998 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1001 gpuDynInst->memStatusVector[paddr].pop_back();
1002 gpuDynInst->pAddr = pkt->
req->getPaddr();
1007 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1009 assert(gpuDynInst->statusVector[index] > 0);
1010 gpuDynInst->statusVector[
index]--;
1012 if (!gpuDynInst->statusVector[index])
1013 gpuDynInst->statusBitVector &= (~(1
ULL <<
index));
1016 DPRINTF(GPUMem,
"bitvector is now %#x\n",
1017 gpuDynInst->statusBitVector);
1019 if (gpuDynInst->statusBitVector ==
VectorMask(0)) {
1020 auto iter = gpuDynInst->memStatusVector.begin();
1021 auto end = gpuDynInst->memStatusVector.end();
1023 while (iter != end) {
1024 assert(iter->second.empty());
1028 gpuDynInst->memStatusVector.clear();
1031 gpuDynInst->statusVector.clear();
1035 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1036 compute_unit->
cu_id, gpuDynInst->simdId,
1037 gpuDynInst->wfSlotId);
1043 if (gpuDynInst->useContinuation) {
1044 assert(!gpuDynInst->isNoScope());
1045 gpuDynInst->execContinuation(
1046 gpuDynInst->staticInstruction(),
1054 if (gpuDynInst->useContinuation) {
1055 assert(!gpuDynInst->isNoScope());
1056 gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1066 ComputeUnitParams::create()
1074 Addr line = pkt->
req->getPaddr();
1076 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1077 pkt->
req->getVaddr(), line);
1080 computeUnit->tlbCycles +=
curTick();
1083 TheISA::GpuTLB::TranslationState *translation_state =
1087 if (!translation_state->tlbEntry) {
1092 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1095 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1096 pkt->
req->getVaddr());
1100 int hit_level = translation_state->hitLevel;
1101 computeUnit->hitsPerTLBLevel[hit_level]++;
1103 delete translation_state->tlbEntry;
1104 assert(!translation_state->ports.size());
1110 delete translation_state;
1119 gpuDynInst->memStatusVector[line].push_back(mp_index);
1120 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1131 panic(
"unsupported response to request conversion %s\n",
1135 if (computeUnit->prefetchDepth) {
1136 int simdId = gpuDynInst->simdId;
1137 int wfSlotId = gpuDynInst->wfSlotId;
1140 switch(computeUnit->prefetchType) {
1142 last = computeUnit->lastVaddrCU[mp_index];
1144 case Enums::PF_PHASE:
1145 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1148 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1153 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1154 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1160 DPRINTF(GPUPrefetch,
"Stride is %d\n", stride);
1162 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1163 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1164 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1166 stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1167 computeUnit->prefetchStride: stride;
1169 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1170 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1172 DPRINTF(GPUPrefetch,
"Prefetching from %#x:", vaddr);
1175 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1176 DPRINTF(GPUPrefetch,
"%d * %d: %#x\n",
pf, stride,
1182 RequestPtr prefetch_req = std::make_shared<Request>(
1185 computeUnit->masterId(),
1190 prefetch_pkt->dataStatic(&foo);
1193 prefetch_pkt->senderState =
1194 new TheISA::GpuTLB::TranslationState(TLB_mode,
1195 computeUnit->shader->gpuTc,
1199 sendFunctional(prefetch_pkt);
1202 TheISA::GpuTLB::TranslationState *tlb_state =
1203 safe_cast<TheISA::GpuTLB::TranslationState*>(
1204 prefetch_pkt->senderState);
1207 delete tlb_state->tlbEntry;
1209 delete prefetch_pkt;
1228 computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt);
1230 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1231 computeUnit->cu_id, gpuDynInst->simdId,
1232 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1234 computeUnit->schedule(mem_req_event,
curTick() +
1235 computeUnit->req_tick_latency);
1244 [
this, pkt]{ processMemReqEvent(pkt); },
1245 "ComputeUnit memory request event",
true);
1252 [
this, pkt]{ processMemRespEvent(pkt); },
1253 "ComputeUnit memory response event",
true);
1261 ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
1263 if (!(sendTimingReq(pkt))) {
1264 retries.push_back(std::make_pair(pkt, gpuDynInst));
1267 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1268 compute_unit->cu_id, gpuDynInst->simdId,
1269 gpuDynInst->wfSlotId,
index,
1270 pkt->
req->getPaddr());
1273 "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1274 compute_unit->cu_id, gpuDynInst->simdId,
1275 gpuDynInst->wfSlotId,
index,
1276 pkt->
req->getPaddr());
1289 int len = retries.size();
1291 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1292 computeUnit->cu_id, len);
1295 assert(isStalled());
1300 for (
int i = 0;
i <
len; ++
i) {
1303 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1305 if (!sendTimingReq(pkt)) {
1308 DPRINTF(GPUTLB,
": failed again\n");
1311 DPRINTF(GPUTLB,
": successful\n");
1312 retries.pop_front();
1320 Addr line M5_VAR_USED = pkt->
req->getPaddr();
1321 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1322 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1327 TheISA::GpuTLB::TranslationState *translation_state =
1330 bool success = translation_state->tlbEntry !=
nullptr;
1331 delete translation_state->tlbEntry;
1332 assert(!translation_state->ports.size());
1334 delete translation_state;
1351 computeUnit->fetchStage.fetch(pkt, wavefront);
1353 if (wavefront->dropFetch) {
1354 assert(wavefront->instructionBuffer.empty());
1355 wavefront->dropFetch =
false;
1358 wavefront->pendingFetch = 0;
1374 int len = retries.size();
1375 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1378 assert(isStalled());
1384 for (
int i = 0;
i <
len; ++
i) {
1387 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1389 if (!sendTimingReq(pkt)) {
1391 DPRINTF(GPUTLB,
": failed again\n");
1394 DPRINTF(GPUTLB,
": successful\n");
1395 retries.pop_front();
1407 .
desc(
"Number of vector ALU insts issued.")
1410 .
name(
name() +
".valu_insts_per_wf")
1411 .
desc(
"The avg. number of vector ALU insts issued per-wavefront.")
1415 .
desc(
"Number of scalar ALU insts issued.")
1418 .
name(
name() +
".salu_insts_per_wf")
1419 .
desc(
"The avg. number of scalar ALU insts issued per-wavefront.")
1422 .
name(
name() +
".inst_cycles_valu")
1423 .
desc(
"Number of cycles needed to execute VALU insts.")
1426 .
name(
name() +
".inst_cycles_salu")
1427 .
desc(
"Number of cycles needed to execute SALU insts.")
1430 .
name(
name() +
".thread_cycles_valu")
1431 .
desc(
"Number of thread cycles used to execute vector ALU ops. " 1432 "Similar to instCyclesVALU but multiplied by the number of " 1436 .
name(
name() +
".valu_utilization")
1437 .
desc(
"Percentage of active vector ALU threads in a wave.")
1440 .
name(
name() +
".lds_no_flat_insts")
1441 .
desc(
"Number of LDS insts issued, not including FLAT " 1442 "accesses that resolve to LDS.")
1445 .
name(
name() +
".lds_no_flat_insts_per_wf")
1446 .
desc(
"The avg. number of LDS insts (not including FLAT " 1447 "accesses that resolve to LDS) per-wavefront.")
1451 .
desc(
"The number of FLAT insts that resolve to vmem issued.")
1454 .
name(
name() +
".flat_vmem_insts_per_wf")
1455 .
desc(
"The average number of FLAT insts that resolve to vmem " 1456 "issued per-wavefront.")
1460 .
desc(
"The number of FLAT insts that resolve to LDS issued.")
1463 .
name(
name() +
".flat_lds_insts_per_wf")
1464 .
desc(
"The average number of FLAT insts that resolve to LDS " 1465 "issued per-wavefront.")
1468 .
name(
name() +
".vector_mem_writes")
1469 .
desc(
"Number of vector mem write insts (excluding FLAT insts).")
1472 .
name(
name() +
".vector_mem_writes_per_wf")
1473 .
desc(
"The average number of vector mem write insts " 1474 "(excluding FLAT insts) per-wavefront.")
1477 .
name(
name() +
".vector_mem_reads")
1478 .
desc(
"Number of vector mem read insts (excluding FLAT insts).")
1481 .
name(
name() +
".vector_mem_reads_per_wf")
1482 .
desc(
"The avg. number of vector mem read insts (excluding " 1483 "FLAT insts) per-wavefront.")
1486 .
name(
name() +
".scalar_mem_writes")
1487 .
desc(
"Number of scalar mem write insts.")
1490 .
name(
name() +
".scalar_mem_writes_per_wf")
1491 .
desc(
"The average number of scalar mem write insts per-wavefront.")
1494 .
name(
name() +
".scalar_mem_reads")
1495 .
desc(
"Number of scalar mem read insts.")
1498 .
name(
name() +
".scalar_mem_reads_per_wf")
1499 .
desc(
"The average number of scalar mem read insts per-wavefront.")
1515 .
desc(
"total number of cycles for all uncoalesced requests")
1520 .
desc(
"number of uncoalesced requests")
1524 .
name(
name() +
".avg_translation_latency")
1525 .
desc(
"Avg. translation latency for data translations")
1532 .
name(
name() +
".TLB_hits_distribution")
1533 .
desc(
"TLB hits distribution (0 for page table, x for Lx-TLB")
1537 for (
int i = 0;
i < 4; ++
i) {
1547 .
desc(
"Instruction Execution Rate: Number of executed vector " 1548 "instructions per cycle")
1553 .
name(
name() +
".lds_bank_conflicts")
1554 .
desc(
"Number of bank conflicts per LDS memory packet")
1558 .
name(
name() +
".lds_bank_access_cnt")
1559 .
desc(
"Total number of LDS bank accesses")
1567 .
name(
name() +
".page_divergence_dist")
1568 .
desc(
"pages touched per wf (over all mem. instr.)")
1573 .
name(
name() +
".warp_execution_dist")
1574 .
desc(
"number of lanes active per instruction (oval all instructions)")
1579 .
name(
name() +
".gmem_lanes_execution_dist")
1580 .
desc(
"number of active lanes per global memory instruction")
1585 .
name(
name() +
".lmem_lanes_execution_dist")
1586 .
desc(
"number of active lanes per local memory instruction")
1590 .
name(
name() +
".num_instr_executed")
1591 .
desc(
"number of instructions executed")
1595 .
name(
name() +
".num_vec_ops_executed")
1596 .
desc(
"number of vec ops executed (e.g. WF size/inst)")
1600 .
name(
name() +
".num_total_cycles")
1601 .
desc(
"number of cycles the CU ran for")
1606 .
desc(
"Instructions per cycle (this CU only)")
1611 .
desc(
"Vector Operations per cycle (this CU only)")
1615 .
name(
name() +
".num_alu_insts_executed")
1616 .
desc(
"Number of dynamic non-GM memory insts executed")
1620 .
name(
name() +
".wg_blocked_due_lds_alloc")
1621 .
desc(
"Workgroup blocked due to LDS capacity")
1628 .
name(
name() +
".times_wg_blocked_due_vgpr_alloc")
1629 .
desc(
"Number of times WGs are blocked due to VGPR allocation per SIMD")
1633 .
name(
name() +
".global_mem_instr_cnt")
1634 .
desc(
"dynamic global memory instructions count")
1638 .
name(
name() +
".local_mem_instr_cnt")
1639 .
desc(
"dynamic local memory intruction count")
1646 .
name(
name() +
".num_completed_wfs")
1647 .
desc(
"number of completed wavefronts")
1652 .
desc(
"number of compare and swap operations")
1656 .
name(
name() +
".num_failed_CAS_ops")
1657 .
desc(
"number of compare and swap operations that failed")
1674 if (gpuDynInst->isScalar()) {
1675 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1678 }
else if (gpuDynInst->isLoad()) {
1680 }
else if (gpuDynInst->isStore()) {
1684 if (gpuDynInst->isALU()) {
1688 }
else if (gpuDynInst->isFlat()) {
1689 if (gpuDynInst->isLocalMem()) {
1694 }
else if (gpuDynInst->isLocalMem()) {
1696 }
else if (gpuDynInst->isLoad()) {
1698 }
else if (gpuDynInst->isStore()) {
1718 if (computeUnit->countPages) {
1719 std::ostream *page_stat_file =
1722 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
1725 for (
auto iter : computeUnit->pageAccesses) {
1726 *page_stat_file << std::hex << iter.first <<
",";
1727 *page_stat_file << std::dec << iter.second.first <<
",";
1728 *page_stat_file << std::dec << iter.second.second << std::endl;
1742 bool glbMemBusRdy =
true;
1746 bool locMemBusRdy =
true;
1785 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
1804 RequestPtr newRequest = std::make_shared<Request>();
1805 newRequest->setPaddr(0x0);
1825 fatal_if(!senderState,
"did not get the right sort of sender state");
1832 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1846 fatal_if(!sender_state,
"packet without a valid sender state");
1851 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
1855 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
1856 computeUnit->cu_id, gpuDynInst->simdId,
1857 gpuDynInst->wfSlotId);
1865 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1866 computeUnit->cu_id, gpuDynInst->simdId,
1867 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
1870 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1871 computeUnit->cu_id, gpuDynInst->simdId,
1872 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
1886 auto queueSize = retries.size();
1888 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
1889 computeUnit->cu_id, queueSize);
1892 "why was there a recvReqRetry() with no pending reqs?");
1894 "recvReqRetry() happened when the port was not stalled");
1898 while (!retries.empty()) {
1901 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
1906 DPRINTF(GPUPort,
": LDS send failed again\n");
1909 DPRINTF(GPUTLB,
": LDS send successful\n");
uint32_t numVecRegsPerSimd
#define panic(...)
This implements a cprintf based panic() function.
void updatePageDivergenceDist(Addr addr)
Stats::Formula tlbLatency
RubyTester::SenderState SenderState
GPUDynInstPtr _gpuDynInst
void processMemReqEvent(PacketPtr pkt)
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Stats::Scalar flatLDSInsts
std::vector< bool > vectorAluInstAvail
void handleResponse(GPUDynInstPtr gpuDynInst)
this method handles responses sent to this GM pipeline by the CU.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation...
const Regs::Info & regInfo(Addr daddr)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch=true, RequestPtr req=nullptr)
std::true_type foo(void(*)(ThreadContext *, const Ret &ret, State &state))
#define fatal(...)
This implements a cprintf based fatal() function.
uint32_t numCyclesPerLoadTransfer
const std::string & name()
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
std::map< unsigned, waveQueue > xactCasLoadMap
void init(ComputeUnit *cu)
std::vector< std::vector< std::pair< Wavefront *, WAVE_STATUS > > > waveStatusList
void init(ComputeUnit *cu)
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState...
void fillKernelState(Wavefront *w, NDRange *ndr)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
Stats::Vector hitsPerTLBLevel
Stats::Scalar dynamicGMemInstrCnt
ScheduleStage scheduleStage
Stats::Formula flatLDSInstsPerWF
bool isGMStRespFIFOWrRdy() const
Stats::Distribution controlFlowDivergenceDist
const std::string & toString() const
Return the string to a cmd given by idx.
Bitfield< 21, 20 > stride
std::vector< std::vector< Wavefront * > > readyList
std::shared_ptr< Request > RequestPtr
GPUDynInstPtr _gpuDynInst
Stats::Scalar vectorMemWrites
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
virtual Process * getProcessPtr()=0
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
bool isGMLdRespFIFOWrRdy() const
CUExitCallback * cuExitCallback
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the slave port by calling its corresponding receive function...
void init(ComputeUnit *cu)
std::vector< DTLBPort * > tlbPort
std::vector< std::vector< Wavefront * > > wfList
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Stats::Scalar dynamicLMemInstrCnt
SenderState is information carried along with the packet throughout the TLB hierarchy.
Stats::Formula numALUInstsExecuted
T * getPtr()
get a pointer to the data ptr.
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenMasterPort/SlaveMasterPort pa...
GPUStaticInst * kernelLaunchInst
Stats::Scalar numInstrExecuted
Derived & init(size_type size)
Set this vector to have the given size.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Stats::Distribution ldsBankConflictDist
SenderState is information carried along with the packet throughout the TLB hierarchy.
std::vector< WaitClass > vrfToLocalMemPipeBus
Stats::Formula vectorMemWritesPerWF
Stats::Scalar wgBlockedDueLdsAllocation
TokenManager * memPortTokens
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
RequestPtr req
A pointer to the original request.
std::vector< WaitClass > aluPipe
uint32_t numCyclesPerStoreTransfer
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, NDRange *ndr)
ComputeUnit(const Params *p)
GlobalMemPipeline globalMemoryPipe
uint32_t coalescerToVrfBusWidth
Stats::Formula vALUUtilization
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Formula scalarMemWritesPerWF
Stats::Scalar numTimesWgBlockedDueVgprAlloc
std::vector< uint32_t > workItemId[3]
Tick curTick()
The current simulated tick.
std::deque< GPUDynInstPtr > instructionBuffer
Stats::Distribution execRateDist
Stats::Formula vectorMemReadsPerWF
void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
std::vector< std::pair< uint32_t, uint32_t > > regIdxVec
std::string csprintf(const char *format, const Args &...args)
bool translate(Addr vaddr, Addr &paddr)
Translate function.
std::vector< uint32_t > workItemFlatId
Stats::Distribution pageDivergenceDist
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
Stats::Scalar tlbRequests
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
Stats::Scalar flatVMemInsts
GPUDynInstPtr getMemInst() const
The request should be marked with KERNEL.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::vector< DataPort * > memPort
The memory port for SIMD data accesses.
void registerExitCallback(Callback *callback)
Register an exit callback.
std::vector< std::vector< Addr > > lastVaddrSimd
bool isPowerOf2(const T &n)
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
uint32_t vrfToCoalescerBusWidth
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void schedule(Event &event, Tick when)
void computeActualWgSz(NDRange *ndr)
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
void StartWorkgroup(NDRange *ndr)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Stats::Formula sALUInstsPerWF
void init(ComputeUnit *cu)
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
Stats::Scalar scalarMemWrites
Stats::Scalar scalarMemReads
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Packet::SenderState * saved
#define ULL(N)
uint64_t constant
Stats::Scalar ldsNoFlatInsts
std::vector< std::pair< Wavefront *, DISPATCH_STATUS > > dispatchList
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result))
send a general request to the LDS make sure to look at the return value here as your request might be...
bool cedeSIMD(int simdId, int wfSlotId)
uint8_t args[KER_ARGS_LENGTH]
Stats::Scalar instCyclesVALU
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Stats::Scalar completedWfs
Stats::Formula scalarMemReadsPerWF
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Stats::Formula vALUInstsPerWF
virtual const std::string name() const
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
EmulationPageTable * pTable
Stats::Distribution activeLanesPerGMemInstrDist
Declarations of a non-full system Page Table.
bool fixupFault(Addr vaddr)
Attempt to fix up a fault at vaddr by allocating a page on the stack.
void init(uint64_t *_tcnt, uint32_t _numStages=0)
SenderState is information carried along with the packet, esp.
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
Stats::Scalar numVecOpsExecuted
std::vector< VectorRegisterFile * > vrf
SenderState * senderState
This packet's sender state.
uint32_t spillSizePerItem
MemCmd cmd
The command field of the packet.
void start(uint64_t _wfDynId, uint64_t _base_ptr)
void init(ComputeUnit *cu)
Tick ticks(int numCycles) const
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
Stats::Scalar numFailedCASOps
T divCeil(const T &a, const U &b)
int ReadyWorkgroup(NDRange *ndr)
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
void init(ComputeUnit *cu)
std::map< Addr, int > pagesTouched
Stats::Scalar instCyclesSALU
virtual void process()
virtual process function that is invoked when the callback queue is executed.
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Stats::Formula flatVMemInstsPerWF
std::vector< uint8_t > statusVec
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
std::vector< uint64_t > lastExecCycle
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
virtual void regStats()
Callback to set stat parameters.
int impl_kern_boundary_sync
Stats::Scalar ldsBankAccesses
Stats::Scalar totalCycles
LDSPort * ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > timestampVec
Stats::Scalar vectorMemReads
std::vector< Addr > lastVaddrCU
void regStats() override
Callback to set stat parameters.
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Stats::Formula ldsNoFlatInstsPerWF
bool isSimdDone(uint32_t) const
uint64_t getAndIncSeqNum()
Stats::Scalar threadCyclesVALU
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
std::vector< int > vectorRegsReserved
ProbePointArg< PacketInfo > Packet
Packet probe point.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ScoreboardCheckStage scoreboardCheckStage