39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUExec.hh"
41 #include "debug/GPUFetch.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUPort.hh"
44 #include "debug/GPUPrefetch.hh"
45 #include "debug/GPUReg.hh"
46 #include "debug/GPURename.hh"
47 #include "debug/GPUSync.hh"
48 #include "debug/GPUTLB.hh"
66 numVectorGlobalMemUnits(
p.num_global_mem_pipes),
67 numVectorSharedMemUnits(
p.num_shared_mem_pipes),
68 numScalarMemUnits(
p.num_scalar_mem_pipes),
69 numVectorALUs(
p.num_SIMDs),
70 numScalarALUs(
p.num_scalar_cores),
71 vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
72 coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
73 registerManager(
p.register_manager),
75 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
76 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
77 execStage(
p, *this, scheduleToExecute),
78 globalMemoryPipe(
p, *this),
79 localMemoryPipe(
p, *this),
80 scalarMemoryPipe(
p, *this),
81 tickEvent([this]{
exec(); },
"Compute unit tick event",
84 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
85 simdWidth(
p.simd_width),
86 spBypassPipeLength(
p.spbypass_pipe_length),
87 dpBypassPipeLength(
p.dpbypass_pipe_length),
88 scalarPipeStages(
p.scalar_pipe_length),
89 operandNetworkLength(
p.operand_network_length),
90 issuePeriod(
p.issue_period),
91 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
92 srf_scm_bus_latency(
p.srf_scm_bus_latency),
93 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
94 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
95 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
96 debugSegFault(
p.debugSegFault),
97 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
98 countPages(
p.countPages),
99 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
100 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
101 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
102 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
108 _cacheLineSize(
p.system->cacheLineSize()),
109 _numBarrierSlots(
p.num_barrier_slots),
110 globalSeqNum(0), wavefrontSize(
p.wf_size),
111 scoreboardCheckToSchedule(
p),
112 scheduleToExecute(
p),
119 "Functional TLB not supported in full-system GPU simulation");
130 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
132 "WF size is larger than the host can support");
134 "Wavefront size should be a power of 2");
137 numCyclesPerStoreTransfer =
138 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
139 (double)vrfToCoalescerBusWidth);
141 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
142 / coalescerToVrfBusWidth;
145 idleWfs =
p.n_wf * numVectorALUs;
146 lastVaddrWF.resize(numVectorALUs);
147 wfList.resize(numVectorALUs);
149 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
151 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
152 freeBarrierIds.insert(
i);
155 for (
int j = 0;
j < numVectorALUs; ++
j) {
156 lastVaddrWF[
j].resize(
p.n_wf);
158 for (
int i = 0;
i <
p.n_wf; ++
i) {
159 lastVaddrWF[
j][
i].resize(wfSize());
161 wfList[
j].push_back(
p.wavefronts[
j *
p.n_wf +
i]);
162 wfList[
j][
i]->setParent(
this);
164 for (
int k = 0;
k < wfSize(); ++
k) {
165 lastVaddrWF[
j][
i][
k] = 0;
170 lastVaddrSimd.resize(numVectorALUs);
172 for (
int i = 0;
i < numVectorALUs; ++
i) {
173 lastVaddrSimd[
i].resize(wfSize(), 0);
176 lastVaddrCU.resize(wfSize());
180 if (
p.execPolicy ==
"OLDEST-FIRST") {
182 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
185 fatal(
"Invalid WF execution policy (CU)\n");
188 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
192 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
198 memPortTokens =
new TokenManager(
p.max_cu_tokens);
202 lastExecCycle.resize(numVectorALUs, 0);
204 for (
int i = 0;
i < vrf.size(); ++
i) {
205 vrf[
i]->setParent(
this);
207 for (
int i = 0;
i < srf.size(); ++
i) {
208 srf[
i]->setParent(
this);
210 numVecRegsPerSimd = vrf[0]->numRegs();
211 numScalarRegsPerSimd = srf[0]->numRegs();
213 registerManager->setParent(
this);
217 instExecPerSimd.resize(numVectorALUs, 0);
221 "Cache line size should be a power of two.");
222 cacheLineBits =
floorLog2(_cacheLineSize);
305 w->workGroupSz[0] = task->
wgSize(0);
306 w->workGroupSz[1] = task->
wgSize(1);
307 w->workGroupSz[2] = task->
wgSize(2);
308 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
312 w->computeActualWgSz(task);
319 static int _n_wave = 0;
325 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
329 w->execMask() = init_mask;
333 w->initMask = init_mask.to_ullong();
336 w->barrierId(bar_id);
338 assert(!
w->hasBarrier());
342 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
343 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
345 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
346 (
w->actualWgSz[0] *
w->actualWgSz[1]);
348 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
349 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
356 w->workGroupId[0] =
w->wgId % task->
numWg(0);
357 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
358 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
361 w->ldsChunk = ldsChunk;
363 [[maybe_unused]] int32_t refCount =
365 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
366 cu_id,
w->wgId, refCount);
368 w->instructionBuffer.clear();
373 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
374 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
375 w->simdId,
w->wfSlotId, refCount);
377 w->initRegState(task,
w->actualWgSzTotal);
392 = std::make_shared<GPUDynInst>(
this,
nullptr,
396 gpuDynInst->kern_id = kernId;
398 req->setContext(gpuDynInst->wfDynId);
431 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
445 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
459 if (num_wfs_in_wg > 1) {
466 assert(!wf_barrier.maxBarrierCnt());
467 assert(!wf_barrier.numAtBarrier());
468 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
470 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
471 "%d waves using this barrier.\n",
cu_id, barrier_id,
491 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
492 "vregDemand[%d] sregDemand[%d]\n",
i,
j,
w->wfDynId,
493 vregDemand, sregDemand);
508 "Instruction Buffer of WF%d can't be empty",
w->wgId);
517 "Instruction Buffer of WF%d can't be empty",
w->wgId);
520 auto it =
pipeMap.find(ii->seqNum());
530 int trueWgSizeTotal = 1;
536 trueWgSizeTotal *= trueWgSize[
d];
537 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
540 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
543 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
544 num_wfs_in_wg = numWfs;
546 bool barrier_avail =
true;
549 barrier_avail =
false;
562 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
563 "that has %d VGPRs\n",
566 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
573 int numMappedWfs = 0;
585 if (numMappedWfs < numWfs &&
599 assert(numMappedWfs <= numWfs);
601 bool vregAvail =
true;
602 bool sregAvail =
true;
604 if (numMappedWfs < numWfs) {
620 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
621 VGPR Availability = %d, SGPR Availability = %d\n",
622 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
639 if (!barrier_avail) {
648 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
649 && ldsAvail && barrier_avail;
657 return wf_barrier.numYetToReachBarrier();
664 return wf_barrier.allAtBarrier();
671 wf_barrier.incNumAtBarrier();
678 return wf_barrier.numAtBarrier();
685 return wf_barrier.maxBarrierCnt();
699 wf_barrier.decMaxBarrierCnt();
706 wf_barrier.release();
729 for (
auto &vecRegFile :
vrf) {
733 for (
auto &scRegFile :
srf) {
777 "No support for multiple Global Memory Pipelines exists!!!");
784 "No support for multiple Local Memory Pipelines exists!!!");
791 "No support for multiple Scalar Memory Pipelines exists!!!");
835 if (gpuDynInst->isKernelLaunch()) {
838 assert(pkt->
req->isKernel());
839 assert(pkt->
req->isInvL1());
854 && gpuDynInst->isEndOfKernel()) {
860 assert(pkt->
req->isKernel());
861 assert(pkt->
req->isGL2CacheFlush());
877 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
878 computeUnit->cu_id,
w->simdId,
w->wfSlotId,
879 w->wfDynId,
w->wgId);
885 if (!pkt->
req->isKernel()) {
886 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
887 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
888 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
889 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
890 gpuDynInst->disassemble(),
w->outstandingReqs,
891 w->outstandingReqs - 1);
892 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
901 computeUnit->memPort[
index].createMemRespEvent(pkt);
904 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
905 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
906 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
908 computeUnit->schedule(mem_resp_event,
909 curTick() + computeUnit->resp_tick_latency);
917 return handleResponse(pkt);
923 assert(!pkt->
req->isKernel());
930 assert(gpuDynInst->numScalarReqs > 0);
932 gpuDynInst->numScalarReqs--;
942 if (!gpuDynInst->numScalarReqs) {
943 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
944 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
947 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
961 for (
const auto &pkt : retries) {
962 if (!sendTimingReq(pkt)) {
973 int len = retries.size();
977 for (
int i = 0;
i <
len; ++
i) {
979 [[maybe_unused]]
GPUDynInstPtr gpuDynInst = retries.front().second;
980 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
981 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
982 pkt->
req->getPaddr());
987 if (!sendTimingReq(pkt)) {
988 DPRINTF(GPUMem,
"failed again!\n");
991 DPRINTF(GPUMem,
"successful!\n");
1000 computeUnit->handleSQCReturn(pkt);
1014 int len = retries.size();
1018 for (
int i = 0;
i <
len; ++
i) {
1020 [[maybe_unused]]
Wavefront *wavefront = retries.front().second;
1021 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1023 pkt->
req->getPaddr());
1024 if (!sendTimingReq(pkt)) {
1025 DPRINTF(GPUFetch,
"failed again!\n");
1028 DPRINTF(GPUFetch,
"successful!\n");
1029 retries.pop_front();
1038 Addr tmp_vaddr = pkt->
req->getVaddr();
1043 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1045 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1071 }
else if (pkt->
isRead()) {
1074 fatal(
"pkt is not a read nor a write\n");
1086 unsigned size = pkt->
getSize();
1089 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1090 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1095 if (!
p->pTable->translate(
vaddr, paddr)) {
1096 if (!
p->fixupFault(
vaddr)) {
1097 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1098 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1115 tlbPort[tlbPort_index].sendFunctional(pkt);
1118 int hit_level = translation_state->
hitLevel;
1119 assert(hit_level != -1);
1124 safe_cast<GpuTranslationState*>(pkt->
senderState);
1127 delete sender_state->
saved;
1128 delete sender_state;
1130 assert(pkt->
req->hasPaddr());
1131 assert(pkt->
req->hasSize());
1141 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1152 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1153 gpuDynInst->tlbHitLevel[
index] = hit_level;
1160 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1161 "scheduled\n",
cu_id, gpuDynInst->simdId,
1162 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1165 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1166 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1168 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1169 "failed!\n",
cu_id, gpuDynInst->simdId,
1170 gpuDynInst->wfSlotId, tmp_vaddr);
1172 tlbPort[tlbPort_index].retries.push_back(pkt);
1173 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1178 tlbPort[tlbPort_index].stallPort();
1180 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1181 "failed!\n",
cu_id, gpuDynInst->simdId,
1182 gpuDynInst->wfSlotId, tmp_vaddr);
1184 tlbPort[tlbPort_index].retries.push_back(pkt);
1186 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x from "
1187 "instruction %s sent!\n",
cu_id, gpuDynInst->simdId,
1188 gpuDynInst->wfSlotId, tmp_vaddr,
1189 gpuDynInst->disassemble().c_str());
1193 gpuDynInst->resetEntireStatusVector();
1195 gpuDynInst->decrementStatusVector(
index);
1205 tlbPort[tlbPort_index].sendFunctional(pkt);
1215 memPort[0].sendFunctional(new_pkt);
1217 DPRINTF(GPUMem,
"Functional sendRequest\n");
1218 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1219 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1220 new_pkt->
req->getPaddr());
1224 safe_cast<GpuTranslationState*>(pkt->
senderState);
1254 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1256 pkt->
req->getVaddr());
1265 assert(gpuDynInst->isGlobalSeg() ||
1266 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1271 req = std::make_shared<Request>(
1282 if (kernelMemSync) {
1283 if (gpuDynInst->isKernelLaunch()) {
1285 req->setReqInstSeqNum(gpuDynInst->seqNum());
1292 memPort[0].createMemReqEvent(pkt);
1294 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1295 "an acquire\n",
cu_id, gpuDynInst->simdId,
1296 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1303 assert(gpuDynInst->isEndOfKernel());
1306 req->setReqInstSeqNum(gpuDynInst->seqNum());
1313 memPort[0].createMemReqEvent(pkt);
1315 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1316 "a release\n",
cu_id, gpuDynInst->simdId,
1317 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1322 gpuDynInst->setRequestFlags(req);
1324 req->setReqInstSeqNum(gpuDynInst->seqNum());
1331 memPort[0].createMemReqEvent(pkt);
1334 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1335 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1336 pkt->
req->getPaddr());
1346 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1353 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1354 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1355 pkt->
req->getPaddr(),
id);
1357 Addr paddr = pkt->
req->getPaddr();
1373 int index = gpuDynInst->memStatusVector[paddr].back();
1375 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1376 pkt->
req->getPaddr(),
id);
1378 gpuDynInst->memStatusVector[paddr].pop_back();
1379 gpuDynInst->pAddr = pkt->
req->getPaddr();
1381 gpuDynInst->decrementStatusVector(
index);
1382 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1384 if (gpuDynInst->allLanesZero()) {
1385 auto iter = gpuDynInst->memStatusVector.begin();
1386 auto end = gpuDynInst->memStatusVector.end();
1388 while (iter != end) {
1389 assert(iter->second.empty());
1396 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1402 gpuDynInst->memStatusVector.clear();
1408 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1409 compute_unit->
cu_id, gpuDynInst->simdId,
1410 gpuDynInst->wfSlotId);
1413 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1415 .insert(std::make_pair(gpuDynInst,
curTick()));
1427 Addr line = pkt->
req->getPaddr();
1429 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1430 pkt->
req->getVaddr(), line);
1433 computeUnit->stats.tlbCycles +=
curTick();
1437 safe_cast<GpuTranslationState*>(pkt->
senderState);
1440 if (!translation_state->
tlbEntry) {
1442 safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
1445 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1448 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1449 pkt->
req->getVaddr());
1453 int hit_level = translation_state->
hitLevel;
1454 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1456 delete translation_state->
tlbEntry;
1457 assert(!translation_state->
ports.size());
1463 delete translation_state;
1467 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1472 gpuDynInst->memStatusVector[line].push_back(mp_index);
1473 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1484 panic(
"unsupported response to request conversion %s\n",
1488 if (computeUnit->prefetchDepth) {
1489 int simdId = gpuDynInst->simdId;
1490 int wfSlotId = gpuDynInst->wfSlotId;
1493 switch(computeUnit->prefetchType) {
1495 last = computeUnit->lastVaddrCU[mp_index];
1497 case enums::PF_PHASE:
1498 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1501 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1506 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1507 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1515 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1516 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1517 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1519 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1520 computeUnit->prefetchStride:
stride;
1522 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1523 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1528 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1535 RequestPtr prefetch_req = std::make_shared<Request>(
1538 computeUnit->requestorId(),
1548 computeUnit->shader->gpuTc,
true);
1551 sendFunctional(prefetch_pkt);
1555 safe_cast<GpuTranslationState*>(
1561 delete prefetch_pkt;
1582 if (new_pkt->
req->systemReq()) {
1587 if (!gpuDynInst->isSystemReq()) {
1588 computeUnit->getTokenManager()->recvTokens(1);
1589 gpuDynInst->setSystemReq();
1592 new_pkt->
req->requestorId(computeUnit->vramRequestorId());
1598 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1600 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1601 computeUnit->cu_id, gpuDynInst->simdId,
1602 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1604 computeUnit->schedule(mem_req_event,
curTick() +
1605 computeUnit->req_tick_latency);
1614 [
this, pkt]{ processMemReqEvent(pkt); },
1615 "ComputeUnit memory request event",
true);
1622 [
this, pkt]{ processMemRespEvent(pkt); },
1623 "ComputeUnit memory response event",
true);
1631 [[maybe_unused]]
ComputeUnit *compute_unit = computeUnit;
1633 if (pkt->
req->systemReq()) {
1637 }
else if (!(sendTimingReq(pkt))) {
1638 retries.push_back(std::make_pair(pkt, gpuDynInst));
1641 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1642 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1643 id, pkt->
req->getPaddr());
1646 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
1647 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1648 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1649 pkt->
req->getPaddr());
1656 return "ComputeUnit scalar memory request event";
1662 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1666 if (pkt->req->systemReq()) {
1674 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1675 compute_unit->
cu_id, gpuDynInst->simdId,
1676 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1679 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1680 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1681 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1682 pkt->req->getPaddr());
1695 int len = retries.size();
1697 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1698 computeUnit->cu_id,
len);
1701 assert(isStalled());
1706 for (
int i = 0;
i <
len; ++
i) {
1709 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1711 if (!sendTimingReq(pkt)) {
1714 DPRINTF(GPUTLB,
": failed again\n");
1717 DPRINTF(GPUTLB,
": successful\n");
1718 retries.pop_front();
1729 safe_cast<GpuTranslationState*>(pkt->
senderState);
1733 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1735 delete translation_state->
tlbEntry;
1736 assert(!translation_state->
ports.size());
1739 delete translation_state;
1742 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1747 [[maybe_unused]]
Wavefront *
w = gpuDynInst->wavefront();
1749 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1750 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1751 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1760 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1779 if (req_pkt->
req->systemReq()) {
1780 gpuDynInst->setSystemReq();
1782 req_pkt->
req->requestorId(computeUnit->vramRequestorId());
1787 (computeUnit->scalarDataPort, req_pkt);
1788 computeUnit->schedule(scalar_mem_req_event,
curTick() +
1789 computeUnit->req_tick_latency);
1797 [[maybe_unused]]
Addr line = pkt->
req->getPaddr();
1798 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1799 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1805 = safe_cast<GpuTranslationState*>(pkt->
senderState);
1807 bool success = translation_state->
tlbEntry !=
nullptr;
1808 delete translation_state->
tlbEntry;
1809 assert(!translation_state->
ports.size());
1811 delete translation_state;
1815 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1828 computeUnit->fetchStage.fetch(pkt, wavefront);
1851 int len = retries.size();
1852 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1855 assert(isStalled());
1861 for (
int i = 0;
i <
len; ++
i) {
1864 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1866 if (!sendTimingReq(pkt)) {
1868 DPRINTF(GPUTLB,
": failed again\n");
1871 DPRINTF(GPUTLB,
": successful\n");
1872 retries.pop_front();
1880 if (gpuDynInst->isScalar()) {
1881 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
1884 }
else if (gpuDynInst->isLoad()) {
1886 }
else if (gpuDynInst->isStore()) {
1890 if (gpuDynInst->isALU()) {
1898 += gpuDynInst->wavefront()->execMask().count();
1899 }
else if (gpuDynInst->isFlat()) {
1900 if (gpuDynInst->isLocalMem()) {
1905 }
else if (gpuDynInst->isFlatGlobal()) {
1907 }
else if (gpuDynInst->isLocalMem()) {
1909 }
else if (gpuDynInst->isLoad()) {
1911 }
else if (gpuDynInst->isStore()) {
1915 if (gpuDynInst->isLoad()) {
1916 switch (gpuDynInst->executedAs()) {
1917 case enums::SC_SPILL:
1920 case enums::SC_GLOBAL:
1923 case enums::SC_GROUP:
1926 case enums::SC_PRIVATE:
1929 case enums::SC_READONLY:
1932 case enums::SC_KERNARG:
1945 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
1948 }
else if (gpuDynInst->isStore()) {
1949 switch (gpuDynInst->executedAs()) {
1950 case enums::SC_SPILL:
1953 case enums::SC_GLOBAL:
1956 case enums::SC_GROUP:
1959 case enums::SC_PRIVATE:
1962 case enums::SC_READONLY:
1965 case enums::SC_KERNARG:
1978 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2002 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
2006 *page_stat_file << std::hex << iter.first <<
",";
2007 *page_stat_file << std::dec << iter.second.first <<
",";
2008 *page_stat_file << std::dec << iter.second.second << std::endl;
2045 const uint32_t wgId)
const
2055 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
2074 RequestPtr newRequest = std::make_shared<Request>();
2075 newRequest->setPaddr(0x0);
2104 fatal_if(!senderState,
"did not get the right sort of sender state");
2111 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2125 fatal_if(!sender_state,
"packet without a valid sender state");
2130 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2134 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2135 computeUnit->cu_id, gpuDynInst->simdId,
2136 gpuDynInst->wfSlotId);
2144 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2145 computeUnit->cu_id, gpuDynInst->simdId,
2146 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2149 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2150 computeUnit->cu_id, gpuDynInst->simdId,
2151 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2165 auto queueSize = retries.size();
2167 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2168 computeUnit->cu_id, queueSize);
2171 "why was there a recvReqRetry() with no pending reqs?");
2173 "recvReqRetry() happened when the port was not stalled");
2177 while (!retries.empty()) {
2180 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2185 DPRINTF(GPUPort,
": LDS send failed again\n");
2188 DPRINTF(GPUTLB,
": LDS send successful\n");
2196 : statistics::
Group(parent),
2197 ADD_STAT(vALUInsts,
"Number of vector ALU insts issued."),
2198 ADD_STAT(vALUInstsPerWF,
"The avg. number of vector ALU insts issued "
2200 ADD_STAT(sALUInsts,
"Number of scalar ALU insts issued."),
2201 ADD_STAT(sALUInstsPerWF,
"The avg. number of scalar ALU insts issued "
2204 "Number of cycles needed to execute VALU insts."),
2206 "Number of cycles needed to execute SALU insts."),
2207 ADD_STAT(threadCyclesVALU,
"Number of thread cycles used to execute "
2208 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2209 "the number of active threads."),
2211 "Percentage of active vector ALU threads in a wave."),
2212 ADD_STAT(ldsNoFlatInsts,
"Number of LDS insts issued, not including FLAT"
2213 " accesses that resolve to LDS."),
2214 ADD_STAT(ldsNoFlatInstsPerWF,
"The avg. number of LDS insts (not "
2215 "including FLAT accesses that resolve to LDS) per-wavefront."),
2217 "The number of FLAT insts that resolve to vmem issued."),
2218 ADD_STAT(flatVMemInstsPerWF,
"The average number of FLAT insts that "
2219 "resolve to vmem issued per-wavefront."),
2221 "The number of FLAT insts that resolve to LDS issued."),
2222 ADD_STAT(flatLDSInstsPerWF,
"The average number of FLAT insts that "
2223 "resolve to LDS issued per-wavefront."),
2225 "Number of vector mem write insts (excluding FLAT insts)."),
2226 ADD_STAT(vectorMemWritesPerWF,
"The average number of vector mem write "
2227 "insts (excluding FLAT insts) per-wavefront."),
2229 "Number of vector mem read insts (excluding FLAT insts)."),
2230 ADD_STAT(vectorMemReadsPerWF,
"The avg. number of vector mem read insts "
2231 "(excluding FLAT insts) per-wavefront."),
2232 ADD_STAT(scalarMemWrites,
"Number of scalar mem write insts."),
2234 "The average number of scalar mem write insts per-wavefront."),
2235 ADD_STAT(scalarMemReads,
"Number of scalar mem read insts."),
2237 "The average number of scalar mem read insts per-wavefront."),
2238 ADD_STAT(vectorMemReadsPerKiloInst,
2239 "Number of vector mem reads per kilo-instruction"),
2240 ADD_STAT(vectorMemWritesPerKiloInst,
2241 "Number of vector mem writes per kilo-instruction"),
2242 ADD_STAT(vectorMemInstsPerKiloInst,
2243 "Number of vector mem insts per kilo-instruction"),
2244 ADD_STAT(scalarMemReadsPerKiloInst,
2245 "Number of scalar mem reads per kilo-instruction"),
2246 ADD_STAT(scalarMemWritesPerKiloInst,
2247 "Number of scalar mem writes per kilo-instruction"),
2248 ADD_STAT(scalarMemInstsPerKiloInst,
2249 "Number of scalar mem insts per kilo-instruction"),
2250 ADD_STAT(instCyclesVMemPerSimd,
"Number of cycles to send address, "
2251 "command, data from VRF to vector memory unit, per SIMD"),
2252 ADD_STAT(instCyclesScMemPerSimd,
"Number of cycles to send address, "
2253 "command, data from SRF to scalar memory unit, per SIMD"),
2254 ADD_STAT(instCyclesLdsPerSimd,
"Number of cycles to send address, "
2255 "command, data from VRF to LDS unit, per SIMD"),
2256 ADD_STAT(globalReads,
"Number of reads to the global segment"),
2257 ADD_STAT(globalWrites,
"Number of writes to the global segment"),
2259 "Number of memory instructions sent to the global segment"),
2260 ADD_STAT(argReads,
"Number of reads to the arg segment"),
2261 ADD_STAT(argWrites,
"NUmber of writes to the arg segment"),
2263 "Number of memory instructions sent to the arg segment"),
2264 ADD_STAT(spillReads,
"Number of reads to the spill segment"),
2265 ADD_STAT(spillWrites,
"Number of writes to the spill segment"),
2267 "Number of memory instructions sent to the spill segment"),
2268 ADD_STAT(groupReads,
"Number of reads to the group segment"),
2269 ADD_STAT(groupWrites,
"Number of writes to the group segment"),
2271 "Number of memory instructions sent to the group segment"),
2272 ADD_STAT(privReads,
"Number of reads to the private segment"),
2273 ADD_STAT(privWrites,
"Number of writes to the private segment"),
2275 "Number of memory instructions sent to the private segment"),
2276 ADD_STAT(readonlyReads,
"Number of reads to the readonly segment"),
2278 "Number of memory instructions sent to the readonly segment"),
2280 "Number of memory instructions sent to the readonly segment"),
2281 ADD_STAT(kernargReads,
"Number of reads sent to the kernarg segment"),
2283 "Number of memory instructions sent to the kernarg segment"),
2285 "Number of memory instructions sent to the kernarg segment"),
2287 "wave level parallelism: count of active waves at wave launch"),
2288 ADD_STAT(tlbRequests,
"number of uncoalesced requests"),
2290 "total number of cycles for all uncoalesced requests"),
2291 ADD_STAT(tlbLatency,
"Avg. translation latency for data translations"),
2293 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2294 ADD_STAT(ldsBankAccesses,
"Total number of LDS bank accesses"),
2296 "Number of bank conflicts per LDS memory packet"),
2298 "pages touched per wf (over all mem. instr.)"),
2300 "dynamic non-flat global memory instruction count"),
2302 "dynamic flat global memory instruction count"),
2303 ADD_STAT(dynamicLMemInstrCnt,
"dynamic local memory intruction count"),
2304 ADD_STAT(wgBlockedDueBarrierAllocation,
2305 "WG dispatch was blocked due to lack of barrier resources"),
2306 ADD_STAT(wgBlockedDueLdsAllocation,
2307 "Workgroup blocked due to LDS capacity"),
2308 ADD_STAT(numInstrExecuted,
"number of instructions executed"),
2309 ADD_STAT(execRateDist,
"Instruction Execution Rate: Number of executed "
2310 "vector instructions per cycle"),
2312 "number of vec ops executed (e.g. WF size/inst)"),
2314 "number of f16 vec ops executed (e.g. WF size/inst)"),
2316 "number of f32 vec ops executed (e.g. WF size/inst)"),
2318 "number of f64 vec ops executed (e.g. WF size/inst)"),
2320 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2322 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2324 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2326 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2328 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2330 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2332 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2334 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2336 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2338 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2339 ADD_STAT(totalCycles,
"number of cycles the CU ran for"),
2340 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2341 ADD_STAT(vpc_f16,
"F16 Vector Operations per cycle (this CU only)"),
2342 ADD_STAT(vpc_f32,
"F32 Vector Operations per cycle (this CU only)"),
2343 ADD_STAT(vpc_f64,
"F64 Vector Operations per cycle (this CU only)"),
2344 ADD_STAT(ipc,
"Instructions per cycle (this CU only)"),
2345 ADD_STAT(controlFlowDivergenceDist,
"number of lanes active per "
2346 "instruction (over all instructions)"),
2347 ADD_STAT(activeLanesPerGMemInstrDist,
2348 "number of active lanes per global memory instruction"),
2349 ADD_STAT(activeLanesPerLMemInstrDist,
2350 "number of active lanes per local memory instruction"),
2352 "Number of dynamic non-GM memory insts executed"),
2353 ADD_STAT(numTimesWgBlockedDueVgprAlloc,
"Number of times WGs are "
2354 "blocked due to VGPR allocation per SIMD"),
2355 ADD_STAT(numTimesWgBlockedDueSgprAlloc,
"Number of times WGs are "
2356 "blocked due to SGPR allocation per SIMD"),
2357 ADD_STAT(numCASOps,
"number of compare and swap operations"),
2359 "number of compare and swap operations that failed"),
2360 ADD_STAT(completedWfs,
"number of completed wavefronts"),
2361 ADD_STAT(completedWGs,
"number of completed workgroups"),
2362 ADD_STAT(headTailLatency,
"ticks between first and last cache block "
2363 "arrival at coalescer"),
2364 ADD_STAT(instInterleave,
"Measure of instruction interleaving per SIMD")
2417 for (
int i = 0;
i < 4; ++
i) {
void sendRequest(PacketPtr pkt, Event *callback)
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
GPUDynInstPtr getMemInst() const
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
const char * description() const
Return a C string describing the event.
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ComputeUnit * computeUnit
bool handleResponse(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
int numVectorGlobalMemUnits
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void resetBarrier(int bar_id)
std::vector< std::vector< Addr > > lastVaddrSimd
int numVectorSharedMemUnits
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
ScoreboardCheckStage scoreboardCheckStage
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
WFBarrier & barrierSlot(int bar_id)
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void processFetchReturn(PacketPtr pkt)
GPUComputeDriver * driver()
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
int wgSize(int dim) const
int numVectorRegs() const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
int numScalarRegs() const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
const std::string & toString() const
Return the string to a cmd given by idx.
virtual std::string name() const
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
std::ostream * stream() const
Get the output underlying output stream.
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
T * getPtr()
get a pointer to the data ptr.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
RequestPtr req
A pointer to the original request.
MemCmd cmd
The command field of the packet.
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
std::vector< PoolManager * > vrfPoolMgrs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
std::vector< PoolManager * > srfPoolMgrs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
@ KERNEL
The request should be marked with KERNEL.
GPUDispatcher & dispatcher()
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
AMDGPUSystemHub * systemHub
GPUCommandProcessor & gpuCmdProc
virtual Process * getProcessPtr()=0
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
static const int InvalidID
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
bool rdy(Cycles cycles=Cycles(0)) const
void setStatus(status_e newStatus)
std::deque< GPUDynInstPtr > instructionBuffer
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Derived & init(size_type size)
Set this vector to have the given size.
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
bool scheduled() const
Determine if the current event is scheduled.
void schedule(Event &event, Tick when)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Bitfield< 21, 20 > stride
ProbePointArg< PacketInfo > Packet
Packet probe point.
const FlagsType pdf
Print the percent of the total that this entry represents.
const FlagsType oneline
Print all values on a single line.
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
RubyTester::SenderState SenderState
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
std::string csprintf(const char *format, const Args &...args)
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Declarations of a non-full system Page Table.
statistics::Scalar spillReads
statistics::Scalar groupWrites
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numVecOpsExecuted
statistics::Formula vpc_f64
statistics::Scalar instCyclesSALU
statistics::Formula vectorMemWritesPerWF
statistics::Scalar argWrites
statistics::Scalar globalReads
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vALUInstsPerWF
statistics::Formula vectorMemWritesPerKiloInst
statistics::Formula sALUInstsPerWF
statistics::Formula readonlyMemInsts
statistics::Formula vALUUtilization
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::Formula privMemInsts
statistics::VectorDistribution instInterleave
statistics::Scalar flatVMemInsts
statistics::Formula vpc_f16
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar dynamicLMemInstrCnt
statistics::Formula flatLDSInstsPerWF
statistics::Vector instCyclesVMemPerSimd
statistics::Formula flatVMemInstsPerWF
statistics::Scalar argReads
statistics::Distribution waveLevelParallelism
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar scalarMemWrites
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Formula groupMemInsts
statistics::Scalar privReads
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Formula numALUInstsExecuted
statistics::Scalar completedWfs
statistics::Distribution ldsBankConflictDist
statistics::Scalar vectorMemWrites
statistics::Scalar numInstrExecuted
statistics::Scalar vectorMemReads
statistics::Formula argMemInsts
statistics::Scalar tlbCycles
statistics::Formula scalarMemWritesPerKiloInst
statistics::Scalar scalarMemReads
statistics::Scalar tlbRequests
statistics::Formula kernargMemInsts
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar groupReads
statistics::Scalar privWrites
statistics::Scalar kernargReads
statistics::Scalar instCyclesVALU
statistics::Formula scalarMemWritesPerWF
statistics::Scalar readonlyWrites
statistics::Formula vectorMemReadsPerWF
statistics::Scalar dynamicGMemInstrCnt
statistics::Formula vpc_f32
statistics::Formula tlbLatency
statistics::Scalar vALUInsts
statistics::Formula scalarMemReadsPerKiloInst
statistics::Formula globalMemInsts
statistics::Formula scalarMemReadsPerWF
statistics::Vector hitsPerTLBLevel
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar threadCyclesVALU
statistics::Scalar ldsNoFlatInsts
statistics::Scalar flatLDSInsts
statistics::Scalar spillWrites
statistics::Formula ldsNoFlatInstsPerWF
statistics::Formula spillMemInsts
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Scalar kernargWrites
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar globalWrites
statistics::Distribution headTailLatency
statistics::Scalar totalCycles
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
statistics::Scalar readonlyReads
statistics::Scalar sALUInsts
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
GPUDynInstPtr _gpuDynInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
GPUDynInstPtr _gpuDynInst
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
Packet::SenderState * saved
const std::string & name()