39#include "debug/GPUDisp.hh"
40#include "debug/GPUExec.hh"
41#include "debug/GPUFetch.hh"
42#include "debug/GPUMem.hh"
43#include "debug/GPUPort.hh"
44#include "debug/GPUPrefetch.hh"
45#include "debug/GPUReg.hh"
46#include "debug/GPURename.hh"
47#include "debug/GPUSync.hh"
48#include "debug/GPUTLB.hh"
67 numVectorGlobalMemUnits(
p.num_global_mem_pipes),
68 numVectorSharedMemUnits(
p.num_shared_mem_pipes),
69 numScalarMemUnits(
p.num_scalar_mem_pipes),
70 numVectorALUs(
p.num_SIMDs),
71 numScalarALUs(
p.num_scalar_cores),
72 vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
73 coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
74 registerManager(
p.register_manager),
76 scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
77 scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
78 execStage(
p, *this, scheduleToExecute),
79 globalMemoryPipe(
p, *this),
80 localMemoryPipe(
p, *this),
81 scalarMemoryPipe(
p, *this),
82 tickEvent([this]{
exec(); },
"Compute unit tick event",
85 vrf(
p.vector_register_file), srf(
p.scalar_register_file),
86 rfc(
p.register_file_cache),
87 simdWidth(
p.simd_width),
88 spBypassPipeLength(
p.spbypass_pipe_length),
89 dpBypassPipeLength(
p.dpbypass_pipe_length),
90 rfcPipeLength(
p.rfc_pipe_length),
91 scalarPipeStages(
p.scalar_pipe_length),
92 operandNetworkLength(
p.operand_network_length),
93 issuePeriod(
p.issue_period),
94 vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
95 srf_scm_bus_latency(
p.srf_scm_bus_latency),
96 vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
97 perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
98 prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
99 debugSegFault(
p.debugSegFault),
100 functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
101 countPages(
p.countPages),
102 req_tick_latency(
p.mem_req_latency *
p.clk_domain->clockPeriod()),
103 resp_tick_latency(
p.mem_resp_latency *
p.clk_domain->clockPeriod()),
104 scalar_req_tick_latency(
105 p.scalar_mem_req_latency *
p.clk_domain->clockPeriod()),
106 scalar_resp_tick_latency(
107 p.scalar_mem_resp_latency *
p.clk_domain->clockPeriod()),
108 memtime_latency(
p.memtime_latency *
p.clk_domain->clockPeriod()),
109 _requestorId(
p.system->getRequestorId(
this,
"ComputeUnit")),
110 lds(*
p.localDataStore), gmTokenPort(
name() +
".gmTokenPort",
this),
116 _cacheLineSize(
p.system->cacheLineSize()),
117 _numBarrierSlots(
p.num_barrier_slots),
118 globalSeqNum(0), wavefrontSize(
p.wf_size),
119 scoreboardCheckToSchedule(
p),
120 scheduleToExecute(
p),
127 "Functional TLB not supported in full-system GPU simulation");
138 fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
140 "WF size is larger than the host can support");
142 "Wavefront size should be a power of 2");
145 numCyclesPerStoreTransfer =
146 (uint32_t)ceil((
double)(wfSize() *
sizeof(uint32_t)) /
147 (double)vrfToCoalescerBusWidth);
149 numCyclesPerLoadTransfer = (wfSize() *
sizeof(uint32_t))
150 / coalescerToVrfBusWidth;
153 idleWfs =
p.n_wf * numVectorALUs;
154 lastVaddrWF.resize(numVectorALUs);
155 wfList.resize(numVectorALUs);
157 wfBarrierSlots.resize(
p.num_barrier_slots,
WFBarrier());
159 for (
int i = 0;
i <
p.num_barrier_slots; ++
i) {
160 freeBarrierIds.insert(
i);
163 for (
int j = 0; j < numVectorALUs; ++j) {
164 lastVaddrWF[j].resize(
p.n_wf);
166 for (
int i = 0;
i <
p.n_wf; ++
i) {
167 lastVaddrWF[j][
i].resize(wfSize());
169 wfList[j].push_back(
p.wavefronts[j *
p.n_wf +
i]);
170 wfList[j][
i]->setParent(
this);
172 for (
int k = 0;
k < wfSize(); ++
k) {
173 lastVaddrWF[j][
i][
k] = 0;
178 lastVaddrSimd.resize(numVectorALUs);
180 for (
int i = 0;
i < numVectorALUs; ++
i) {
181 lastVaddrSimd[
i].resize(wfSize(), 0);
184 lastVaddrCU.resize(wfSize());
188 if (
p.execPolicy ==
"OLDEST-FIRST") {
190 }
else if (
p.execPolicy ==
"ROUND-ROBIN") {
193 fatal(
"Invalid WF execution policy (CU)\n");
196 for (
int i = 0;
i <
p.port_memory_port_connection_count; ++
i) {
200 for (
int i = 0;
i <
p.port_translation_port_connection_count; ++
i) {
206 memPortTokens =
new TokenManager(
p.max_cu_tokens);
210 lastExecCycle.resize(numVectorALUs, 0);
212 for (
int i = 0;
i < vrf.size(); ++
i) {
213 vrf[
i]->setParent(
this);
214 rfc[
i]->setParent(
this);
216 for (
int i = 0;
i < srf.size(); ++
i) {
217 srf[
i]->setParent(
this);
219 numVecRegsPerSimd = vrf[0]->numRegs();
220 numScalarRegsPerSimd = srf[0]->numRegs();
222 registerManager->setParent(
this);
226 instExecPerSimd.resize(numVectorALUs, 0);
230 "Cache line size should be a power of two.");
231 cacheLineBits =
floorLog2(_cacheLineSize);
314 w->workGroupSz[0] = task->
wgSize(0);
315 w->workGroupSz[1] = task->
wgSize(1);
316 w->workGroupSz[2] = task->
wgSize(2);
317 w->wgSz =
w->workGroupSz[0] *
w->workGroupSz[1] *
w->workGroupSz[2];
321 w->computeActualWgSz(task);
328 static int _n_wave = 0;
334 if (
k + waveId *
wfSize() <
w->actualWgSzTotal)
338 w->execMask() = init_mask;
342 w->initMask = init_mask.to_ullong();
345 w->barrierId(bar_id);
347 assert(!
w->hasBarrier());
351 w->workItemId[0][
k] = (
k + waveId *
wfSize()) %
w->actualWgSz[0];
352 w->workItemId[1][
k] = ((
k + waveId *
wfSize()) /
w->actualWgSz[0]) %
354 w->workItemId[2][
k] = (
k + waveId *
wfSize()) /
355 (
w->actualWgSz[0] *
w->actualWgSz[1]);
357 w->workItemFlatId[
k] =
w->workItemId[2][
k] *
w->actualWgSz[0] *
358 w->actualWgSz[1] +
w->workItemId[1][
k] *
w->actualWgSz[0] +
365 w->workGroupId[0] =
w->wgId % task->
numWg(0);
366 w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
367 w->workGroupId[2] =
w->wgId / (task->
numWg(0) * task->
numWg(1));
370 w->ldsChunk = ldsChunk;
372 [[maybe_unused]] int32_t refCount =
374 DPRINTF(GPUDisp,
"CU%d: increase ref ctr wg[%d] to [%d]\n",
375 cu_id,
w->wgId, refCount);
377 w->instructionBuffer.clear();
382 DPRINTF(GPUDisp,
"Scheduling wfDynId/barrier_id %d/%d on CU%d: "
383 "WF[%d][%d]. Ref cnt:%d\n", _n_wave,
w->barrierId(),
cu_id,
384 w->simdId,
w->wfSlotId, refCount);
386 w->initRegState(task,
w->actualWgSzTotal);
392 panic_if(
w->wrGmReqsInPipe,
"GM write counter for wavefront non-zero\n");
393 panic_if(
w->rdGmReqsInPipe,
"GM read counter for wavefront non-zero\n");
394 panic_if(
w->wrLmReqsInPipe,
"LM write counter for wavefront non-zero\n");
395 panic_if(
w->rdLmReqsInPipe,
"GM read counter for wavefront non-zero\n");
397 "Outstanding reqs counter for wavefront non-zero\n");
408 = std::make_shared<GPUDynInst>(
this,
nullptr,
412 gpuDynInst->kern_id = kernId;
435 = std::make_shared<GPUDynInst>(
this,
nullptr,
439 gpuDynInst->kern_id = kernId;
441 gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
463 DPRINTF(GPUDisp,
"CU%d: Scheduling wakeup next cycle\n",
cu_id);
477 panic_if(!ldsChunk,
"was not able to reserve space for this WG");
491 if (num_wfs_in_wg > 1) {
498 assert(!wf_barrier.maxBarrierCnt());
499 assert(!wf_barrier.numAtBarrier());
500 wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
502 DPRINTF(GPUSync,
"CU[%d] - Dispatching WG with barrier Id%d. "
503 "%d waves using this barrier.\n",
cu_id, barrier_id,
523 DPRINTF(GPURename,
"SIMD[%d] wfSlotId[%d] WF[%d] "
524 "vregDemand[%d] sregDemand[%d]\n",
i, j,
w->wfDynId,
525 vregDemand, sregDemand);
540 "Instruction Buffer of WF%d can't be empty",
w->wgId);
549 "Instruction Buffer of WF%d can't be empty",
w->wgId);
552 auto it =
pipeMap.find(ii->seqNum());
562 int trueWgSizeTotal = 1;
568 trueWgSizeTotal *= trueWgSize[
d];
569 DPRINTF(GPUDisp,
"trueWgSize[%d] = %d\n",
d, trueWgSize[
d]);
572 DPRINTF(GPUDisp,
"trueWgSizeTotal = %d\n", trueWgSizeTotal);
575 int numWfs = (trueWgSizeTotal +
wfSize() - 1) /
wfSize();
576 num_wfs_in_wg = numWfs;
578 bool barrier_avail =
true;
581 barrier_avail =
false;
594 "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
595 "that has %d VGPRs\n",
598 "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
605 int numMappedWfs = 0;
617 if (numMappedWfs < numWfs &&
631 assert(numMappedWfs <= numWfs);
633 bool vregAvail =
true;
634 bool sregAvail =
true;
636 if (numMappedWfs < numWfs) {
652 DPRINTF(GPUDisp,
"Free WF slots = %d, Mapped WFs = %d, \
653 VGPR Availability = %d, SGPR Availability = %d\n",
654 freeWfSlots, numMappedWfs, vregAvail, sregAvail);
671 if (!barrier_avail) {
680 bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
681 && ldsAvail && barrier_avail;
689 return wf_barrier.numYetToReachBarrier();
696 return wf_barrier.allAtBarrier();
703 wf_barrier.incNumAtBarrier();
710 return wf_barrier.numAtBarrier();
717 return wf_barrier.maxBarrierCnt();
731 wf_barrier.decMaxBarrierCnt();
738 wf_barrier.release();
761 for (
auto &vecRegFile :
vrf) {
765 for (
auto &scRegFile :
srf) {
809 "No support for multiple Global Memory Pipelines exists!!!");
816 "No support for multiple Local Memory Pipelines exists!!!");
823 "No support for multiple Scalar Memory Pipelines exists!!!");
868 assert(cu !=
nullptr);
870 if (pkt->
req->isInvL2()) {
874 panic(
"Unknown MemSyncResp not from an instruction");
886 if (gpuDynInst->isKernelLaunch()) {
889 assert(pkt->
req->isKernel());
890 assert(pkt->
req->isInvL1());
905 && gpuDynInst->isEndOfKernel()) {
911 assert(pkt->
req->isKernel());
912 assert(pkt->
req->isGL2CacheFlush());
928 DPRINTF(GPUDisp,
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
929 computeUnit->cu_id,
w->simdId,
w->wfSlotId,
930 w->wfDynId,
w->wgId);
936 if (!pkt->
req->isKernel()) {
937 w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
938 DPRINTF(GPUExec,
"MemSyncResp: WF[%d][%d] WV%d %s decrementing "
939 "outstanding reqs %d => %d\n", gpuDynInst->simdId,
940 gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
941 gpuDynInst->disassemble(),
w->outstandingReqs,
942 w->outstandingReqs - 1);
943 computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
952 computeUnit->memPort[
index].createMemRespEvent(pkt);
955 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
956 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
957 gpuDynInst->seqNum(),
index, pkt->
req->getPaddr());
959 computeUnit->schedule(mem_resp_event,
960 curTick() + computeUnit->resp_tick_latency);
968 return handleResponse(pkt);
975 if (pkt->
req->isKernel()) {
982 assert(!pkt->
req->isKernel());
989 assert(gpuDynInst->numScalarReqs > 0);
991 gpuDynInst->numScalarReqs--;
1001 if (!gpuDynInst->numScalarReqs) {
1002 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
1003 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
1006 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
1020 for (
const auto &pkt : retries) {
1021 if (!sendTimingReq(pkt)) {
1024 retries.pop_front();
1036 for (
int i = 0;
i <
len; ++
i) {
1038 [[maybe_unused]]
GPUDynInstPtr gpuDynInst = retries.front().second;
1039 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
1040 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1041 pkt->
req->getPaddr());
1046 if (!sendTimingReq(pkt)) {
1047 DPRINTF(GPUMem,
"failed again!\n");
1050 DPRINTF(GPUMem,
"successful!\n");
1051 retries.pop_front();
1064 if (sender_state->
wavefront !=
nullptr) {
1073 computeUnit->shader->gpuCmdProc.completeTimingRead();
1075 computeUnit->handleSQCReturn(pkt);
1094 int len = retries.size();
1098 for (
int i = 0;
i <
len; ++
i) {
1100 [[maybe_unused]]
Wavefront *wavefront = retries.front().second;
1101 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
1103 pkt->
req->getPaddr());
1104 if (!sendTimingReq(pkt)) {
1105 DPRINTF(GPUFetch,
"failed again!\n");
1108 DPRINTF(GPUFetch,
"successful!\n");
1109 retries.pop_front();
1117 return "ComputeUnit SQC memory request event";
1123 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1126 assert(!pkt->req->systemReq());
1138 Addr tmp_vaddr = pkt->
req->getVaddr();
1143 pkt->
req->setPC(gpuDynInst->wavefront()->pc());
1145 pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
1171 }
else if (pkt->
isRead()) {
1174 fatal(
"pkt is not a read nor a write\n");
1188 unsigned size = pkt->
getSize();
1191 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
1192 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
1197 if (!
p->pTable->translate(
vaddr, paddr)) {
1198 if (!
p->fixupFault(
vaddr)) {
1199 panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
1200 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1217 tlbPort[tlbPort_index].sendFunctional(pkt);
1220 int hit_level = translation_state->
hitLevel;
1221 assert(hit_level != -1);
1226 safe_cast<GpuTranslationState*>(pkt->
senderState);
1229 delete sender_state->
saved;
1230 delete sender_state;
1232 assert(pkt->
req->hasPaddr());
1233 assert(pkt->
req->hasSize());
1243 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
1254 gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
1255 gpuDynInst->tlbHitLevel[
index] = hit_level;
1262 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data "
1263 "scheduled\n",
cu_id, gpuDynInst->simdId,
1264 gpuDynInst->wfSlotId,
index, pkt->
req->getPaddr());
1267 }
else if (
tlbPort[tlbPort_index].isStalled()) {
1268 assert(
tlbPort[tlbPort_index].retries.size() > 0);
1270 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1271 "failed!\n",
cu_id, gpuDynInst->simdId,
1272 gpuDynInst->wfSlotId, tmp_vaddr);
1274 tlbPort[tlbPort_index].retries.push_back(pkt);
1275 }
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
1280 tlbPort[tlbPort_index].stallPort();
1282 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x "
1283 "failed!\n",
cu_id, gpuDynInst->simdId,
1284 gpuDynInst->wfSlotId, tmp_vaddr);
1286 tlbPort[tlbPort_index].retries.push_back(pkt);
1288 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x from "
1289 "instruction %s sent!\n",
cu_id, gpuDynInst->simdId,
1290 gpuDynInst->wfSlotId, tmp_vaddr,
1291 gpuDynInst->disassemble().c_str());
1295 gpuDynInst->resetEntireStatusVector();
1297 gpuDynInst->decrementStatusVector(
index);
1307 tlbPort[tlbPort_index].sendFunctional(pkt);
1317 memPort[0].sendFunctional(new_pkt);
1319 DPRINTF(GPUMem,
"Functional sendRequest\n");
1320 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index %d: addr %#x\n",
cu_id,
1321 gpuDynInst->simdId, gpuDynInst->wfSlotId,
index,
1322 new_pkt->
req->getPaddr());
1326 safe_cast<GpuTranslationState*>(pkt->
senderState);
1356 DPRINTF(GPUTLB,
"sent scalar %s translation request for addr %#x\n",
1358 pkt->
req->getVaddr());
1367 assert(gpuDynInst->isGlobalSeg() ||
1368 gpuDynInst->executedAs() == enums::SC_GLOBAL);
1373 req = std::make_shared<Request>(
1384 if (kernelMemSync) {
1385 if (gpuDynInst->isKernelLaunch()) {
1387 req->setReqInstSeqNum(gpuDynInst->seqNum());
1394 memPort[0].createMemReqEvent(pkt);
1396 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1397 "an acquire\n",
cu_id, gpuDynInst->simdId,
1398 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1405 assert(gpuDynInst->isEndOfKernel());
1408 req->setReqInstSeqNum(gpuDynInst->seqNum());
1415 memPort[0].createMemReqEvent(pkt);
1417 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
1418 "a release\n",
cu_id, gpuDynInst->simdId,
1419 gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
1424 gpuDynInst->setRequestFlags(req);
1426 req->setReqInstSeqNum(gpuDynInst->seqNum());
1433 memPort[0].createMemReqEvent(pkt);
1436 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
1437 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
1438 pkt->
req->getPaddr());
1447 auto req = std::make_shared<Request>(paddr, 64, 0,
vramRequestorId());
1451 pkt->pushSenderState(
1465 safe_cast<DataPort::SenderState*>(pkt->
senderState);
1472 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1473 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1474 pkt->
req->getPaddr(),
id);
1476 Addr paddr = pkt->
req->getPaddr();
1492 int index = gpuDynInst->memStatusVector[paddr].back();
1494 DPRINTF(GPUMem,
"Response for addr %#x, index %d\n",
1495 pkt->
req->getPaddr(),
id);
1497 gpuDynInst->memStatusVector[paddr].pop_back();
1498 gpuDynInst->pAddr = pkt->
req->getPaddr();
1500 gpuDynInst->decrementStatusVector(
index);
1501 DPRINTF(GPUMem,
"bitvector is now %s\n", gpuDynInst->printStatusVector());
1503 if (gpuDynInst->allLanesZero()) {
1504 auto iter = gpuDynInst->memStatusVector.begin();
1505 auto end = gpuDynInst->memStatusVector.end();
1507 while (iter != end) {
1508 assert(iter->second.empty());
1515 if (compute_unit->
headTailMap.count(gpuDynInst)) {
1521 gpuDynInst->memStatusVector.clear();
1527 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: packet totally complete\n",
1528 compute_unit->
cu_id, gpuDynInst->simdId,
1529 gpuDynInst->wfSlotId);
1532 if (!compute_unit->
headTailMap.count(gpuDynInst)) {
1534 .insert(std::make_pair(gpuDynInst,
curTick()));
1546 Addr line = pkt->
req->getPaddr();
1548 DPRINTF(GPUTLB,
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1549 pkt->
req->getVaddr(), line);
1552 computeUnit->stats.tlbCycles +=
curTick();
1556 safe_cast<GpuTranslationState*>(pkt->
senderState);
1559 if (!translation_state->
tlbEntry) {
1561 safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
1564 computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
1567 DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n",
w->wfDynId,
1568 pkt->
req->getVaddr());
1572 int hit_level = translation_state->
hitLevel;
1573 computeUnit->stats.hitsPerTLBLevel[hit_level]++;
1575 delete translation_state->
tlbEntry;
1576 assert(!translation_state->
ports.size());
1582 delete translation_state;
1586 safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
1591 gpuDynInst->memStatusVector[line].push_back(mp_index);
1592 gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1603 panic(
"unsupported response to request conversion %s\n",
1607 if (computeUnit->prefetchDepth) {
1608 int simdId = gpuDynInst->simdId;
1609 int wfSlotId = gpuDynInst->wfSlotId;
1612 switch(computeUnit->prefetchType) {
1614 last = computeUnit->lastVaddrCU[mp_index];
1616 case enums::PF_PHASE:
1617 last = computeUnit->lastVaddrSimd[simdId][mp_index];
1620 last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1625 DPRINTF(GPUPrefetch,
"CU[%d][%d][%d][%d]: %#x was last\n",
1626 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1634 computeUnit->lastVaddrCU[mp_index] =
vaddr;
1635 computeUnit->lastVaddrSimd[simdId][mp_index] =
vaddr;
1636 computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] =
vaddr;
1638 stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
1639 computeUnit->prefetchStride:
stride;
1641 DPRINTF(GPUPrefetch,
"%#x to: CU[%d][%d][%d][%d]\n",
vaddr,
1642 computeUnit->cu_id, simdId, wfSlotId, mp_index);
1647 for (
int pf = 1;
pf <= computeUnit->prefetchDepth; ++
pf) {
1654 RequestPtr prefetch_req = std::make_shared<Request>(
1657 computeUnit->requestorId(),
1667 computeUnit->shader->gpuTc,
true);
1670 sendFunctional(prefetch_pkt);
1674 safe_cast<GpuTranslationState*>(
1680 delete prefetch_pkt;
1701 if (new_pkt->
req->systemReq()) {
1706 if (!gpuDynInst->isSystemReq()) {
1707 computeUnit->getTokenManager()->recvTokens(1);
1708 gpuDynInst->setSystemReq();
1711 new_pkt->
req->requestorId(computeUnit->vramRequestorId());
1717 computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
1719 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1720 computeUnit->cu_id, gpuDynInst->simdId,
1721 gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
1723 computeUnit->schedule(mem_req_event,
curTick() +
1724 computeUnit->req_tick_latency);
1733 [
this, pkt]{ processMemReqEvent(pkt); },
1734 "ComputeUnit memory request event",
true);
1741 [
this, pkt]{ processMemRespEvent(pkt); },
1742 "ComputeUnit memory response event",
true);
1750 [[maybe_unused]]
ComputeUnit *compute_unit = computeUnit;
1752 if (pkt->
req->systemReq()) {
1756 }
else if (!(sendTimingReq(pkt))) {
1757 retries.emplace_back(pkt, gpuDynInst);
1761 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1762 compute_unit->
cu_id, gpuDynInst->simdId,
1763 gpuDynInst->wfSlotId,
id, pkt->
req->getPaddr());
1768 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
1769 " req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1770 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
id,
1771 pkt->
req->getPaddr());
1779 return "ComputeUnit scalar memory request event";
1785 SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1789 if (pkt->req->systemReq()) {
1797 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
1798 compute_unit->
cu_id, gpuDynInst->simdId,
1799 gpuDynInst->wfSlotId, pkt->req->getPaddr());
1802 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data "
1803 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
1804 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
1805 pkt->req->getPaddr());
1818 int len = retries.size();
1820 DPRINTF(GPUTLB,
"CU%d: DTLB recvReqRetry - %d pending requests\n",
1821 computeUnit->cu_id,
len);
1824 assert(isStalled());
1829 for (
int i = 0;
i <
len; ++
i) {
1832 DPRINTF(GPUTLB,
"CU%d: retrying D-translaton for address%#x",
vaddr);
1834 if (!sendTimingReq(pkt)) {
1837 DPRINTF(GPUTLB,
": failed again\n");
1840 DPRINTF(GPUTLB,
": successful\n");
1841 retries.pop_front();
1852 safe_cast<GpuTranslationState*>(pkt->
senderState);
1856 "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
1858 delete translation_state->
tlbEntry;
1859 assert(!translation_state->
ports.size());
1862 delete translation_state;
1865 safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
1870 [[maybe_unused]]
Wavefront *
w = gpuDynInst->wavefront();
1872 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
1873 "translation: PA %#x -> %#x\n", computeUnit->cu_id,
w->simdId,
1874 w->wfSlotId,
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
1883 fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
1902 if (req_pkt->
req->systemReq()) {
1903 gpuDynInst->setSystemReq();
1905 req_pkt->
req->requestorId(computeUnit->vramRequestorId());
1910 (computeUnit->scalarDataPort, req_pkt);
1911 computeUnit->schedule(scalar_mem_req_event,
curTick() +
1912 computeUnit->scalar_req_tick_latency);
1920 [[maybe_unused]]
Addr line = pkt->
req->getPaddr();
1921 DPRINTF(GPUTLB,
"CU%d: ITLBPort received %#x->%#x\n",
1922 computeUnit->cu_id, pkt->
req->getVaddr(), line);
1928 = safe_cast<GpuTranslationState*>(pkt->
senderState);
1930 bool success = translation_state->
tlbEntry !=
nullptr;
1931 delete translation_state->
tlbEntry;
1932 assert(!translation_state->
ports.size());
1934 delete translation_state;
1938 safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
1951 computeUnit->fetchStage.fetch(pkt, wavefront);
1974 int len = retries.size();
1975 DPRINTF(GPUTLB,
"CU%d: ITLB recvReqRetry - %d pending requests\n",
len);
1978 assert(isStalled());
1984 for (
int i = 0;
i <
len; ++
i) {
1987 DPRINTF(GPUTLB,
"CU%d: retrying I-translaton for address%#x",
vaddr);
1989 if (!sendTimingReq(pkt)) {
1991 DPRINTF(GPUTLB,
": failed again\n");
1994 DPRINTF(GPUTLB,
": successful\n");
1995 retries.pop_front();
2003 if (gpuDynInst->isScalar()) {
2004 if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
2007 }
else if (gpuDynInst->isLoad()) {
2009 }
else if (gpuDynInst->isStore()) {
2013 if (gpuDynInst->isALU()) {
2021 += gpuDynInst->wavefront()->execMask().count();
2022 }
else if (gpuDynInst->isFlat()) {
2023 if (gpuDynInst->isLocalMem()) {
2028 }
else if (gpuDynInst->isFlatGlobal()) {
2030 }
else if (gpuDynInst->isFlatScratch()) {
2032 }
else if (gpuDynInst->isLocalMem()) {
2034 }
else if (gpuDynInst->isLoad()) {
2036 }
else if (gpuDynInst->isStore()) {
2040 if (gpuDynInst->isLoad()) {
2041 switch (gpuDynInst->executedAs()) {
2042 case enums::SC_SPILL:
2045 case enums::SC_GLOBAL:
2048 case enums::SC_GROUP:
2051 case enums::SC_PRIVATE:
2054 case enums::SC_READONLY:
2057 case enums::SC_KERNARG:
2063 case enums::SC_NONE:
2070 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2073 }
else if (gpuDynInst->isStore()) {
2074 switch (gpuDynInst->executedAs()) {
2075 case enums::SC_SPILL:
2078 case enums::SC_GLOBAL:
2081 case enums::SC_GROUP:
2084 case enums::SC_PRIVATE:
2087 case enums::SC_READONLY:
2090 case enums::SC_KERNARG:
2096 case enums::SC_NONE:
2103 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
2127 *page_stat_file <<
"page, wavefront accesses, workitem accesses" <<
2131 *page_stat_file << std::hex << iter.first <<
",";
2132 *page_stat_file << std::dec << iter.second.first <<
",";
2133 *page_stat_file << std::dec << iter.second.second << std::endl;
2170 const uint32_t wgId)
const
2180 for (
int i_wf = 0; i_wf <
shader->
n_wf; ++i_wf){
2199 RequestPtr newRequest = std::make_shared<Request>();
2200 newRequest->setPaddr(0x0);
2229 fatal_if(!senderState,
"did not get the right sort of sender state");
2236 computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
2250 fatal_if(!sender_state,
"packet without a valid sender state");
2255 fatal_if(retries.empty(),
"must have retries waiting to be stalled");
2259 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: LDS send failed!\n",
2260 computeUnit->cu_id, gpuDynInst->simdId,
2261 gpuDynInst->wfSlotId);
2269 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
2270 computeUnit->cu_id, gpuDynInst->simdId,
2271 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2274 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
2275 computeUnit->cu_id, gpuDynInst->simdId,
2276 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
2290 auto queueSize = retries.size();
2292 DPRINTF(GPUPort,
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
2293 computeUnit->cu_id, queueSize);
2296 "why was there a recvReqRetry() with no pending reqs?");
2298 "recvReqRetry() happened when the port was not stalled");
2302 while (!retries.empty()) {
2305 DPRINTF(GPUPort,
"CU%d: retrying LDS send\n", computeUnit->cu_id);
2310 DPRINTF(GPUPort,
": LDS send failed again\n");
2313 DPRINTF(GPUTLB,
": LDS send successful\n");
2321 : statistics::
Group(parent),
2322 ADD_STAT(vALUInsts,
"Number of vector ALU insts issued."),
2323 ADD_STAT(vALUInstsPerWF,
"The avg. number of vector ALU insts issued "
2325 ADD_STAT(sALUInsts,
"Number of scalar ALU insts issued."),
2326 ADD_STAT(sALUInstsPerWF,
"The avg. number of scalar ALU insts issued "
2329 "Number of cycles needed to execute VALU insts."),
2331 "Number of cycles needed to execute SALU insts."),
2332 ADD_STAT(threadCyclesVALU,
"Number of thread cycles used to execute "
2333 "vector ALU ops. Similar to instCyclesVALU but multiplied by "
2334 "the number of active threads."),
2336 "Percentage of active vector ALU threads in a wave."),
2337 ADD_STAT(ldsNoFlatInsts,
"Number of LDS insts issued, not including FLAT"
2338 " accesses that resolve to LDS."),
2339 ADD_STAT(ldsNoFlatInstsPerWF,
"The avg. number of LDS insts (not "
2340 "including FLAT accesses that resolve to LDS) per-wavefront."),
2342 "The number of FLAT insts that resolve to vmem issued."),
2343 ADD_STAT(flatVMemInstsPerWF,
"The average number of FLAT insts that "
2344 "resolve to vmem issued per-wavefront."),
2346 "The number of FLAT insts that resolve to LDS issued."),
2347 ADD_STAT(flatLDSInstsPerWF,
"The average number of FLAT insts that "
2348 "resolve to LDS issued per-wavefront."),
2350 "Number of vector mem write insts (excluding FLAT insts)."),
2351 ADD_STAT(vectorMemWritesPerWF,
"The average number of vector mem write "
2352 "insts (excluding FLAT insts) per-wavefront."),
2354 "Number of vector mem read insts (excluding FLAT insts)."),
2355 ADD_STAT(vectorMemReadsPerWF,
"The avg. number of vector mem read insts "
2356 "(excluding FLAT insts) per-wavefront."),
2357 ADD_STAT(scalarMemWrites,
"Number of scalar mem write insts."),
2359 "The average number of scalar mem write insts per-wavefront."),
2360 ADD_STAT(scalarMemReads,
"Number of scalar mem read insts."),
2362 "The average number of scalar mem read insts per-wavefront."),
2363 ADD_STAT(vectorMemReadsPerKiloInst,
2364 "Number of vector mem reads per kilo-instruction"),
2365 ADD_STAT(vectorMemWritesPerKiloInst,
2366 "Number of vector mem writes per kilo-instruction"),
2367 ADD_STAT(vectorMemInstsPerKiloInst,
2368 "Number of vector mem insts per kilo-instruction"),
2369 ADD_STAT(scalarMemReadsPerKiloInst,
2370 "Number of scalar mem reads per kilo-instruction"),
2371 ADD_STAT(scalarMemWritesPerKiloInst,
2372 "Number of scalar mem writes per kilo-instruction"),
2373 ADD_STAT(scalarMemInstsPerKiloInst,
2374 "Number of scalar mem insts per kilo-instruction"),
2375 ADD_STAT(instCyclesVMemPerSimd,
"Number of cycles to send address, "
2376 "command, data from VRF to vector memory unit, per SIMD"),
2377 ADD_STAT(instCyclesScMemPerSimd,
"Number of cycles to send address, "
2378 "command, data from SRF to scalar memory unit, per SIMD"),
2379 ADD_STAT(instCyclesLdsPerSimd,
"Number of cycles to send address, "
2380 "command, data from VRF to LDS unit, per SIMD"),
2381 ADD_STAT(globalReads,
"Number of reads to the global segment"),
2382 ADD_STAT(globalWrites,
"Number of writes to the global segment"),
2384 "Number of memory instructions sent to the global segment"),
2385 ADD_STAT(argReads,
"Number of reads to the arg segment"),
2386 ADD_STAT(argWrites,
"NUmber of writes to the arg segment"),
2388 "Number of memory instructions sent to the arg segment"),
2389 ADD_STAT(spillReads,
"Number of reads to the spill segment"),
2390 ADD_STAT(spillWrites,
"Number of writes to the spill segment"),
2392 "Number of memory instructions sent to the spill segment"),
2393 ADD_STAT(groupReads,
"Number of reads to the group segment"),
2394 ADD_STAT(groupWrites,
"Number of writes to the group segment"),
2396 "Number of memory instructions sent to the group segment"),
2397 ADD_STAT(privReads,
"Number of reads to the private segment"),
2398 ADD_STAT(privWrites,
"Number of writes to the private segment"),
2400 "Number of memory instructions sent to the private segment"),
2401 ADD_STAT(readonlyReads,
"Number of reads to the readonly segment"),
2403 "Number of memory instructions sent to the readonly segment"),
2405 "Number of memory instructions sent to the readonly segment"),
2406 ADD_STAT(kernargReads,
"Number of reads sent to the kernarg segment"),
2408 "Number of memory instructions sent to the kernarg segment"),
2410 "Number of memory instructions sent to the kernarg segment"),
2412 "wave level parallelism: count of active waves at wave launch"),
2413 ADD_STAT(tlbRequests,
"number of uncoalesced requests"),
2415 "total number of cycles for all uncoalesced requests"),
2416 ADD_STAT(tlbLatency,
"Avg. translation latency for data translations"),
2418 "TLB hits distribution (0 for page table, x for Lx-TLB)"),
2419 ADD_STAT(ldsBankAccesses,
"Total number of LDS bank accesses"),
2421 "Number of bank conflicts per LDS memory packet"),
2423 "pages touched per wf (over all mem. instr.)"),
2425 "dynamic non-flat global memory instruction count"),
2427 "dynamic flat global memory instruction count"),
2428 ADD_STAT(dynamicLMemInstrCnt,
"dynamic local memory intruction count"),
2429 ADD_STAT(wgBlockedDueBarrierAllocation,
2430 "WG dispatch was blocked due to lack of barrier resources"),
2431 ADD_STAT(wgBlockedDueLdsAllocation,
2432 "Workgroup blocked due to LDS capacity"),
2433 ADD_STAT(numInstrExecuted,
"number of instructions executed"),
2434 ADD_STAT(execRateDist,
"Instruction Execution Rate: Number of executed "
2435 "vector instructions per cycle"),
2437 "number of vec ops executed (e.g. WF size/inst)"),
2439 "number of f16 vec ops executed (e.g. WF size/inst)"),
2441 "number of f32 vec ops executed (e.g. WF size/inst)"),
2443 "number of f64 vec ops executed (e.g. WF size/inst)"),
2445 "number of fma16 vec ops executed (e.g. WF size/inst)"),
2447 "number of fma32 vec ops executed (e.g. WF size/inst)"),
2449 "number of fma64 vec ops executed (e.g. WF size/inst)"),
2451 "number of mac16 vec ops executed (e.g. WF size/inst)"),
2453 "number of mac32 vec ops executed (e.g. WF size/inst)"),
2455 "number of mac64 vec ops executed (e.g. WF size/inst)"),
2457 "number of mad16 vec ops executed (e.g. WF size/inst)"),
2459 "number of mad32 vec ops executed (e.g. WF size/inst)"),
2461 "number of mad64 vec ops executed (e.g. WF size/inst)"),
2463 "number of mfma vec ops executed (e.g. WF size/inst)"),
2465 "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
2467 "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
2469 "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
2471 "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
2473 "number of two op FP vec ops executed (e.g. WF size/inst)"),
2474 ADD_STAT(totalCycles,
"number of cycles the CU ran for"),
2475 ADD_STAT(
vpc,
"Vector Operations per cycle (this CU only)"),
2476 ADD_STAT(vpc_f16,
"F16 Vector Operations per cycle (this CU only)"),
2477 ADD_STAT(vpc_f32,
"F32 Vector Operations per cycle (this CU only)"),
2478 ADD_STAT(vpc_f64,
"F64 Vector Operations per cycle (this CU only)"),
2479 ADD_STAT(ipc,
"Instructions per cycle (this CU only)"),
2480 ADD_STAT(controlFlowDivergenceDist,
"number of lanes active per "
2481 "instruction (over all instructions)"),
2482 ADD_STAT(activeLanesPerGMemInstrDist,
2483 "number of active lanes per global memory instruction"),
2484 ADD_STAT(activeLanesPerLMemInstrDist,
2485 "number of active lanes per local memory instruction"),
2487 "Number of dynamic non-GM memory insts executed"),
2488 ADD_STAT(numTimesWgBlockedDueVgprAlloc,
"Number of times WGs are "
2489 "blocked due to VGPR allocation per SIMD"),
2490 ADD_STAT(numTimesWgBlockedDueSgprAlloc,
"Number of times WGs are "
2491 "blocked due to SGPR allocation per SIMD"),
2492 ADD_STAT(numCASOps,
"number of compare and swap operations"),
2494 "number of compare and swap operations that failed"),
2495 ADD_STAT(completedWfs,
"number of completed wavefronts"),
2496 ADD_STAT(completedWGs,
"number of completed workgroups"),
2497 ADD_STAT(headTailLatency,
"ticks between first and last cache block "
2498 "arrival at coalescer"),
2499 ADD_STAT(instInterleave,
"Measure of instruction interleaving per SIMD")
2552 for (
int i = 0;
i < 4; ++
i) {
void sendRequest(PacketPtr pkt, Event *callback)
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
GPUDynInstPtr getMemInst() const
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
const char * description() const
Return a C string describing the event.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
ComputeUnit * computeUnit
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
const char * description() const
Return a C string describing the event.
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ComputeUnit * computeUnit
bool handleResponse(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
int numVectorGlobalMemUnits
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
void resetBarrier(int bar_id)
std::vector< std::vector< Addr > > lastVaddrSimd
int numVectorSharedMemUnits
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
void sendInvL2(Addr paddr)
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
ScoreboardCheckStage scoreboardCheckStage
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void processFetchReturn(PacketPtr pkt)
GPUComputeDriver * driver()
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
int wgSize(int dim) const
int numVectorRegs() const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
int numScalarRegs() const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
int increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
use the dynamic wave id to create or just increase the reference count
bool canReserve(uint32_t x_size) const
can this much space be reserved for a workgroup?
LdsChunk * reserveSpace(const uint32_t dispatchId, const uint32_t wgId, const uint32_t size)
assign a parent and request this amount of space be set aside for this wgid
int getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
return the current reference count for this workgroup id
bool isLMRespFIFOWrRdy() const
bool isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
const std::string & toString() const
Return the string to a cmd given by idx.
virtual std::string name() const
OutputStream * create(const std::string &name, bool binary=false, bool no_gz=false)
Creates a file in this directory (optionally compressed).
std::ostream * stream() const
Get the output underlying output stream.
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
T * getPtr()
get a pointer to the data ptr.
RequestPtr req
A pointer to the original request.
unsigned size
The size of the request or transfer.
MemCmd cmd
The command field of the packet.
void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand)
std::vector< PoolManager * > vrfPoolMgrs
bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
std::vector< PoolManager * > srfPoolMgrs
bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
@ KERNEL
The request should be marked with KERNEL.
void injectScalarMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req)
void incNumOutstandingInvL2s()
int getNumOutstandingInvL2s() const
GPUDispatcher & dispatcher()
void decNumOutstandingInvL2s()
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
AMDGPUSystemHub * systemHub
GPUCommandProcessor & gpuCmdProc
virtual Process * getProcessPtr()=0
void setTokenManager(TokenManager *_tokenManager)
Specify a token manger, which will handle tracking of tokens for a TokenRequestPort/ResponseRequestPo...
static const int InvalidID
void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
bool rdy(Cycles cycles=Cycles(0)) const
void setStatus(status_e newStatus)
std::deque< GPUDynInstPtr > instructionBuffer
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Derived & init(size_type size)
Set this vector to have the given size.
VectorDistribution & init(size_type size, Counter min, Counter max, Counter bkt)
Initialize storage and parameters for this distribution.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
bool scheduled() const
Determine if the current event is scheduled.
void schedule(Event &event, Tick when)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Bitfield< 21, 20 > stride
const FlagsType pdf
Print the percent of the total that this entry represents.
const FlagsType oneline
Print all values on a single line.
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
std::string csprintf(const char *format, const Args &...args)
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Declarations of a non-full system Page Table.
statistics::Scalar spillReads
statistics::Scalar groupWrites
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numVecOpsExecuted
statistics::Formula vpc_f64
statistics::Scalar instCyclesSALU
statistics::Formula vectorMemWritesPerWF
statistics::Scalar argWrites
statistics::Scalar globalReads
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vALUInstsPerWF
statistics::Formula vectorMemWritesPerKiloInst
statistics::Formula sALUInstsPerWF
statistics::Formula readonlyMemInsts
statistics::Formula vALUUtilization
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::Formula privMemInsts
statistics::VectorDistribution instInterleave
statistics::Scalar flatVMemInsts
statistics::Formula vpc_f16
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar dynamicLMemInstrCnt
statistics::Formula flatLDSInstsPerWF
statistics::Vector instCyclesVMemPerSimd
statistics::Formula flatVMemInstsPerWF
statistics::Scalar argReads
statistics::Distribution waveLevelParallelism
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar scalarMemWrites
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Formula groupMemInsts
statistics::Scalar privReads
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Formula numALUInstsExecuted
statistics::Scalar completedWfs
statistics::Distribution ldsBankConflictDist
statistics::Scalar vectorMemWrites
statistics::Scalar numInstrExecuted
statistics::Scalar vectorMemReads
statistics::Formula argMemInsts
statistics::Scalar tlbCycles
statistics::Formula scalarMemWritesPerKiloInst
statistics::Scalar scalarMemReads
statistics::Scalar tlbRequests
statistics::Formula kernargMemInsts
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar groupReads
statistics::Scalar privWrites
statistics::Scalar kernargReads
statistics::Scalar instCyclesVALU
statistics::Formula scalarMemWritesPerWF
statistics::Scalar readonlyWrites
statistics::Formula vectorMemReadsPerWF
statistics::Scalar dynamicGMemInstrCnt
statistics::Formula vpc_f32
statistics::Formula tlbLatency
statistics::Scalar vALUInsts
statistics::Formula scalarMemReadsPerKiloInst
statistics::Formula globalMemInsts
statistics::Formula scalarMemReadsPerWF
statistics::Vector hitsPerTLBLevel
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar threadCyclesVALU
statistics::Scalar ldsNoFlatInsts
statistics::Scalar flatLDSInsts
statistics::Scalar spillWrites
statistics::Formula ldsNoFlatInstsPerWF
statistics::Formula spillMemInsts
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Scalar kernargWrites
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar globalWrites
statistics::Distribution headTailLatency
statistics::Scalar totalCycles
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
statistics::Scalar readonlyReads
statistics::Scalar sALUInsts
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
ComputeUnit * computeUnit
GPUDynInstPtr _gpuDynInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
GPUDynInstPtr _gpuDynInst
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
Packet::SenderState * saved
const std::string & name()