38 #include "debug/GPUDisp.hh" 
   39 #include "debug/GPUExec.hh" 
   40 #include "debug/GPUFetch.hh" 
   41 #include "debug/GPUMem.hh" 
   42 #include "debug/GPUPort.hh" 
   43 #include "debug/GPUPrefetch.hh" 
   44 #include "debug/GPUReg.hh" 
   45 #include "debug/GPURename.hh" 
   46 #include "debug/GPUSync.hh" 
   47 #include "debug/GPUTLB.hh" 
   65     numVectorGlobalMemUnits(
p.num_global_mem_pipes),
 
   66     numVectorSharedMemUnits(
p.num_shared_mem_pipes),
 
   67     numScalarMemUnits(
p.num_scalar_mem_pipes),
 
   68     numVectorALUs(
p.num_SIMDs),
 
   69     numScalarALUs(
p.num_scalar_cores),
 
   70     vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
 
   71     coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
 
   72     registerManager(
p.register_manager),
 
   74     scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
 
   75     scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
 
   76     execStage(
p, *this, scheduleToExecute),
 
   77     globalMemoryPipe(
p, *this),
 
   78     localMemoryPipe(
p, *this),
 
   79     scalarMemoryPipe(
p, *this),
 
   80     tickEvent([this]{ 
exec(); }, 
"Compute unit tick event",
 
   83     vrf(
p.vector_register_file), srf(
p.scalar_register_file),
 
   84     simdWidth(
p.simd_width),
 
   85     spBypassPipeLength(
p.spbypass_pipe_length),
 
   86     dpBypassPipeLength(
p.dpbypass_pipe_length),
 
   87     scalarPipeStages(
p.scalar_pipe_length),
 
   88     operandNetworkLength(
p.operand_network_length),
 
   89     issuePeriod(
p.issue_period),
 
   90     vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
 
   91     srf_scm_bus_latency(
p.srf_scm_bus_latency),
 
   92     vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
 
   93     perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
 
   94     prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
 
   95     debugSegFault(
p.debugSegFault),
 
   96     functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
 
   97     countPages(
p.countPages),
 
   98     req_tick_latency(
p.mem_req_latency * 
p.clk_domain->clockPeriod()),
 
   99     resp_tick_latency(
p.mem_resp_latency * 
p.clk_domain->clockPeriod()),
 
  100     _requestorId(
p.system->getRequestorId(
this, 
"ComputeUnit")),
 
  101     lds(*
p.localDataStore), gmTokenPort(
name() + 
".gmTokenPort", 
this),
 
  107     _cacheLineSize(
p.system->cacheLineSize()),
 
  108     _numBarrierSlots(
p.num_barrier_slots),
 
  109     globalSeqNum(0), wavefrontSize(
p.wf_size),
 
  110     scoreboardCheckToSchedule(
p),
 
  111     scheduleToExecute(
p),
 
  123     fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
 
  125              "WF size is larger than the host can support");
 
  127              "Wavefront size should be a power of 2");
 
  130     numCyclesPerStoreTransfer =
 
  131         (uint32_t)ceil((
double)(wfSize() * 
sizeof(uint32_t)) /
 
  132                 (double)vrfToCoalescerBusWidth);
 
  134     numCyclesPerLoadTransfer = (wfSize() * 
sizeof(uint32_t))
 
  135                                / coalescerToVrfBusWidth;
 
  138     idleWfs = 
p.n_wf * numVectorALUs;
 
  139     lastVaddrWF.resize(numVectorALUs);
 
  140     wfList.resize(numVectorALUs);
 
  142     wfBarrierSlots.resize(
p.num_barrier_slots, 
WFBarrier());
 
  144     for (
int i = 0; 
i < 
p.num_barrier_slots; ++
i) {
 
  145         freeBarrierIds.insert(
i);
 
  148     for (
int j = 0; 
j < numVectorALUs; ++
j) {
 
  149         lastVaddrWF[
j].resize(
p.n_wf);
 
  151         for (
int i = 0; 
i < 
p.n_wf; ++
i) {
 
  152             lastVaddrWF[
j][
i].resize(wfSize());
 
  154             wfList[
j].push_back(
p.wavefronts[
j * 
p.n_wf + 
i]);
 
  155             wfList[
j][
i]->setParent(
this);
 
  157             for (
int k = 0; 
k < wfSize(); ++
k) {
 
  158                 lastVaddrWF[
j][
i][
k] = 0;
 
  163     lastVaddrSimd.resize(numVectorALUs);
 
  165     for (
int i = 0; 
i < numVectorALUs; ++
i) {
 
  166         lastVaddrSimd[
i].resize(wfSize(), 0);
 
  169     lastVaddrCU.resize(wfSize());
 
  173     if (
p.execPolicy == 
"OLDEST-FIRST") {
 
  175     } 
else if (
p.execPolicy == 
"ROUND-ROBIN") {
 
  178         fatal(
"Invalid WF execution policy (CU)\n");
 
  181     for (
int i = 0; 
i < 
p.port_memory_port_connection_count; ++
i) {
 
  185     for (
int i = 0; 
i < 
p.port_translation_port_connection_count; ++
i) {
 
  191     memPortTokens = 
new TokenManager(
p.max_cu_tokens);
 
  195     lastExecCycle.resize(numVectorALUs, 0);
 
  197     for (
int i = 0; 
i < vrf.size(); ++
i) {
 
  198         vrf[
i]->setParent(
this);
 
  200     for (
int i = 0; 
i < srf.size(); ++
i) {
 
  201         srf[
i]->setParent(
this);
 
  203     numVecRegsPerSimd = vrf[0]->numRegs();
 
  204     numScalarRegsPerSimd = srf[0]->numRegs();
 
  206     registerManager->setParent(
this);
 
  210     instExecPerSimd.resize(numVectorALUs, 0);
 
  214         "Cache line size should be a power of two.");
 
  215     cacheLineBits = 
floorLog2(_cacheLineSize);
 
  298     w->workGroupSz[0] = task->
wgSize(0);
 
  299     w->workGroupSz[1] = task->
wgSize(1);
 
  300     w->workGroupSz[2] = task->
wgSize(2);
 
  301     w->wgSz = 
w->workGroupSz[0] * 
w->workGroupSz[1] * 
w->workGroupSz[2];
 
  305     w->computeActualWgSz(task);
 
  312     static int _n_wave = 0;
 
  318         if (
k + waveId * 
wfSize() < 
w->actualWgSzTotal)
 
  322     w->execMask() = init_mask;
 
  326     w->initMask = init_mask.to_ullong();
 
  329         w->barrierId(bar_id);
 
  331         assert(!
w->hasBarrier());
 
  335         w->workItemId[0][
k] = (
k + waveId * 
wfSize()) % 
w->actualWgSz[0];
 
  336         w->workItemId[1][
k] = ((
k + waveId * 
wfSize()) / 
w->actualWgSz[0]) %
 
  338         w->workItemId[2][
k] = (
k + waveId * 
wfSize()) /
 
  339                               (
w->actualWgSz[0] * 
w->actualWgSz[1]);
 
  341         w->workItemFlatId[
k] = 
w->workItemId[2][
k] * 
w->actualWgSz[0] *
 
  342             w->actualWgSz[1] + 
w->workItemId[1][
k] * 
w->actualWgSz[0] +
 
  349     w->workGroupId[0] = 
w->wgId % task->
numWg(0);
 
  350     w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
 
  351     w->workGroupId[2] = 
w->wgId / (task->
numWg(0) * task->
numWg(1));
 
  354     w->ldsChunk = ldsChunk;
 
  356     [[maybe_unused]] int32_t refCount =
 
  358     DPRINTF(GPUDisp, 
"CU%d: increase ref ctr wg[%d] to [%d]\n",
 
  359                     cu_id, 
w->wgId, refCount);
 
  361     w->instructionBuffer.clear();
 
  366     DPRINTF(GPUDisp, 
"Scheduling wfDynId/barrier_id %d/%d on CU%d: " 
  367             "WF[%d][%d]. Ref cnt:%d\n", _n_wave, 
w->barrierId(), 
cu_id,
 
  368             w->simdId, 
w->wfSlotId, refCount);
 
  370     w->initRegState(task, 
w->actualWgSzTotal);
 
  385         = std::make_shared<GPUDynInst>(
this, 
nullptr,
 
  389     gpuDynInst->kern_id = kernId;
 
  391     req->setContext(gpuDynInst->wfDynId);
 
  424         DPRINTF(GPUDisp, 
"CU%d: Scheduling wakeup next cycle\n", 
cu_id);
 
  438     panic_if(!ldsChunk, 
"was not able to reserve space for this WG");
 
  452     if (num_wfs_in_wg > 1) {
 
  459         assert(!wf_barrier.maxBarrierCnt());
 
  460         assert(!wf_barrier.numAtBarrier());
 
  461         wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
 
  463         DPRINTF(GPUSync, 
"CU[%d] - Dispatching WG with barrier Id%d. " 
  464                 "%d waves using this barrier.\n", 
cu_id, barrier_id,
 
  484                 DPRINTF(GPURename, 
"SIMD[%d] wfSlotId[%d] WF[%d] " 
  485                     "vregDemand[%d] sregDemand[%d]\n", 
i, 
j, 
w->wfDynId,
 
  486                     vregDemand, sregDemand);
 
  501              "Instruction Buffer of WF%d can't be empty", 
w->wgId);
 
  510              "Instruction Buffer of WF%d can't be empty", 
w->wgId);
 
  513     auto it = 
pipeMap.find(ii->seqNum());
 
  523     int trueWgSizeTotal = 1;
 
  529         trueWgSizeTotal *= trueWgSize[
d];
 
  530         DPRINTF(GPUDisp, 
"trueWgSize[%d] =  %d\n", 
d, trueWgSize[
d]);
 
  533     DPRINTF(GPUDisp, 
"trueWgSizeTotal =  %d\n", trueWgSizeTotal);
 
  536     int numWfs = (trueWgSizeTotal + 
wfSize() - 1) / 
wfSize();
 
  537     num_wfs_in_wg = numWfs;
 
  539     bool barrier_avail = 
true;
 
  542         barrier_avail = 
false;
 
  555              "WG with %d WFs and %d VGPRs per WI can not be allocated to CU " 
  556              "that has %d VGPRs\n",
 
  559              "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU " 
  566     int numMappedWfs = 0;
 
  578                 if (numMappedWfs < numWfs &&
 
  592     assert(numMappedWfs <= numWfs);
 
  594     bool vregAvail = 
true;
 
  595     bool sregAvail = 
true;
 
  597     if (numMappedWfs < numWfs) {
 
  613     DPRINTF(GPUDisp, 
"Free WF slots =  %d, Mapped WFs = %d, \ 
  614             VGPR Availability = %d, SGPR Availability = %d\n",
 
  615             freeWfSlots, numMappedWfs, vregAvail, sregAvail);
 
  632     if (!barrier_avail) {
 
  641     bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
 
  642                         && ldsAvail && barrier_avail;
 
  650     return wf_barrier.numYetToReachBarrier();
 
  657     return wf_barrier.allAtBarrier();
 
  664     wf_barrier.incNumAtBarrier();
 
  671     return wf_barrier.numAtBarrier();
 
  678     return wf_barrier.maxBarrierCnt();
 
  692     wf_barrier.decMaxBarrierCnt();
 
  699     wf_barrier.release();
 
  722     for (
auto &vecRegFile : 
vrf) {
 
  726     for (
auto &scRegFile : 
srf) {
 
  770              "No support for multiple Global Memory Pipelines exists!!!");
 
  777              "No support for multiple Local Memory Pipelines exists!!!");
 
  784              "No support for multiple Scalar Memory Pipelines exists!!!");
 
  822         if (gpuDynInst->isKernelLaunch()) {
 
  825             assert(pkt->
req->isKernel());
 
  826             assert(pkt->
req->isInvL1());
 
  841             && gpuDynInst->isEndOfKernel()) {
 
  847             assert(pkt->
req->isKernel());
 
  848             assert(pkt->
req->isGL2CacheFlush());
 
  864             DPRINTF(GPUDisp, 
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
 
  866                     w->wfDynId, 
w->wgId);
 
  872         if (!pkt->
req->isKernel()) {
 
  874             DPRINTF(GPUExec, 
"MemSyncResp: WF[%d][%d] WV%d %s decrementing " 
  875                             "outstanding reqs %d => %d\n", gpuDynInst->simdId,
 
  876                             gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
 
  877                             gpuDynInst->disassemble(), 
w->outstandingReqs,
 
  878                             w->outstandingReqs - 1);
 
  891             "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
 
  893             gpuDynInst->seqNum(), 
index, pkt->
req->getPaddr());
 
  904     assert(!pkt->
req->isKernel());
 
  911     assert(gpuDynInst->numScalarReqs > 0);
 
  913     gpuDynInst->numScalarReqs--;
 
  923     if (!gpuDynInst->numScalarReqs) {
 
  924         if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 
  925                 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
 
  928                 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
 
  942     for (
const auto &pkt : retries) {
 
  943         if (!sendTimingReq(pkt)) {
 
  954     int len = retries.size();
 
  958     for (
int i = 0; 
i < 
len; ++
i) {
 
  960         [[maybe_unused]] 
GPUDynInstPtr gpuDynInst = retries.front().second;
 
  961         DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
 
  962                 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
  963                 pkt->
req->getPaddr());
 
  968         if (!sendTimingReq(pkt)) {
 
  969             DPRINTF(GPUMem, 
"failed again!\n");
 
  972             DPRINTF(GPUMem, 
"successful!\n");
 
  981     computeUnit->fetchStage.processFetchReturn(pkt);
 
  988     int len = retries.size();
 
  992     for (
int i = 0; 
i < 
len; ++
i) {
 
  994         [[maybe_unused]] 
Wavefront *wavefront = retries.front().second;
 
  995         DPRINTF(GPUFetch, 
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
 
  997                 pkt->
req->getPaddr());
 
  998         if (!sendTimingReq(pkt)) {
 
  999             DPRINTF(GPUFetch, 
"failed again!\n");
 
 1002             DPRINTF(GPUFetch, 
"successful!\n");
 
 1003             retries.pop_front();
 
 1012     Addr tmp_vaddr = pkt->
req->getVaddr();
 
 1017     pkt->
req->setPC(gpuDynInst->wavefront()->pc());
 
 1019     pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1040     } 
else if (pkt->
isRead()) {
 
 1043         fatal(
"pkt is not a read nor a write\n");
 
 1055             unsigned size = pkt->
getSize();
 
 1058                 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
 
 1059                       cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 
vaddr);
 
 1064             if (!
p->pTable->translate(
vaddr, paddr)) {
 
 1065                 if (!
p->fixupFault(
vaddr)) {
 
 1066                     panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
 
 1067                           cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1084             tlbPort[tlbPort_index].sendFunctional(pkt);
 
 1087             int hit_level = translation_state->
hitLevel;
 
 1088             assert(hit_level != -1);
 
 1093                 safe_cast<GpuTranslationState*>(pkt->
senderState);
 
 1096             delete sender_state->
saved;
 
 1097             delete sender_state;
 
 1099             assert(pkt->
req->hasPaddr());
 
 1100             assert(pkt->
req->hasSize());
 
 1110                 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
 
 1121             gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
 
 1122             gpuDynInst->tlbHitLevel[
index] = hit_level;
 
 1129             DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x data " 
 1130                     "scheduled\n", 
cu_id, gpuDynInst->simdId,
 
 1131                     gpuDynInst->wfSlotId, 
index, pkt->
req->getPaddr());
 
 1134         } 
else if (
tlbPort[tlbPort_index].isStalled()) {
 
 1135             assert(
tlbPort[tlbPort_index].retries.size() > 0);
 
 1137             DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Translation for addr %#x " 
 1138                     "failed!\n", 
cu_id, gpuDynInst->simdId,
 
 1139                     gpuDynInst->wfSlotId, tmp_vaddr);
 
 1141             tlbPort[tlbPort_index].retries.push_back(pkt);
 
 1142         } 
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
 
 1147             tlbPort[tlbPort_index].stallPort();
 
 1149             DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Translation for addr %#x " 
 1150                     "failed!\n", 
cu_id, gpuDynInst->simdId,
 
 1151                     gpuDynInst->wfSlotId, tmp_vaddr);
 
 1153             tlbPort[tlbPort_index].retries.push_back(pkt);
 
 1156                    "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
 
 1157                    cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
 
 1161             gpuDynInst->resetEntireStatusVector();
 
 1163             gpuDynInst->decrementStatusVector(
index);
 
 1173         tlbPort[tlbPort_index].sendFunctional(pkt);
 
 1183         memPort[0].sendFunctional(new_pkt);
 
 1185         DPRINTF(GPUMem, 
"Functional sendRequest\n");
 
 1186         DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: index %d: addr %#x\n", 
cu_id,
 
 1187                 gpuDynInst->simdId, gpuDynInst->wfSlotId, 
index,
 
 1188                 new_pkt->
req->getPaddr());
 
 1192              safe_cast<GpuTranslationState*>(pkt->
senderState);
 
 1222         DPRINTF(GPUTLB, 
"sent scalar %s translation request for addr %#x\n",
 
 1224                 pkt->
req->getVaddr());
 
 1233     assert(gpuDynInst->isGlobalSeg() ||
 
 1234            gpuDynInst->executedAs() == enums::SC_GLOBAL);
 
 1237         req = std::make_shared<Request>(
 
 1246     if (kernelMemSync) {
 
 1247         if (gpuDynInst->isKernelLaunch()) {
 
 1249             req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1256               memPort[0].createMemReqEvent(pkt);
 
 1258             DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling " 
 1259                     "an acquire\n", 
cu_id, gpuDynInst->simdId,
 
 1260                     gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
 
 1267           assert(gpuDynInst->isEndOfKernel());
 
 1270           req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1277             memPort[0].createMemReqEvent(pkt);
 
 1279           DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling " 
 1280                   "a release\n", 
cu_id, gpuDynInst->simdId,
 
 1281                   gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
 
 1286         gpuDynInst->setRequestFlags(req);
 
 1288         req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1295           memPort[0].createMemReqEvent(pkt);
 
 1298                 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
 
 1299                 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
 
 1300                 pkt->
req->getPaddr());
 
 1310         safe_cast<DataPort::SenderState*>(pkt->
senderState);
 
 1317     DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
 
 1318             compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1319             pkt->
req->getPaddr(), 
id);
 
 1321     Addr paddr = pkt->
req->getPaddr();
 
 1335     int index = gpuDynInst->memStatusVector[paddr].back();
 
 1337     DPRINTF(GPUMem, 
"Response for addr %#x, index %d\n",
 
 1338             pkt->
req->getPaddr(), 
id);
 
 1340     gpuDynInst->memStatusVector[paddr].pop_back();
 
 1341     gpuDynInst->pAddr = pkt->
req->getPaddr();
 
 1343     gpuDynInst->decrementStatusVector(
index);
 
 1344     DPRINTF(GPUMem, 
"bitvector is now %s\n", gpuDynInst->printStatusVector());
 
 1346     if (gpuDynInst->allLanesZero()) {
 
 1347         auto iter = gpuDynInst->memStatusVector.begin();
 
 1348         auto end = gpuDynInst->memStatusVector.end();
 
 1350         while (iter != end) {
 
 1351             assert(iter->second.empty());
 
 1358         if (compute_unit->
headTailMap.count(gpuDynInst)) {
 
 1364         gpuDynInst->memStatusVector.clear();
 
 1370         DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: packet totally complete\n",
 
 1371                 compute_unit->
cu_id, gpuDynInst->simdId,
 
 1372                 gpuDynInst->wfSlotId);
 
 1375             if (!compute_unit->
headTailMap.count(gpuDynInst)) {
 
 1377                     .insert(std::make_pair(gpuDynInst, 
curTick()));
 
 1389     Addr line = pkt->
req->getPaddr();
 
 1391     DPRINTF(GPUTLB, 
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
 
 1392             pkt->
req->getVaddr(), line);
 
 1395     computeUnit->stats.tlbCycles += 
curTick();
 
 1399                safe_cast<GpuTranslationState*>(pkt->
senderState);
 
 1402     if (!translation_state->
tlbEntry) {
 
 1404             safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
 
 1407             computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
 
 1410         DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n", 
w->wfDynId,
 
 1411                  pkt->
req->getVaddr());
 
 1415     int hit_level = translation_state->
hitLevel;
 
 1416     computeUnit->stats.hitsPerTLBLevel[hit_level]++;
 
 1418     delete translation_state->
tlbEntry;
 
 1419     assert(!translation_state->
ports.size());
 
 1425     delete translation_state;
 
 1429         safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
 
 1434     gpuDynInst->memStatusVector[line].push_back(mp_index);
 
 1435     gpuDynInst->tlbHitLevel[mp_index] = hit_level;
 
 1446         panic(
"unsupported response to request conversion %s\n",
 
 1450     if (computeUnit->prefetchDepth) {
 
 1451         int simdId = gpuDynInst->simdId;
 
 1452         int wfSlotId = gpuDynInst->wfSlotId;
 
 1455         switch(computeUnit->prefetchType) {
 
 1457             last = computeUnit->lastVaddrCU[mp_index];
 
 1459         case enums::PF_PHASE:
 
 1460             last = computeUnit->lastVaddrSimd[simdId][mp_index];
 
 1463             last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
 
 1468         DPRINTF(GPUPrefetch, 
"CU[%d][%d][%d][%d]: %#x was last\n",
 
 1469                 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
 
 1477         computeUnit->lastVaddrCU[mp_index] = 
vaddr;
 
 1478         computeUnit->lastVaddrSimd[simdId][mp_index] = 
vaddr;
 
 1479         computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = 
vaddr;
 
 1481         stride = (computeUnit->prefetchType == enums::PF_STRIDE) ?
 
 1482             computeUnit->prefetchStride: 
stride;
 
 1484         DPRINTF(GPUPrefetch, 
"%#x to: CU[%d][%d][%d][%d]\n", 
vaddr,
 
 1485                 computeUnit->cu_id, simdId, wfSlotId, mp_index);
 
 1490         for (
int pf = 1; 
pf <= computeUnit->prefetchDepth; ++
pf) {
 
 1497             RequestPtr prefetch_req = std::make_shared<Request>(
 
 1500                 computeUnit->requestorId(),
 
 1510                     computeUnit->shader->gpuTc, 
true);
 
 1513             sendFunctional(prefetch_pkt);
 
 1517                  safe_cast<GpuTranslationState*>(
 
 1523             delete prefetch_pkt;
 
 1542         computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
 
 1544     DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
 
 1545             computeUnit->cu_id, gpuDynInst->simdId,
 
 1546             gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
 
 1548     computeUnit->schedule(mem_req_event, 
curTick() +
 
 1549                           computeUnit->req_tick_latency);
 
 1558         [
this, pkt]{ processMemReqEvent(pkt); },
 
 1559         "ComputeUnit memory request event", 
true);
 
 1566         [
this, pkt]{ processMemRespEvent(pkt); },
 
 1567         "ComputeUnit memory response event", 
true);
 
 1575     [[maybe_unused]] 
ComputeUnit *compute_unit = computeUnit;
 
 1577     if (!(sendTimingReq(pkt))) {
 
 1578         retries.push_back(std::make_pair(pkt, gpuDynInst));
 
 1581                 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
 
 1582                 compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1583                 id, pkt->
req->getPaddr());
 
 1586                 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " 
 1587                 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
 
 1588                 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), 
id,
 
 1589                 pkt->
req->getPaddr());
 
 1596     return "ComputeUnit scalar memory request event";
 
 1602     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
 
 1610                 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
 
 1611                 compute_unit->
cu_id, gpuDynInst->simdId,
 
 1612                 gpuDynInst->wfSlotId, pkt->req->getPaddr());
 
 1615                 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data " 
 1616                 "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
 
 1617                 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
 
 1618                 pkt->req->getPaddr());
 
 1631     int len = retries.size();
 
 1633     DPRINTF(GPUTLB, 
"CU%d: DTLB recvReqRetry - %d pending requests\n",
 
 1634             computeUnit->cu_id, 
len);
 
 1637     assert(isStalled());
 
 1642     for (
int i = 0; 
i < 
len; ++
i) {
 
 1645         DPRINTF(GPUTLB, 
"CU%d: retrying D-translaton for address%#x", 
vaddr);
 
 1647         if (!sendTimingReq(pkt)) {
 
 1650             DPRINTF(GPUTLB, 
": failed again\n");
 
 1653             DPRINTF(GPUTLB, 
": successful\n");
 
 1654             retries.pop_front();
 
 1665         safe_cast<GpuTranslationState*>(pkt->
senderState);
 
 1669             "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
 
 1671     delete translation_state->
tlbEntry;
 
 1672     assert(!translation_state->
ports.size());
 
 1675     delete translation_state;
 
 1678         safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
 
 1683     [[maybe_unused]] 
Wavefront *
w = gpuDynInst->wavefront();
 
 1685     DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received " 
 1686         "translation: PA %#x -> %#x\n", computeUnit->cu_id, 
w->simdId,
 
 1687         w->wfSlotId, 
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
 
 1696       fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
 
 1707     if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
 
 1708         computeUnit->scalarDataPort.retries.push_back(req_pkt);
 
 1709         DPRINTF(GPUMem, 
"send scalar req failed for: %s\n",
 
 1710                 gpuDynInst->disassemble());
 
 1712         DPRINTF(GPUMem, 
"send scalar req for: %s\n",
 
 1713                 gpuDynInst->disassemble());
 
 1722     [[maybe_unused]] 
Addr line = pkt->
req->getPaddr();
 
 1723     DPRINTF(GPUTLB, 
"CU%d: ITLBPort received %#x->%#x\n",
 
 1724             computeUnit->cu_id, pkt->
req->getVaddr(), line);
 
 1730         = safe_cast<GpuTranslationState*>(pkt->
senderState);
 
 1732     bool success = translation_state->
tlbEntry != 
nullptr;
 
 1733     delete translation_state->
tlbEntry;
 
 1734     assert(!translation_state->
ports.size());
 
 1736     delete translation_state;
 
 1740         safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
 
 1753         computeUnit->fetchStage.fetch(pkt, wavefront);
 
 1776     int len = retries.size();
 
 1777     DPRINTF(GPUTLB, 
"CU%d: ITLB recvReqRetry - %d pending requests\n", 
len);
 
 1780     assert(isStalled());
 
 1786     for (
int i = 0; 
i < 
len; ++
i) {
 
 1789         DPRINTF(GPUTLB, 
"CU%d: retrying I-translaton for address%#x", 
vaddr);
 
 1791         if (!sendTimingReq(pkt)) {
 
 1793             DPRINTF(GPUTLB, 
": failed again\n");
 
 1796             DPRINTF(GPUTLB, 
": successful\n");
 
 1797             retries.pop_front();
 
 1805     if (gpuDynInst->isScalar()) {
 
 1806         if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
 
 1809         } 
else if (gpuDynInst->isLoad()) {
 
 1811         } 
else if (gpuDynInst->isStore()) {
 
 1815         if (gpuDynInst->isALU()) {
 
 1823                 += gpuDynInst->wavefront()->execMask().count();
 
 1824         } 
else if (gpuDynInst->isFlat()) {
 
 1825             if (gpuDynInst->isLocalMem()) {
 
 1830         } 
else if (gpuDynInst->isFlatGlobal()) {
 
 1832         } 
else if (gpuDynInst->isLocalMem()) {
 
 1834         } 
else if (gpuDynInst->isLoad()) {
 
 1836         } 
else if (gpuDynInst->isStore()) {
 
 1840         if (gpuDynInst->isLoad()) {
 
 1841             switch (gpuDynInst->executedAs()) {
 
 1842               case enums::SC_SPILL:
 
 1845               case enums::SC_GLOBAL:
 
 1848               case enums::SC_GROUP:
 
 1851               case enums::SC_PRIVATE:
 
 1854               case enums::SC_READONLY:
 
 1857               case enums::SC_KERNARG:
 
 1870                 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
 
 1873         } 
else if (gpuDynInst->isStore()) {
 
 1874             switch (gpuDynInst->executedAs()) {
 
 1875               case enums::SC_SPILL:
 
 1878               case enums::SC_GLOBAL:
 
 1881               case enums::SC_GROUP:
 
 1884               case enums::SC_PRIVATE:
 
 1887               case enums::SC_READONLY:
 
 1890               case enums::SC_KERNARG:
 
 1903                 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
 
 1927         *page_stat_file << 
"page, wavefront accesses, workitem accesses" <<
 
 1931             *page_stat_file << std::hex << iter.first << 
",";
 
 1932             *page_stat_file << std::dec << iter.second.first << 
",";
 
 1933             *page_stat_file << std::dec << iter.second.second << std::endl;
 
 1970     const uint32_t wgId)
 const 
 1980     for (
int i_wf = 0; i_wf < 
shader->
n_wf; ++i_wf){
 
 1999     RequestPtr newRequest = std::make_shared<Request>();
 
 2000     newRequest->setPaddr(0x0);
 
 2020     fatal_if(!senderState, 
"did not get the right sort of sender state");
 
 2027     computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
 
 2041     fatal_if(!sender_state, 
"packet without a valid sender state");
 
 2046         fatal_if(retries.empty(), 
"must have retries waiting to be stalled");
 
 2050         DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: LDS send failed!\n",
 
 2051                         computeUnit->cu_id, gpuDynInst->simdId,
 
 2052                         gpuDynInst->wfSlotId);
 
 2060         DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
 
 2061                 computeUnit->cu_id, gpuDynInst->simdId,
 
 2062                 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
 
 2065         DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
 
 2066                 computeUnit->cu_id, gpuDynInst->simdId,
 
 2067                 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
 
 2081     auto queueSize = retries.size();
 
 2083     DPRINTF(GPUPort, 
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
 
 2084             computeUnit->cu_id, queueSize);
 
 2087              "why was there a recvReqRetry() with no pending reqs?");
 
 2089              "recvReqRetry() happened when the port was not stalled");
 
 2093     while (!retries.empty()) {
 
 2096         DPRINTF(GPUPort, 
"CU%d: retrying LDS send\n", computeUnit->cu_id);
 
 2101             DPRINTF(GPUPort, 
": LDS send failed again\n");
 
 2104             DPRINTF(GPUTLB, 
": LDS send successful\n");
 
 2112     : statistics::
Group(parent),
 
 2113       ADD_STAT(vALUInsts, 
"Number of vector ALU insts issued."),
 
 2114       ADD_STAT(vALUInstsPerWF, 
"The avg. number of vector ALU insts issued " 
 2116       ADD_STAT(sALUInsts, 
"Number of scalar ALU insts issued."),
 
 2117       ADD_STAT(sALUInstsPerWF, 
"The avg. number of scalar ALU insts issued " 
 2120                "Number of cycles needed to execute VALU insts."),
 
 2122                "Number of cycles needed to execute SALU insts."),
 
 2123       ADD_STAT(threadCyclesVALU, 
"Number of thread cycles used to execute " 
 2124                "vector ALU ops. Similar to instCyclesVALU but multiplied by " 
 2125                "the number of active threads."),
 
 2127                "Percentage of active vector ALU threads in a wave."),
 
 2128       ADD_STAT(ldsNoFlatInsts, 
"Number of LDS insts issued, not including FLAT" 
 2129                " accesses that resolve to LDS."),
 
 2130       ADD_STAT(ldsNoFlatInstsPerWF, 
"The avg. number of LDS insts (not " 
 2131                "including FLAT accesses that resolve to LDS) per-wavefront."),
 
 2133                "The number of FLAT insts that resolve to vmem issued."),
 
 2134       ADD_STAT(flatVMemInstsPerWF, 
"The average number of FLAT insts that " 
 2135                "resolve to vmem issued per-wavefront."),
 
 2137                "The number of FLAT insts that resolve to LDS issued."),
 
 2138       ADD_STAT(flatLDSInstsPerWF, 
"The average number of FLAT insts that " 
 2139                "resolve to LDS issued per-wavefront."),
 
 2141                "Number of vector mem write insts (excluding FLAT insts)."),
 
 2142       ADD_STAT(vectorMemWritesPerWF, 
"The average number of vector mem write " 
 2143                "insts (excluding FLAT insts) per-wavefront."),
 
 2145                "Number of vector mem read insts (excluding FLAT insts)."),
 
 2146       ADD_STAT(vectorMemReadsPerWF, 
"The avg. number of vector mem read insts " 
 2147                "(excluding FLAT insts) per-wavefront."),
 
 2148       ADD_STAT(scalarMemWrites, 
"Number of scalar mem write insts."),
 
 2150                "The average number of scalar mem write insts per-wavefront."),
 
 2151       ADD_STAT(scalarMemReads, 
"Number of scalar mem read insts."),
 
 2153                "The average number of scalar mem read insts per-wavefront."),
 
 2154       ADD_STAT(vectorMemReadsPerKiloInst,
 
 2155                "Number of vector mem reads per kilo-instruction"),
 
 2156       ADD_STAT(vectorMemWritesPerKiloInst,
 
 2157                "Number of vector mem writes per kilo-instruction"),
 
 2158       ADD_STAT(vectorMemInstsPerKiloInst,
 
 2159                "Number of vector mem insts per kilo-instruction"),
 
 2160       ADD_STAT(scalarMemReadsPerKiloInst,
 
 2161                "Number of scalar mem reads per kilo-instruction"),
 
 2162       ADD_STAT(scalarMemWritesPerKiloInst,
 
 2163                "Number of scalar mem writes per kilo-instruction"),
 
 2164       ADD_STAT(scalarMemInstsPerKiloInst,
 
 2165                "Number of scalar mem insts per kilo-instruction"),
 
 2166       ADD_STAT(instCyclesVMemPerSimd, 
"Number of cycles to send address, " 
 2167                "command, data from VRF to vector memory unit, per SIMD"),
 
 2168       ADD_STAT(instCyclesScMemPerSimd, 
"Number of cycles to send address, " 
 2169                "command, data from SRF to scalar memory unit, per SIMD"),
 
 2170       ADD_STAT(instCyclesLdsPerSimd, 
"Number of cycles to send address, " 
 2171                "command, data from VRF to LDS unit, per SIMD"),
 
 2172       ADD_STAT(globalReads, 
"Number of reads to the global segment"),
 
 2173       ADD_STAT(globalWrites, 
"Number of writes to the global segment"),
 
 2175                "Number of memory instructions sent to the global segment"),
 
 2176       ADD_STAT(argReads, 
"Number of reads to the arg segment"),
 
 2177       ADD_STAT(argWrites, 
"NUmber of writes to the arg segment"),
 
 2179                "Number of memory instructions sent to the arg segment"),
 
 2180       ADD_STAT(spillReads, 
"Number of reads to the spill segment"),
 
 2181       ADD_STAT(spillWrites, 
"Number of writes to the spill segment"),
 
 2183                "Number of memory instructions sent to the spill segment"),
 
 2184       ADD_STAT(groupReads, 
"Number of reads to the group segment"),
 
 2185       ADD_STAT(groupWrites, 
"Number of writes to the group segment"),
 
 2187                "Number of memory instructions sent to the group segment"),
 
 2188       ADD_STAT(privReads, 
"Number of reads to the private segment"),
 
 2189       ADD_STAT(privWrites, 
"Number of writes to the private segment"),
 
 2191                "Number of memory instructions sent to the private segment"),
 
 2192       ADD_STAT(readonlyReads, 
"Number of reads to the readonly segment"),
 
 2194                "Number of memory instructions sent to the readonly segment"),
 
 2196                "Number of memory instructions sent to the readonly segment"),
 
 2197       ADD_STAT(kernargReads, 
"Number of reads sent to the kernarg segment"),
 
 2199                "Number of memory instructions sent to the kernarg segment"),
 
 2201                "Number of memory instructions sent to the kernarg segment"),
 
 2203                "wave level parallelism: count of active waves at wave launch"),
 
 2204       ADD_STAT(tlbRequests, 
"number of uncoalesced requests"),
 
 2206                "total number of cycles for all uncoalesced requests"),
 
 2207       ADD_STAT(tlbLatency, 
"Avg. translation latency for data translations"),
 
 2209                "TLB hits distribution (0 for page table, x for Lx-TLB)"),
 
 2210       ADD_STAT(ldsBankAccesses, 
"Total number of LDS bank accesses"),
 
 2212                "Number of bank conflicts per LDS memory packet"),
 
 2214                "pages touched per wf (over all mem. instr.)"),
 
 2216                "dynamic non-flat global memory instruction count"),
 
 2218                "dynamic flat global memory instruction count"),
 
 2219       ADD_STAT(dynamicLMemInstrCnt, 
"dynamic local memory intruction count"),
 
 2220       ADD_STAT(wgBlockedDueBarrierAllocation,
 
 2221                "WG dispatch was blocked due to lack of barrier resources"),
 
 2222       ADD_STAT(wgBlockedDueLdsAllocation,
 
 2223                "Workgroup blocked due to LDS capacity"),
 
 2224       ADD_STAT(numInstrExecuted, 
"number of instructions executed"),
 
 2225       ADD_STAT(execRateDist, 
"Instruction Execution Rate: Number of executed " 
 2226                "vector instructions per cycle"),
 
 2228                "number of vec ops executed (e.g. WF size/inst)"),
 
 2230                "number of f16 vec ops executed (e.g. WF size/inst)"),
 
 2232                "number of f32 vec ops executed (e.g. WF size/inst)"),
 
 2234                "number of f64 vec ops executed (e.g. WF size/inst)"),
 
 2236                "number of fma16 vec ops executed (e.g. WF size/inst)"),
 
 2238                "number of fma32 vec ops executed (e.g. WF size/inst)"),
 
 2240                "number of fma64 vec ops executed (e.g. WF size/inst)"),
 
 2242                "number of mac16 vec ops executed (e.g. WF size/inst)"),
 
 2244                "number of mac32 vec ops executed (e.g. WF size/inst)"),
 
 2246                "number of mac64 vec ops executed (e.g. WF size/inst)"),
 
 2248                "number of mad16 vec ops executed (e.g. WF size/inst)"),
 
 2250                "number of mad32 vec ops executed (e.g. WF size/inst)"),
 
 2252                "number of mad64 vec ops executed (e.g. WF size/inst)"),
 
 2254                "number of two op FP vec ops executed (e.g. WF size/inst)"),
 
 2255       ADD_STAT(totalCycles, 
"number of cycles the CU ran for"),
 
 2256       ADD_STAT(
vpc, 
"Vector Operations per cycle (this CU only)"),
 
 2257       ADD_STAT(vpc_f16, 
"F16 Vector Operations per cycle (this CU only)"),
 
 2258       ADD_STAT(vpc_f32, 
"F32 Vector Operations per cycle (this CU only)"),
 
 2259       ADD_STAT(vpc_f64, 
"F64 Vector Operations per cycle (this CU only)"),
 
 2260       ADD_STAT(ipc, 
"Instructions per cycle (this CU only)"),
 
 2261       ADD_STAT(controlFlowDivergenceDist, 
"number of lanes active per " 
 2262                "instruction (over all instructions)"),
 
 2263       ADD_STAT(activeLanesPerGMemInstrDist,
 
 2264                "number of active lanes per global memory instruction"),
 
 2265       ADD_STAT(activeLanesPerLMemInstrDist,
 
 2266                "number of active lanes per local memory instruction"),
 
 2268                "Number of dynamic non-GM memory insts executed"),
 
 2269       ADD_STAT(numTimesWgBlockedDueVgprAlloc, 
"Number of times WGs are " 
 2270                "blocked due to VGPR allocation per SIMD"),
 
 2271       ADD_STAT(numTimesWgBlockedDueSgprAlloc, 
"Number of times WGs are " 
 2272                "blocked due to SGPR allocation per SIMD"),
 
 2273       ADD_STAT(numCASOps, 
"number of compare and swap operations"),
 
 2275                "number of compare and swap operations that failed"),
 
 2276       ADD_STAT(completedWfs, 
"number of completed wavefronts"),
 
 2277       ADD_STAT(completedWGs, 
"number of completed workgroups"),
 
 2278       ADD_STAT(headTailLatency, 
"ticks between first and last cache block " 
 2279                "arrival at coalescer"),
 
 2280       ADD_STAT(instInterleave, 
"Measure of instruction interleaving per SIMD")
 
 2333     for (
int i = 0; 
i < 4; ++
i) {