40 #include "debug/GPUDisp.hh" 
   41 #include "debug/GPUExec.hh" 
   42 #include "debug/GPUFetch.hh" 
   43 #include "debug/GPUMem.hh" 
   44 #include "debug/GPUPort.hh" 
   45 #include "debug/GPUPrefetch.hh" 
   46 #include "debug/GPUReg.hh" 
   47 #include "debug/GPURename.hh" 
   48 #include "debug/GPUSync.hh" 
   49 #include "debug/GPUTLB.hh" 
   63     numVectorGlobalMemUnits(
p.num_global_mem_pipes),
 
   64     numVectorSharedMemUnits(
p.num_shared_mem_pipes),
 
   65     numScalarMemUnits(
p.num_scalar_mem_pipes),
 
   66     numVectorALUs(
p.num_SIMDs),
 
   67     numScalarALUs(
p.num_scalar_cores),
 
   68     vrfToCoalescerBusWidth(
p.vrf_to_coalescer_bus_width),
 
   69     coalescerToVrfBusWidth(
p.coalescer_to_vrf_bus_width),
 
   70     registerManager(
p.register_manager),
 
   72     scoreboardCheckStage(
p, *this, scoreboardCheckToSchedule),
 
   73     scheduleStage(
p, *this, scoreboardCheckToSchedule, scheduleToExecute),
 
   74     execStage(
p, *this, scheduleToExecute),
 
   75     globalMemoryPipe(
p, *this),
 
   76     localMemoryPipe(
p, *this),
 
   77     scalarMemoryPipe(
p, *this),
 
   78     tickEvent([this]{ 
exec(); }, 
"Compute unit tick event",
 
   81     vrf(
p.vector_register_file), srf(
p.scalar_register_file),
 
   82     simdWidth(
p.simd_width),
 
   83     spBypassPipeLength(
p.spbypass_pipe_length),
 
   84     dpBypassPipeLength(
p.dpbypass_pipe_length),
 
   85     scalarPipeStages(
p.scalar_pipe_length),
 
   86     operandNetworkLength(
p.operand_network_length),
 
   87     issuePeriod(
p.issue_period),
 
   88     vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
 
   89     srf_scm_bus_latency(
p.srf_scm_bus_latency),
 
   90     vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
 
   91     perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
 
   92     prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
 
   93     debugSegFault(
p.debugSegFault),
 
   94     functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
 
   95     countPages(
p.countPages),
 
   96     req_tick_latency(
p.mem_req_latency * 
p.clk_domain->clockPeriod()),
 
   97     resp_tick_latency(
p.mem_resp_latency * 
p.clk_domain->clockPeriod()),
 
   98     _requestorId(
p.system->getRequestorId(
this, 
"ComputeUnit")),
 
   99     lds(*
p.localDataStore), gmTokenPort(
name() + 
".gmTokenPort", 
this),
 
  105     _cacheLineSize(
p.system->cacheLineSize()),
 
  106     _numBarrierSlots(
p.num_barrier_slots),
 
  107     globalSeqNum(0), wavefrontSize(
p.wf_size),
 
  108     scoreboardCheckToSchedule(
p),
 
  109     scheduleToExecute(
p),
 
  121     fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
 
  123              "WF size is larger than the host can support");
 
  125              "Wavefront size should be a power of 2");
 
  128     numCyclesPerStoreTransfer =
 
  129         (uint32_t)ceil((
double)(wfSize() * 
sizeof(uint32_t)) /
 
  130                 (double)vrfToCoalescerBusWidth);
 
  132     numCyclesPerLoadTransfer = (wfSize() * 
sizeof(uint32_t))
 
  133                                / coalescerToVrfBusWidth;
 
  136     idleWfs = 
p.n_wf * numVectorALUs;
 
  137     lastVaddrWF.resize(numVectorALUs);
 
  138     wfList.resize(numVectorALUs);
 
  140     wfBarrierSlots.resize(
p.num_barrier_slots, 
WFBarrier());
 
  142     for (
int i = 0; 
i < 
p.num_barrier_slots; ++
i) {
 
  143         freeBarrierIds.insert(
i);
 
  146     for (
int j = 0; 
j < numVectorALUs; ++
j) {
 
  147         lastVaddrWF[
j].resize(
p.n_wf);
 
  149         for (
int i = 0; 
i < 
p.n_wf; ++
i) {
 
  150             lastVaddrWF[
j][
i].resize(wfSize());
 
  152             wfList[
j].push_back(
p.wavefronts[
j * 
p.n_wf + 
i]);
 
  153             wfList[
j][
i]->setParent(
this);
 
  155             for (
int k = 0; 
k < wfSize(); ++
k) {
 
  156                 lastVaddrWF[
j][
i][
k] = 0;
 
  161     lastVaddrSimd.resize(numVectorALUs);
 
  163     for (
int i = 0; 
i < numVectorALUs; ++
i) {
 
  164         lastVaddrSimd[
i].resize(wfSize(), 0);
 
  167     lastVaddrCU.resize(wfSize());
 
  171     if (
p.execPolicy == 
"OLDEST-FIRST") {
 
  173     } 
else if (
p.execPolicy == 
"ROUND-ROBIN") {
 
  176         fatal(
"Invalid WF execution policy (CU)\n");
 
  179     for (
int i = 0; 
i < 
p.port_memory_port_connection_count; ++
i) {
 
  183     for (
int i = 0; 
i < 
p.port_translation_port_connection_count; ++
i) {
 
  193     lastExecCycle.resize(numVectorALUs, 0);
 
  195     for (
int i = 0; 
i < vrf.size(); ++
i) {
 
  196         vrf[
i]->setParent(
this);
 
  198     for (
int i = 0; 
i < srf.size(); ++
i) {
 
  199         srf[
i]->setParent(
this);
 
  201     numVecRegsPerSimd = vrf[0]->numRegs();
 
  202     numScalarRegsPerSimd = srf[0]->numRegs();
 
  204     registerManager->setParent(
this);
 
  208     instExecPerSimd.resize(numVectorALUs, 0);
 
  212         "Cache line size should be a power of two.");
 
  213     cacheLineBits = 
floorLog2(_cacheLineSize);
 
  296     w->workGroupSz[0] = task->
wgSize(0);
 
  297     w->workGroupSz[1] = task->
wgSize(1);
 
  298     w->workGroupSz[2] = task->
wgSize(2);
 
  299     w->wgSz = 
w->workGroupSz[0] * 
w->workGroupSz[1] * 
w->workGroupSz[2];
 
  303     w->computeActualWgSz(task);
 
  310     static int _n_wave = 0;
 
  316         if (
k + waveId * 
wfSize() < 
w->actualWgSzTotal)
 
  320     w->execMask() = init_mask;
 
  324     w->initMask = init_mask.to_ullong();
 
  327         w->barrierId(bar_id);
 
  329         assert(!
w->hasBarrier());
 
  333         w->workItemId[0][
k] = (
k + waveId * 
wfSize()) % 
w->actualWgSz[0];
 
  334         w->workItemId[1][
k] = ((
k + waveId * 
wfSize()) / 
w->actualWgSz[0]) %
 
  336         w->workItemId[2][
k] = (
k + waveId * 
wfSize()) /
 
  337                               (
w->actualWgSz[0] * 
w->actualWgSz[1]);
 
  339         w->workItemFlatId[
k] = 
w->workItemId[2][
k] * 
w->actualWgSz[0] *
 
  340             w->actualWgSz[1] + 
w->workItemId[1][
k] * 
w->actualWgSz[0] +
 
  347     w->workGroupId[0] = 
w->wgId % task->
numWg(0);
 
  348     w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
 
  349     w->workGroupId[2] = 
w->wgId / (task->
numWg(0) * task->
numWg(1));
 
  352     w->ldsChunk = ldsChunk;
 
  354     M5_VAR_USED int32_t refCount =
 
  356     DPRINTF(GPUDisp, 
"CU%d: increase ref ctr wg[%d] to [%d]\n",
 
  357                     cu_id, 
w->wgId, refCount);
 
  359     w->instructionBuffer.clear();
 
  364     DPRINTF(GPUDisp, 
"Scheduling wfDynId/barrier_id %d/%d on CU%d: " 
  365             "WF[%d][%d]. Ref cnt:%d\n", _n_wave, 
w->barrierId(), 
cu_id,
 
  366             w->simdId, 
w->wfSlotId, refCount);
 
  368     w->initRegState(task, 
w->actualWgSzTotal);
 
  383         = std::make_shared<GPUDynInst>(
this, 
nullptr,
 
  387     gpuDynInst->kern_id = kernId;
 
  389     req->setContext(gpuDynInst->wfDynId);
 
  422         DPRINTF(GPUDisp, 
"CU%d: Scheduling wakeup next cycle\n", 
cu_id);
 
  436     panic_if(!ldsChunk, 
"was not able to reserve space for this WG");
 
  450     if (num_wfs_in_wg > 1) {
 
  457         assert(!wf_barrier.maxBarrierCnt());
 
  458         assert(!wf_barrier.numAtBarrier());
 
  459         wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
 
  461         DPRINTF(GPUSync, 
"CU[%d] - Dispatching WG with barrier Id%d. " 
  462                 "%d waves using this barrier.\n", 
cu_id, barrier_id,
 
  482                 DPRINTF(GPURename, 
"SIMD[%d] wfSlotId[%d] WF[%d] " 
  483                     "vregDemand[%d] sregDemand[%d]\n", 
i, 
j, 
w->wfDynId,
 
  484                     vregDemand, sregDemand);
 
  499              "Instruction Buffer of WF%d can't be empty", 
w->wgId);
 
  508              "Instruction Buffer of WF%d can't be empty", 
w->wgId);
 
  511     auto it = 
pipeMap.find(ii->seqNum());
 
  521     int trueWgSizeTotal = 1;
 
  527         trueWgSizeTotal *= trueWgSize[
d];
 
  528         DPRINTF(GPUDisp, 
"trueWgSize[%d] =  %d\n", 
d, trueWgSize[
d]);
 
  531     DPRINTF(GPUDisp, 
"trueWgSizeTotal =  %d\n", trueWgSizeTotal);
 
  534     int numWfs = (trueWgSizeTotal + 
wfSize() - 1) / 
wfSize();
 
  535     num_wfs_in_wg = numWfs;
 
  537     bool barrier_avail = 
true;
 
  540         barrier_avail = 
false;
 
  553              "WG with %d WFs and %d VGPRs per WI can not be allocated to CU " 
  554              "that has %d VGPRs\n",
 
  557              "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU " 
  564     int numMappedWfs = 0;
 
  576                 if (numMappedWfs < numWfs &&
 
  590     assert(numMappedWfs <= numWfs);
 
  592     bool vregAvail = 
true;
 
  593     bool sregAvail = 
true;
 
  595     if (numMappedWfs < numWfs) {
 
  611     DPRINTF(GPUDisp, 
"Free WF slots =  %d, Mapped WFs = %d, \ 
  612             VGPR Availability = %d, SGPR Availability = %d\n",
 
  613             freeWfSlots, numMappedWfs, vregAvail, sregAvail);
 
  630     if (!barrier_avail) {
 
  639     bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
 
  640                         && ldsAvail && barrier_avail;
 
  648     return wf_barrier.numYetToReachBarrier();
 
  655     return wf_barrier.allAtBarrier();
 
  662     wf_barrier.incNumAtBarrier();
 
  669     return wf_barrier.numAtBarrier();
 
  676     return wf_barrier.maxBarrierCnt();
 
  690     wf_barrier.decMaxBarrierCnt();
 
  697     wf_barrier.release();
 
  720     for (
auto &vecRegFile : 
vrf) {
 
  724     for (
auto &scRegFile : 
srf) {
 
  768              "No support for multiple Global Memory Pipelines exists!!!");
 
  775              "No support for multiple Local Memory Pipelines exists!!!");
 
  782              "No support for multiple Scalar Memory Pipelines exists!!!");
 
  820         if (gpuDynInst->isKernelLaunch()) {
 
  823             assert(pkt->
req->isKernel());
 
  824             assert(pkt->
req->isInvL1());
 
  839             && gpuDynInst->isEndOfKernel()) {
 
  845             assert(pkt->
req->isKernel());
 
  846             assert(pkt->
req->isGL2CacheFlush());
 
  862             DPRINTF(GPUDisp, 
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
 
  864                     w->wfDynId, 
w->wgId);
 
  870         if (!pkt->
req->isKernel()) {
 
  872             DPRINTF(GPUExec, 
"MemSyncResp: WF[%d][%d] WV%d %s decrementing " 
  873                             "outstanding reqs %d => %d\n", gpuDynInst->simdId,
 
  874                             gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
 
  875                             gpuDynInst->disassemble(), 
w->outstandingReqs,
 
  876                             w->outstandingReqs - 1);
 
  889             "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
 
  891             gpuDynInst->seqNum(), 
index, pkt->
req->getPaddr());
 
  902     assert(!pkt->
req->isKernel());
 
  909     assert(gpuDynInst->numScalarReqs > 0);
 
  911     gpuDynInst->numScalarReqs--;
 
  921     if (!gpuDynInst->numScalarReqs) {
 
  922         if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 
  923                 computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
 
  926                 computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
 
  940     for (
const auto &pkt : retries) {
 
  941         if (!sendTimingReq(pkt)) {
 
  952     int len = retries.size();
 
  956     for (
int i = 0; 
i < 
len; ++
i) {
 
  958         M5_VAR_USED 
GPUDynInstPtr gpuDynInst = retries.front().second;
 
  959         DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
 
  960                 computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
  961                 pkt->
req->getPaddr());
 
  966         if (!sendTimingReq(pkt)) {
 
  967             DPRINTF(GPUMem, 
"failed again!\n");
 
  970             DPRINTF(GPUMem, 
"successful!\n");
 
  979     computeUnit->fetchStage.processFetchReturn(pkt);
 
  986     int len = retries.size();
 
  990     for (
int i = 0; 
i < 
len; ++
i) {
 
  992         M5_VAR_USED 
Wavefront *wavefront = retries.front().second;
 
  993         DPRINTF(GPUFetch, 
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
 
  994                 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
 
  995                 pkt->
req->getPaddr());
 
  996         if (!sendTimingReq(pkt)) {
 
  997             DPRINTF(GPUFetch, 
"failed again!\n");
 
 1000             DPRINTF(GPUFetch, 
"successful!\n");
 
 1001             retries.pop_front();
 
 1010     Addr tmp_vaddr = pkt->
req->getVaddr();
 
 1015     pkt->
req->setPC(gpuDynInst->wavefront()->pc());
 
 1017     pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1030     } 
else if (pkt->
isRead()) {
 
 1033         fatal(
"pkt is not a read nor a write\n");
 
 1045             unsigned size = pkt->
getSize();
 
 1048                 panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
 
 1049                       cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 
vaddr);
 
 1054             if (!
p->pTable->translate(
vaddr, paddr)) {
 
 1055                 if (!
p->fixupFault(
vaddr)) {
 
 1056                     panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
 
 1057                           cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1074             tlbPort[tlbPort_index].sendFunctional(pkt);
 
 1077             int hit_level = translation_state->
hitLevel;
 
 1078             assert(hit_level != -1);
 
 1083                 safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
 
 1086             delete sender_state->
saved;
 
 1087             delete sender_state;
 
 1089             assert(pkt->
req->hasPaddr());
 
 1090             assert(pkt->
req->hasSize());
 
 1100                 uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
 
 1111             gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
 
 1112             gpuDynInst->tlbHitLevel[
index] = hit_level;
 
 1119             DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x data " 
 1120                     "scheduled\n", 
cu_id, gpuDynInst->simdId,
 
 1121                     gpuDynInst->wfSlotId, 
index, pkt->
req->getPaddr());
 
 1124         } 
else if (
tlbPort[tlbPort_index].isStalled()) {
 
 1125             assert(
tlbPort[tlbPort_index].retries.size() > 0);
 
 1127             DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Translation for addr %#x " 
 1128                     "failed!\n", 
cu_id, gpuDynInst->simdId,
 
 1129                     gpuDynInst->wfSlotId, tmp_vaddr);
 
 1131             tlbPort[tlbPort_index].retries.push_back(pkt);
 
 1132         } 
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
 
 1137             tlbPort[tlbPort_index].stallPort();
 
 1139             DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Translation for addr %#x " 
 1140                     "failed!\n", 
cu_id, gpuDynInst->simdId,
 
 1141                     gpuDynInst->wfSlotId, tmp_vaddr);
 
 1143             tlbPort[tlbPort_index].retries.push_back(pkt);
 
 1146                    "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
 
 1147                    cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
 
 1151             gpuDynInst->resetEntireStatusVector();
 
 1153             gpuDynInst->decrementStatusVector(
index);
 
 1163         tlbPort[tlbPort_index].sendFunctional(pkt);
 
 1173         memPort[0].sendFunctional(new_pkt);
 
 1175         DPRINTF(GPUMem, 
"Functional sendRequest\n");
 
 1176         DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: index %d: addr %#x\n", 
cu_id,
 
 1177                 gpuDynInst->simdId, gpuDynInst->wfSlotId, 
index,
 
 1178                 new_pkt->
req->getPaddr());
 
 1182              safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
 
 1212         DPRINTF(GPUTLB, 
"sent scalar %s translation request for addr %#x\n",
 
 1214                 pkt->
req->getVaddr());
 
 1223     assert(gpuDynInst->isGlobalSeg() ||
 
 1224            gpuDynInst->executedAs() == Enums::SC_GLOBAL);
 
 1227         req = std::make_shared<Request>(
 
 1236     if (kernelMemSync) {
 
 1237         if (gpuDynInst->isKernelLaunch()) {
 
 1239             req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1246               memPort[0].createMemReqEvent(pkt);
 
 1248             DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling " 
 1249                     "an acquire\n", 
cu_id, gpuDynInst->simdId,
 
 1250                     gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
 
 1257           assert(gpuDynInst->isEndOfKernel());
 
 1260           req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1267             memPort[0].createMemReqEvent(pkt);
 
 1269           DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling " 
 1270                   "a release\n", 
cu_id, gpuDynInst->simdId,
 
 1271                   gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
 
 1276         gpuDynInst->setRequestFlags(req);
 
 1278         req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1285           memPort[0].createMemReqEvent(pkt);
 
 1288                 "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
 
 1289                 cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
 
 1290                 pkt->
req->getPaddr());
 
 1300         safe_cast<DataPort::SenderState*>(pkt->
senderState);
 
 1307     DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
 
 1308             compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1309             pkt->
req->getPaddr(), 
id);
 
 1311     Addr paddr = pkt->
req->getPaddr();
 
 1325     int index = gpuDynInst->memStatusVector[paddr].back();
 
 1327     DPRINTF(GPUMem, 
"Response for addr %#x, index %d\n",
 
 1328             pkt->
req->getPaddr(), 
id);
 
 1330     gpuDynInst->memStatusVector[paddr].pop_back();
 
 1331     gpuDynInst->pAddr = pkt->
req->getPaddr();
 
 1333     gpuDynInst->decrementStatusVector(
index);
 
 1334     DPRINTF(GPUMem, 
"bitvector is now %s\n", gpuDynInst->printStatusVector());
 
 1336     if (gpuDynInst->allLanesZero()) {
 
 1337         auto iter = gpuDynInst->memStatusVector.begin();
 
 1338         auto end = gpuDynInst->memStatusVector.end();
 
 1340         while (iter != end) {
 
 1341             assert(iter->second.empty());
 
 1348         if (compute_unit->
headTailMap.count(gpuDynInst)) {
 
 1354         gpuDynInst->memStatusVector.clear();
 
 1360         DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: packet totally complete\n",
 
 1361                 compute_unit->
cu_id, gpuDynInst->simdId,
 
 1362                 gpuDynInst->wfSlotId);
 
 1365             if (!compute_unit->
headTailMap.count(gpuDynInst)) {
 
 1367                     .insert(std::make_pair(gpuDynInst, 
curTick()));
 
 1379     Addr line = pkt->
req->getPaddr();
 
 1381     DPRINTF(GPUTLB, 
"CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
 
 1382             pkt->
req->getVaddr(), line);
 
 1385     computeUnit->stats.tlbCycles += 
curTick();
 
 1389                safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
 
 1392     if (!translation_state->
tlbEntry) {
 
 1394             safe_cast<DTLBPort::SenderState*>(translation_state->
saved);
 
 1397             computeUnit->wfList[sender_state->
_gpuDynInst->simdId]
 
 1400         DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n", 
w->wfDynId,
 
 1401                  pkt->
req->getVaddr());
 
 1405     int hit_level = translation_state->
hitLevel;
 
 1406     computeUnit->stats.hitsPerTLBLevel[hit_level]++;
 
 1408     delete translation_state->
tlbEntry;
 
 1409     assert(!translation_state->
ports.size());
 
 1415     delete translation_state;
 
 1419         safe_cast<DTLBPort::SenderState*>(pkt->
senderState);
 
 1424     gpuDynInst->memStatusVector[line].push_back(mp_index);
 
 1425     gpuDynInst->tlbHitLevel[mp_index] = hit_level;
 
 1436         panic(
"unsupported response to request conversion %s\n",
 
 1440     if (computeUnit->prefetchDepth) {
 
 1441         int simdId = gpuDynInst->simdId;
 
 1442         int wfSlotId = gpuDynInst->wfSlotId;
 
 1445         switch(computeUnit->prefetchType) {
 
 1447             last = computeUnit->lastVaddrCU[mp_index];
 
 1449         case Enums::PF_PHASE:
 
 1450             last = computeUnit->lastVaddrSimd[simdId][mp_index];
 
 1453             last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
 
 1458         DPRINTF(GPUPrefetch, 
"CU[%d][%d][%d][%d]: %#x was last\n",
 
 1459                 computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
 
 1467         computeUnit->lastVaddrCU[mp_index] = 
vaddr;
 
 1468         computeUnit->lastVaddrSimd[simdId][mp_index] = 
vaddr;
 
 1469         computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = 
vaddr;
 
 1471         stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
 
 1472             computeUnit->prefetchStride: 
stride;
 
 1474         DPRINTF(GPUPrefetch, 
"%#x to: CU[%d][%d][%d][%d]\n", 
vaddr,
 
 1475                 computeUnit->cu_id, simdId, wfSlotId, mp_index);
 
 1480         for (
int pf = 1; 
pf <= computeUnit->prefetchDepth; ++
pf) {
 
 1487             RequestPtr prefetch_req = std::make_shared<Request>(
 
 1490                 computeUnit->requestorId(),
 
 1500                     computeUnit->shader->gpuTc, 
true);
 
 1503             sendFunctional(prefetch_pkt);
 
 1507                  safe_cast<X86ISA::GpuTLB::TranslationState*>(
 
 1513             delete prefetch_pkt;
 
 1532         computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
 
 1534     DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
 
 1535             computeUnit->cu_id, gpuDynInst->simdId,
 
 1536             gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
 
 1538     computeUnit->schedule(mem_req_event, 
curTick() +
 
 1539                           computeUnit->req_tick_latency);
 
 1548         [
this, pkt]{ processMemReqEvent(pkt); },
 
 1549         "ComputeUnit memory request event", 
true);
 
 1556         [
this, pkt]{ processMemRespEvent(pkt); },
 
 1557         "ComputeUnit memory response event", 
true);
 
 1565     M5_VAR_USED 
ComputeUnit *compute_unit = computeUnit;
 
 1567     if (!(sendTimingReq(pkt))) {
 
 1568         retries.push_back(std::make_pair(pkt, gpuDynInst));
 
 1571                 "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
 
 1572                 compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1573                 id, pkt->
req->getPaddr());
 
 1576                 "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " 
 1577                 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
 
 1578                 gpuDynInst->wfSlotId, gpuDynInst->seqNum(), 
id,
 
 1579                 pkt->
req->getPaddr());
 
 1586     return "ComputeUnit scalar memory request event";
 
 1592     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
 
 1600                 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
 
 1601                 compute_unit->cu_id, gpuDynInst->simdId,
 
 1602                 gpuDynInst->wfSlotId, pkt->req->getPaddr());
 
 1605                 "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data " 
 1606                 "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
 
 1607                 gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
 
 1608                 pkt->req->getPaddr());
 
 1621     int len = retries.size();
 
 1623     DPRINTF(GPUTLB, 
"CU%d: DTLB recvReqRetry - %d pending requests\n",
 
 1624             computeUnit->cu_id, 
len);
 
 1627     assert(isStalled());
 
 1632     for (
int i = 0; 
i < 
len; ++
i) {
 
 1635         DPRINTF(GPUTLB, 
"CU%d: retrying D-translaton for address%#x", 
vaddr);
 
 1637         if (!sendTimingReq(pkt)) {
 
 1640             DPRINTF(GPUTLB, 
": failed again\n");
 
 1643             DPRINTF(GPUTLB, 
": successful\n");
 
 1644             retries.pop_front();
 
 1655         safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
 
 1659             "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
 
 1661     delete translation_state->
tlbEntry;
 
 1662     assert(!translation_state->
ports.size());
 
 1665     delete translation_state;
 
 1668         safe_cast<ScalarDTLBPort::SenderState*>(pkt->
senderState);
 
 1673     M5_VAR_USED 
Wavefront *
w = gpuDynInst->wavefront();
 
 1675     DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received " 
 1676         "translation: PA %#x -> %#x\n", computeUnit->cu_id, 
w->simdId,
 
 1677         w->wfSlotId, 
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
 
 1686       fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
 
 1697     if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
 
 1698         computeUnit->scalarDataPort.retries.push_back(req_pkt);
 
 1699         DPRINTF(GPUMem, 
"send scalar req failed for: %s\n",
 
 1700                 gpuDynInst->disassemble());
 
 1702         DPRINTF(GPUMem, 
"send scalar req for: %s\n",
 
 1703                 gpuDynInst->disassemble());
 
 1712     M5_VAR_USED 
Addr line = pkt->
req->getPaddr();
 
 1713     DPRINTF(GPUTLB, 
"CU%d: ITLBPort received %#x->%#x\n",
 
 1714             computeUnit->cu_id, pkt->
req->getVaddr(), line);
 
 1720         = safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->
senderState);
 
 1722     bool success = translation_state->
tlbEntry != 
nullptr;
 
 1723     delete translation_state->
tlbEntry;
 
 1724     assert(!translation_state->
ports.size());
 
 1726     delete translation_state;
 
 1730         safe_cast<ITLBPort::SenderState*>(pkt->
senderState);
 
 1743         computeUnit->fetchStage.fetch(pkt, wavefront);
 
 1766     int len = retries.size();
 
 1767     DPRINTF(GPUTLB, 
"CU%d: ITLB recvReqRetry - %d pending requests\n", 
len);
 
 1770     assert(isStalled());
 
 1776     for (
int i = 0; 
i < 
len; ++
i) {
 
 1779         DPRINTF(GPUTLB, 
"CU%d: retrying I-translaton for address%#x", 
vaddr);
 
 1781         if (!sendTimingReq(pkt)) {
 
 1783             DPRINTF(GPUTLB, 
": failed again\n");
 
 1786             DPRINTF(GPUTLB, 
": successful\n");
 
 1787             retries.pop_front();
 
 1795     if (gpuDynInst->isScalar()) {
 
 1796         if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
 
 1799         } 
else if (gpuDynInst->isLoad()) {
 
 1801         } 
else if (gpuDynInst->isStore()) {
 
 1805         if (gpuDynInst->isALU()) {
 
 1813                 += gpuDynInst->wavefront()->execMask().count();
 
 1814         } 
else if (gpuDynInst->isFlat()) {
 
 1815             if (gpuDynInst->isLocalMem()) {
 
 1820         } 
else if (gpuDynInst->isLocalMem()) {
 
 1822         } 
else if (gpuDynInst->isLoad()) {
 
 1824         } 
else if (gpuDynInst->isStore()) {
 
 1828         if (gpuDynInst->isLoad()) {
 
 1829             switch (gpuDynInst->executedAs()) {
 
 1830               case Enums::SC_SPILL:
 
 1833               case Enums::SC_GLOBAL:
 
 1836               case Enums::SC_GROUP:
 
 1839               case Enums::SC_PRIVATE:
 
 1842               case Enums::SC_READONLY:
 
 1845               case Enums::SC_KERNARG:
 
 1858                 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
 
 1861         } 
else if (gpuDynInst->isStore()) {
 
 1862             switch (gpuDynInst->executedAs()) {
 
 1863               case Enums::SC_SPILL:
 
 1866               case Enums::SC_GLOBAL:
 
 1869               case Enums::SC_GROUP:
 
 1872               case Enums::SC_PRIVATE:
 
 1875               case Enums::SC_READONLY:
 
 1878               case Enums::SC_KERNARG:
 
 1891                 fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
 
 1915         *page_stat_file << 
"page, wavefront accesses, workitem accesses" <<
 
 1919             *page_stat_file << std::hex << iter.first << 
",";
 
 1920             *page_stat_file << std::dec << iter.second.first << 
",";
 
 1921             *page_stat_file << std::dec << iter.second.second << std::endl;
 
 1958     const uint32_t wgId)
 const 
 1968     for (
int i_wf = 0; i_wf < 
shader->
n_wf; ++i_wf){
 
 1987     RequestPtr newRequest = std::make_shared<Request>();
 
 1988     newRequest->setPaddr(0x0);
 
 2008     fatal_if(!senderState, 
"did not get the right sort of sender state");
 
 2015     computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
 
 2029     fatal_if(!sender_state, 
"packet without a valid sender state");
 
 2034         fatal_if(retries.empty(), 
"must have retries waiting to be stalled");
 
 2038         DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: LDS send failed!\n",
 
 2039                         computeUnit->cu_id, gpuDynInst->simdId,
 
 2040                         gpuDynInst->wfSlotId);
 
 2048         DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
 
 2049                 computeUnit->cu_id, gpuDynInst->simdId,
 
 2050                 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
 
 2053         DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
 
 2054                 computeUnit->cu_id, gpuDynInst->simdId,
 
 2055                 gpuDynInst->wfSlotId, pkt->
req->getPaddr());
 
 2069     auto queueSize = retries.size();
 
 2071     DPRINTF(GPUPort, 
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
 
 2072             computeUnit->cu_id, queueSize);
 
 2075              "why was there a recvReqRetry() with no pending reqs?");
 
 2077              "recvReqRetry() happened when the port was not stalled");
 
 2081     while (!retries.empty()) {
 
 2084         DPRINTF(GPUPort, 
"CU%d: retrying LDS send\n", computeUnit->cu_id);
 
 2089             DPRINTF(GPUPort, 
": LDS send failed again\n");
 
 2092             DPRINTF(GPUTLB, 
": LDS send successful\n");
 
 2100       ADD_STAT(vALUInsts, 
"Number of vector ALU insts issued."),
 
 2101       ADD_STAT(vALUInstsPerWF, 
"The avg. number of vector ALU insts issued " 
 2103       ADD_STAT(sALUInsts, 
"Number of scalar ALU insts issued."),
 
 2104       ADD_STAT(sALUInstsPerWF, 
"The avg. number of scalar ALU insts issued " 
 2107                "Number of cycles needed to execute VALU insts."),
 
 2109                "Number of cycles needed to execute SALU insts."),
 
 2110       ADD_STAT(threadCyclesVALU, 
"Number of thread cycles used to execute " 
 2111                "vector ALU ops. Similar to instCyclesVALU but multiplied by " 
 2112                "the number of active threads."),
 
 2114                "Percentage of active vector ALU threads in a wave."),
 
 2115       ADD_STAT(ldsNoFlatInsts, 
"Number of LDS insts issued, not including FLAT" 
 2116                " accesses that resolve to LDS."),
 
 2117       ADD_STAT(ldsNoFlatInstsPerWF, 
"The avg. number of LDS insts (not " 
 2118                "including FLAT accesses that resolve to LDS) per-wavefront."),
 
 2120                "The number of FLAT insts that resolve to vmem issued."),
 
 2121       ADD_STAT(flatVMemInstsPerWF, 
"The average number of FLAT insts that " 
 2122                "resolve to vmem issued per-wavefront."),
 
 2124                "The number of FLAT insts that resolve to LDS issued."),
 
 2125       ADD_STAT(flatLDSInstsPerWF, 
"The average number of FLAT insts that " 
 2126                "resolve to LDS issued per-wavefront."),
 
 2128                "Number of vector mem write insts (excluding FLAT insts)."),
 
 2129       ADD_STAT(vectorMemWritesPerWF, 
"The average number of vector mem write " 
 2130                "insts (excluding FLAT insts) per-wavefront."),
 
 2132                "Number of vector mem read insts (excluding FLAT insts)."),
 
 2133       ADD_STAT(vectorMemReadsPerWF, 
"The avg. number of vector mem read insts " 
 2134                "(excluding FLAT insts) per-wavefront."),
 
 2135       ADD_STAT(scalarMemWrites, 
"Number of scalar mem write insts."),
 
 2137                "The average number of scalar mem write insts per-wavefront."),
 
 2138       ADD_STAT(scalarMemReads, 
"Number of scalar mem read insts."),
 
 2140                "The average number of scalar mem read insts per-wavefront."),
 
 2141       ADD_STAT(vectorMemReadsPerKiloInst,
 
 2142                "Number of vector mem reads per kilo-instruction"),
 
 2143       ADD_STAT(vectorMemWritesPerKiloInst,
 
 2144                "Number of vector mem writes per kilo-instruction"),
 
 2145       ADD_STAT(vectorMemInstsPerKiloInst,
 
 2146                "Number of vector mem insts per kilo-instruction"),
 
 2147       ADD_STAT(scalarMemReadsPerKiloInst,
 
 2148                "Number of scalar mem reads per kilo-instruction"),
 
 2149       ADD_STAT(scalarMemWritesPerKiloInst,
 
 2150                "Number of scalar mem writes per kilo-instruction"),
 
 2151       ADD_STAT(scalarMemInstsPerKiloInst,
 
 2152                "Number of scalar mem insts per kilo-instruction"),
 
 2153       ADD_STAT(instCyclesVMemPerSimd, 
"Number of cycles to send address, " 
 2154                "command, data from VRF to vector memory unit, per SIMD"),
 
 2155       ADD_STAT(instCyclesScMemPerSimd, 
"Number of cycles to send address, " 
 2156                "command, data from SRF to scalar memory unit, per SIMD"),
 
 2157       ADD_STAT(instCyclesLdsPerSimd, 
"Number of cycles to send address, " 
 2158                "command, data from VRF to LDS unit, per SIMD"),
 
 2159       ADD_STAT(globalReads, 
"Number of reads to the global segment"),
 
 2160       ADD_STAT(globalWrites, 
"Number of writes to the global segment"),
 
 2162                "Number of memory instructions sent to the global segment"),
 
 2163       ADD_STAT(argReads, 
"Number of reads to the arg segment"),
 
 2164       ADD_STAT(argWrites, 
"NUmber of writes to the arg segment"),
 
 2166                "Number of memory instructions sent to the arg segment"),
 
 2167       ADD_STAT(spillReads, 
"Number of reads to the spill segment"),
 
 2168       ADD_STAT(spillWrites, 
"Number of writes to the spill segment"),
 
 2170                "Number of memory instructions sent to the spill segment"),
 
 2171       ADD_STAT(groupReads, 
"Number of reads to the group segment"),
 
 2172       ADD_STAT(groupWrites, 
"Number of writes to the group segment"),
 
 2174                "Number of memory instructions sent to the group segment"),
 
 2175       ADD_STAT(privReads, 
"Number of reads to the private segment"),
 
 2176       ADD_STAT(privWrites, 
"Number of writes to the private segment"),
 
 2178                "Number of memory instructions sent to the private segment"),
 
 2179       ADD_STAT(readonlyReads, 
"Number of reads to the readonly segment"),
 
 2181                "Number of memory instructions sent to the readonly segment"),
 
 2183                "Number of memory instructions sent to the readonly segment"),
 
 2184       ADD_STAT(kernargReads, 
"Number of reads sent to the kernarg segment"),
 
 2186                "Number of memory instructions sent to the kernarg segment"),
 
 2188                "Number of memory instructions sent to the kernarg segment"),
 
 2190                "wave level parallelism: count of active waves at wave launch"),
 
 2191       ADD_STAT(tlbRequests, 
"number of uncoalesced requests"),
 
 2193                "total number of cycles for all uncoalesced requests"),
 
 2194       ADD_STAT(tlbLatency, 
"Avg. translation latency for data translations"),
 
 2196                "TLB hits distribution (0 for page table, x for Lx-TLB)"),
 
 2197       ADD_STAT(ldsBankAccesses, 
"Total number of LDS bank accesses"),
 
 2199                "Number of bank conflicts per LDS memory packet"),
 
 2201                "pages touched per wf (over all mem. instr.)"),
 
 2203                "dynamic non-flat global memory instruction count"),
 
 2205                "dynamic flat global memory instruction count"),
 
 2206       ADD_STAT(dynamicLMemInstrCnt, 
"dynamic local memory intruction count"),
 
 2207       ADD_STAT(wgBlockedDueBarrierAllocation,
 
 2208                "WG dispatch was blocked due to lack of barrier resources"),
 
 2209       ADD_STAT(wgBlockedDueLdsAllocation,
 
 2210                "Workgroup blocked due to LDS capacity"),
 
 2211       ADD_STAT(numInstrExecuted, 
"number of instructions executed"),
 
 2212       ADD_STAT(execRateDist, 
"Instruction Execution Rate: Number of executed " 
 2213                "vector instructions per cycle"),
 
 2215                "number of vec ops executed (e.g. WF size/inst)"),
 
 2217                "number of f16 vec ops executed (e.g. WF size/inst)"),
 
 2219                "number of f32 vec ops executed (e.g. WF size/inst)"),
 
 2221                "number of f64 vec ops executed (e.g. WF size/inst)"),
 
 2223                "number of fma16 vec ops executed (e.g. WF size/inst)"),
 
 2225                "number of fma32 vec ops executed (e.g. WF size/inst)"),
 
 2227                "number of fma64 vec ops executed (e.g. WF size/inst)"),
 
 2229                "number of mac16 vec ops executed (e.g. WF size/inst)"),
 
 2231                "number of mac32 vec ops executed (e.g. WF size/inst)"),
 
 2233                "number of mac64 vec ops executed (e.g. WF size/inst)"),
 
 2235                "number of mad16 vec ops executed (e.g. WF size/inst)"),
 
 2237                "number of mad32 vec ops executed (e.g. WF size/inst)"),
 
 2239                "number of mad64 vec ops executed (e.g. WF size/inst)"),
 
 2241                "number of two op FP vec ops executed (e.g. WF size/inst)"),
 
 2242       ADD_STAT(totalCycles, 
"number of cycles the CU ran for"),
 
 2243       ADD_STAT(
vpc, 
"Vector Operations per cycle (this CU only)"),
 
 2244       ADD_STAT(vpc_f16, 
"F16 Vector Operations per cycle (this CU only)"),
 
 2245       ADD_STAT(vpc_f32, 
"F32 Vector Operations per cycle (this CU only)"),
 
 2246       ADD_STAT(vpc_f64, 
"F64 Vector Operations per cycle (this CU only)"),
 
 2247       ADD_STAT(ipc, 
"Instructions per cycle (this CU only)"),
 
 2248       ADD_STAT(controlFlowDivergenceDist, 
"number of lanes active per " 
 2249                "instruction (over all instructions)"),
 
 2250       ADD_STAT(activeLanesPerGMemInstrDist,
 
 2251                "number of active lanes per global memory instruction"),
 
 2252       ADD_STAT(activeLanesPerLMemInstrDist,
 
 2253                "number of active lanes per local memory instruction"),
 
 2255                "Number of dynamic non-GM memory insts executed"),
 
 2256       ADD_STAT(numTimesWgBlockedDueVgprAlloc, 
"Number of times WGs are " 
 2257                "blocked due to VGPR allocation per SIMD"),
 
 2258       ADD_STAT(numTimesWgBlockedDueSgprAlloc, 
"Number of times WGs are " 
 2259                "blocked due to SGPR allocation per SIMD"),
 
 2260       ADD_STAT(numCASOps, 
"number of compare and swap operations"),
 
 2262                "number of compare and swap operations that failed"),
 
 2263       ADD_STAT(completedWfs, 
"number of completed wavefronts"),
 
 2264       ADD_STAT(completedWGs, 
"number of completed workgroups"),
 
 2265       ADD_STAT(headTailLatency, 
"ticks between first and last cache block " 
 2266                "arrival at coalescer"),
 
 2267       ADD_STAT(instInterleave, 
"Measure of instruction interleaving per SIMD")
 
 2319     for (
int i = 0; 
i < 4; ++
i) {