39#include "debug/GPUDisp.hh" 
   40#include "debug/GPUExec.hh" 
   41#include "debug/GPUFetch.hh" 
   42#include "debug/GPUMem.hh" 
   43#include "debug/GPUPort.hh" 
   44#include "debug/GPUPrefetch.hh" 
   45#include "debug/GPUReg.hh" 
   46#include "debug/GPURename.hh" 
   47#include "debug/GPUSync.hh" 
   48#include "debug/GPUTLB.hh" 
   49#include "debug/GPUTrace.hh" 
   50#include "enums/GfxVersion.hh" 
   87    vrf(
p.vector_register_file), srf(
p.scalar_register_file),
 
   88    rfc(
p.register_file_cache),
 
   89    simdWidth(
p.simd_width),
 
   90    spBypassPipeLength(
p.spbypass_pipe_length),
 
   91    dpBypassPipeLength(
p.dpbypass_pipe_length),
 
   92    rfcPipeLength(
p.rfc_pipe_length),
 
   93    scalarPipeStages(
p.scalar_pipe_length),
 
   94    operandNetworkLength(
p.operand_network_length),
 
   95    issuePeriod(
p.issue_period),
 
   96    vrf_gm_bus_latency(
p.vrf_gm_bus_latency),
 
   97    srf_scm_bus_latency(
p.srf_scm_bus_latency),
 
   98    vrf_lm_bus_latency(
p.vrf_lm_bus_latency),
 
   99    perLaneTLB(
p.perLaneTLB), prefetchDepth(
p.prefetch_depth),
 
  100    prefetchStride(
p.prefetch_stride), prefetchType(
p.prefetch_prev_type),
 
  101    debugSegFault(
p.debugSegFault),
 
  102    functionalTLB(
p.functionalTLB), localMemBarrier(
p.localMemBarrier),
 
  103    countPages(
p.countPages),
 
  104    req_tick_latency(
p.mem_req_latency * 
p.clk_domain->clockPeriod()),
 
  105    resp_tick_latency(
p.mem_resp_latency * 
p.clk_domain->clockPeriod()),
 
  106    scalar_req_tick_latency(
 
  107            p.scalar_mem_req_latency * 
p.clk_domain->clockPeriod()),
 
  108    scalar_resp_tick_latency(
 
  109            p.scalar_mem_resp_latency * 
p.clk_domain->clockPeriod()),
 
  110    memtime_latency(
p.memtime_latency * 
p.clk_domain->clockPeriod()),
 
  111    mfma_scale(
p.mfma_scale),
 
  119        {GfxVersion::gfx90a, {
 
  120            {
"v_mfma_f32_32x32x1_2b_f32", 64},
 
  121            {
"v_mfma_f32_16x16x1_4b_f32", 32},
 
  122            {
"v_mfma_f32_4x4x1_16b_f32", 8},
 
  123            {
"v_mfma_f32_32x32x2_f32", 64},
 
  124            {
"v_mfma_f32_16x16x4_f32", 32},
 
  125            {
"v_mfma_f32_32x32x4_2b_f16", 64},
 
  126            {
"v_mfma_f32_16x16x4_4b_f16", 32},
 
  127            {
"v_mfma_f32_4x4x4_16b_f16", 8},
 
  128            {
"v_mfma_f32_32x32x8_f16", 64},
 
  129            {
"v_mfma_f32_16x16x16_f16", 32},
 
  130            {
"v_mfma_i32_32x32x4_2b_i8", 64},
 
  131            {
"v_mfma_i32_16x16x4_4b_i8", 32},
 
  132            {
"v_mfma_i32_4x4x4_16b_i8", 8},
 
  133            {
"v_mfma_i32_32x32x8_i8", 64},
 
  134            {
"v_mfma_i32_16x16x16_i8", 32},
 
  135            {
"v_mfma_f32_32x32x2_2b_bf16", 64},
 
  136            {
"v_mfma_f32_16x16x2_4b_bf16", 32},
 
  137            {
"v_mfma_f32_4x4x2_16b_bf16", 8},
 
  138            {
"v_mfma_f32_32x32x4_bf16", 64},
 
  139            {
"v_mfma_f32_16x16x8_bf16", 32},
 
  140            {
"v_mfma_f64_16x16x4_f64", 32},
 
  141            {
"v_mfma_f64_4x4x4_4b_f64", 16},
 
  148        {GfxVersion::gfx942, {
 
  149            {
"v_mfma_f32_32x32x1_2b_f32", 64},
 
  150            {
"v_mfma_f32_16x16x1_4b_f32", 32},
 
  151            {
"v_mfma_f32_4x4x1_16b_f32", 8},
 
  152            {
"v_mfma_f32_32x32x2_f32", 64},
 
  153            {
"v_mfma_f32_16x16x4_f32", 32},
 
  154            {
"v_mfma_f32_32x32x4_2b_f16", 64},
 
  155            {
"v_mfma_f32_16x16x4_4b_f16", 32},
 
  156            {
"v_mfma_f32_4x4x4_16b_f16", 8},
 
  157            {
"v_mfma_f32_32x32x8_f16", 32},
 
  158            {
"v_mfma_f32_16x16x16_f16", 16},
 
  159            {
"v_mfma_f32_32x32x4_2b_bf16", 64},
 
  160            {
"v_mfma_f32_16x16x4_4b_bf16", 32},
 
  161            {
"v_mfma_f32_4x4x4_16b_bf16", 8},
 
  162            {
"v_mfma_f32_32x32x8_bf16", 32},
 
  163            {
"v_mfma_f32_16x16x16_bf16", 16},
 
  164            {
"v_mfma_i32_32x32x4_2b_i8", 64},
 
  165            {
"v_mfma_i32_16x16x4_4b_i8", 32},
 
  166            {
"v_mfma_i32_4x4x4_16b_i8", 8},
 
  167            {
"v_mfma_i32_32x32x16_i8", 32},
 
  168            {
"v_mfma_i32_16x16x32_i8", 16},
 
  169            {
"v_mfma_f32_16x16x8_xf32", 16},
 
  170            {
"v_mfma_f32_32x32x4_xf32", 32},
 
  171            {
"v_mfma_f64_16x16x4_f64", 32},
 
  172            {
"v_mfma_f64_4x4x4_4b_f64", 16},
 
  173            {
"v_mfma_f32_16x16x32_bf8_bf8", 16},
 
  174            {
"v_mfma_f32_16x16x32_bf8_fp8", 16},
 
  175            {
"v_mfma_f32_32x32x16_fp8_bf8", 32},
 
  176            {
"v_mfma_f32_32x32x16_fp8_fp8", 32},
 
  179    _requestorId(
p.system->getRequestorId(
this, 
"ComputeUnit")),
 
  180    lds(*
p.localDataStore), gmTokenPort(
name() + 
".gmTokenPort", 
this),
 
  186    _cacheLineSize(
p.system->cacheLineSize()),
 
  187    _numBarrierSlots(
p.num_barrier_slots),
 
  188    globalSeqNum(0), wavefrontSize(
p.wf_size),
 
  189    scoreboardCheckToSchedule(
p),
 
  190    scheduleToExecute(
p),
 
  197             "Functional TLB not supported in full-system GPU simulation");
 
  208    fatal_if(
p.wf_size > std::numeric_limits<unsigned long long>::digits ||
 
  210             "WF size is larger than the host can support");
 
  212             "Wavefront size should be a power of 2");
 
  215    numCyclesPerStoreTransfer =
 
  216        (uint32_t)ceil((
double)(wfSize() * 
sizeof(uint32_t)) /
 
  217                (
double)vrfToCoalescerBusWidth);
 
  219    numCyclesPerLoadTransfer = (wfSize() * 
sizeof(uint32_t))
 
  220                               / coalescerToVrfBusWidth;
 
  223    idleWfs = 
p.n_wf * numVectorALUs;
 
  224    lastVaddrWF.resize(numVectorALUs);
 
  225    wfList.resize(numVectorALUs);
 
  227    wfBarrierSlots.resize(
p.num_barrier_slots, 
WFBarrier());
 
  229    for (
int i = 0; 
i < 
p.num_barrier_slots; ++
i) {
 
  230        freeBarrierIds.insert(
i);
 
  233    for (
int j = 0; j < numVectorALUs; ++j) {
 
  234        lastVaddrWF[j].resize(
p.n_wf);
 
  236        for (
int i = 0; 
i < 
p.n_wf; ++
i) {
 
  237            lastVaddrWF[j][
i].resize(wfSize());
 
  239            wfList[j].push_back(
p.wavefronts[j * 
p.n_wf + 
i]);
 
  240            wfList[j][
i]->setParent(
this);
 
  242            for (
int k = 0; 
k < wfSize(); ++
k) {
 
  243                lastVaddrWF[j][
i][
k] = 0;
 
  248    lastVaddrSimd.resize(numVectorALUs);
 
  250    for (
int i = 0; 
i < numVectorALUs; ++
i) {
 
  251        lastVaddrSimd[
i].resize(wfSize(), 0);
 
  254    lastVaddrCU.resize(wfSize());
 
  258    if (
p.execPolicy == 
"OLDEST-FIRST") {
 
  260    } 
else if (
p.execPolicy == 
"ROUND-ROBIN") {
 
  263        fatal(
"Invalid WF execution policy (CU)\n");
 
  266    for (
int i = 0; 
i < 
p.port_memory_port_connection_count; ++
i) {
 
  270    for (
int i = 0; 
i < 
p.port_translation_port_connection_count; ++
i) {
 
  280    lastExecCycle.resize(numVectorALUs, 0);
 
  282    for (
int i = 0; 
i < vrf.size(); ++
i) {
 
  283        vrf[
i]->setParent(
this);
 
  284        rfc[
i]->setParent(
this);
 
  286    for (
int i = 0; 
i < srf.size(); ++
i) {
 
  287        srf[
i]->setParent(
this);
 
  289    numVecRegsPerSimd = vrf[0]->numRegs();
 
  290    numScalarRegsPerSimd = srf[0]->numRegs();
 
  292    registerManager->setParent(
this);
 
  296    instExecPerSimd.resize(numVectorALUs, 0);
 
  300        "Cache line size should be a power of two.");
 
  301    cacheLineBits = 
floorLog2(_cacheLineSize);
 
  303    matrix_core_ready.resize(numVectorALUs);
 
  304    for (
int i = 0; 
i < numVectorALUs; 
i++) {
 
  305        matrix_core_ready[
i] = 0;
 
 
  392    w->workGroupSz[0] = task->
wgSize(0);
 
  393    w->workGroupSz[1] = task->
wgSize(1);
 
  394    w->workGroupSz[2] = task->
wgSize(2);
 
  395    w->wgSz = 
w->workGroupSz[0] * 
w->workGroupSz[1] * 
w->workGroupSz[2];
 
  399    w->computeActualWgSz(task);
 
 
  406    static int _n_wave = 0;
 
  412        if (
k + waveId * 
wfSize() < 
w->actualWgSzTotal)
 
  416    w->execMask() = init_mask;
 
  420    w->initMask = init_mask.to_ullong();
 
  423        w->barrierId(bar_id);
 
  425        assert(!
w->hasBarrier());
 
  429        w->workItemId[0][
k] = (
k + waveId * 
wfSize()) % 
w->actualWgSz[0];
 
  430        w->workItemId[1][
k] = ((
k + waveId * 
wfSize()) / 
w->actualWgSz[0]) %
 
  432        w->workItemId[2][
k] = (
k + waveId * 
wfSize()) /
 
  433                              (
w->actualWgSz[0] * 
w->actualWgSz[1]);
 
  435        w->workItemFlatId[
k] = 
w->workItemId[2][
k] * 
w->actualWgSz[0] *
 
  436            w->actualWgSz[1] + 
w->workItemId[1][
k] * 
w->actualWgSz[0] +
 
  443    w->workGroupId[0] = 
w->wgId % task->
numWg(0);
 
  444    w->workGroupId[1] = (
w->wgId / task->
numWg(0)) % task->
numWg(1);
 
  445    w->workGroupId[2] = 
w->wgId / (task->
numWg(0) * task->
numWg(1));
 
  448    w->ldsChunk = ldsChunk;
 
  450    [[maybe_unused]] int32_t refCount =
 
  451                lds.increaseRefCounter(
w->dispatchId, 
w->wgId);
 
  452    DPRINTF(GPUDisp, 
"CU%d: increase ref ctr wg[%d] to [%d]\n",
 
  453                    cu_id, 
w->wgId, refCount);
 
  455    w->instructionBuffer.clear();
 
  460    DPRINTF(GPUDisp, 
"Scheduling wfDynId/barrier_id %d/%d on CU%d: " 
  461            "WF[%d][%d]. Ref cnt:%d\n", _n_wave, 
w->barrierId(), 
cu_id,
 
  462            w->simdId, 
w->wfSlotId, refCount);
 
  464    w->initRegState(task, 
w->actualWgSzTotal);
 
  470    w->vmemIssued.clear();
 
  471    w->lgkmIssued.clear();
 
  472    w->expIssued.clear();
 
  474    panic_if(
w->wrGmReqsInPipe, 
"GM write counter for wavefront non-zero\n");
 
  475    panic_if(
w->rdGmReqsInPipe, 
"GM read counter for wavefront non-zero\n");
 
  476    panic_if(
w->wrLmReqsInPipe, 
"LM write counter for wavefront non-zero\n");
 
  477    panic_if(
w->rdLmReqsInPipe, 
"GM read counter for wavefront non-zero\n");
 
  479             "Outstanding reqs counter for wavefront non-zero\n");
 
 
  490        = std::make_shared<GPUDynInst>(
this, 
nullptr,
 
  494    gpuDynInst->kern_id = kernId;
 
 
  517        = std::make_shared<GPUDynInst>(
this, 
nullptr,
 
  521    gpuDynInst->kern_id = kernId;
 
  523    gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
 
 
  545        DPRINTF(GPUDisp, 
"CU%d: Scheduling wakeup next cycle\n", 
cu_id);
 
  559    panic_if(!ldsChunk, 
"was not able to reserve space for this WG");
 
  573    if (num_wfs_in_wg > 1) {
 
  580        assert(!wf_barrier.maxBarrierCnt());
 
  581        assert(!wf_barrier.numAtBarrier());
 
  582        wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
 
  584        DPRINTF(GPUSync, 
"CU[%d] - Dispatching WG with barrier Id%d. " 
  585                "%d waves using this barrier.\n", 
cu_id, barrier_id,
 
  591    for (
int j = 0; j < 
shader->n_wf; ++j) {
 
  605                DPRINTF(GPURename, 
"SIMD[%d] wfSlotId[%d] WF[%d] " 
  606                    "vregDemand[%d] sregDemand[%d]\n", 
i, j, 
w->wfDynId,
 
  607                    vregDemand, sregDemand);
 
 
  622             "Instruction Buffer of WF%d can't be empty", 
w->wgId);
 
 
  631             "Instruction Buffer of WF%d can't be empty", 
w->wgId);
 
  634    auto it = 
pipeMap.find(ii->seqNum());
 
 
  644    int trueWgSizeTotal = 1;
 
  650        trueWgSizeTotal *= trueWgSize[
d];
 
  651        DPRINTF(GPUDisp, 
"trueWgSize[%d] =  %d\n", 
d, trueWgSize[
d]);
 
  654    DPRINTF(GPUDisp, 
"trueWgSizeTotal =  %d\n", trueWgSizeTotal);
 
  657    int numWfs = (trueWgSizeTotal + 
wfSize() - 1) / 
wfSize();
 
  658    num_wfs_in_wg = numWfs;
 
  660    bool barrier_avail = 
true;
 
  663        barrier_avail = 
false;
 
  676             "WG with %d WFs and %d VGPRs per WI can not be allocated to CU " 
  677             "that has %d VGPRs\n",
 
  680             "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU " 
  687    int numMappedWfs = 0;
 
  693    for (
int j = 0; j < 
shader->n_wf; ++j) {
 
  699                if (numMappedWfs < numWfs &&
 
  713    assert(numMappedWfs <= numWfs);
 
  715    bool vregAvail = 
true;
 
  716    bool sregAvail = 
true;
 
  718    if (numMappedWfs < numWfs) {
 
  734    DPRINTF(GPUDisp, 
"Free WF slots =  %d, Mapped WFs = %d, \ 
  735            VGPR Availability = %d, SGPR Availability = %d\n",
 
  736            freeWfSlots, numMappedWfs, vregAvail, sregAvail);
 
  739        ++
stats.numTimesWgBlockedDueVgprAlloc;
 
  743        ++
stats.numTimesWgBlockedDueSgprAlloc;
 
  748    bool ldsAvail = 
lds.canReserve(task->
ldsSize());
 
  750        stats.wgBlockedDueLdsAllocation++;
 
  753    if (!barrier_avail) {
 
  754        stats.wgBlockedDueBarrierAllocation++;
 
  762    bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
 
  763                        && ldsAvail && barrier_avail;
 
 
  771    return wf_barrier.numYetToReachBarrier();
 
 
  778    return wf_barrier.allAtBarrier();
 
 
  785    wf_barrier.incNumAtBarrier();
 
 
  792    return wf_barrier.numAtBarrier();
 
 
  799    return wf_barrier.maxBarrierCnt();
 
 
  813    wf_barrier.decMaxBarrierCnt();
 
 
  820    wf_barrier.release();
 
 
  828        for (
int j = 0; j < 
shader->n_wf; ++j) {
 
 
  843    for (
auto &vecRegFile : 
vrf) {
 
  847    for (
auto &scRegFile : 
srf) {
 
  864    if (
shader->getProgressInterval() != 0 &&
 
 
  898             "No support for multiple Global Memory Pipelines exists!!!");
 
  905             "No support for multiple Local Memory Pipelines exists!!!");
 
  912             "No support for multiple Scalar Memory Pipelines exists!!!");
 
 
  957            assert(cu != 
nullptr);
 
  959            if (pkt->
req->isInvL2()) {
 
  963                panic(
"Unknown MemSyncResp not from an instruction");
 
  975        if (gpuDynInst->isKernelLaunch()) {
 
  978            assert(pkt->
req->isKernel());
 
  979            assert(pkt->
req->isInvL1());
 
  994            && gpuDynInst->isEndOfKernel()) {
 
 1000            assert(pkt->
req->isKernel());
 
 1001            assert(pkt->
req->isGL2CacheFlush());
 
 1017            DPRINTF(GPUDisp, 
"CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
 
 1019                    w->wfDynId, 
w->wgId);
 
 1025        if (!pkt->
req->isKernel()) {
 
 1026            w = 
computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
 
 1027            DPRINTF(GPUExec, 
"MemSyncResp: WF[%d][%d] WV%d %s decrementing " 
 1028                            "outstanding reqs %d => %d\n", gpuDynInst->simdId,
 
 1029                            gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
 
 1030                            gpuDynInst->disassemble(), 
w->outstandingReqs,
 
 1031                            w->outstandingReqs - 1);
 
 1032            computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
 
 1044            "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
 
 1045            computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1046            gpuDynInst->seqNum(), 
index, pkt->
req->getPaddr());
 
 
 1064    if (pkt->
req->isKernel()) {
 
 1071    assert(!pkt->
req->isKernel());
 
 1078    assert(gpuDynInst->numScalarReqs > 0);
 
 1080    gpuDynInst->numScalarReqs--;
 
 1090    if (!gpuDynInst->numScalarReqs) {
 
 1091        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
 
 1092                computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
 
 1095                computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
 
 
 1109    for (
const auto &pkt : 
retries) {
 
 
 1125    for (
int i = 0; 
i < 
len; ++
i) {
 
 1128        DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
 
 1129                computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1130                pkt->
req->getPaddr());
 
 1136            DPRINTF(GPUMem, 
"failed again!\n");
 
 1139            DPRINTF(GPUMem, 
"successful!\n");
 
 
 1153    if (sender_state->
wavefront != 
nullptr) {
 
 1163            computeUnit->shader->gpuCmdProc.completeTimingRead(dispType);
 
 
 1188    for (
int i = 0; 
i < 
len; ++
i) {
 
 1191        DPRINTF(GPUFetch, 
"CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
 
 1193                pkt->
req->getPaddr());
 
 1195            DPRINTF(GPUFetch, 
"failed again!\n");
 
 1198            DPRINTF(GPUFetch, 
"successful!\n");
 
 
 1207    return "ComputeUnit SQC memory request event";
 
 
 1216    assert(!
pkt->req->systemReq());
 
 
 1228    Addr tmp_vaddr = pkt->
req->getVaddr();
 
 1233    pkt->
req->setPC(gpuDynInst->wavefront()->pc());
 
 1235    pkt->
req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1254        shader->gpuCmdProc.driver()->setMtype(pkt->
req);
 
 1261    } 
else if (pkt->
isRead()) {
 
 1264        fatal(
"pkt is not a read nor a write\n");
 
 1270    ++
stats.tlbRequests;
 
 1278            unsigned size = pkt->
getSize();
 
 1281                panic(
"CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
 
 1282                      cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 
vaddr);
 
 1287            if (!
p->pTable->translate(
vaddr, paddr)) {
 
 1288                if (!
p->fixupFault(
vaddr)) {
 
 1289                    panic(
"CU%d: WF[%d][%d]: Fault on addr %#x!\n",
 
 1290                          cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1307            tlbPort[tlbPort_index].sendFunctional(pkt);
 
 1310            int hit_level = translation_state->
hitLevel;
 
 1311            assert(hit_level != -1);
 
 1312            stats.hitsPerTLBLevel[hit_level]++;
 
 1319            delete sender_state->
saved;
 
 1320            delete sender_state;
 
 1322            assert(pkt->
req->hasPaddr());
 
 1323            assert(pkt->
req->hasSize());
 
 1333                uint8_t *tmpData = oldPkt->
getPtr<uint8_t>();
 
 1344            gpuDynInst->memStatusVector[pkt->
getAddr()].push_back(
index);
 
 1345            gpuDynInst->tlbHitLevel[
index] = hit_level;
 
 1352            DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x data " 
 1353                    "scheduled\n", 
cu_id, gpuDynInst->simdId,
 
 1354                    gpuDynInst->wfSlotId, 
index, pkt->
req->getPaddr());
 
 1357        } 
else if (
tlbPort[tlbPort_index].isStalled()) {
 
 1358            assert(
tlbPort[tlbPort_index].retries.size() > 0);
 
 1360            DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Translation for addr %#x " 
 1361                    "failed!\n", 
cu_id, gpuDynInst->simdId,
 
 1362                    gpuDynInst->wfSlotId, tmp_vaddr);
 
 1364            tlbPort[tlbPort_index].retries.push_back(pkt);
 
 1365        } 
else if (!
tlbPort[tlbPort_index].sendTimingReq(pkt)) {
 
 1370            tlbPort[tlbPort_index].stallPort();
 
 1372            DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Translation for addr %#x " 
 1373                    "failed!\n", 
cu_id, gpuDynInst->simdId,
 
 1374                    gpuDynInst->wfSlotId, tmp_vaddr);
 
 1376            tlbPort[tlbPort_index].retries.push_back(pkt);
 
 1378           DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Translation for addr %#x from " 
 1379                   "instruction %s sent!\n", 
cu_id, gpuDynInst->simdId,
 
 1380                   gpuDynInst->wfSlotId, tmp_vaddr,
 
 1381                   gpuDynInst->disassemble().c_str());
 
 1385            gpuDynInst->resetEntireStatusVector();
 
 1387            gpuDynInst->decrementStatusVector(
index);
 
 1397        tlbPort[tlbPort_index].sendFunctional(pkt);
 
 1407        memPort[0].sendFunctional(new_pkt);
 
 1409        DPRINTF(GPUMem, 
"Functional sendRequest\n");
 
 1410        DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: index %d: addr %#x\n", 
cu_id,
 
 1411                gpuDynInst->simdId, gpuDynInst->wfSlotId, 
index,
 
 1412                new_pkt->
req->getPaddr());
 
 
 1446        DPRINTF(GPUTLB, 
"sent scalar %s translation request for addr %#x\n",
 
 1448                pkt->
req->getVaddr());
 
 
 1457    assert(gpuDynInst->isGlobalSeg() ||
 
 1458           gpuDynInst->executedAs() == enums::SC_GLOBAL);
 
 1463        req = std::make_shared<Request>(
 
 1474    if (kernelMemSync) {
 
 1475        if (gpuDynInst->isKernelLaunch()) {
 
 1477            req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1484              memPort[0].createMemReqEvent(pkt);
 
 1486            DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling " 
 1487                    "an acquire\n", 
cu_id, gpuDynInst->simdId,
 
 1488                    gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
 
 1494          assert(
shader->impl_kern_end_rel);
 
 1495          assert(gpuDynInst->isEndOfKernel());
 
 1498          req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1505            memPort[0].createMemReqEvent(pkt);
 
 1507          DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x scheduling " 
 1508                  "a release\n", 
cu_id, gpuDynInst->simdId,
 
 1509                  gpuDynInst->wfSlotId, 0, pkt->
req->getPaddr());
 
 1514        gpuDynInst->setRequestFlags(req);
 
 1516        req->setReqInstSeqNum(gpuDynInst->seqNum());
 
 1523          memPort[0].createMemReqEvent(pkt);
 
 1526                "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
 
 1527                cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
 
 1528                pkt->
req->getPaddr());
 
 
 1537    auto req = std::make_shared<Request>(paddr, 64, 0, 
vramRequestorId());
 
 1541    pkt->pushSenderState(
 
 1548    shader->incNumOutstandingInvL2s();
 
 
 1562    DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
 
 1563            compute_unit->
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
 
 1564            pkt->
req->getPaddr(), 
id);
 
 1566    Addr paddr = pkt->
req->getPaddr();
 
 1582    int index = gpuDynInst->memStatusVector[paddr].back();
 
 1584    DPRINTF(GPUMem, 
"Response for addr %#x, index %d\n",
 
 1585            pkt->
req->getPaddr(), 
id);
 
 1587    gpuDynInst->memStatusVector[paddr].pop_back();
 
 1588    gpuDynInst->pAddr = pkt->
req->getPaddr();
 
 1590    gpuDynInst->decrementStatusVector(
index);
 
 1591    DPRINTF(GPUMem, 
"bitvector is now %s\n", gpuDynInst->printStatusVector());
 
 1593    if (gpuDynInst->allLanesZero()) {
 
 1594        auto iter = gpuDynInst->memStatusVector.begin();
 
 1595        auto end = gpuDynInst->memStatusVector.end();
 
 1597        while (iter != end) {
 
 1598            assert(iter->second.empty());
 
 1605        if (compute_unit->
headTailMap.count(gpuDynInst)) {
 
 1611        gpuDynInst->memStatusVector.clear();
 
 1617        DPRINTF(GPUMem, 
"CU%d: WF[%d][%d]: packet totally complete\n",
 
 1618                compute_unit->
cu_id, gpuDynInst->simdId,
 
 1619                gpuDynInst->wfSlotId);
 
 1622            if (!compute_unit->
headTailMap.count(gpuDynInst)) {
 
 1624                    .insert(std::make_pair(gpuDynInst, 
curTick()));
 
 
 1636    Addr line = pkt->
req->getPaddr();
 
 1639            pkt->
req->getVaddr(), line);
 
 1649    if (!translation_state->
tlbEntry) {
 
 1657        DPRINTFN(
"Wave %d couldn't tranlate vaddr %#x\n", 
w->wfDynId,
 
 1658                 pkt->
req->getVaddr());
 
 1662    int hit_level = translation_state->
hitLevel;
 
 1665    delete translation_state->
tlbEntry;
 
 1666    assert(!translation_state->
ports.size());
 
 1672    delete translation_state;
 
 1681    gpuDynInst->memStatusVector[line].push_back(mp_index);
 
 1682    gpuDynInst->tlbHitLevel[mp_index] = hit_level;
 
 1684    DPRINTF(GPUTrace, 
"CU%d WF[%d][%d]: Translated %#lx -> %#lx for " 
 1685            "instruction %s (seqNum: %ld)\n", 
computeUnit->cu_id,
 
 1686            gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->
req->getVaddr(),
 
 1687            line, gpuDynInst->disassemble().c_str(), gpuDynInst->seqNum());
 
 1698        panic(
"unsupported response to request conversion %s\n",
 
 1703        int simdId = gpuDynInst->simdId;
 
 1704        int wfSlotId = gpuDynInst->wfSlotId;
 
 1711        case enums::PF_PHASE:
 
 1712            last = 
computeUnit->lastVaddrSimd[simdId][mp_index];
 
 1715            last = 
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
 
 1720        DPRINTF(GPUPrefetch, 
"CU[%d][%d][%d][%d]: %#x was last\n",
 
 1721                computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
 
 1736        DPRINTF(GPUPrefetch, 
"%#x to: CU[%d][%d][%d][%d]\n", 
vaddr,
 
 1749            RequestPtr prefetch_req = std::make_shared<Request>(
 
 1775            delete prefetch_pkt;
 
 1796    if (new_pkt->
req->systemReq()) {
 
 1801        if (!gpuDynInst->isSystemReq()) {
 
 1803            gpuDynInst->setSystemReq();
 
 1812        computeUnit->memPort[mp_index].createMemReqEvent(new_pkt);
 
 1814    DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
 
 1816            gpuDynInst->wfSlotId, mp_index, new_pkt->
req->getPaddr());
 
 
 1829        "ComputeUnit memory request event", 
true);
 
 
 1837        "ComputeUnit memory response event", 
true);
 
 
 1847    if (pkt->
req->systemReq()) {
 
 1852        retries.emplace_back(pkt, gpuDynInst);
 
 1856                    "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
 
 1857                    compute_unit->
cu_id, gpuDynInst->simdId,
 
 1858                    gpuDynInst->wfSlotId, 
id, pkt->
req->getPaddr());
 
 1863                    "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data" 
 1864                    " req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
 
 1865                    gpuDynInst->wfSlotId, gpuDynInst->seqNum(), 
id,
 
 1866                    pkt->
req->getPaddr());
 
 
 1874    return "ComputeUnit scalar memory request event";
 
 
 1884    if (
pkt->req->systemReq()) {
 
 1892                "CU%d: WF[%d][%d]: addr %#x data req failed!\n",
 
 1893                compute_unit->
cu_id, gpuDynInst->simdId,
 
 1894                gpuDynInst->wfSlotId, 
pkt->req->getPaddr());
 
 1897                "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data " 
 1898                "req sent!\n", compute_unit->
cu_id, gpuDynInst->simdId,
 
 1899                gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
 
 1900                pkt->req->getPaddr());
 
 
 1915    DPRINTF(GPUTLB, 
"CU%d: DTLB recvReqRetry - %d pending requests\n",
 
 1924    for (
int i = 0; 
i < 
len; ++
i) {
 
 1927        DPRINTF(GPUTLB, 
"CU%d: retrying D-translaton for address%#x", 
vaddr);
 
 1932            DPRINTF(GPUTLB, 
": failed again\n");
 
 1935            DPRINTF(GPUTLB, 
": successful\n");
 
 
 1951            "Translation of vaddr %#x failed\n", pkt->
req->getVaddr());
 
 1953    delete translation_state->
tlbEntry;
 
 1954    assert(!translation_state->
ports.size());
 
 1957    delete translation_state;
 
 1965    [[maybe_unused]] 
Wavefront *
w = gpuDynInst->wavefront();
 
 1967    DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received " 
 1968        "translation: PA %#x -> %#x\n", 
computeUnit->cu_id, 
w->simdId,
 
 1969        w->wfSlotId, 
w->kernId, pkt->
req->getVaddr(), pkt->
req->getPaddr());
 
 1978      fatal(
"Scalar DTLB receieved unexpected MemCmd response %s\n",
 
 1997    if (req_pkt->
req->systemReq()) {
 
 1998        gpuDynInst->setSystemReq();
 
 
 2015    [[maybe_unused]] 
Addr line = pkt->
req->getPaddr();
 
 2016    DPRINTF(GPUTLB, 
"CU%d: ITLBPort received %#x->%#x\n",
 
 2025    bool success = translation_state->
tlbEntry != 
nullptr;
 
 2026    delete translation_state->
tlbEntry;
 
 2027    assert(!translation_state->
ports.size());
 
 2029    delete translation_state;
 
 
 2070    DPRINTF(GPUTLB, 
"CU%d: ITLB recvReqRetry - %d pending requests\n", 
len);
 
 2079    for (
int i = 0; 
i < 
len; ++
i) {
 
 2082        DPRINTF(GPUTLB, 
"CU%d: retrying I-translaton for address%#x", 
vaddr);
 
 2086            DPRINTF(GPUTLB, 
": failed again\n");
 
 2089            DPRINTF(GPUTLB, 
": successful\n");
 
 
 2098    if (gpuDynInst->isScalar()) {
 
 2099        if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
 
 2101            stats.instCyclesSALU++;
 
 2102        } 
else if (gpuDynInst->isLoad()) {
 
 2103            stats.scalarMemReads++;
 
 2104        } 
else if (gpuDynInst->isStore()) {
 
 2105            stats.scalarMemWrites++;
 
 2108        if (gpuDynInst->isALU()) {
 
 2109            shader->total_valu_insts++;
 
 2110            if (
shader->total_valu_insts == 
shader->max_valu_insts) {
 
 2114            stats.instCyclesVALU++;
 
 2115            stats.threadCyclesVALU
 
 2116                += gpuDynInst->wavefront()->execMask().count();
 
 2117        } 
else if (gpuDynInst->isFlat()) {
 
 2118            if (gpuDynInst->isLocalMem()) {
 
 2119                stats.flatLDSInsts++;
 
 2121                stats.flatVMemInsts++;
 
 2123        } 
else if (gpuDynInst->isFlatGlobal()) {
 
 2124            stats.flatVMemInsts++;
 
 2125        } 
else if (gpuDynInst->isFlatScratch()) {
 
 2126            stats.flatVMemInsts++;
 
 2127        } 
else if (gpuDynInst->isLocalMem()) {
 
 2128            stats.ldsNoFlatInsts++;
 
 2129        } 
else if (gpuDynInst->isLoad()) {
 
 2130            stats.vectorMemReads++;
 
 2131        } 
else if (gpuDynInst->isStore()) {
 
 2132            stats.vectorMemWrites++;
 
 2135        if (gpuDynInst->isLoad()) {
 
 2136            switch (gpuDynInst->executedAs()) {
 
 2137              case enums::SC_SPILL:
 
 2140              case enums::SC_GLOBAL:
 
 2141                stats.globalReads++;
 
 2143              case enums::SC_GROUP:
 
 2146              case enums::SC_PRIVATE:
 
 2149              case enums::SC_READONLY:
 
 2150                stats.readonlyReads++;
 
 2152              case enums::SC_KERNARG:
 
 2153                stats.kernargReads++;
 
 2158              case enums::SC_NONE:
 
 2165                fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
 
 2168        } 
else if (gpuDynInst->isStore()) {
 
 2169            switch (gpuDynInst->executedAs()) {
 
 2170              case enums::SC_SPILL:
 
 2171                stats.spillWrites++;
 
 2173              case enums::SC_GLOBAL:
 
 2174                stats.globalWrites++;
 
 2176              case enums::SC_GROUP:
 
 2177                stats.groupWrites++;
 
 2179              case enums::SC_PRIVATE:
 
 2182              case enums::SC_READONLY:
 
 2183                stats.readonlyWrites++;
 
 2185              case enums::SC_KERNARG:
 
 2186                stats.kernargWrites++;
 
 2191              case enums::SC_NONE:
 
 2198                fatal(
"%s has no valid segment\n", gpuDynInst->disassemble());
 
 
 2220        std::ostream *page_stat_file = 
simout.create(
name().c_str())->stream();
 
 2222        *page_stat_file << 
"page, wavefront accesses, workitem accesses" <<
 
 2226            *page_stat_file << std::hex << iter.first << 
",";
 
 2227            *page_stat_file << std::dec << iter.second.first << 
",";
 
 2228            *page_stat_file << std::dec << iter.second.second << std::endl;
 
 
 2265    const uint32_t wgId)
 const 
 2267    return lds.getRefCounter(dispatchId, wgId);
 
 
 2275    for (
int i_wf = 0; i_wf < 
shader->n_wf; ++i_wf){
 
 
 2294    RequestPtr newRequest = std::make_shared<Request>();
 
 2295    newRequest->setPaddr(0x0);
 
 2303    return ldsPort.sendTimingReq(newPacket);
 
 
 2319        for (
int i = 0; 
i < 
shader->n_wf; ++
i) {
 
 2324            std::cout << 
curTick() << 
": ";
 
 2325            std::cout << 
"CU" << 
cu_id << 
" WF[" << j << 
"][" << 
i << 
"] ";
 
 2326            wfList[j][
i]->printProgress();
 
 2332    std::cout << std::endl;
 
 
 2344    fatal_if(!senderState, 
"did not get the right sort of sender state");
 
 2351    computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
 
 
 2365    fatal_if(!sender_state, 
"packet without a valid sender state");
 
 2374        DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: LDS send failed!\n",
 
 2376                        gpuDynInst->wfSlotId);
 
 2384        DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
 
 2386                gpuDynInst->wfSlotId, pkt->
req->getPaddr());
 
 2389        DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
 
 2391                gpuDynInst->wfSlotId, pkt->
req->getPaddr());
 
 
 2405    auto queueSize = 
retries.size();
 
 2407    DPRINTF(GPUPort, 
"CU%d: LDSPort recvReqRetry - %d pending requests\n",
 
 2411             "why was there a recvReqRetry() with no pending reqs?");
 
 2413             "recvReqRetry() happened when the port was not stalled");
 
 2425            DPRINTF(GPUPort, 
": LDS send failed again\n");
 
 2428            DPRINTF(GPUTLB, 
": LDS send successful\n");
 
 
 2444               "Number of cycles needed to execute VALU insts."),
 
 2446               "Number of cycles needed to execute SALU insts."),
 
 2448               "vector ALU ops. Similar to instCyclesVALU but multiplied by " 
 2449               "the number of active threads."),
 
 2451               "Percentage of active vector ALU threads in a wave."),
 
 2453               " accesses that resolve to LDS."),
 
 2455               "including FLAT accesses that resolve to LDS) per-wavefront."),
 
 2457               "The number of FLAT insts that resolve to vmem issued."),
 
 2459               "resolve to vmem issued per-wavefront."),
 
 2461               "The number of FLAT insts that resolve to LDS issued."),
 
 2463               "resolve to LDS issued per-wavefront."),
 
 2465               "Number of vector mem write insts (excluding FLAT insts)."),
 
 2467               "insts (excluding FLAT insts) per-wavefront."),
 
 2469               "Number of vector mem read insts (excluding FLAT insts)."),
 
 2471               "(excluding FLAT insts) per-wavefront."),
 
 2474               "The average number of scalar mem write insts per-wavefront."),
 
 2477               "The average number of scalar mem read insts per-wavefront."),
 
 2479               "Number of vector mem reads per kilo-instruction"),
 
 2481               "Number of vector mem writes per kilo-instruction"),
 
 2483               "Number of vector mem insts per kilo-instruction"),
 
 2485               "Number of scalar mem reads per kilo-instruction"),
 
 2487               "Number of scalar mem writes per kilo-instruction"),
 
 2489               "Number of scalar mem insts per kilo-instruction"),
 
 2491               "command, data from VRF to vector memory unit, per SIMD"),
 
 2493               "command, data from SRF to scalar memory unit, per SIMD"),
 
 2495               "command, data from VRF to LDS unit, per SIMD"),
 
 2499               "Number of memory instructions sent to the global segment"),
 
 2503               "Number of memory instructions sent to the arg segment"),
 
 2507               "Number of memory instructions sent to the spill segment"),
 
 2511               "Number of memory instructions sent to the group segment"),
 
 2515               "Number of memory instructions sent to the private segment"),
 
 2518               "Number of memory instructions sent to the readonly segment"),
 
 2520               "Number of memory instructions sent to the readonly segment"),
 
 2523               "Number of memory instructions sent to the kernarg segment"),
 
 2525               "Number of memory instructions sent to the kernarg segment"),
 
 2527               "wave level parallelism: count of active waves at wave launch"),
 
 2530               "total number of cycles for all uncoalesced requests"),
 
 2533               "TLB hits distribution (0 for page table, x for Lx-TLB)"),
 
 2536               "Number of bank conflicts per LDS memory packet"),
 
 2538               "pages touched per wf (over all mem. instr.)"),
 
 2540               "dynamic non-flat global memory instruction count"),
 
 2542               "dynamic flat global memory instruction count"),
 
 2545               "WG dispatch was blocked due to lack of barrier resources"),
 
 2547               "Workgroup blocked due to LDS capacity"),
 
 2550               "vector instructions per cycle"),
 
 2552               "number of vec ops executed (e.g. WF size/inst)"),
 
 2554               "number of f16 vec ops executed (e.g. WF size/inst)"),
 
 2556               "number of f32 vec ops executed (e.g. WF size/inst)"),
 
 2558               "number of f64 vec ops executed (e.g. WF size/inst)"),
 
 2560               "number of fma16 vec ops executed (e.g. WF size/inst)"),
 
 2562               "number of fma32 vec ops executed (e.g. WF size/inst)"),
 
 2564               "number of fma64 vec ops executed (e.g. WF size/inst)"),
 
 2566               "number of mac16 vec ops executed (e.g. WF size/inst)"),
 
 2568               "number of mac32 vec ops executed (e.g. WF size/inst)"),
 
 2570               "number of mac64 vec ops executed (e.g. WF size/inst)"),
 
 2572               "number of mad16 vec ops executed (e.g. WF size/inst)"),
 
 2574               "number of mad32 vec ops executed (e.g. WF size/inst)"),
 
 2576               "number of mad64 vec ops executed (e.g. WF size/inst)"),
 
 2578               "number of mfma vec ops executed (e.g. WF size/inst)"),
 
 2580               "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
 
 2582               "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
 
 2584               "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
 
 2586               "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
 
 2588               "number of two op FP vec ops executed (e.g. WF size/inst)"),
 
 2590      ADD_STAT(
vpc, 
"Vector Operations per cycle (this CU only)"),
 
 2594      ADD_STAT(
ipc, 
"Instructions per cycle (this CU only)"),
 
 2596               "instruction (over all instructions)"),
 
 2598               "number of active lanes per global memory instruction"),
 
 2600               "number of active lanes per local memory instruction"),
 
 2602               "Number of dynamic non-GM memory insts executed"),
 
 2604               "blocked due to VGPR allocation per SIMD"),
 
 2606               "blocked due to SGPR allocation per SIMD"),
 
 2609               "number of compare and swap operations that failed"),
 
 2613               "arrival at coalescer"),
 
 2667    for (
int i = 0; 
i < 4; ++
i) {
 
 
void sendRequest(PacketPtr pkt, Event *callback)
ClockedObject(const ClockedObjectParams &p)
Tick nextCycle() const
Based on the clock of the object, determine the start tick of the first cycle that is at least one cy...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ComputeUnit * computeUnit
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
ComputeUnit * computeUnit
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ComputeUnit * computeUnit
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
SenderState is information carried along with the packet, esp.
GPUDynInstPtr getMemInst() const
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
ComputeUnit * computeUnit
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
const char * description() const
Return a C string describing the event.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
ComputeUnit * computeUnit
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ComputeUnit * computeUnit
const char * description() const
Return a C string describing the event.
ScalarDataPort & scalarDataPort
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ComputeUnit * computeUnit
bool handleResponse(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
int numVectorGlobalMemUnits
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
void resetBarrier(int bar_id)
std::vector< std::vector< Addr > > lastVaddrSimd
int numVectorSharedMemUnits
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
void sendInvL2(Addr paddr)
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
int vrfToCoalescerBusWidth
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
ScoreboardCheckStage scoreboardCheckStage
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
int coalescerToVrfBusWidth
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
gem5::ComputeUnit::ComputeUnitStats stats
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
int wgSize(int dim) const
int numVectorRegs() const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
int numScalarRegs() const
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
const std::string & toString() const
Return the string to a cmd given by idx.
virtual std::string name() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
void pushSenderState(SenderState *sender_state)
Push a new sender state to the packet and make the current sender state the predecessor of the new on...
T * getPtr()
get a pointer to the data ptr.
RequestPtr req
A pointer to the original request.
MemCmd cmd
The command field of the packet.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
void sendFunctional(PacketPtr pkt) const
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
@ KERNEL
The request should be marked with KERNEL.
int getNumOutstandingInvL2s() const
void decNumOutstandingInvL2s()
AMDGPUSystemHub * systemHub
static const int InvalidID
void setStatus(status_e newStatus)
std::deque< GPUDynInstPtr > instructionBuffer
void barrierId(int bar_id)
@ S_BARRIER
WF is stalled at a barrier.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
void schedule(Event &event, Tick when)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Bitfield< 21, 20 > stride
const FlagsType pdf
Print the percent of the total that this entry represents.
const FlagsType oneline
Print all values on a single line.
Copyright (c) 2024 Arm Limited All rights reserved.
T safe_cast(U &&ref_or_ptr)
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
std::string csprintf(const char *format, const Args &...args)
void registerExitCallback(const std::function< void()> &callback)
Register an exit callback.
Declarations of a non-full system Page Table.
statistics::Scalar spillReads
statistics::Scalar groupWrites
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numFailedCASOps
statistics::Scalar numVecOpsExecuted
statistics::Formula vpc_f64
statistics::Scalar instCyclesSALU
statistics::Formula vectorMemWritesPerWF
statistics::Scalar argWrites
statistics::Scalar globalReads
statistics::Scalar numCASOps
statistics::Scalar completedWGs
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vALUInstsPerWF
statistics::Formula vectorMemWritesPerKiloInst
statistics::Formula sALUInstsPerWF
statistics::Formula readonlyMemInsts
statistics::Formula vALUUtilization
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::Formula privMemInsts
statistics::VectorDistribution instInterleave
statistics::Scalar flatVMemInsts
statistics::Scalar numVecOpsExecutedMAC64
statistics::Formula vpc_f16
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar dynamicLMemInstrCnt
statistics::Formula flatLDSInstsPerWF
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Vector instCyclesVMemPerSimd
statistics::Formula flatVMemInstsPerWF
statistics::Scalar argReads
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedFMA64
statistics::Scalar scalarMemWrites
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Formula groupMemInsts
statistics::Scalar privReads
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Formula numALUInstsExecuted
statistics::Scalar completedWfs
statistics::Distribution ldsBankConflictDist
statistics::Scalar vectorMemWrites
statistics::Scalar numInstrExecuted
statistics::Scalar vectorMemReads
statistics::Formula argMemInsts
statistics::Scalar tlbCycles
statistics::Formula scalarMemWritesPerKiloInst
statistics::Scalar scalarMemReads
statistics::Scalar tlbRequests
statistics::Formula kernargMemInsts
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar groupReads
statistics::Scalar privWrites
statistics::Scalar kernargReads
statistics::Scalar instCyclesVALU
statistics::Formula scalarMemWritesPerWF
statistics::Scalar readonlyWrites
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Formula vectorMemReadsPerWF
statistics::Scalar dynamicGMemInstrCnt
statistics::Formula vpc_f32
statistics::Scalar ldsBankAccesses
statistics::Formula tlbLatency
statistics::Scalar vALUInsts
statistics::Scalar numVecOpsExecutedFMA32
statistics::Formula scalarMemReadsPerKiloInst
statistics::Formula globalMemInsts
statistics::Formula scalarMemReadsPerWF
statistics::Scalar numVecOpsExecutedMAD16
statistics::Vector hitsPerTLBLevel
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar threadCyclesVALU
statistics::Scalar ldsNoFlatInsts
statistics::Scalar flatLDSInsts
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar spillWrites
statistics::Formula ldsNoFlatInstsPerWF
statistics::Scalar numVecOpsExecutedMAD32
statistics::Formula spillMemInsts
statistics::Scalar numVecOpsExecutedMFMAI8
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Scalar kernargWrites
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar globalWrites
statistics::Scalar dynamicFlatMemInstrCnt
statistics::Distribution headTailLatency
statistics::Scalar totalCycles
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
statistics::Scalar readonlyReads
statistics::Scalar sALUInsts
statistics::Scalar numVecOpsExecutedMFMA
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
ComputeUnit * computeUnit
GPUDynInstPtr _gpuDynInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPUDynInstPtr _gpuDynInst
GPUDynInstPtr _gpuDynInst
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
Packet::SenderState * saved
const std::string & name()