release/current/wavefront_8cc_source.html

/*

 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "gpu-compute/wavefront.hh"


#include "base/bitfield.hh"

#include "debug/GPUExec.hh"

#include "debug/GPUInitAbi.hh"

#include "debug/WavefrontStack.hh"

#include "gpu-compute/compute_unit.hh"

#include "gpu-compute/gpu_dyn_inst.hh"

#include "gpu-compute/register_file_cache.hh"

#include "gpu-compute/scalar_register_file.hh"

#include "gpu-compute/shader.hh"

#include "gpu-compute/simple_pool_manager.hh"

#include "gpu-compute/vector_register_file.hh"


namespace gem5

{


Wavefront::Wavefront(const Params &p)

  : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),

    maxIbSize(p.max_ib_size), _gpuISA(*this),

    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),

    vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),

    sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)

{

    lastTrace = 0;

    execUnitId = -1;

    status = S_STOPPED;

    reservedVectorRegs = 0;

    reservedScalarRegs = 0;

    startVgprIndex = 0;

    startSgprIndex = 0;

    outstandingReqs = 0;

    outstandingReqsWrGm = 0;

    outstandingReqsWrLm = 0;

    outstandingReqsRdGm = 0;

    outstandingReqsRdLm = 0;

    rdLmReqsInPipe = 0;

    rdGmReqsInPipe = 0;

    wrLmReqsInPipe = 0;

    wrGmReqsInPipe = 0;

    scalarRdGmReqsInPipe = 0;

    scalarWrGmReqsInPipe = 0;

    scalarOutstandingReqsRdGm = 0;

    scalarOutstandingReqsWrGm = 0;

    lastNonIdleTick = 0;

    ldsChunk = nullptr;


    memTraceBusy = 0;

    oldVgprTcnt = 0xffffffffffffffffll;

    oldDgprTcnt = 0xffffffffffffffffll;

    oldVgpr.resize(p.wf_size);


    pendingFetch = false;

    dropFetch = false;

    maxVgprs = 0;

    maxSgprs = 0;


    lastAddr.resize(p.wf_size);

    workItemFlatId.resize(p.wf_size);

    oldDgpr.resize(p.wf_size);

    for (int i = 0; i < 3; ++i) {

        workItemId[i].resize(p.wf_size);

    }


    _execMask.set();

    rawDist.clear();

    lastInstExec = 0;

    vecReads.clear();

}

Wavefront::Wavefront(const Params &p) {…}


void


Wavefront::init()

{

    reservedVectorRegs = 0;

    reservedScalarRegs = 0;

    startVgprIndex = 0;

    startSgprIndex = 0;


    scalarAlu = computeUnit->mapWaveToScalarAlu(this);

    scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);

    globalMem = computeUnit->mapWaveToGlobalMem(this);

    localMem = computeUnit->mapWaveToLocalMem(this);

    scalarMem = computeUnit->mapWaveToScalarMem(this);

}

Wavefront::init() {…}


void


Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)

{

    int regInitIdx = 0;

    gfxVersion = task->gfxVersion();


    // Iterate over all the init fields and check which

    // bits are enabled. Useful information can be found here:

    // https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/

    //                    blob/master/AMDGPU-ABI.md

    for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {


        if (task->sgprBitEnabled(en_bit)) {

            int physSgprIdx = 0;

            uint32_t firstWave = 0;

            int orderedAppendTerm = 0;

            int numWfsInWg = 0;

            uint32_t finalValue = 0;

            Addr host_disp_pkt_addr = task->hostDispPktAddr();

            Addr kernarg_addr = task->kernargAddr();

            Addr hidden_priv_base(0);


            switch (en_bit) {

              case PrivateSegBuf:

                    physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[0]);

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting PrivateSegBuffer: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[0]);


                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[1]);

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting PrivateSegBuffer: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[1]);


                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[2]);

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting PrivateSegBuffer: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[2]);


                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[3]);


                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting PrivateSegBuffer: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        task->amdQueue.scratch_resource_descriptor[3]);

                break;

              case DispatchPtr:

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        bits(host_disp_pkt_addr, 31, 0));

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting DispatchPtr: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        bits(host_disp_pkt_addr, 31, 0));


                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        bits(host_disp_pkt_addr, 63, 32));

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting DispatchPtr: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        bits(host_disp_pkt_addr, 63, 32));


                ++regInitIdx;

                break;

              case QueuePtr:

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        bits(task->hostAMDQueueAddr, 31, 0));

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting QueuePtr: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        bits(task->hostAMDQueueAddr, 31, 0));


                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        bits(task->hostAMDQueueAddr, 63, 32));

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting QueuePtr: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        bits(task->hostAMDQueueAddr, 63, 32));


                ++regInitIdx;

                break;

              case KernargSegPtr:

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        bits(kernarg_addr, 31, 0));

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting KernargSegPtr: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        bits(kernarg_addr, 31, 0));


                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        bits(kernarg_addr, 63, 32));

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting KernargSegPtr: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        bits(kernarg_addr, 63, 32));


                ++regInitIdx;

                break;

              case DispatchId:

                physSgprIdx

                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        task->dispatchId());

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting DispatchId: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        task->dispatchId());


                // Dispatch ID in gem5 is an int. Set upper 32-bits to zero.

                physSgprIdx

                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx, 0);

                ++regInitIdx;

                break;

              case FlatScratchInit:

                physSgprIdx

                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                    (TheGpuISA::ScalarRegU32)(task->amdQueue

                        .scratch_backing_memory_location & 0xffffffff));

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting FlatScratch Addr: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        (TheGpuISA::ScalarRegU32)(task->amdQueue

                        .scratch_backing_memory_location & 0xffffffff));


                physSgprIdx =

                       computeUnit->registerManager->mapSgpr(this, regInitIdx);

                // This vallue should be sizeof(DWORD) aligned, that is

                // 4 byte aligned

                computeUnit->srf[simdId]->write(physSgprIdx,

                    task->amdQueue.scratch_workitem_byte_size);

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting FlatScratch size: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        task->amdQueue.scratch_workitem_byte_size);

                hidden_priv_base =

                    (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |

                    (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]

                    & 0x000000000000ffff) << 32);

                computeUnit->shader->initShHiddenPrivateBase(

                       hidden_priv_base,

                       task->amdQueue.scratch_backing_memory_location);

                break;

              case PrivateSegSize:

                physSgprIdx

                    = computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                        task->privMemPerItem());

                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting private segment size: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        task->privMemPerItem());

                break;

              case WorkgroupIdX:

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                                                     workGroupId[0]);


                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting WG ID X: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);

                break;

              case WorkgroupIdY:

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                                                     workGroupId[1]);


                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting WG ID Y: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);

                break;

              case WorkgroupIdZ:

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->write(physSgprIdx,

                                                     workGroupId[2]);


                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting WG ID Z: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);

                break;

              case PrivSegWaveByteOffset:


                // For architected flat scratch, this enable is reused to set

                // the FLAT_SCRATCH register pair to the scratch backing

                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch

                if (task->gfxVersion() == GfxVersion::gfx942) {

                    archFlatScratchAddr =

                        task->amdQueue.scratch_backing_memory_location;


                    DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                            "Setting architected flat scratch = %x\n",

                            computeUnit->cu_id, simdId, wfSlotId, wfDynId,

                            archFlatScratchAddr);


                    break;

                }


                // Not architected flat scratch. Write the scratch wavefront

                // offset: https://llvm.org/docs/AMDGPUUsage.html

                //              #amdgpu-amdhsa-initial-kernel-execution-state

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);


                computeUnit->srf[simdId]->write(physSgprIdx, 1024 *

                    (wgId * (wgSz / 64) + wfId) *

                    task->amdQueue.compute_tmpring_size_wavesize);


                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting Private Seg Offset: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx,

                        1024 * (wgId * (wgSz / 64) + wfId) *

                        task->amdQueue.compute_tmpring_size_wavesize);

                break;

              case WorkgroupInfo:

                firstWave = (wfId == 0) ? 1 : 0;

                numWfsInWg = divCeil(wgSizeInWorkItems,

                                         computeUnit->wfSize());

                finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);

                finalValue |= (orderedAppendTerm << 6);

                finalValue |= numWfsInWg;

                physSgprIdx =

                    computeUnit->registerManager->mapSgpr(this, regInitIdx);

                computeUnit->srf[simdId]->

                    write(physSgprIdx, finalValue);


                ++regInitIdx;

                DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "

                        "Setting WG Info: s[%d] = %x\n",

                        computeUnit->cu_id, simdId,

                        wfSlotId, wfDynId, physSgprIdx, finalValue);

                break;

              default:

                fatal("SGPR enable bit %i not supported\n", en_bit);

                break;

            }

        }

    }


    // Save the offset to the first accumulation VGPR number from HSA task.

    accumOffset = task->accumOffset();


    regInitIdx = 0;


    // VGPRs are initialized to the work item IDs for a given thread. There

    // are two ways to initialize the IDs based on number of dimensions. ISAs

    // will either have packed work-item IDs or not. LLVM lists them here:

    // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table

    // Default to false and set to true for gem5 supported ISAs.

    bool packed_work_item_id = false;


    if (task->gfxVersion() == GfxVersion::gfx90a ||

        task->gfxVersion() == GfxVersion::gfx942) {

        packed_work_item_id = true;

    }


    // For ISAs with packed work item IDs, only one VGPR is used and the

    // (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits

    // for each dimension

    if (packed_work_item_id) {

        TheGpuISA::VecRegContainerU32 raw_vgpr;

        TheGpuISA::VecElemU32 *packed_vgpr

            = raw_vgpr.as<TheGpuISA::VecElemU32>();


        uint32_t physVgprIdx = computeUnit->registerManager

            ->mapVgpr(this, regInitIdx);

        for (int lane = 0; lane < workItemId[0].size(); ++lane) {

            packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;

        }

        if (task->vgprBitEnabled(1)) {

            for (int lane = 0; lane < workItemId[1].size(); ++lane) {

                packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);

            }

        }

        if (task->vgprBitEnabled(2)) {

            for (int lane = 0; lane < workItemId[2].size(); ++lane) {

                packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);

            }

        }

        computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);


        return;

    }


    // For ISAs with non-packed work item IDs, map and initialize one VGPR

    // per dimensions. Do this by iterating over all the init fields and

    // checking which bits are enabled.

    for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {

        if (task->vgprBitEnabled(en_bit)) {

            uint32_t physVgprIdx = 0;

            TheGpuISA::VecRegContainerU32 raw_vgpr;


            switch (en_bit) {

              case WorkitemIdX:

                {

                    physVgprIdx = computeUnit->registerManager

                        ->mapVgpr(this, regInitIdx);

                    TheGpuISA::VecElemU32 *vgpr_x

                        = raw_vgpr.as<TheGpuISA::VecElemU32>();


                    for (int lane = 0; lane < workItemId[0].size(); ++lane) {

                        vgpr_x[lane] = workItemId[0][lane];

                    }


                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);

                    rawDist[regInitIdx] = 0;

                    ++regInitIdx;

                }

                break;

              case WorkitemIdY:

                {

                    physVgprIdx = computeUnit->registerManager

                        ->mapVgpr(this, regInitIdx);

                    TheGpuISA::VecElemU32 *vgpr_y

                        = raw_vgpr.as<TheGpuISA::VecElemU32>();


                    for (int lane = 0; lane < workItemId[1].size(); ++lane) {

                        vgpr_y[lane] = workItemId[1][lane];

                    }


                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);

                    rawDist[regInitIdx] = 0;

                    ++regInitIdx;

                }

                break;

              case WorkitemIdZ:

                {

                    physVgprIdx = computeUnit->registerManager->

                        mapVgpr(this, regInitIdx);

                    TheGpuISA::VecElemU32 *vgpr_z

                        = raw_vgpr.as<TheGpuISA::VecElemU32>();


                    for (int lane = 0; lane < workItemId[2].size(); ++lane) {

                        vgpr_z[lane] = workItemId[2][lane];

                    }


                    computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);

                    rawDist[regInitIdx] = 0;

                    ++regInitIdx;

                }

                break;

            }

        }

    }

}

Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) {…}


void


Wavefront::resizeRegFiles(int num_vregs, int num_sregs)

{

    maxVgprs = num_vregs;

    maxSgprs = num_sregs;

}

Wavefront::resizeRegFiles(int num_vregs, int num_sregs) {…}


Wavefront::~Wavefront()

{

}

Wavefront::~Wavefront() {…}


void


Wavefront::setStatus(status_e newStatus)

{

    if (computeUnit->idleCUTimeout > 0) {

        // Wavefront's status transitions to stalled or stopped

        if ((newStatus == S_STOPPED || newStatus == S_STALLED ||

             newStatus == S_WAITCNT || newStatus == S_BARRIER) &&

            (status != newStatus)) {

            computeUnit->idleWfs++;

            assert(computeUnit->idleWfs <=

                   (computeUnit->shader->n_wf * computeUnit->numVectorALUs));

            if (computeUnit->idleWfs ==

                (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {

                lastNonIdleTick = curTick();

            }

            // Wavefront's status transitions to an active state (from

            // a stopped or stalled state)

        } else if ((status == S_STOPPED || status == S_STALLED ||

                    status == S_WAITCNT || status == S_BARRIER) &&

                   (status != newStatus)) {

            // if all WFs in the CU were idle then check if the idleness

            // period exceeded the timeout threshold

            if (computeUnit->idleWfs ==

                (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {

                panic_if((curTick() - lastNonIdleTick) >=

                         computeUnit->idleCUTimeout,

                         "CU%d has been idle for %d ticks at tick %d",

                         computeUnit->cu_id, computeUnit->idleCUTimeout,

                         curTick());

            }

            computeUnit->idleWfs--;

            assert(computeUnit->idleWfs >= 0);

        }

    }

    status = newStatus;

}

Wavefront::setStatus(status_e newStatus) {…}


void


Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)

{

    wfDynId = _wf_dyn_id;

    _pc = init_pc;


    status = S_RUNNING;


    vecReads.resize(maxVgprs, 0);

}

Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc) {…}


bool


Wavefront::isGmInstruction(GPUDynInstPtr ii)

{

    if (ii->isGlobalMem() ||

        (ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {

        return true;

    }


    return false;

}

Wavefront::isGmInstruction(GPUDynInstPtr ii) {…}


bool


Wavefront::isLmInstruction(GPUDynInstPtr ii)

{

    if (ii->isLocalMem() ||

        (ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {

        return true;

    }


    return false;

}

Wavefront::isLmInstruction(GPUDynInstPtr ii) {…}


bool


Wavefront::isOldestInstSleep()

{

    if (instructionBuffer.empty())

        return false;


    GPUDynInstPtr ii = instructionBuffer.front();


    if (ii->isSleep()) {

        return true;

    }

    return false;

}

Wavefront::isOldestInstSleep() {…}


bool


Wavefront::isOldestInstWaitcnt()

{

    if (instructionBuffer.empty())

        return false;


    GPUDynInstPtr ii = instructionBuffer.front();


    if (ii->isWaitcnt()) {

        // waitcnt is a scalar

        assert(ii->isScalar());

        return true;

    }


    return false;

}

Wavefront::isOldestInstWaitcnt() {…}


bool


Wavefront::isOldestInstScalarALU()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()

        || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||

        (ii->isKernArgSeg() && ii->isLoad()))) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstScalarALU() {…}


bool


Wavefront::isOldestInstVectorALU()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||

        ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()

        || (ii->isKernArgSeg() && ii->isLoad()))) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstVectorALU() {…}


bool


Wavefront::isOldestInstBarrier()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && ii->isBarrier()) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstBarrier() {…}


bool


Wavefront::isOldestInstGMem()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstGMem() {…}


bool


Wavefront::isOldestInstScalarMem()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstScalarMem() {…}


bool


Wavefront::isOldestInstLMem()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && ii->isLocalMem()) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstLMem() {…}


bool


Wavefront::isOldestInstPrivMem()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && ii->isPrivateSeg()) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstPrivMem() {…}


bool


Wavefront::isOldestInstFlatMem()

{

    assert(!instructionBuffer.empty());

    GPUDynInstPtr ii = instructionBuffer.front();


    if (status != S_STOPPED && ii->isFlat()) {

        return true;

    }


    return false;

}

Wavefront::isOldestInstFlatMem() {…}


bool


Wavefront::stopFetch()

{

    for (auto it : instructionBuffer) {

        GPUDynInstPtr ii = it;

        if (ii->isReturn() || ii->isBranch() ||

            ii->isEndOfKernel()) {

            return true;

        }

    }


    return false;

}

Wavefront::stopFetch() {…}


void


Wavefront::freeResources()

{

    execUnitId = -1;

}

Wavefront::freeResources() {…}


void Wavefront::validateRequestCounters()

{

    panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||

             wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||

             outstandingReqs < 0,

             "Negative requests in pipe for WF%d for slot%d"

             " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"

             " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"

             " Outstanding Reqs=%d\n",

             wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,

             rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);

}

void Wavefront::validateRequestCounters() {…}


void


Wavefront::reserveGmResource(GPUDynInstPtr ii)

{

    if (!ii->isScalar()) {

        if (ii->isLoad()) {

            rdGmReqsInPipe++;

        } else if (ii->isStore()) {

            wrGmReqsInPipe++;

        } else if (ii->isAtomic() || ii->isMemSync()) {

            rdGmReqsInPipe++;

            wrGmReqsInPipe++;

        } else {

            panic("Invalid memory operation!\n");

        }

        execUnitId = globalMem;

    } else {

        if (ii->isLoad()) {

            scalarRdGmReqsInPipe++;

        } else if (ii->isStore()) {

            scalarWrGmReqsInPipe++;

        } else if (ii->isAtomic() || ii->isMemSync()) {

            scalarWrGmReqsInPipe++;

            scalarRdGmReqsInPipe++;

        } else {

            panic("Invalid memory operation!\n");

        }

        execUnitId = scalarMem;

    }

}

Wavefront::reserveGmResource(GPUDynInstPtr ii) {…}


void


Wavefront::reserveLmResource(GPUDynInstPtr ii)

{

    fatal_if(ii->isScalar(),

             "Scalar instructions can not access Shared memory!!!");

    if (ii->isLoad()) {

        rdLmReqsInPipe++;

    } else if (ii->isStore()) {

        wrLmReqsInPipe++;

    } else if (ii->isAtomic() || ii->isMemSync()) {

        wrLmReqsInPipe++;

        rdLmReqsInPipe++;

    } else {

        panic("Invalid memory operation!\n");

    }

    execUnitId = localMem;

}

Wavefront::reserveLmResource(GPUDynInstPtr ii) {…}


std::vector<int>


Wavefront::reserveResources()

{

    // vector of execution unit IDs to return to schedule stage

    // this return is only used for debugging and an assertion...

    std::vector<int> execUnitIds;


    // Get current instruction

    GPUDynInstPtr ii = instructionBuffer.front();

    assert(ii);


    // Single precision ALU or Branch or Return or Special instruction

    if (ii->isALU() || ii->isSpecialOp() ||

        ii->isBranch() || ii->isNop() ||

        (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||

        ii->isReturn() || ii->isEndOfKernel()) {

        if (!ii->isScalar()) {

            execUnitId = simdId;

        } else {

            execUnitId = scalarAluGlobalIdx;

        }

        // this is to enforce a fixed number of cycles per issue slot per SIMD

    } else if (ii->isBarrier()) {

        execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;

    } else if (ii->isFlat()) {

        assert(!ii->isScalar());

        reserveLmResource(ii);

        // add execUnitId, reserved by reserveLmResource, list before it is

        // overwriten by reserveGmResource

        execUnitIds.push_back(execUnitId);

        flatLmUnitId = execUnitId;

        reserveGmResource(ii);

        flatGmUnitId = execUnitId;

        execUnitIds.push_back(flatGmUnitId);

        execUnitId = -1;

    } else if (ii->isGlobalMem()) {

        reserveGmResource(ii);

    } else if (ii->isLocalMem()) {

        reserveLmResource(ii);

    } else if (ii->isPrivateSeg()) {

        fatal_if(ii->isScalar(),

                 "Scalar instructions can not access Private memory!!!");

        reserveGmResource(ii);

    } else {

        panic("reserveResources -> Couldn't process op!\n");

    }


    if (execUnitId != -1) {

        execUnitIds.push_back(execUnitId);

    }

    assert(execUnitIds.size());

    return execUnitIds;

}

Wavefront::reserveResources() {…}


void


Wavefront::exec()

{

    // ---- Exit if wavefront is inactive ----------------------------- //


    if (status == S_STOPPED || status == S_RETURNING ||

        status==S_STALLED || instructionBuffer.empty()) {

        return;

    }


    if (status == S_WAITCNT) {

        assert(isOldestInstWaitcnt());

    }


    // Get current instruction


    GPUDynInstPtr ii = instructionBuffer.front();


    const Addr old_pc = pc();

    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "

            "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,

            wfDynId, ii->disassemble(), old_pc, ii->seqNum());


    ii->execute(ii);

    // delete the dynamic instruction from the pipeline map

    computeUnit->deleteFromPipeMap(this);

    // update the instruction stats in the CU

    computeUnit->updateInstStats(ii);


    // inform VRF of instruction execution to schedule write-back

    // and scoreboard ready for registers

    if (!ii->isScalar()) {

        computeUnit->rfc[simdId]->waveExecuteInst(this, ii);

        computeUnit->vrf[simdId]->waveExecuteInst(this, ii);

    }

    computeUnit->srf[simdId]->waveExecuteInst(this, ii);


    computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());

    computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());

    computeUnit->stats.numInstrExecuted++;

    stats.numInstrExecuted++;

    computeUnit->instExecPerSimd[simdId]++;

    computeUnit->stats.execRateDist.sample(

                                    computeUnit->stats.totalCycles.value() -

                                    computeUnit->lastExecCycle[simdId]);

    computeUnit->lastExecCycle[simdId] =

        computeUnit->stats.totalCycles.value();


    if (lastInstExec) {

        computeUnit->stats.instInterleave[simdId].

            sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);

    }

    lastInstExec = computeUnit->instExecPerSimd[simdId];


    // want to track:

    // number of reads that occur per value written


    // vector RAW dependency tracking

    for (const auto& srcVecOp : ii->srcVecRegOperands()) {

        for (const auto& virtIdx : srcVecOp.virtIndices()) {

            // This check should never fail, but to be safe we check

            if (rawDist.find(virtIdx) != rawDist.end()) {

                stats.vecRawDistance.sample(stats.numInstrExecuted.value() -

                                      rawDist[virtIdx]);

            }

            // increment number of reads to this register

            vecReads[virtIdx]++;

        }

    }


    for (const auto& dstVecOp : ii->dstVecRegOperands()) {

        for (const auto& virtIdx : dstVecOp.virtIndices()) {

            // rawDist is set on writes, but will not be set for the first

            // write to each physical register

            if (rawDist.find(virtIdx) != rawDist.end()) {

                // Sample the number of reads that were performed

                stats.readsPerWrite.sample(vecReads[virtIdx]);

            }

            // on a write, reset count of reads to 0

            vecReads[virtIdx] = 0;


            rawDist[virtIdx] = stats.numInstrExecuted.value();

        }

    }


    if (pc() == old_pc) {

        // PC not modified by instruction, proceed to next

        _gpuISA.advancePC(ii);

        instructionBuffer.pop_front();

    } else {

        DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",

                computeUnit->cu_id, simdId, wfSlotId, wfDynId,

                ii->disassemble());

        discardFetch();

    }

    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",

            computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());


    if (computeUnit->shader->hsail_mode==Shader::SIMT) {

        const int num_active_lanes = execMask().count();

        computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);

        computeUnit->stats.numVecOpsExecuted += num_active_lanes;


        if (ii->isMFMA()) {

            computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;

            if (ii->isI8()) {

                computeUnit->stats.numVecOpsExecutedMFMAI8

                    += num_active_lanes;

            }

        }


        if (ii->isF16() && ii->isALU()) {

            if (ii->isF32() || ii->isF64()) {

                fatal("Instruction is tagged as both (1) F16, and (2)"

                       "either F32 or F64.");

            }

            computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;

            if (ii->isFMA()) {

                computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMAC()) {

                computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMAD()) {

                computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMFMA()) {

                computeUnit->stats.numVecOpsExecutedMFMAF16

                    += num_active_lanes;

            }

        }

        if (ii->isF32() && ii->isALU()) {

            if (ii->isF16() || ii->isF64()) {

                fatal("Instruction is tagged as both (1) F32, and (2)"

                       "either F16 or F64.");

            }

            computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;

            if (ii->isFMA()) {

                computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMAC()) {

                computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMAD()) {

                computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMFMA()) {

                computeUnit->stats.numVecOpsExecutedMFMAF32

                    += num_active_lanes;

            }

        }

        if (ii->isF64() && ii->isALU()) {

            if (ii->isF16() || ii->isF32()) {

                fatal("Instruction is tagged as both (1) F64, and (2)"

                       "either F16 or F32.");

            }

            computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;

            if (ii->isFMA()) {

                computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMAC()) {

                computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMAD()) {

                computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;

                computeUnit->stats.numVecOpsExecutedTwoOpFP

                    += num_active_lanes;

            }

            else if (ii->isMFMA()) {

                computeUnit->stats.numVecOpsExecutedMFMAF64

                    += num_active_lanes;

            }

        }

        if (isGmInstruction(ii)) {

            computeUnit->stats.activeLanesPerGMemInstrDist.sample(

                                                            num_active_lanes);

        } else if (isLmInstruction(ii)) {

            computeUnit->stats.activeLanesPerLMemInstrDist.sample(

                                                            num_active_lanes);

        }

    }


    if (execMask().none() && ii->needsToken()) {

        computeUnit->getTokenManager()->recvTokens(1);

        return;

    }


    // Update Vector ALU pipeline and other resources

    bool flat_as_gm = false;

    bool flat_as_lm = false;

    if (ii->isFlat()) {

        flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||

                     (ii->executedAs() == enums::SC_PRIVATE);

        flat_as_lm = (ii->executedAs() == enums::SC_GROUP);

    }


    // Single precision ALU or Branch or Return or Special instruction

    // Note, we use the same timing regardless of SP or DP ALU operation.

    if (ii->isALU() || ii->isSpecialOp() ||

        ii->isBranch() || ii->isNop() ||

        (ii->isKernArgSeg() && ii->isLoad()) ||

        ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {

        // this is to enforce a fixed number of cycles per issue slot per SIMD

        if (!ii->isScalar()) {

            computeUnit->vectorALUs[simdId].set(computeUnit->

                cyclesToTicks(computeUnit->issuePeriod));

        } else {

            computeUnit->scalarALUs[scalarAlu].set(computeUnit->

                cyclesToTicks(computeUnit->issuePeriod));

        }

    // Barrier on Scalar ALU

    } else if (ii->isBarrier()) {

        computeUnit->scalarALUs[scalarAlu].set(computeUnit->

            cyclesToTicks(computeUnit->issuePeriod));

    // GM or Flat as GM Load

    } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {

        if (!ii->isScalar()) {

            computeUnit->vrfToGlobalMemPipeBus.set(

                computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));

            computeUnit->vectorGlobalMemUnit.

                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=

                computeUnit->vrf_gm_bus_latency;

        } else {

            computeUnit->srfToScalarMemPipeBus.set(computeUnit->

                cyclesToTicks(computeUnit->srf_scm_bus_latency));

            computeUnit->scalarMemUnit.

                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=

                computeUnit->srf_scm_bus_latency;

        }

    // GM or Flat as GM Store

    } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {

        if (!ii->isScalar()) {

            computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->

                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));

            computeUnit->vectorGlobalMemUnit.

                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=

                (2 * computeUnit->vrf_gm_bus_latency);

        } else {

            computeUnit->srfToScalarMemPipeBus.set(computeUnit->

                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));

            computeUnit->scalarMemUnit.

                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=

                (2 * computeUnit->srf_scm_bus_latency);

        }

    } else if ((ii->isAtomic() || ii->isMemSync()) &&

               (ii->isGlobalMem() || flat_as_gm)) {

        if (!ii->isScalar()) {

            computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->

                cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));

            computeUnit->vectorGlobalMemUnit.

                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

            computeUnit->stats.instCyclesVMemPerSimd[simdId] +=

                (2 * computeUnit->vrf_gm_bus_latency);

        } else {

            computeUnit->srfToScalarMemPipeBus.set(computeUnit->

                cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));

            computeUnit->scalarMemUnit.

                set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

            computeUnit->stats.instCyclesScMemPerSimd[simdId] +=

                (2 * computeUnit->srf_scm_bus_latency);

        }

    // LM or Flat as LM Load

    } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {

        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->

            cyclesToTicks(computeUnit->vrf_lm_bus_latency));

        computeUnit->vectorSharedMemUnit.

            set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));

        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=

            computeUnit->vrf_lm_bus_latency;

    // LM or Flat as LM Store

    } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {

        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->

            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));

        computeUnit->vectorSharedMemUnit.

            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=

            (2 * computeUnit->vrf_lm_bus_latency);

    // LM or Flat as LM, Atomic or MemFence

    } else if ((ii->isAtomic() || ii->isMemSync()) &&

               (ii->isLocalMem() || flat_as_lm)) {

        computeUnit->vrfToLocalMemPipeBus.set(computeUnit->

            cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));

        computeUnit->vectorSharedMemUnit.

            set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));

        computeUnit->stats.instCyclesLdsPerSimd[simdId] +=

            (2 * computeUnit->vrf_lm_bus_latency);

    } else {

        panic("Bad instruction type!\n");

    }

}

Wavefront::exec() {…}


GPUDynInstPtr


Wavefront::nextInstr()

{

    // Read next instruction from instruction buffer

    GPUDynInstPtr ii = instructionBuffer.front();

    // if the WF has been dispatched in the schedule stage then

    // check the next oldest instruction for readiness

    if (computeUnit->pipeMap.find(ii->seqNum()) !=

        computeUnit->pipeMap.end()) {

        if (instructionBuffer.size() > 1) {

            auto it = instructionBuffer.begin() + 1;

            return *it;

        } else { // No new instructions to check

            return nullptr;

        }

    }

    return ii;

}

Wavefront::nextInstr() {…}


void


Wavefront::discardFetch()

{

    instructionBuffer.clear();

    dropFetch |= pendingFetch;


    computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);

}

Wavefront::discardFetch() {…}


bool


Wavefront::waitCntsSatisfied()

{

    // Both vmWaitCnt && lgkmWaitCnt uninitialized means

    // waitCnt instruction has been dispatched but not executed yet: next

    // instruction should be blocked until waitCnt is executed.

    if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {

        return false;

    }


    if (vmWaitCnt != -1) {

        if (vmemInstsIssued > vmWaitCnt) {

            // vmWaitCnt not satisfied

            return false;

        }

    }


    if (expWaitCnt != -1) {

        if (expInstsIssued > expWaitCnt) {

            // expWaitCnt not satisfied

            return false;

        }

    }


    if (lgkmWaitCnt != -1) {

        if (lgkmInstsIssued > lgkmWaitCnt) {

            // lgkmWaitCnt not satisfied

            return false;

        }

    }


    // if we get here all outstanding waitcnts must

    // be satisfied, so we resume normal operation

    clearWaitCnts();


    return true;

}

Wavefront::waitCntsSatisfied() {…}


bool


Wavefront::sleepDone()

{

    assert(status == S_STALLED_SLEEP);


    // if the sleep count has not been set, then the sleep instruction has not

    // been executed yet, so we will return true without setting the wavefront

    // status

    if (sleepCnt == 0)

        return false;


    sleepCnt--;

    if (sleepCnt != 0)

        return false;


    status = S_RUNNING;

    return true;

}

Wavefront::sleepDone() {…}


void


Wavefront::setSleepTime(int sleep_time)

{

    assert(sleepCnt == 0);

    sleepCnt = sleep_time;

}

Wavefront::setSleepTime(int sleep_time) {…}


void


Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)

{

    // the scoreboard should have set the status

    // to S_WAITCNT once a waitcnt instruction

    // was marked as ready

    assert(status == S_WAITCNT);


    // waitcnt instruction shouldn't be sending

    // negative counts

    assert(vm_wait_cnt >= 0);

    assert(exp_wait_cnt >= 0);

    assert(lgkm_wait_cnt >= 0);

    // waitcnts are a max of 15 because we have

    // only 1 nibble (4 bits) to set the counts

    assert(vm_wait_cnt <= 0xf);

    assert(exp_wait_cnt <= 0x7);

    assert(lgkm_wait_cnt <= 0x1f);


    assert(vmWaitCnt == -1);

    assert(expWaitCnt == -1);

    assert(lgkmWaitCnt == -1);


    if (vm_wait_cnt != 0xf)

        vmWaitCnt = vm_wait_cnt;


    if (exp_wait_cnt != 0x7)

        expWaitCnt = exp_wait_cnt;


    if (lgkm_wait_cnt != 0x1f)

        lgkmWaitCnt = lgkm_wait_cnt;

}

Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt) {…}


void


Wavefront::clearWaitCnts()

{

    // reset the waitcnts back to

    // -1, indicating they are no

    // longer valid

    vmWaitCnt = -1;

    expWaitCnt = -1;

    lgkmWaitCnt = -1;


    // resume running normally

    status = S_RUNNING;

}

Wavefront::clearWaitCnts() {…}


void


Wavefront::incVMemInstsIssued()

{

    ++vmemInstsIssued;

}

Wavefront::incVMemInstsIssued() {…}


void


Wavefront::incExpInstsIssued()

{

    ++expInstsIssued;

}

Wavefront::incExpInstsIssued() {…}


void


Wavefront::incLGKMInstsIssued()

{

    ++lgkmInstsIssued;

}

Wavefront::incLGKMInstsIssued() {…}


void


Wavefront::decVMemInstsIssued()

{

    --vmemInstsIssued;

}

Wavefront::decVMemInstsIssued() {…}


void


Wavefront::decExpInstsIssued()

{

    --expInstsIssued;

}

Wavefront::decExpInstsIssued() {…}


void


Wavefront::decLGKMInstsIssued()

{

    --lgkmInstsIssued;

}

Wavefront::decLGKMInstsIssued() {…}


Addr


Wavefront::pc() const

{

    return _pc;

}

Wavefront::pc() const {…}


void


Wavefront::pc(Addr new_pc)

{

    _pc = new_pc;

}

Wavefront::pc(Addr new_pc) {…}


VectorMask&


Wavefront::execMask()

{

    return _execMask;

}

Wavefront::execMask() {…}


bool


Wavefront::execMask(int lane) const

{

    return _execMask[lane];

}

Wavefront::execMask(int lane) const {…}


void


Wavefront::freeRegisterFile()

{

    /* clear busy registers */

    for (int i=0; i < maxVgprs; i++) {

        int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);

        computeUnit->vrf[simdId]->markReg(vgprIdx, false);

    }


    /* Free registers used by this wavefront */

    uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %

                         computeUnit->vrf[simdId]->numRegs();

    computeUnit->registerManager->vrfPoolMgrs[simdId]->

        freeRegion(startVgprIndex, endIndex);

}

Wavefront::freeRegisterFile() {…}


void


Wavefront::computeActualWgSz(HSAQueueEntry *task)

{

    actualWgSzTotal = 1;

    for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {

        actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]

                                 - task->wgId(d) * workGroupSz[d]);

        actualWgSzTotal *= actualWgSz[d];

    }

}

Wavefront::computeActualWgSz(HSAQueueEntry *task) {…}


void


Wavefront::barrierId(int bar_id)

{

    assert(bar_id >= WFBarrier::InvalidID);

    assert(bar_id < computeUnit->numBarrierSlots());

    barId = bar_id;

}

Wavefront::barrierId(int bar_id) {…}


int


Wavefront::barrierId() const

{

    return barId;

}

Wavefront::barrierId() const {…}


bool


Wavefront::hasBarrier() const

{

    return barId > WFBarrier::InvalidID;

}

Wavefront::hasBarrier() const {…}


void


Wavefront::releaseBarrier()

{

    barId = WFBarrier::InvalidID;

}

Wavefront::releaseBarrier() {…}


Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)

    : statistics::Group(parent),

      ADD_STAT(numInstrExecuted,

               "number of instructions executed by this WF slot"),

      ADD_STAT(schCycles, "number of cycles spent in schedule stage"),

      ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),

      ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "

               "RF denied adding instruction"),

      ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"

               " not available"),

      ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "

               "RF reads to complete"),

      ADD_STAT(schLdsArbStalls,

               "number of cycles wave stalled due to LDS-VRF arbitration"),

      // FIXME: the name of the WF needs to be unique

      ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "

               "instructions are blocked due to WAW or WAR dependencies"),

      // FIXME: the name of the WF needs to be unique

      ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "

               "instructions are blocked due to RAW dependencies"),

      ADD_STAT(vecRawDistance,

               "Count of RAW distance in dynamic instructions for this WF"),

      ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")

{

    vecRawDistance.init(0, 20, 1);

    readsPerWrite.init(0, 4, 1);

}

Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent) {…}


} // namespace gem5

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

bitfield.hh

gem5::Clocked::cyclesToTicks
Tick cyclesToTicks(Cycles c) const
Definition clocked_object.hh:227

gem5::ComputeUnit::wfSize
int wfSize() const
Definition compute_unit.hh:404

gem5::ComputeUnit::mapWaveToScalarAlu
int mapWaveToScalarAlu(Wavefront *w) const
Definition compute_unit.cc:269

gem5::ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition compute_unit.hh:250

gem5::ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition compute_unit.hh:242

gem5::ComputeUnit::srf_scm_bus_latency
Cycles srf_scm_bus_latency
Definition compute_unit.hh:323

gem5::ComputeUnit::instExecPerSimd
std::vector< uint64_t > instExecPerSimd
Definition compute_unit.hh:331

gem5::ComputeUnit::idleCUTimeout
Tick idleCUTimeout
Definition compute_unit.hh:348

gem5::ComputeUnit::pipeMap
std::unordered_set< uint64_t > pipeMap
Definition compute_unit.hh:277

gem5::ComputeUnit::updateInstStats
void updateInstStats(GPUDynInstPtr gpuDynInst)
Definition compute_unit.cc:2001

gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition compute_unit.hh:226

gem5::ComputeUnit::vrf_gm_bus_latency
Cycles vrf_gm_bus_latency
Definition compute_unit.hh:321

gem5::ComputeUnit::mapWaveToGlobalMem
int mapWaveToGlobalMem(Wavefront *w) const
Definition compute_unit.cc:287

gem5::ComputeUnit::mapWaveToLocalMem
int mapWaveToLocalMem(Wavefront *w) const
Definition compute_unit.cc:295

gem5::ComputeUnit::rfc
std::vector< RegisterFileCache * > rfc
Definition compute_unit.hh:300

gem5::ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition compute_unit.hh:232

gem5::ComputeUnit::vrf_lm_bus_latency
Cycles vrf_lm_bus_latency
Definition compute_unit.hh:325

gem5::ComputeUnit::getTokenManager
TokenManager * getTokenManager()
Definition compute_unit.hh:931

gem5::ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition compute_unit.hh:240

gem5::ComputeUnit::lastExecCycle
std::vector< uint64_t > lastExecCycle
Definition compute_unit.hh:328

gem5::ComputeUnit::issuePeriod
Cycles issuePeriod
Definition compute_unit.hh:318

gem5::ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition compute_unit.hh:298

gem5::ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition compute_unit.hh:246

gem5::ComputeUnit::mapWaveToScalarMem
int mapWaveToScalarMem(Wavefront *w) const
Definition compute_unit.cc:303

gem5::ComputeUnit::registerManager
RegisterManager * registerManager
Definition compute_unit.hh:279

gem5::ComputeUnit::cu_id
int cu_id
Definition compute_unit.hh:293

gem5::ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition compute_unit.hh:234

gem5::ComputeUnit::idleWfs
int idleWfs
Definition compute_unit.hh:349

gem5::ComputeUnit::mapWaveToScalarAluGlobalIdx
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
Definition compute_unit.cc:280

gem5::ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition compute_unit.hh:296

gem5::ComputeUnit::fetchStage
FetchStage fetchStage
Definition compute_unit.hh:281

gem5::ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition compute_unit.hh:224

gem5::ComputeUnit::numVectorALUs
int numVectorALUs
Definition compute_unit.hh:245

gem5::ComputeUnit::shader
Shader * shader
Definition compute_unit.hh:358

gem5::ComputeUnit::deleteFromPipeMap
void deleteFromPipeMap(Wavefront *w)
Definition compute_unit.cc:546

gem5::ComputeUnit::stats
gem5::ComputeUnit::ComputeUnitStats stats

gem5::Cycles
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79

gem5::FetchStage::fetchUnit
FetchUnit & fetchUnit(int simdId)
Definition fetch_stage.hh:66

gem5::FetchUnit::flushBuf
void flushBuf(int wfSlotId)
Definition fetch_unit.cc:333

gem5::HSAQueueEntry
Definition hsa_queue_entry.hh:61

gem5::HSAQueueEntry::amdQueue
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
Definition hsa_queue_entry.hh:333

gem5::HSAQueueEntry::kernargAddr
Addr kernargAddr() const
Definition hsa_queue_entry.hh:209

gem5::HSAQueueEntry::sgprBitEnabled
bool sgprBitEnabled(int bit) const
Definition hsa_queue_entry.hh:317

gem5::HSAQueueEntry::wgId
int wgId(int dim) const
Definition hsa_queue_entry.hh:235

gem5::HSAQueueEntry::gfxVersion
const GfxVersion & gfxVersion() const
Definition hsa_queue_entry.hh:135

gem5::HSAQueueEntry::hostDispPktAddr
Addr hostDispPktAddr() const
Definition hsa_queue_entry.hh:191

gem5::HSAQueueEntry::MAX_DIM
static const int MAX_DIM
Definition hsa_queue_entry.hh:336

gem5::HSAQueueEntry::hostAMDQueueAddr
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
Definition hsa_queue_entry.hh:326

gem5::HSAQueueEntry::vgprBitEnabled
bool vgprBitEnabled(int bit) const
Definition hsa_queue_entry.hh:312

gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition hsa_queue_entry.hh:179

gem5::HSAQueueEntry::privMemPerItem
int privMemPerItem() const
Definition hsa_queue_entry.hh:220

gem5::HSAQueueEntry::accumOffset
unsigned accumOffset() const
Definition hsa_queue_entry.hh:405

gem5::RegisterManager::mapVgpr
int mapVgpr(Wavefront *w, int vgprIndex)
Definition register_manager.cc:95

gem5::RegisterManager::vrfPoolMgrs
std::vector< PoolManager * > vrfPoolMgrs
Definition register_manager.hh:80

gem5::RegisterManager::mapSgpr
int mapSgpr(Wavefront *w, int sgprIndex)
Definition register_manager.cc:102

gem5::Shader::SIMT
@ SIMT
Definition shader.hh:114

gem5::Shader::hsail_mode
hsail_mode_e hsail_mode
Definition shader.hh:234

gem5::Shader::n_wf
int n_wf
Definition shader.hh:247

gem5::Shader::initShHiddenPrivateBase
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
Definition shader.hh:209

gem5::Shader::incVectorInstDstOperand
void incVectorInstDstOperand(int num_operands)
Definition shader.hh:326

gem5::Shader::incVectorInstSrcOperand
void incVectorInstSrcOperand(int num_operands)
Definition shader.hh:320

gem5::SimObject
Abstract superclass for simulation objects.
Definition sim_object.hh:148

gem5::TokenManager::recvTokens
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition token_port.cc:155

gem5::WFBarrier
WF barrier slots.
Definition compute_unit.hh:92

gem5::WFBarrier::InvalidID
static const int InvalidID
Definition compute_unit.hh:98

gem5::WaitClass::set
void set(uint64_t i)
Definition misc.hh:82

gem5::Wavefront::maxSgprs
uint32_t maxSgprs
Definition wavefront.hh:135

gem5::Wavefront::flatGmUnitId
int flatGmUnitId
Definition wavefront.hh:106

gem5::Wavefront::flatLmUnitId
int flatLmUnitId
Definition wavefront.hh:105

gem5::Wavefront::status
status_e status
Definition wavefront.hh:335

gem5::Wavefront::scalarAluGlobalIdx
int scalarAluGlobalIdx
Definition wavefront.hh:127

gem5::Wavefront::isOldestInstWaitcnt
bool isOldestInstWaitcnt()
Definition wavefront.cc:657

gem5::Wavefront::pc
Addr pc() const
Definition wavefront.cc:1439

gem5::Wavefront::vmemInstsIssued
int vmemInstsIssued
Definition wavefront.hh:331

gem5::Wavefront::hasBarrier
bool hasBarrier() const
Definition wavefront.cc:1504

gem5::Wavefront::_execMask
VectorMask _execMask
Definition wavefront.hh:337

gem5::Wavefront::actualWgSzTotal
uint32_t actualWgSzTotal
Definition wavefront.hh:168

gem5::Wavefront::reserveGmResource
void reserveGmResource(GPUDynInstPtr ii)
Definition wavefront.cc:815

gem5::Wavefront::oldVgprTcnt
uint64_t oldVgprTcnt
Definition wavefront.hh:216

gem5::Wavefront::lastAddr
std::vector< Addr > lastAddr
Definition wavefront.hh:157

gem5::Wavefront::rdLmReqsInPipe
int rdLmReqsInPipe
Definition wavefront.hh:188

gem5::Wavefront::discardFetch
void discardFetch()
Definition wavefront.cc:1262

gem5::Wavefront::wfId
uint32_t wfId
Definition wavefront.hh:171

gem5::Wavefront::setStatus
void setStatus(status_e newStatus)
Definition wavefront.cc:573

gem5::Wavefront::waitCntsSatisfied
bool waitCntsSatisfied()
Definition wavefront.cc:1275

gem5::Wavefront::memTraceBusy
int memTraceBusy
Definition wavefront.hh:195

gem5::Wavefront::wgId
uint32_t wgId
Definition wavefront.hh:164

gem5::Wavefront::validateRequestCounters
void validateRequestCounters()
Definition wavefront.cc:801

gem5::Wavefront::reservedScalarRegs
int reservedScalarRegs
Definition wavefront.hh:200

gem5::Wavefront::simdId
const int simdId
Definition wavefront.hh:101

gem5::Wavefront::outstandingReqsWrGm
int outstandingReqsWrGm
Definition wavefront.hh:177

gem5::Wavefront::isOldestInstLMem
bool isOldestInstLMem()
Definition wavefront.cc:743

gem5::Wavefront::isOldestInstPrivMem
bool isOldestInstPrivMem()
Definition wavefront.cc:756

gem5::Wavefront::isOldestInstScalarMem
bool isOldestInstScalarMem()
Definition wavefront.cc:730

gem5::Wavefront::oldDgprTcnt
uint64_t oldDgprTcnt
Definition wavefront.hh:223

gem5::Wavefront::scalarWrGmReqsInPipe
int scalarWrGmReqsInPipe
Definition wavefront.hh:193

gem5::Wavefront::Wavefront
Wavefront(const Params &p)
Definition wavefront.cc:49

gem5::Wavefront::isOldestInstBarrier
bool isOldestInstBarrier()
Definition wavefront.cc:704

gem5::Wavefront::lastNonIdleTick
Tick lastNonIdleTick
Definition wavefront.hh:116

gem5::Wavefront::lgkmWaitCnt
int lgkmWaitCnt
Definition wavefront.hh:330

gem5::Wavefront::resizeRegFiles
void resizeRegFiles(int num_vregs, int num_sregs)
Definition wavefront.cc:562

gem5::Wavefront::dropFetch
bool dropFetch
Definition wavefront.hh:114

gem5::Wavefront::scalarOutstandingReqsWrGm
int scalarOutstandingReqsWrGm
Definition wavefront.hh:187

gem5::Wavefront::gridSz
uint32_t gridSz[3]
Definition wavefront.hh:163

gem5::Wavefront::_pc
Addr _pc
Definition wavefront.hh:336

gem5::Wavefront::pendingFetch
bool pendingFetch
Definition wavefront.hh:113

gem5::Wavefront::decExpInstsIssued
void decExpInstsIssued()
Definition wavefront.cc:1427

gem5::Wavefront::oldVgpr
std::vector< uint32_t > oldVgpr
Definition wavefront.hh:212

gem5::Wavefront::initRegState
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
Definition wavefront.cc:118

gem5::Wavefront::setSleepTime
void setSleepTime(int sleep_time)
Definition wavefront.cc:1337

gem5::Wavefront::computeUnit
ComputeUnit * computeUnit
Definition wavefront.hh:108

gem5::Wavefront::workItemFlatId
std::vector< uint32_t > workItemFlatId
Definition wavefront.hh:159

gem5::Wavefront::vmWaitCnt
int vmWaitCnt
the following are used for waitcnt instructions vmWaitCnt: once set, we wait for the oustanding numbe...
Definition wavefront.hh:328

gem5::Wavefront::vecReads
std::vector< int > vecReads
Definition wavefront.hh:244

gem5::Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:111

gem5::Wavefront::isOldestInstSleep
bool isOldestInstSleep()
Definition wavefront.cc:643

gem5::Wavefront::accumOffset
uint32_t accumOffset
Definition wavefront.hh:137

gem5::Wavefront::outstandingReqsRdLm
int outstandingReqsRdLm
Definition wavefront.hh:183

gem5::Wavefront::isLmInstruction
bool isLmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:632

gem5::Wavefront::expWaitCnt
int expWaitCnt
Definition wavefront.hh:329

gem5::Wavefront::nextInstr
GPUDynInstPtr nextInstr()
Definition wavefront.cc:1243

gem5::Wavefront::lastTrace
uint64_t lastTrace
Definition wavefront.hh:196

gem5::Wavefront::workItemId
std::vector< uint32_t > workItemId[3]
Definition wavefront.hh:158

gem5::Wavefront::execUnitId
int execUnitId
Definition wavefront.hh:104

gem5::Wavefront::oldDgpr
std::vector< uint64_t > oldDgpr
Definition wavefront.hh:219

gem5::Wavefront::scalarMem
int scalarMem
Definition wavefront.hh:130

gem5::Wavefront::isOldestInstScalarALU
bool isOldestInstScalarALU()
Definition wavefront.cc:674

gem5::Wavefront::exec
void exec()
Definition wavefront.cc:917

gem5::Wavefront::reservedVectorRegs
int reservedVectorRegs
Definition wavefront.hh:198

gem5::Wavefront::localMem
int localMem
Definition wavefront.hh:129

gem5::Wavefront::sleepCnt
int sleepCnt
Definition wavefront.hh:334

gem5::Wavefront::releaseBarrier
void releaseBarrier()
Definition wavefront.cc:1510

gem5::Wavefront::isOldestInstFlatMem
bool isOldestInstFlatMem()
Definition wavefront.cc:769

gem5::Wavefront::Params
WavefrontParams Params
Definition wavefront.hh:251

gem5::Wavefront::maxVgprs
uint32_t maxVgprs
Definition wavefront.hh:133

gem5::Wavefront::decVMemInstsIssued
void decVMemInstsIssued()
Definition wavefront.cc:1421

gem5::Wavefront::computeActualWgSz
void computeActualWgSz(HSAQueueEntry *task)
Definition wavefront.cc:1479

gem5::Wavefront::stopFetch
bool stopFetch()
Definition wavefront.cc:782

gem5::Wavefront::workGroupId
uint32_t workGroupId[3]
Definition wavefront.hh:161

gem5::Wavefront::setWaitCnts
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
Definition wavefront.cc:1344

gem5::Wavefront::wfSlotId
const int wfSlotId
Definition wavefront.hh:98

gem5::Wavefront::rawDist
std::unordered_map< int, uint64_t > rawDist
Definition wavefront.hh:240

gem5::Wavefront::~Wavefront
~Wavefront()
Definition wavefront.cc:568

gem5::Wavefront::incExpInstsIssued
void incExpInstsIssued()
Definition wavefront.cc:1409

gem5::Wavefront::reserveResources
std::vector< int > reserveResources()
Definition wavefront.cc:863

gem5::Wavefront::startSgprIndex
uint32_t startSgprIndex
Definition wavefront.hh:206

gem5::Wavefront::gfxVersion
GfxVersion gfxVersion
Definition wavefront.hh:96

gem5::Wavefront::decLGKMInstsIssued
void decLGKMInstsIssued()
Definition wavefront.cc:1433

gem5::Wavefront::outstandingReqsWrLm
int outstandingReqsWrLm
Definition wavefront.hh:179

gem5::Wavefront::incLGKMInstsIssued
void incLGKMInstsIssued()
Definition wavefront.cc:1415

gem5::Wavefront::barrierId
int barrierId() const
Definition wavefront.cc:1498

gem5::Wavefront::init
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition wavefront.cc:103

gem5::Wavefront::workGroupSz
uint32_t workGroupSz[3]
Definition wavefront.hh:162

gem5::Wavefront::expInstsIssued
int expInstsIssued
Definition wavefront.hh:332

gem5::Wavefront::scalarAlu
int scalarAlu
Definition wavefront.hh:123

gem5::Wavefront::wgSz
uint32_t wgSz
Definition wavefront.hh:165

gem5::Wavefront::wrLmReqsInPipe
int wrLmReqsInPipe
Definition wavefront.hh:190

gem5::Wavefront::isOldestInstVectorALU
bool isOldestInstVectorALU()
Definition wavefront.cc:689

gem5::Wavefront::lastInstExec
uint64_t lastInstExec
Definition wavefront.hh:236

gem5::Wavefront::ldsChunk
LdsChunk * ldsChunk
Definition wavefront.hh:230

gem5::Wavefront::actualWgSz
uint32_t actualWgSz[3]
Definition wavefront.hh:167

gem5::Wavefront::archFlatScratchAddr
Addr archFlatScratchAddr
Definition wavefront.hh:209

gem5::Wavefront::scalarOutstandingReqsRdGm
int scalarOutstandingReqsRdGm
Definition wavefront.hh:185

gem5::Wavefront::scalarRdGmReqsInPipe
int scalarRdGmReqsInPipe
Definition wavefront.hh:192

gem5::Wavefront::freeResources
void freeResources()
Definition wavefront.cc:796

gem5::Wavefront::barId
int barId
Definition wavefront.hh:338

gem5::Wavefront::incVMemInstsIssued
void incVMemInstsIssued()
Definition wavefront.cc:1403

gem5::Wavefront::reserveLmResource
void reserveLmResource(GPUDynInstPtr ii)
Definition wavefront.cc:845

gem5::Wavefront::wrGmReqsInPipe
int wrGmReqsInPipe
Definition wavefront.hh:191

gem5::Wavefront::rdGmReqsInPipe
int rdGmReqsInPipe
Definition wavefront.hh:189

gem5::Wavefront::status_e
status_e
Definition wavefront.hh:64

gem5::Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92

gem5::Wavefront::S_STALLED_SLEEP
@ S_STALLED_SLEEP
Definition wavefront.hh:74

gem5::Wavefront::S_WAITCNT
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:88

gem5::Wavefront::S_RETURNING
@ S_RETURNING
Definition wavefront.hh:68

gem5::Wavefront::S_RUNNING
@ S_RUNNING
Definition wavefront.hh:70

gem5::Wavefront::S_STALLED
@ S_STALLED
Definition wavefront.hh:72

gem5::Wavefront::S_STOPPED
@ S_STOPPED
Definition wavefront.hh:66

gem5::Wavefront::outstandingReqsRdGm
int outstandingReqsRdGm
Definition wavefront.hh:181

gem5::Wavefront::isOldestInstGMem
bool isOldestInstGMem()
Definition wavefront.cc:717

gem5::Wavefront::stats
gem5::Wavefront::WavefrontStats stats

gem5::Wavefront::execMask
VectorMask & execMask()
Definition wavefront.cc:1451

gem5::Wavefront::globalMem
int globalMem
Definition wavefront.hh:128

gem5::Wavefront::outstandingReqs
int outstandingReqs
Definition wavefront.hh:175

gem5::Wavefront::wfDynId
uint64_t wfDynId
Definition wavefront.hh:233

gem5::Wavefront::freeRegisterFile
void freeRegisterFile()
Freeing VRF space.
Definition wavefront.cc:1463

gem5::Wavefront::isGmInstruction
bool isGmInstruction(GPUDynInstPtr ii)
Definition wavefront.cc:621

gem5::Wavefront::lgkmInstsIssued
int lgkmInstsIssued
Definition wavefront.hh:333

gem5::Wavefront::sleepDone
bool sleepDone()
Definition wavefront.cc:1318

gem5::Wavefront::clearWaitCnts
void clearWaitCnts()
Definition wavefront.cc:1389

gem5::Wavefront::startVgprIndex
uint32_t startVgprIndex
Definition wavefront.hh:203

gem5::Wavefront::start
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition wavefront.cc:610

gem5::Wavefront::_gpuISA
TheGpuISA::GPUISA _gpuISA
Definition wavefront.hh:307

gem5::statistics::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition statistics.hh:1327

gem5::statistics::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition statistics.hh:2112

gem5::statistics::Group
Statistics container.
Definition group.hh:93

gem5::statistics::ScalarBase::value
Counter value() const
Return the current value of this stat as its base type.
Definition statistics.hh:621

std::vector
STL vector class.
Definition stl.hh:37

compute_unit.hh

gpu_dyn_inst.hh

ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75

gem5::divCeil
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110

gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188

fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236

fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200

panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::ArmISA::set
Bitfield< 12, 11 > set
Definition misc_types.hh:792

gem5::ArmISA::d
Bitfield< 9 > d
Definition misc_types.hh:64

gem5::MipsISA::p
Bitfield< 0 > p
Definition pra_constants.hh:326

gem5::statistics::none
const FlagsType none
Nothing extra to print.
Definition info.hh:53

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::init_pc
static void init_pc(py::module_ &m_native)
Definition core.cc:168

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::curTick
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46

gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147

gem5::VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48

gem5::WorkgroupIdX
@ WorkgroupIdX
Definition kernel_code.hh:63

gem5::DispatchId
@ DispatchId
Definition kernel_code.hh:60

gem5::NumScalarInitFields
@ NumScalarInitFields
Definition kernel_code.hh:68

gem5::DispatchPtr
@ DispatchPtr
Definition kernel_code.hh:57

gem5::QueuePtr
@ QueuePtr
Definition kernel_code.hh:58

gem5::PrivSegWaveByteOffset
@ PrivSegWaveByteOffset
Definition kernel_code.hh:67

gem5::PrivateSegBuf
@ PrivateSegBuf
Definition kernel_code.hh:56

gem5::WorkgroupIdY
@ WorkgroupIdY
Definition kernel_code.hh:64

gem5::PrivateSegSize
@ PrivateSegSize
Definition kernel_code.hh:62

gem5::WorkgroupInfo
@ WorkgroupInfo
Definition kernel_code.hh:66

gem5::WorkgroupIdZ
@ WorkgroupIdZ
Definition kernel_code.hh:65

gem5::FlatScratchInit
@ FlatScratchInit
Definition kernel_code.hh:61

gem5::KernargSegPtr
@ KernargSegPtr
Definition kernel_code.hh:59

gem5::WorkitemIdX
@ WorkitemIdX
Definition kernel_code.hh:73

gem5::WorkitemIdZ
@ WorkitemIdZ
Definition kernel_code.hh:75

gem5::NumVectorInitFields
@ NumVectorInitFields
Definition kernel_code.hh:76

gem5::WorkitemIdY
@ WorkitemIdY
Definition kernel_code.hh:74

register_file_cache.hh

scalar_register_file.hh

shader.hh

simple_pool_manager.hh

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF64
statistics::Scalar numVecOpsExecutedF64
Definition compute_unit.hh:1138

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecuted
statistics::Scalar numVecOpsExecuted
Definition compute_unit.hh:1132

gem5::ComputeUnit::ComputeUnitStats::activeLanesPerLMemInstrDist
statistics::Distribution activeLanesPerLMemInstrDist
Definition compute_unit.hh:1168

gem5::ComputeUnit::ComputeUnitStats::instInterleave
statistics::VectorDistribution instInterleave
Definition compute_unit.hh:1188

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMAC64
statistics::Scalar numVecOpsExecutedMAC64
Definition compute_unit.hh:1146

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMFMAF16
statistics::Scalar numVecOpsExecutedMFMAF16
Definition compute_unit.hh:1154

gem5::ComputeUnit::ComputeUnitStats::instCyclesVMemPerSimd
statistics::Vector instCyclesVMemPerSimd
Definition compute_unit.hh:1073

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedTwoOpFP
statistics::Scalar numVecOpsExecutedTwoOpFP
Definition compute_unit.hh:1158

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedF32
Definition compute_unit.hh:1136

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedFMA64
statistics::Scalar numVecOpsExecutedFMA64
Definition compute_unit.hh:1142

gem5::ComputeUnit::ComputeUnitStats::controlFlowDivergenceDist
statistics::Distribution controlFlowDivergenceDist
Definition compute_unit.hh:1166

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMAC16
statistics::Scalar numVecOpsExecutedMAC16
Definition compute_unit.hh:1144

gem5::ComputeUnit::ComputeUnitStats::numInstrExecuted
statistics::Scalar numInstrExecuted
Definition compute_unit.hh:1127

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedF16
statistics::Scalar numVecOpsExecutedF16
Definition compute_unit.hh:1134

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedMAD64
Definition compute_unit.hh:1150

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMFMAF64
statistics::Scalar numVecOpsExecutedMFMAF64
Definition compute_unit.hh:1156

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedFMA32
statistics::Scalar numVecOpsExecutedFMA32
Definition compute_unit.hh:1141

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMAD16
statistics::Scalar numVecOpsExecutedMAD16
Definition compute_unit.hh:1148

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMAC32
statistics::Scalar numVecOpsExecutedMAC32
Definition compute_unit.hh:1145

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMFMAF32
statistics::Scalar numVecOpsExecutedMFMAF32
Definition compute_unit.hh:1155

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedFMA16
statistics::Scalar numVecOpsExecutedFMA16
Definition compute_unit.hh:1140

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMAD32
statistics::Scalar numVecOpsExecutedMAD32
Definition compute_unit.hh:1149

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMFMAI8
statistics::Scalar numVecOpsExecutedMFMAI8
Definition compute_unit.hh:1153

gem5::ComputeUnit::ComputeUnitStats::instCyclesLdsPerSimd
statistics::Vector instCyclesLdsPerSimd
Definition compute_unit.hh:1075

gem5::ComputeUnit::ComputeUnitStats::instCyclesScMemPerSimd
statistics::Vector instCyclesScMemPerSimd
Definition compute_unit.hh:1074

gem5::ComputeUnit::ComputeUnitStats::activeLanesPerGMemInstrDist
statistics::Distribution activeLanesPerGMemInstrDist
Definition compute_unit.hh:1167

gem5::ComputeUnit::ComputeUnitStats::totalCycles
statistics::Scalar totalCycles
Definition compute_unit.hh:1160

gem5::ComputeUnit::ComputeUnitStats::execRateDist
statistics::Distribution execRateDist
Definition compute_unit.hh:1130

gem5::ComputeUnit::ComputeUnitStats::numVecOpsExecutedMFMA
statistics::Scalar numVecOpsExecutedMFMA
Definition compute_unit.hh:1152

gem5::Wavefront::WavefrontStats::WavefrontStats
WavefrontStats(statistics::Group *parent)
Definition wavefront.cc:1515

gem5::Wavefront::WavefrontStats::vecRawDistance
statistics::Distribution vecRawDistance
Definition wavefront.hh:379

gem5::Wavefront::WavefrontStats::readsPerWrite
statistics::Distribution readsPerWrite
Definition wavefront.hh:383

gem5::Wavefront::WavefrontStats::numInstrExecuted
statistics::Scalar numInstrExecuted
Definition wavefront.hh:347

gem5::_amd_queue_t::scratch_workitem_byte_size
uint32_t scratch_workitem_byte_size
Definition hsa_queue.hh:84

gem5::_amd_queue_t::compute_tmpring_size_wavesize
uint32_t compute_tmpring_size_wavesize
Definition hsa_queue.hh:79

gem5::_amd_queue_t::scratch_backing_memory_location
uint64_t scratch_backing_memory_location
Definition hsa_queue.hh:82

gem5::_amd_queue_t::scratch_resource_descriptor
uint32_t scratch_resource_descriptor[4]
Definition hsa_queue.hh:81

vector_register_file.hh

wavefront.hh