release/current/schedule__stage_8cc_source.html

/*

 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "gpu-compute/schedule_stage.hh"


#include <unordered_set>


#include "base/compiler.hh"

#include "debug/GPUSched.hh"

#include "debug/GPUVRF.hh"

#include "gpu-compute/compute_unit.hh"

#include "gpu-compute/gpu_static_inst.hh"

#include "gpu-compute/register_file_cache.hh"

#include "gpu-compute/scalar_register_file.hh"

#include "gpu-compute/vector_register_file.hh"

#include "gpu-compute/wavefront.hh"


namespace gem5

{


ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu,

                             ScoreboardCheckToSchedule &from_scoreboard_check,

                             ScheduleToExecute &to_execute)

    : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),

      toExecute(to_execute),

      _name(cu.name() + ".ScheduleStage"),

      vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),

      scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),

      locMemBusRdy(false), locMemIssueRdy(false), stats(&cu, cu.numExeUnits())

{

    for (int j = 0; j < cu.numExeUnits(); ++j) {

        scheduler.emplace_back(p);

    }

    wavesInSch.clear();

    schList.resize(cu.numExeUnits());

    for (auto &dq : schList) {

        dq.clear();

    }

}

ScheduleStage::ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, {…}


ScheduleStage::~ScheduleStage()

{

    scheduler.clear();

    wavesInSch.clear();

    schList.clear();

}

ScheduleStage::~ScheduleStage() {…}


void


ScheduleStage::init()

{


    fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),

             "Scheduler should have same number of entries as CU's readyList");

    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {

        scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));

    }


    assert(computeUnit.numVectorGlobalMemUnits == 1);

    assert(computeUnit.numVectorSharedMemUnits == 1);

}

ScheduleStage::init() {…}


void


ScheduleStage::exec()

{

    toExecute.reset();


    // Update readyList

    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {

        fromScoreboardCheck.updateReadyList(j);

        for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();

             wIt != fromScoreboardCheck.readyWFs(j).end();) {

            if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {

                *wIt = nullptr;

                wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);

            } else {

                wIt++;

            }

        }

    }


    // Attempt to add another wave for each EXE type to schList queues

    // VMEM resources are iterated first, effectively giving priority

    // to VMEM over VALU for scheduling read of operands to the RFs.

    // Scalar Memory are iterated after VMEM


    // Iterate VMEM and SMEM

    int firstMemUnit = computeUnit.firstMemUnit();

    int lastMemUnit = computeUnit.lastMemUnit();

    for (int j = firstMemUnit; j <= lastMemUnit; j++) {

        int readyListSize = fromScoreboardCheck.readyWFs(j).size();

        // If no wave is ready to be scheduled on the execution resource

        // then skip scheduling for this execution resource

        if (!readyListSize) {

            stats.rdyListEmpty[j]++;

            continue;

        }

        stats.rdyListNotEmpty[j]++;


        // Pick a wave and attempt to add it to schList

        Wavefront *wf = scheduler[j].chooseWave();

        GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();

        assert(gpu_dyn_inst);

        if (!addToSchList(j, gpu_dyn_inst)) {

            // For waves not added to schList, increment count of cycles

            // this wave spends in SCH stage.

            wf->stats.schCycles++;

            stats.addToSchListStalls[j]++;

        } else {

            if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {

                wf->incLGKMInstsIssued();

            } else {

                wf->incVMemInstsIssued();

                if (gpu_dyn_inst->isFlat()) {

                    wf->incLGKMInstsIssued();

                }

            }

            if (gpu_dyn_inst->isStore() && gpu_dyn_inst->isGlobalSeg()) {

                wf->incExpInstsIssued();

            }

        }

    }


    // Iterate everything else

    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {

        // skip the VMEM resources

        if (j >= firstMemUnit && j <= lastMemUnit) {

            continue;

        }

        int readyListSize = fromScoreboardCheck.readyWFs(j).size();

        // If no wave is ready to be scheduled on the execution resource

        // then skip scheduling for this execution resource

        if (!readyListSize) {

            stats.rdyListEmpty[j]++;

            continue;

        }

        stats.rdyListNotEmpty[j]++;


        // Pick a wave and attempt to add it to schList

        Wavefront *wf = scheduler[j].chooseWave();

        GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();

        assert(gpu_dyn_inst);

        if (!addToSchList(j, gpu_dyn_inst)) {

            // For waves not added to schList, increment count of cycles

            // this wave spends in SCH stage.

            wf->stats.schCycles++;

            stats.addToSchListStalls[j]++;

        }

    }


    // At this point, the schList queue per EXE type may contain

    // multiple waves, in order of age (oldest to youngest).

    // Wave may be in RFBUSY, indicating they are waiting for registers

    // to be read, or in RFREADY, indicating they are candidates for

    // the dispatchList and execution


    // Iterate schList queues and check if any of the waves have finished

    // reading their operands, moving those waves to RFREADY status

    checkRfOperandReadComplete();


    // Fill the dispatch list with the oldest wave of each EXE type that

    // is ready to execute

    // Wave is picked if status in schList is RFREADY and it passes resource

    // ready checks similar to those currently in SCB

    fillDispatchList();


    // Resource arbitration on waves in dispatchList

    // Losing waves are re-inserted to the schList at a location determined

    // by wave age


    // Arbitrate access to the VRF->LDS bus

    arbitrateVrfToLdsBus();


    // Schedule write operations to the register files

    scheduleRfDestOperands();


    // Lastly, reserve resources for waves that are ready to execute.

    reserveResources();

}

ScheduleStage::exec() {…}


void


ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,

                                        const GPUDynInstPtr &gpu_dyn_inst)

{

    toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);

}

ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s, {…}


void


ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s)

{

    toExecute.dispatchTransition(unitId, s);

}

ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s) {…}


bool


ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)

{

    assert(gpu_dyn_inst);

    Wavefront *wf = gpu_dyn_inst->wavefront();

    bool accessVrfWr = true;

    if (!gpu_dyn_inst->isScalar()) {

        accessVrfWr = computeUnit.vrf[wf->simdId]

            ->canScheduleWriteOperands(wf, gpu_dyn_inst);

    }

    bool accessSrfWr = computeUnit.srf[wf->simdId]

        ->canScheduleWriteOperands(wf, gpu_dyn_inst);

    bool accessRf = accessVrfWr && accessSrfWr;

    if (accessRf) {

        if (!gpu_dyn_inst->isScalar()) {

            computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,

                                                               gpu_dyn_inst);

        }

        computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);

        return true;

    } else {

        stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;

        if (!accessSrfWr) {

            stats.rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;

        }

        if (!accessVrfWr) {

            stats.rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;

        }


        // Increment stall counts for WF

        wf->stats.schStalls++;

        wf->stats.schRfAccessStalls++;

    }

    return false;

}

ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst) {…}


void


ScheduleStage::scheduleRfDestOperands()

{

    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {

        if (toExecute.dispatchStatus(j) == EMPTY ||

            toExecute.dispatchStatus(j) == SKIP) {

            continue;

        }


        // get the wave on dispatch list and attempt to allocate write

        // resources in the RFs

        const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);

        assert(gpu_dyn_inst);

        Wavefront *wf = gpu_dyn_inst->wavefront();

        if (!schedRfWrites(j, gpu_dyn_inst)) {

            reinsertToSchList(j, gpu_dyn_inst);

            doDispatchListTransition(j, EMPTY);

            // if this is a flat inst, also transition the LM pipe to empty

            // Note: since FLAT/LM arbitration occurs before scheduling

            // destination operands to the RFs, it is possible that a LM

            // instruction lost arbitration, but would have been able to

            // pass the RF destination operand check here, and execute

            // instead of the FLAT.

            if (wf->instructionBuffer.front()->isFlat()) {

                assert(toExecute.dispatchStatus(wf->localMem)

                       == SKIP);

                doDispatchListTransition(wf->localMem, EMPTY);

            }

        }

    }

}

ScheduleStage::scheduleRfDestOperands() {…}


bool


ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)

{

    // Attempt to add the wave to the schList if the VRF can support the

    // wave's next instruction

    assert(gpu_dyn_inst);

    Wavefront *wf = gpu_dyn_inst->wavefront();

    bool accessVrf = true;

    if (!gpu_dyn_inst->isScalar()) {

        accessVrf = computeUnit.vrf[wf->simdId]

            ->canScheduleReadOperands(wf, gpu_dyn_inst);

    }

    bool accessSrf = computeUnit.srf[wf->simdId]

        ->canScheduleReadOperands(wf, gpu_dyn_inst);

    // If RFs can support instruction, add to schList in RFBUSY state,

    // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands

    // to the VRF

    bool accessRf = accessVrf && accessSrf;

    if (accessRf) {

        DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",

                exeType, wf->simdId, wf->wfDynId,

                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());


        computeUnit.insertInPipeMap(wf);

        wavesInSch.emplace(wf->wfDynId);

        schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));

        if (wf->isOldestInstBarrier() && wf->hasBarrier()) {

            wf->setStatus(Wavefront::S_BARRIER);

        }

        if (wf->isOldestInstWaitcnt()) {

            wf->setStatus(Wavefront::S_WAITCNT);

        }

        if (wf->isOldestInstSleep()) {

            wf->setStatus(Wavefront::S_STALLED_SLEEP);

        }

        if (!gpu_dyn_inst->isScalar()) {

            computeUnit.vrf[wf->simdId]

                ->scheduleReadOperands(wf, gpu_dyn_inst);

        }

        computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);


        DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",

                exeType, wf->simdId, wf->wfDynId,

                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());

        return true;

    } else {

        // Number of stall cycles due to RF access denied

        stats.rfAccessStalls[SCH_RF_ACCESS_NRDY]++;

        // Count number of denials due to each reason

        // Multiple items may contribute to the denied request

        if (!accessVrf) {

            stats.rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;

        }

        if (!accessSrf) {

            stats.rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;

        }


        // Increment stall counts for WF

        wf->stats.schStalls++;

        wf->stats.schRfAccessStalls++;

        DPRINTF(GPUSched, "schList[%d]: Could not add: "

                "SIMD[%d] WV[%d]: %d: %s\n",

                exeType, wf->simdId, wf->wfDynId,

                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());

    }

    return false;

}

ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst) {…}


void


ScheduleStage::reinsertToSchList(int exeType,

                                 const GPUDynInstPtr &gpu_dyn_inst)

{

    // Insert wave w into schList for specified exeType.

    // Wave is inserted in age order, with oldest wave being at the

    // front of the schList

    assert(gpu_dyn_inst);

    auto schIter = schList.at(exeType).begin();

    while (schIter != schList.at(exeType).end()

           && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {

        schIter++;

    }

    schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));

}

ScheduleStage::reinsertToSchList(int exeType, {…}


void


ScheduleStage::checkMemResources()

{

    // Check for resource availability in the next cycle

    scalarMemBusRdy = false;

    scalarMemIssueRdy = false;

    // check if there is a SRF->Global Memory bus available and

    if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {

        scalarMemBusRdy = true;

    }

    // check if we can issue a scalar memory instruction

    if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {

        scalarMemIssueRdy = true;

    }


    glbMemBusRdy = false;

    glbMemIssueRdy = false;

    // check if there is a VRF->Global Memory bus available

    if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {

        glbMemBusRdy = true;

    }

    // check if we can issue a Global memory instruction

    if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {

        glbMemIssueRdy = true;

    }


    locMemBusRdy = false;

    locMemIssueRdy = false;

    // check if there is a VRF->LDS bus available

    if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {

        locMemBusRdy = true;

    }

    // check if we can issue a LDS instruction

    if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {

        locMemIssueRdy = true;

    }

}

ScheduleStage::checkMemResources() {…}


bool


ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)

{

    assert(gpu_dyn_inst);

    Wavefront *wf = gpu_dyn_inst->wavefront();

    vectorAluRdy = false;

    scalarAluRdy = false;

    // check for available vector/scalar ALUs in the next cycle

    if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {

        vectorAluRdy = true;

    }

    if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {

        scalarAluRdy = true;

    }


    if (gpu_dyn_inst->isNop()) {

        // S_NOP requires SALU. V_NOP requires VALU.

        // TODO: Scalar NOP does not require SALU in hardware,

        // and is executed out of IB directly.

        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {

            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;

            return false;

        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {

            stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;

            return false;

        }

    } else if (gpu_dyn_inst->isEndOfKernel()) {

        // EndPgm instruction

        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {

            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;

            return false;

        }

    } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()

               || gpu_dyn_inst->isALU()) {

        // Barrier, Branch, or ALU instruction

        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {

            stats.dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;

            return false;

        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {

            stats.dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;

            return false;

        }

    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {

        // Vector Global Memory instruction

        bool rdy = true;

        if (!glbMemIssueRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;

        }

        if (!glbMemBusRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;

        }

        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {

            rdy = false;

            stats.dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;

        }

        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {

            rdy = false;

            stats.dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;

        }

        if (!rdy) {

            return false;

        }

    } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {

        // Scalar Global Memory instruction

        bool rdy = true;

        if (!scalarMemIssueRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;

        }

        if (!scalarMemBusRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;

        }

        if (!computeUnit.scalarMemoryPipe

            .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe

            + wf->scalarWrGmReqsInPipe))

        {

            rdy = false;

            stats.dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;

        }

        if (!rdy) {

            return false;

        }

    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {

        // Vector Local Memory instruction

        bool rdy = true;

        if (!locMemIssueRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;

        }

        if (!locMemBusRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;

        }

        if (!computeUnit.localMemoryPipe.

                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {

            rdy = false;

            stats.dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;

        }

        if (!rdy) {

            return false;

        }

    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {

        // Vector Flat memory instruction

        bool rdy = true;

        if (!glbMemIssueRdy || !locMemIssueRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;

        }

        if (!glbMemBusRdy || !locMemBusRdy) {

            rdy = false;

            stats.dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;

        }

        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {

            rdy = false;

            stats.dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;

        }

        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {

            rdy = false;

            stats.dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;

        }

        if (!computeUnit.localMemoryPipe.

                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {

            rdy = false;

            stats.dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;

        }

        if (!rdy) {

            return false;

        }

    } else {

        panic("%s: unknown instr checked for readiness",

              gpu_dyn_inst->disassemble());

        return false;

    }

    stats.dispNrdyStalls[SCH_RDY]++;

    return true;

}

ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) {…}


void


ScheduleStage::fillDispatchList()

{

    // update execution resource status

    checkMemResources();

    // iterate execution resources

    for (int j = 0; j < computeUnit.numExeUnits(); j++) {

        assert(toExecute.dispatchStatus(j) == EMPTY);


        // iterate waves in schList to pick one for dispatch

        auto schIter = schList.at(j).begin();

        bool dispatched = false;

        while (schIter != schList.at(j).end()) {

            // only attempt to dispatch if status is RFREADY

            if (schIter->second == RFREADY) {

                // Check if this wave is ready for dispatch

                bool dispRdy = dispatchReady(schIter->first);

                if (!dispatched && dispRdy) {

                    // No other wave has been dispatched for this exe

                    // resource, and this wave is ready. Place this wave

                    // on dispatchList and make it ready for execution

                    // next cycle.


                    // Acquire a coalescer token if it is a global mem

                    // operation.

                    GPUDynInstPtr mp = schIter->first;

                    if (!mp->isMemSync() && !mp->isScalar() &&

                        mp->needsToken()) {

                        computeUnit.globalMemoryPipe.acqCoalescerToken(mp);

                    }


                    // Set instruction's exec_mask if it's a mem operation

                    if (mp->isMemRef()) {

                        mp->exec_mask = mp->wavefront()->execMask();

                    }


                    doDispatchListTransition(j, EXREADY, schIter->first);

                    DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "

                            "EMPTY->EXREADY\n", j);

                    schIter->first = nullptr;

                    schIter = schList.at(j).erase(schIter);

                    dispatched = true;

                } else {

                    // Either another wave has been dispatched, or this wave

                    // was not ready, so it is stalled this cycle

                    schIter->first->wavefront()->stats.schStalls++;

                    if (!dispRdy) {

                        // not ready for dispatch, increment stall stat

                        schIter->first->wavefront()->stats.schResourceStalls++;

                    }

                    // Examine next wave for this resource

                    schIter++;

                }

            } else {

                // Wave not in RFREADY, try next wave

                schIter++;

            }

        }


        // Increment stall count if no wave sent to dispatchList for

        // current execution resource

        if (!dispatched) {

            stats.schListToDispListStalls[j]++;

        } else {

            stats.schListToDispList[j]++;

        }

    }

}

ScheduleStage::fillDispatchList() {…}


void


ScheduleStage::arbitrateVrfToLdsBus()

{

    // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops


    // iterate the GM pipelines

    for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {

        // get the GM pipe index in the dispatchList

        int gm_exe_unit = computeUnit.firstMemUnit() + i;

        // get the wave in the dispatchList

        GPUDynInstPtr &gpu_dyn_inst

            = toExecute.readyInst(gm_exe_unit);

        // If the WF is valid, ready to execute, and the instruction

        // is a flat access, arbitrate with the WF's assigned LM pipe

        if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)

            == EXREADY && gpu_dyn_inst->isFlat()) {

            Wavefront *wf = gpu_dyn_inst->wavefront();

            // If the associated LM pipe also has a wave selected, block

            // that wave and let the Flat instruction issue. The WF in the

            // LM pipe is added back to the schList for consideration next

            // cycle.

            if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {

                reinsertToSchList(wf->localMem, toExecute

                                  .readyInst(wf->localMem));

                // Increment stall stats for LDS-VRF arbitration

                stats.ldsBusArbStalls++;

                toExecute.readyInst(wf->localMem)

                    ->wavefront()->stats.schLdsArbStalls++;

            }

            // With arbitration of LM pipe complete, transition the

            // LM pipe to SKIP state in the dispatchList to inform EX stage

            // that a Flat instruction is executing next cycle

            doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);

            DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "

                    "EXREADY->SKIP\n", wf->localMem);

        }

    }

}

ScheduleStage::arbitrateVrfToLdsBus() {…}


void


ScheduleStage::checkRfOperandReadComplete()

{

    // Iterate the schList queues and check if operand reads

    // have completed in the RFs. If so, mark the wave as ready for

    // selection for dispatchList

    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {

        for (auto &p : schList.at(j)) {

            const GPUDynInstPtr &gpu_dyn_inst = p.first;

            assert(gpu_dyn_inst);

            Wavefront *wf = gpu_dyn_inst->wavefront();


            // Increment the number of cycles the wave spends in the

            // SCH stage, since this loop visits every wave in SCH.

            wf->stats.schCycles++;


            bool vrfRdy = true;

            if (!gpu_dyn_inst->isScalar()) {

                vrfRdy = computeUnit.vrf[wf->simdId]

                    ->operandReadComplete(wf, gpu_dyn_inst);

            }

            bool srfRdy = computeUnit.srf[wf->simdId]

                ->operandReadComplete(wf, gpu_dyn_inst);

            bool operandsReady = vrfRdy && srfRdy;

            if (operandsReady) {

                DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "

                        "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),

                        gpu_dyn_inst->disassemble());

                DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",

                        j, wf->wfDynId);

                p.second = RFREADY;

            } else {

                DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "

                        "for: %d: %s\n", j, wf->wfDynId,

                        gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());


                // operands not ready yet, increment SCH stage stats

                // aggregate to all wavefronts on the CU

                p.second = RFBUSY;


                // Increment stall stats

                wf->stats.schStalls++;

                wf->stats.schOpdNrdyStalls++;


                stats.opdNrdyStalls[SCH_RF_OPD_NRDY]++;

                if (!vrfRdy) {

                    stats.opdNrdyStalls[SCH_VRF_OPD_NRDY]++;

                }

                if (!srfRdy) {

                    stats.opdNrdyStalls[SCH_SRF_OPD_NRDY]++;

                }

            }

        }

    }

}

ScheduleStage::checkRfOperandReadComplete() {…}


void


ScheduleStage::reserveResources()

{

    std::vector<bool> exeUnitReservations;

    exeUnitReservations.resize(computeUnit.numExeUnits(), false);


    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {

        GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);

        if (gpu_dyn_inst) {

            DISPATCH_STATUS s = toExecute.dispatchStatus(j);

            Wavefront *wf = gpu_dyn_inst->wavefront();

            if (s == EMPTY) {

                continue;

            } else if (s == EXREADY) {

                // Wave is ready for execution

                std::vector<int> execUnitIds = wf->reserveResources();


                if (!gpu_dyn_inst->isScalar()) {

                    computeUnit.vrf[wf->simdId]

                        ->dispatchInstruction(gpu_dyn_inst);

                }

                computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);


                std::stringstream ss;

                for (auto id : execUnitIds) {

                    ss << id << " ";

                }

                DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"

                        "    Reserving ExeRes[ %s]\n",

                        j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),

                        gpu_dyn_inst->disassemble(), ss.str());

                // mark the resources as reserved for this cycle

                for (auto execUnitId : execUnitIds) {

                    panic_if(exeUnitReservations.at(execUnitId),

                             "Execution unit %d is reserved!!!\n"

                             "SIMD[%d] WV[%d]: %d: %s",

                             execUnitId, wf->simdId, wf->wfDynId,

                             gpu_dyn_inst->seqNum(),

                             gpu_dyn_inst->disassemble());

                    exeUnitReservations.at(execUnitId) = true;

                }


                // If wavefront::reserveResources reserved multiple resources,

                // then we're executing a flat memory instruction. This means

                // that we've reserved a global and local memory unit. Thus,

                // we need to mark the latter execution unit as not available.

                if (execUnitIds.size() > 1) {

                    [[maybe_unused]] int lm_exec_unit = wf->localMem;

                    assert(toExecute.dispatchStatus(lm_exec_unit)

                           == SKIP);

                }

            } else if (s == SKIP) {

                // Shared Memory pipe reserved for FLAT instruction.

                // Verify the GM pipe for this wave is ready to execute

                // and the wave in the GM pipe is the same as the wave

                // in the LM pipe

                [[maybe_unused]] int gm_exec_unit = wf->globalMem;

                assert(wf->wfDynId == toExecute

                       .readyInst(gm_exec_unit)->wfDynId);

                assert(toExecute.dispatchStatus(gm_exec_unit)

                       == EXREADY);

            }

        }

    }

}

ScheduleStage::reserveResources() {…}


void


ScheduleStage::deleteFromSch(Wavefront *w)

{

    wavesInSch.erase(w->wfDynId);

}

ScheduleStage::deleteFromSch(Wavefront *w) {…}


ScheduleStage::ScheduleStageStats::ScheduleStageStats(

    statistics::Group *parent, int num_exec_units)

    : statistics::Group(parent, "ScheduleStage"),

      ADD_STAT(rdyListEmpty ,"number of cycles no wave on ready list per "

               "execution resource"),

      ADD_STAT(rdyListNotEmpty, "number of cycles one or more wave on ready "

               "list per execution resource"),

      ADD_STAT(addToSchListStalls, "number of cycles a wave is not added to "

               "schList per execution resource when ready list is not empty"),

      ADD_STAT(schListToDispList, "number of cycles a wave is added to "

               "dispatchList per execution resource"),

      ADD_STAT(schListToDispListStalls, "number of cycles no wave is added to"

               " dispatchList per execution resource"),

      ADD_STAT(rfAccessStalls, "number of stalls due to RF access denied"),

      ADD_STAT(ldsBusArbStalls, "number of stalls due to VRF->LDS bus "

               "conflicts"),

      ADD_STAT(opdNrdyStalls, "number of stalls in SCH due to operands not "

               "ready"),

      ADD_STAT(dispNrdyStalls, "number of stalls in SCH due to resource not "

               "ready")

{

    rdyListNotEmpty.init(num_exec_units);

    rdyListEmpty.init(num_exec_units);

    addToSchListStalls.init(num_exec_units);

    schListToDispList.init(num_exec_units);

    schListToDispListStalls.init(num_exec_units);

    opdNrdyStalls.init(SCH_RF_OPD_NRDY_CONDITIONS);

    dispNrdyStalls.init(SCH_NRDY_CONDITIONS);

    rfAccessStalls.init(SCH_RF_ACCESS_NRDY_CONDITIONS);


    opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));

    opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));

    opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));


    dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));

    dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));

    dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,

                                  csprintf("VectorMemIssue"));

    dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,

                                  csprintf("VectorMemBusBusy"));

    dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,

                                  csprintf("VectorMemCoalescer"));

    dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));

    dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,

                                  csprintf("ScalarMemIssue"));

    dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,

                                  csprintf("ScalarMemBusBusy"));

    dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,

                                  csprintf("ScalarMemFIFO"));

    dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,

                                  csprintf("LocalMemIssue"));

    dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,

                                  csprintf("LocalMemBusBusy"));

    dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,

                                  csprintf("LocalMemFIFO"));

    dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,

                                  csprintf("FlatMemIssue"));

    dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,

                                  csprintf("FlatMemBusBusy"));

    dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,

                                  csprintf("FlatMemCoalescer"));

    dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,

                                  csprintf("FlatMemFIFO"));

    dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));


    rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));

    rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));

    rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));

    rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));

    rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));

}

ScheduleStage::ScheduleStageStats::ScheduleStageStats( {…}


} // namespace gem5

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

gem5::ComputeUnit
Definition compute_unit.hh:203

gem5::ComputeUnit::scalarALUs
std::vector< WaitClass > scalarALUs
Definition compute_unit.hh:250

gem5::ComputeUnit::scalarMemUnit
WaitClass scalarMemUnit
Definition compute_unit.hh:242

gem5::ComputeUnit::numVectorGlobalMemUnits
int numVectorGlobalMemUnits
Definition compute_unit.hh:220

gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition compute_unit.hh:226

gem5::ComputeUnit::localMemoryPipe
LocalMemPipeline localMemoryPipe
Definition compute_unit.hh:286

gem5::ComputeUnit::vrfToLocalMemPipeBus
WaitClass vrfToLocalMemPipeBus
Definition compute_unit.hh:232

gem5::ComputeUnit::numVectorSharedMemUnits
int numVectorSharedMemUnits
Definition compute_unit.hh:228

gem5::ComputeUnit::srfToScalarMemPipeBus
WaitClass srfToScalarMemPipeBus
Definition compute_unit.hh:240

gem5::ComputeUnit::lastMemUnit
int lastMemUnit() const
Definition compute_unit.cc:262

gem5::ComputeUnit::scalarMemoryPipe
ScalarMemPipeline scalarMemoryPipe
Definition compute_unit.hh:287

gem5::ComputeUnit::numExeUnits
int numExeUnits() const
Definition compute_unit.cc:247

gem5::ComputeUnit::globalMemoryPipe
GlobalMemPipeline globalMemoryPipe
Definition compute_unit.hh:285

gem5::ComputeUnit::insertInPipeMap
void insertInPipeMap(Wavefront *w)
Definition compute_unit.cc:537

gem5::ComputeUnit::srf
std::vector< ScalarRegisterFile * > srf
Definition compute_unit.hh:298

gem5::ComputeUnit::firstMemUnit
int firstMemUnit() const
Definition compute_unit.cc:255

gem5::ComputeUnit::vectorALUs
std::vector< WaitClass > vectorALUs
Definition compute_unit.hh:246

gem5::ComputeUnit::vectorSharedMemUnit
WaitClass vectorSharedMemUnit
Definition compute_unit.hh:234

gem5::ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition compute_unit.hh:296

gem5::ComputeUnit::vrfToGlobalMemPipeBus
WaitClass vrfToGlobalMemPipeBus
Definition compute_unit.hh:224

gem5::Cycles
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79

gem5::GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition global_memory_pipeline.cc:98

gem5::GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition global_memory_pipeline.cc:86

gem5::GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition global_memory_pipeline.cc:63

gem5::ScalarMemPipeline::isGMReqFIFOWrRdy
bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
Definition scalar_memory_pipeline.hh:87

gem5::ScheduleStage::reinsertToSchList
void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Definition schedule_stage.cc:363

gem5::ScheduleStage::doDispatchListTransition
void doDispatchListTransition(int unitId, DISPATCH_STATUS s, const GPUDynInstPtr &gpu_dyn_inst)
Definition schedule_stage.cc:214

gem5::ScheduleStage::SCH_RF_OPD_NRDY_CONDITIONS
@ SCH_RF_OPD_NRDY_CONDITIONS
Definition schedule_stage.hh:102

gem5::ScheduleStage::SCH_RF_OPD_NRDY
@ SCH_RF_OPD_NRDY
Definition schedule_stage.hh:101

gem5::ScheduleStage::SCH_SRF_OPD_NRDY
@ SCH_SRF_OPD_NRDY
Definition schedule_stage.hh:100

gem5::ScheduleStage::SCH_VRF_OPD_NRDY
@ SCH_VRF_OPD_NRDY
Definition schedule_stage.hh:99

gem5::ScheduleStage::checkMemResources
void checkMemResources()
Definition schedule_stage.cc:379

gem5::ScheduleStage::stats
gem5::ScheduleStage::ScheduleStageStats stats

gem5::ScheduleStage::checkRfOperandReadComplete
void checkRfOperandReadComplete()
Definition schedule_stage.cc:665

gem5::ScheduleStage::init
void init()
Definition schedule_stage.cc:77

gem5::ScheduleStage::glbMemBusRdy
bool glbMemBusRdy
Definition schedule_stage.hh:147

gem5::ScheduleStage::~ScheduleStage
~ScheduleStage()
Definition schedule_stage.cc:69

gem5::ScheduleStage::toExecute
ScheduleToExecute & toExecute
Definition schedule_stage.hh:127

gem5::ScheduleStage::ScheduleStage
ScheduleStage(const ComputeUnitParams &p, ComputeUnit &cu, ScoreboardCheckToSchedule &from_scoreboard_check, ScheduleToExecute &to_execute)
Definition schedule_stage.cc:49

gem5::ScheduleStage::RFBUSY
@ RFBUSY
Definition schedule_stage.hh:120

gem5::ScheduleStage::RFREADY
@ RFREADY
Definition schedule_stage.hh:121

gem5::ScheduleStage::fillDispatchList
void fillDispatchList()
Definition schedule_stage.cc:557

gem5::ScheduleStage::dispatchReady
bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
Definition schedule_stage.cc:417

gem5::ScheduleStage::SCH_SRF_WR_ACCESS_NRDY
@ SCH_SRF_WR_ACCESS_NRDY
Definition schedule_stage.hh:109

gem5::ScheduleStage::SCH_VRF_RD_ACCESS_NRDY
@ SCH_VRF_RD_ACCESS_NRDY
Definition schedule_stage.hh:106

gem5::ScheduleStage::SCH_RF_ACCESS_NRDY_CONDITIONS
@ SCH_RF_ACCESS_NRDY_CONDITIONS
Definition schedule_stage.hh:111

gem5::ScheduleStage::SCH_VRF_WR_ACCESS_NRDY
@ SCH_VRF_WR_ACCESS_NRDY
Definition schedule_stage.hh:107

gem5::ScheduleStage::SCH_SRF_RD_ACCESS_NRDY
@ SCH_SRF_RD_ACCESS_NRDY
Definition schedule_stage.hh:108

gem5::ScheduleStage::SCH_RF_ACCESS_NRDY
@ SCH_RF_ACCESS_NRDY
Definition schedule_stage.hh:110

gem5::ScheduleStage::scalarMemBusRdy
bool scalarMemBusRdy
Definition schedule_stage.hh:145

gem5::ScheduleStage::locMemBusRdy
bool locMemBusRdy
Definition schedule_stage.hh:149

gem5::ScheduleStage::reserveResources
void reserveResources()
Definition schedule_stage.cc:721

gem5::ScheduleStage::scalarAluRdy
bool scalarAluRdy
Definition schedule_stage.hh:144

gem5::ScheduleStage::scheduler
std::vector< Scheduler > scheduler
Definition schedule_stage.hh:131

gem5::ScheduleStage::schList
std::vector< std::deque< std::pair< GPUDynInstPtr, SCH_STATUS > > > schList
Definition schedule_stage.hh:185

gem5::ScheduleStage::scheduleRfDestOperands
void scheduleRfDestOperands()
Definition schedule_stage.cc:263

gem5::ScheduleStage::exec
void exec()
Definition schedule_stage.cc:91

gem5::ScheduleStage::glbMemIssueRdy
bool glbMemIssueRdy
Definition schedule_stage.hh:148

gem5::ScheduleStage::fromScoreboardCheck
ScoreboardCheckToSchedule & fromScoreboardCheck
Definition schedule_stage.hh:126

gem5::ScheduleStage::arbitrateVrfToLdsBus
void arbitrateVrfToLdsBus()
Definition schedule_stage.cc:626

gem5::ScheduleStage::locMemIssueRdy
bool locMemIssueRdy
Definition schedule_stage.hh:150

gem5::ScheduleStage::wavesInSch
std::unordered_set< uint64_t > wavesInSch
Definition schedule_stage.hh:174

gem5::ScheduleStage::computeUnit
ComputeUnit & computeUnit
Definition schedule_stage.hh:125

gem5::ScheduleStage::schedRfWrites
bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Definition schedule_stage.cc:227

gem5::ScheduleStage::deleteFromSch
void deleteFromSch(Wavefront *w)
Definition schedule_stage.cc:787

gem5::ScheduleStage::scalarMemIssueRdy
bool scalarMemIssueRdy
Definition schedule_stage.hh:146

gem5::ScheduleStage::vectorAluRdy
bool vectorAluRdy
Definition schedule_stage.hh:143

gem5::ScheduleStage::addToSchList
bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
Definition schedule_stage.cc:295

gem5::ScheduleStage::SCH_FLAT_MEM_BUS_BUSY_NRDY
@ SCH_FLAT_MEM_BUS_BUSY_NRDY
Definition schedule_stage.hh:90

gem5::ScheduleStage::SCH_LOCAL_MEM_ISSUE_NRDY
@ SCH_LOCAL_MEM_ISSUE_NRDY
Definition schedule_stage.hh:86

gem5::ScheduleStage::SCH_RDY
@ SCH_RDY
Definition schedule_stage.hh:94

gem5::ScheduleStage::SCH_FLAT_MEM_REQS_NRDY
@ SCH_FLAT_MEM_REQS_NRDY
Definition schedule_stage.hh:92

gem5::ScheduleStage::SCH_VECTOR_MEM_COALESCER_NRDY
@ SCH_VECTOR_MEM_COALESCER_NRDY
Definition schedule_stage.hh:80

gem5::ScheduleStage::SCH_SCALAR_ALU_NRDY
@ SCH_SCALAR_ALU_NRDY
Definition schedule_stage.hh:76

gem5::ScheduleStage::SCH_FLAT_MEM_COALESCER_NRDY
@ SCH_FLAT_MEM_COALESCER_NRDY
Definition schedule_stage.hh:91

gem5::ScheduleStage::SCH_SCALAR_MEM_BUS_BUSY_NRDY
@ SCH_SCALAR_MEM_BUS_BUSY_NRDY
Definition schedule_stage.hh:84

gem5::ScheduleStage::SCH_CEDE_SIMD_NRDY
@ SCH_CEDE_SIMD_NRDY
Definition schedule_stage.hh:82

gem5::ScheduleStage::SCH_VECTOR_MEM_REQS_NRDY
@ SCH_VECTOR_MEM_REQS_NRDY
Definition schedule_stage.hh:81

gem5::ScheduleStage::SCH_SCALAR_MEM_ISSUE_NRDY
@ SCH_SCALAR_MEM_ISSUE_NRDY
Definition schedule_stage.hh:83

gem5::ScheduleStage::SCH_VECTOR_MEM_BUS_BUSY_NRDY
@ SCH_VECTOR_MEM_BUS_BUSY_NRDY
Definition schedule_stage.hh:79

gem5::ScheduleStage::SCH_LOCAL_MEM_FIFO_NRDY
@ SCH_LOCAL_MEM_FIFO_NRDY
Definition schedule_stage.hh:88

gem5::ScheduleStage::SCH_FLAT_MEM_ISSUE_NRDY
@ SCH_FLAT_MEM_ISSUE_NRDY
Definition schedule_stage.hh:89

gem5::ScheduleStage::SCH_LOCAL_MEM_BUS_BUSY_NRDY
@ SCH_LOCAL_MEM_BUS_BUSY_NRDY
Definition schedule_stage.hh:87

gem5::ScheduleStage::SCH_VECTOR_ALU_NRDY
@ SCH_VECTOR_ALU_NRDY
Definition schedule_stage.hh:77

gem5::ScheduleStage::SCH_SCALAR_MEM_FIFO_NRDY
@ SCH_SCALAR_MEM_FIFO_NRDY
Definition schedule_stage.hh:85

gem5::ScheduleStage::SCH_FLAT_MEM_FIFO_NRDY
@ SCH_FLAT_MEM_FIFO_NRDY
Definition schedule_stage.hh:93

gem5::ScheduleStage::SCH_NRDY_CONDITIONS
@ SCH_NRDY_CONDITIONS
Definition schedule_stage.hh:95

gem5::ScheduleStage::SCH_VECTOR_MEM_ISSUE_NRDY
@ SCH_VECTOR_MEM_ISSUE_NRDY
Definition schedule_stage.hh:78

gem5::ScheduleToExecute
Communication interface between Schedule and Execute stages.
Definition comm.hh:99

gem5::ScheduleToExecute::dispatchStatus
DISPATCH_STATUS dispatchStatus(int func_unit_id) const
Definition comm.cc:150

gem5::ScheduleToExecute::reset
void reset() override
Reset the pipe stage interface.
Definition comm.cc:115

gem5::ScheduleToExecute::readyInst
GPUDynInstPtr & readyInst(int func_unit_id)
Definition comm.cc:127

gem5::ScheduleToExecute::dispatchTransition
void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, int func_unit_id, DISPATCH_STATUS disp_status)
Once the scheduler has chosen a winning WF for execution, and after the WF's oldest instruction's ope...
Definition comm.cc:133

gem5::ScoreboardCheckToSchedule
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63

gem5::ScoreboardCheckToSchedule::numReadyLists
int numReadyLists() const
Returns the number of ready lists (i.e., the number of functional units).
Definition comm.cc:73

gem5::ScoreboardCheckToSchedule::readyWFs
std::vector< Wavefront * > & readyWFs(int func_unit_id)
TODO: These methods expose this class' implementation too much by returning references to its interna...
Definition comm.cc:79

gem5::ScoreboardCheckToSchedule::updateReadyList
void updateReadyList(int func_unit_id)
Delete all wavefronts that have been marked as ready at scoreboard stage but are found to have empty ...
Definition comm.cc:89

gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition misc.hh:93

gem5::Wavefront
Definition wavefront.hh:61

gem5::Wavefront::isOldestInstWaitcnt
bool isOldestInstWaitcnt()
Definition wavefront.cc:657

gem5::Wavefront::hasBarrier
bool hasBarrier() const
Definition wavefront.cc:1504

gem5::Wavefront::rdLmReqsInPipe
int rdLmReqsInPipe
Definition wavefront.hh:188

gem5::Wavefront::setStatus
void setStatus(status_e newStatus)
Definition wavefront.cc:573

gem5::Wavefront::simdId
const int simdId
Definition wavefront.hh:101

gem5::Wavefront::scalarWrGmReqsInPipe
int scalarWrGmReqsInPipe
Definition wavefront.hh:193

gem5::Wavefront::isOldestInstBarrier
bool isOldestInstBarrier()
Definition wavefront.cc:704

gem5::Wavefront::instructionBuffer
std::deque< GPUDynInstPtr > instructionBuffer
Definition wavefront.hh:111

gem5::Wavefront::isOldestInstSleep
bool isOldestInstSleep()
Definition wavefront.cc:643

gem5::Wavefront::localMem
int localMem
Definition wavefront.hh:129

gem5::Wavefront::incExpInstsIssued
void incExpInstsIssued()
Definition wavefront.cc:1409

gem5::Wavefront::reserveResources
std::vector< int > reserveResources()
Definition wavefront.cc:863

gem5::Wavefront::incLGKMInstsIssued
void incLGKMInstsIssued()
Definition wavefront.cc:1415

gem5::Wavefront::scalarAlu
int scalarAlu
Definition wavefront.hh:123

gem5::Wavefront::wrLmReqsInPipe
int wrLmReqsInPipe
Definition wavefront.hh:190

gem5::Wavefront::scalarRdGmReqsInPipe
int scalarRdGmReqsInPipe
Definition wavefront.hh:192

gem5::Wavefront::incVMemInstsIssued
void incVMemInstsIssued()
Definition wavefront.cc:1403

gem5::Wavefront::S_BARRIER
@ S_BARRIER
WF is stalled at a barrier.
Definition wavefront.hh:92

gem5::Wavefront::S_STALLED_SLEEP
@ S_STALLED_SLEEP
Definition wavefront.hh:74

gem5::Wavefront::S_WAITCNT
@ S_WAITCNT
wavefront has unsatisfied wait counts
Definition wavefront.hh:88

gem5::Wavefront::stats
gem5::Wavefront::WavefrontStats stats

gem5::Wavefront::globalMem
int globalMem
Definition wavefront.hh:128

gem5::Wavefront::wfDynId
uint64_t wfDynId
Definition wavefront.hh:233

gem5::statistics::DataWrapVec::subname
Derived & subname(off_type index, const std::string &name)
Set the subfield name for the given index, and marks this stat to print at the end of simulation.
Definition statistics.hh:401

gem5::statistics::Group
Statistics container.
Definition group.hh:93

gem5::statistics::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition statistics.hh:1039

std::vector
STL vector class.
Definition stl.hh:37

compiler.hh

compute_unit.hh

gpu_static_inst.hh

ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188

fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236

panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214

gem5::ArmISA::s
Bitfield< 4 > s
Definition misc_types.hh:647

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::ArmISA::ss
Bitfield< 21 > ss
Definition misc_types.hh:60

gem5::ArmISA::mp
Bitfield< 11 > mp
Definition misc_types.hh:942

gem5::MipsISA::p
Bitfield< 0 > p
Definition pra_constants.hh:326

gem5::MipsISA::dq
Bitfield< 2 > dq
Definition dt_constants.hh:131

gem5::MipsISA::w
Bitfield< 0 > w
Definition pra_constants.hh:281

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::DISPATCH_STATUS
DISPATCH_STATUS
Definition exec_stage.hh:60

gem5::EXREADY
@ EXREADY
Definition exec_stage.hh:62

gem5::EMPTY
@ EMPTY
Definition exec_stage.hh:61

gem5::SKIP
@ SKIP
Definition exec_stage.hh:63

gem5::csprintf
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161

register_file_cache.hh

scalar_register_file.hh

schedule_stage.hh

gem5::ScheduleStage::ScheduleStageStats::rdyListNotEmpty
statistics::Vector rdyListNotEmpty
Definition schedule_stage.hh:195

gem5::ScheduleStage::ScheduleStageStats::addToSchListStalls
statistics::Vector addToSchListStalls
Definition schedule_stage.hh:200

gem5::ScheduleStage::ScheduleStageStats::schListToDispList
statistics::Vector schListToDispList
Definition schedule_stage.hh:205

gem5::ScheduleStage::ScheduleStageStats::ScheduleStageStats
ScheduleStageStats(statistics::Group *parent, int num_exec_units)
Definition schedule_stage.cc:792

gem5::ScheduleStage::ScheduleStageStats::rfAccessStalls
statistics::Vector rfAccessStalls
Definition schedule_stage.hh:216

gem5::ScheduleStage::ScheduleStageStats::schListToDispListStalls
statistics::Vector schListToDispListStalls
Definition schedule_stage.hh:209

gem5::ScheduleStage::ScheduleStageStats::opdNrdyStalls
statistics::Vector opdNrdyStalls
Definition schedule_stage.hh:225

gem5::ScheduleStage::ScheduleStageStats::rdyListEmpty
statistics::Vector rdyListEmpty
Definition schedule_stage.hh:194

gem5::ScheduleStage::ScheduleStageStats::dispNrdyStalls
statistics::Vector dispNrdyStalls
Definition schedule_stage.hh:230

gem5::ScheduleStage::ScheduleStageStats::ldsBusArbStalls
statistics::Scalar ldsBusArbStalls
Definition schedule_stage.hh:221

gem5::Wavefront::WavefrontStats::schCycles
statistics::Scalar schCycles
Definition wavefront.hh:350

gem5::Wavefront::WavefrontStats::schRfAccessStalls
statistics::Scalar schRfAccessStalls
Definition wavefront.hh:360

gem5::Wavefront::WavefrontStats::schOpdNrdyStalls
statistics::Scalar schOpdNrdyStalls
Definition wavefront.hh:364

gem5::Wavefront::WavefrontStats::schStalls
statistics::Scalar schStalls
Definition wavefront.hh:353

name
const std::string & name()
Definition trace.cc:48

vector_register_file.hh

wavefront.hh