develop/compute__unit_8hh_source.html

/*

 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#ifndef __COMPUTE_UNIT_HH__

#define __COMPUTE_UNIT_HH__


#include <deque>

#include <map>

#include <unordered_set>

#include <vector>


#include "base/callback.hh"

#include "base/compiler.hh"

#include "base/statistics.hh"

#include "base/stats/group.hh"

#include "base/types.hh"

#include "config/the_gpu_isa.hh"

#include "enums/PrefetchType.hh"

#include "gpu-compute/comm.hh"

#include "gpu-compute/exec_stage.hh"

#include "gpu-compute/fetch_stage.hh"

#include "gpu-compute/global_memory_pipeline.hh"

#include "gpu-compute/hsa_queue_entry.hh"

#include "gpu-compute/local_memory_pipeline.hh"

#include "gpu-compute/register_manager.hh"

#include "gpu-compute/scalar_memory_pipeline.hh"

#include "gpu-compute/schedule_stage.hh"

#include "gpu-compute/scoreboard_check_stage.hh"

#include "mem/port.hh"

#include "mem/token_port.hh"

#include "sim/clocked_object.hh"


namespace gem5

{


class HSAQueueEntry;

class LdsChunk;

class ScalarRegisterFile;

class Shader;

class VectorRegisterFile;


struct ComputeUnitParams;


enum EXEC_POLICY

{

    OLDEST = 0,

    RR

};


enum TLB_CACHE

{

    TLB_MISS_CACHE_MISS = 0,

    TLB_MISS_CACHE_HIT,

    TLB_HIT_CACHE_MISS,

    TLB_HIT_CACHE_HIT

};


class WFBarrier

{

  public:

    WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)

    {

    }


    static const int InvalidID = -1;


    int

    numAtBarrier() const

    {

        return _numAtBarrier;

    }


    int

    numYetToReachBarrier() const

    {

        return _maxBarrierCnt - _numAtBarrier;

    }


    int

    maxBarrierCnt() const

    {

        return _maxBarrierCnt;

    }


    void

    setMaxBarrierCnt(int max_barrier_cnt)

    {

        _maxBarrierCnt = max_barrier_cnt;

    }


    void

    incNumAtBarrier()

    {

        assert(_numAtBarrier < _maxBarrierCnt);

        ++_numAtBarrier;

    }


    bool

    allAtBarrier() const

    {

        return _numAtBarrier == _maxBarrierCnt;

    }


    void

    decMaxBarrierCnt()

    {

        assert(_maxBarrierCnt > 0);

        --_maxBarrierCnt;

    }


    void

    release()

    {

        _numAtBarrier = 0;

        _maxBarrierCnt = 0;

    }


    void

    reset()

    {

        _numAtBarrier = 0;

    }


  private:

    int _numAtBarrier;


    int _maxBarrierCnt;

};


class ComputeUnit : public ClockedObject

{

  public:


    // Execution resources

    //

    // The ordering of units is:

    // Vector ALUs

    // Scalar ALUs

    // GM Pipe

    // LM Pipe

    // Scalar Mem Pipe

    //

    // Note: the ordering of units is important and the code assumes the

    // above ordering. However, there may be more than one resource of

    // each type (e.g., 4 VALUs or 2 SALUs)


    int numVectorGlobalMemUnits;

    // Resource control for global memory to VRF data/address bus

    WaitClass glbMemToVrfBus;

    // Resource control for Vector Register File->Global Memory pipe buses

    WaitClass vrfToGlobalMemPipeBus;

    // Resource control for Vector Global Memory execution unit

    WaitClass vectorGlobalMemUnit;


    int numVectorSharedMemUnits;

    // Resource control for local memory to VRF data/address bus

    WaitClass locMemToVrfBus;

    // Resource control for Vector Register File->Local Memory pipe buses

    WaitClass vrfToLocalMemPipeBus;

    // Resource control for Vector Shared/Local Memory execution unit

    WaitClass vectorSharedMemUnit;


    int numScalarMemUnits;

    // Resource control for scalar memory to SRF data/address bus

    WaitClass scalarMemToSrfBus;

    // Resource control for Scalar Register File->Scalar Memory pipe buses

    WaitClass srfToScalarMemPipeBus;

    // Resource control for Scalar Memory execution unit

    WaitClass scalarMemUnit;


    // vector ALU execution resources

    int numVectorALUs;

    std::vector<WaitClass> vectorALUs;


    // scalar ALU execution resources

    int numScalarALUs;

    std::vector<WaitClass> scalarALUs;


    // Return total number of execution units on this CU

    int numExeUnits() const;

    // index into readyList of the first memory unit

    int firstMemUnit() const;

    // index into readyList of the last memory unit

    int lastMemUnit() const;

    // index into scalarALUs vector of SALU used by the wavefront

    int mapWaveToScalarAlu(Wavefront *w) const;

    // index into readyList of SALU used by wavefront

    int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;

    // index into readyList of Global Memory unit used by wavefront

    int mapWaveToGlobalMem(Wavefront *w) const;

    // index into readyList of Local Memory unit used by wavefront

    int mapWaveToLocalMem(Wavefront *w) const;

    // index into readyList of Scalar Memory unit used by wavefront

    int mapWaveToScalarMem(Wavefront *w) const;


    int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes

    int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes

    int numCyclesPerStoreTransfer;  // number of cycles per vector store

    int numCyclesPerLoadTransfer;  // number of cycles per vector load


    // track presence of dynamic instructions in the Schedule pipeline

    // stage. This is used to check the readiness of the oldest,

    // non-dispatched instruction of every WF in the Scoreboard stage.

    std::unordered_set<uint64_t> pipeMap;


    RegisterManager* registerManager;


    FetchStage fetchStage;

    ScoreboardCheckStage scoreboardCheckStage;

    ScheduleStage scheduleStage;

    ExecStage execStage;

    GlobalMemPipeline globalMemoryPipe;

    LocalMemPipeline localMemoryPipe;

    ScalarMemPipeline scalarMemoryPipe;


    EventFunctionWrapper tickEvent;


    typedef ComputeUnitParams Params;

    std::vector<std::vector<Wavefront*>> wfList;

    int cu_id;


    // array of vector register files, one per SIMD

    std::vector<VectorRegisterFile*> vrf;

    // array of scalar register files, one per SIMD

    std::vector<ScalarRegisterFile*> srf;


    // Width per VALU/SIMD unit: number of work items that can be executed

    // on the vector ALU simultaneously in a SIMD unit

    int simdWidth;

    // number of pipe stages for bypassing data to next dependent single

    // precision vector instruction inside the vector ALU pipeline

    int spBypassPipeLength;

    // number of pipe stages for bypassing data to next dependent double

    // precision vector instruction inside the vector ALU pipeline

    int dpBypassPipeLength;

    // number of pipe stages for scalar ALU

    int scalarPipeStages;

    // number of pipe stages for operand collection & distribution network

    int operandNetworkLength;

    // number of cycles per instruction issue period

    Cycles issuePeriod;


    // VRF to GM Bus latency

    Cycles vrf_gm_bus_latency;

    // SRF to Scalar Mem Bus latency

    Cycles srf_scm_bus_latency;

    // VRF to LM Bus latency

    Cycles vrf_lm_bus_latency;


    // tracks the last cycle a vector instruction was executed on a SIMD

    std::vector<uint64_t> lastExecCycle;


    // tracks the number of dyn inst executed per SIMD

    std::vector<uint64_t> instExecPerSimd;


    // true if we allow a separate TLB per lane

    bool perLaneTLB;

    // if 0, TLB prefetching is off.

    int prefetchDepth;

    // if fixed-stride prefetching, this is the stride.

    int prefetchStride;


    std::vector<Addr> lastVaddrCU;

    std::vector<std::vector<Addr>> lastVaddrSimd;

    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;

    enums::PrefetchType prefetchType;

    EXEC_POLICY exec_policy;


    bool debugSegFault;

    // Idle CU timeout in ticks

    Tick idleCUTimeout;

    int idleWfs;

    bool functionalTLB;

    bool localMemBarrier;


    /*

     * for Counting page accesses

     */

    bool countPages;


    Shader *shader;


    Tick req_tick_latency;

    Tick resp_tick_latency;

    Tick scalar_req_tick_latency;

    Tick scalar_resp_tick_latency;


    std::vector<int> numWfsToSched;


    // number of currently reserved vector registers per SIMD unit

    std::vector<int> vectorRegsReserved;

    // number of currently reserved scalar registers per SIMD unit

    std::vector<int> scalarRegsReserved;

    // number of vector registers per SIMD unit

    int numVecRegsPerSimd;

    // number of available scalar registers per SIMD unit

    int numScalarRegsPerSimd;


    // this hash map will keep track of page divergence

    // per memory instruction per wavefront. The hash map

    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.

    std::map<Addr, int> pagesTouched;


    void insertInPipeMap(Wavefront *w);

    void deleteFromPipeMap(Wavefront *w);


    ComputeUnit(const Params &p);

    ~ComputeUnit();


    // Timing Functions

    int oprNetPipeLength() const { return operandNetworkLength; }

    int simdUnitWidth() const { return simdWidth; }

    int spBypassLength() const { return spBypassPipeLength; }

    int dpBypassLength() const { return dpBypassPipeLength; }

    int scalarPipeLength() const { return scalarPipeStages; }

    int storeBusLength() const { return numCyclesPerStoreTransfer; }

    int loadBusLength() const { return numCyclesPerLoadTransfer; }

    int wfSize() const { return wavefrontSize; }


    void exec();

    void initiateFetch(Wavefront *wavefront);

    void fetch(PacketPtr pkt, Wavefront *wavefront);

    void fillKernelState(Wavefront *w, HSAQueueEntry *task);


    void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,

                        HSAQueueEntry *task, int bar_id,

                        bool fetchContext=false);


    void doInvalidate(RequestPtr req, int kernId);

    void doFlush(GPUDynInstPtr gpuDynInst);


    void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);

    bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);


    int cacheLineSize() const { return _cacheLineSize; }

    int getCacheLineBits() const { return cacheLineBits; }


    void resetRegisterPool();


  private:

    WFBarrier&

    barrierSlot(int bar_id)

    {

        assert(bar_id > WFBarrier::InvalidID);

        return wfBarrierSlots.at(bar_id);

    }


    int

    getFreeBarrierId()

    {

        assert(freeBarrierIds.size());

        auto free_bar_id = freeBarrierIds.begin();

        int bar_id = *free_bar_id;

        freeBarrierIds.erase(free_bar_id);

        return bar_id;

    }


  public:

    int numYetToReachBarrier(int bar_id);

    bool allAtBarrier(int bar_id);

    void incNumAtBarrier(int bar_id);

    int numAtBarrier(int bar_id);

    int maxBarrierCnt(int bar_id);

    void resetBarrier(int bar_id);

    void decMaxBarrierCnt(int bar_id);

    void releaseBarrier(int bar_id);

    void releaseWFsFromBarrier(int bar_id);

    int numBarrierSlots() const { return _numBarrierSlots; }


    template<typename c0, typename c1>

    void doSmReturn(GPUDynInstPtr gpuDynInst);


    virtual void init() override;

    void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);

    void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);

    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,

                              bool kernelMemSync,

                              RequestPtr req=nullptr);

    void handleMemPacket(PacketPtr pkt, int memport_index);

    bool processTimingPacket(PacketPtr pkt);

    void processFetchReturn(PacketPtr pkt);

    void updatePageDivergenceDist(Addr addr);


    RequestorID requestorId() { return _requestorId; }

    RequestorID vramRequestorId();


    bool isDone() const;

    bool isVectorAluIdle(uint32_t simdId) const;


    void handleSQCReturn(PacketPtr pkt);


  protected:

    RequestorID _requestorId;


    LdsState &lds;


  public:

    LdsState &

    getLds() const

    {

        return lds;

    }


    int32_t

    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;


    [[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);


    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;

    pageDataStruct pageAccesses;


    void exitCallback();


    class GMTokenPort : public TokenRequestPort

    {

      public:

        GMTokenPort(const std::string& name, SimObject *owner,

                    PortID id = InvalidPortID)

            : TokenRequestPort(name, owner, id)

        { }

        ~GMTokenPort() { }


      protected:

        bool recvTimingResp(PacketPtr) { return false; }

        void recvReqRetry() { }

    };


    // Manager for the number of tokens available to this compute unit to

    // send global memory request packets to the coalescer this is only used

    // between global memory pipe and TCP coalescer.

    TokenManager *memPortTokens;

    GMTokenPort gmTokenPort;


    class DataPort : public RequestPort

    {

      public:

        DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)

            : RequestPort(_name, id), computeUnit(_cu) { }


        bool snoopRangeSent;


        struct SenderState : public Packet::SenderState

        {

            GPUDynInstPtr _gpuDynInst;

            PortID port_index;

            Packet::SenderState *saved;


            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,

                        Packet::SenderState *sender_state=nullptr)

                : _gpuDynInst(gpuDynInst),

                  port_index(_port_index),

                  saved(sender_state) { }

        };


        class SystemHubEvent : public Event

        {

          DataPort *dataPort;

          PacketPtr reqPkt;


          public:

            SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)

                : dataPort(_dataPort), reqPkt(pkt)

            {

                setFlags(Event::AutoDelete);

            }


            void

            process()

            {

                // DMAs do not operate on packets and therefore do not

                // convert to a response. Do that here instead.

                reqPkt->makeResponse();

                dataPort->handleResponse(reqPkt);

            }

        };


        void processMemReqEvent(PacketPtr pkt);

        EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);


        void processMemRespEvent(PacketPtr pkt);

        EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);


        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;


        bool handleResponse(PacketPtr pkt);


      protected:

        ComputeUnit *computeUnit;


        virtual bool recvTimingResp(PacketPtr pkt);

        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }

        virtual void recvFunctional(PacketPtr pkt) { }

        virtual void recvRangeChange() { }

        virtual void recvReqRetry();


        virtual void

        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)

        {

            resp.clear();

            snoop = true;

        }


    };


    // Scalar data cache access port

    class ScalarDataPort : public RequestPort

    {

      public:

        ScalarDataPort(const std::string &_name, ComputeUnit *_cu)

            : RequestPort(_name), computeUnit(_cu)

        {

        }


        bool recvTimingResp(PacketPtr pkt) override;

        void recvReqRetry() override;


        struct SenderState : public Packet::SenderState

        {

            SenderState(GPUDynInstPtr gpuDynInst,

                        Packet::SenderState *sender_state=nullptr)

                : _gpuDynInst(gpuDynInst), saved(sender_state)

            {

            }


            GPUDynInstPtr _gpuDynInst;

            Packet::SenderState *saved;

        };


        class MemReqEvent : public Event

        {

          private:

            ScalarDataPort &scalarDataPort;

            PacketPtr pkt;


          public:

            MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)

                : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)

            {

              setFlags(Event::AutoDelete);

            }


            void process();

            const char *description() const;

        };


        class SystemHubEvent : public Event

        {

          ScalarDataPort *dataPort;

          PacketPtr reqPkt;


          public:

            SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)

                : dataPort(_dataPort), reqPkt(pkt)

            {

                setFlags(Event::AutoDelete);

            }


            void

            process()

            {

                // DMAs do not operate on packets and therefore do not

                // convert to a response. Do that here instead.

                reqPkt->makeResponse();

                dataPort->handleResponse(reqPkt);

            }

        };


        bool handleResponse(PacketPtr pkt);


        std::deque<PacketPtr> retries;


      private:

        ComputeUnit *computeUnit;

    };


    // Instruction cache access port

    class SQCPort : public RequestPort

    {

      public:

        SQCPort(const std::string &_name, ComputeUnit *_cu)

            : RequestPort(_name), computeUnit(_cu) { }


        bool snoopRangeSent;


        struct SenderState : public Packet::SenderState

        {

            Wavefront *wavefront;

            Packet::SenderState *saved;

            // kernel id to be used in handling I-Cache invalidate response

            int kernId;


            SenderState(Wavefront *_wavefront, Packet::SenderState

                    *sender_state=nullptr, int _kernId=-1)

                : wavefront(_wavefront), saved(sender_state),

                kernId(_kernId){ }

        };


        std::deque<std::pair<PacketPtr, Wavefront*>> retries;


      protected:

        ComputeUnit *computeUnit;


        virtual bool recvTimingResp(PacketPtr pkt);

        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }

        virtual void recvFunctional(PacketPtr pkt) { }

        virtual void recvRangeChange() { }

        virtual void recvReqRetry();


        virtual void

        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)

        {

            resp.clear();

            snoop = true;

        }

     };


    class DTLBPort : public RequestPort

    {

      public:

        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)

            : RequestPort(_name, id), computeUnit(_cu),

              stalled(false)

        { }


        bool isStalled() { return stalled; }

        void stallPort() { stalled = true; }

        void unstallPort() { stalled = false; }


        std::deque<PacketPtr> retries;


        struct SenderState: public Packet::SenderState

        {

            // the memInst that this is associated with

            GPUDynInstPtr _gpuDynInst;


            // the lane in the memInst this is associated with, so we send

            // the memory request down the right port

            PortID portIndex;


            // constructor used for packets involved in timing accesses

            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)

                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }


        };


      protected:

        ComputeUnit *computeUnit;

        bool stalled;


        virtual bool recvTimingResp(PacketPtr pkt);

        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }

        virtual void recvFunctional(PacketPtr pkt) { }

        virtual void recvRangeChange() { }

        virtual void recvReqRetry();

    };


    class ScalarDTLBPort : public RequestPort

    {

      public:

        ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)

            : RequestPort(_name), computeUnit(_cu), stalled(false)

        {

        }


        struct SenderState : public Packet::SenderState

        {

            SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }

            GPUDynInstPtr _gpuDynInst;

        };


        bool recvTimingResp(PacketPtr pkt) override;

        void recvReqRetry() override { assert(false); }


        bool isStalled() const { return stalled; }

        void stallPort() { stalled = true; }

        void unstallPort() { stalled = false; }


        std::deque<PacketPtr> retries;


      private:

        ComputeUnit *computeUnit;

        bool stalled;

    };


    class ITLBPort : public RequestPort

    {

      public:

        ITLBPort(const std::string &_name, ComputeUnit *_cu)

            : RequestPort(_name), computeUnit(_cu), stalled(false) { }


        bool isStalled() { return stalled; }

        void stallPort() { stalled = true; }

        void unstallPort() { stalled = false; }


        std::deque<PacketPtr> retries;


        struct SenderState: public Packet::SenderState

        {

            // The wavefront associated with this request

            Wavefront *wavefront;


            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }

        };


      protected:

        ComputeUnit *computeUnit;

        bool stalled;


        virtual bool recvTimingResp(PacketPtr pkt);

        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }

        virtual void recvFunctional(PacketPtr pkt) { }

        virtual void recvRangeChange() { }

        virtual void recvReqRetry();

    };


    class LDSPort : public RequestPort

    {

      public:

        LDSPort(const std::string &_name, ComputeUnit *_cu)

        : RequestPort(_name), computeUnit(_cu)

        {

        }


        bool isStalled() const { return stalled; }

        void stallPort() { stalled = true; }

        void unstallPort() { stalled = false; }


        std::queue<PacketPtr> retries;


        class SenderState: public Packet::SenderState

        {

          protected:

            // The actual read/write/atomic request that goes with this command

            GPUDynInstPtr _gpuDynInst = nullptr;


          public:

            SenderState(GPUDynInstPtr gpuDynInst):

              _gpuDynInst(gpuDynInst)

            {

            }


            GPUDynInstPtr

            getMemInst() const

            {

              return _gpuDynInst;

            }

        };


        virtual bool

        sendTimingReq(PacketPtr pkt);


      protected:


        bool stalled = false;


        ComputeUnit *computeUnit;


        virtual bool

        recvTimingResp(PacketPtr pkt);


        virtual Tick

        recvAtomic(PacketPtr pkt) { return 0; }


        virtual void

        recvFunctional(PacketPtr pkt)

        {

        }


        virtual void

        recvRangeChange()

        {

        }


        virtual void

        recvReqRetry();

    };


    LDSPort ldsPort;


    TokenManager *

    getTokenManager()

    {

        return memPortTokens;

    }


    std::vector<DataPort> memPort;

    // port to the TLB hierarchy (i.e., the L1 TLB)

    std::vector<DTLBPort> tlbPort;

    // port to the scalar data cache

    ScalarDataPort scalarDataPort;

    // port to the scalar data TLB

    ScalarDTLBPort scalarDTLBPort;

    // port to the SQC (i.e. the I-cache)

    SQCPort sqcPort;

    // port to the SQC TLB (there's a separate TLB for each I-cache)

    ITLBPort sqcTLBPort;


    Port &

    getPort(const std::string &if_name, PortID idx) override

    {

        if (if_name == "memory_port" && idx < memPort.size()) {

            return memPort[idx];

        } else if (if_name == "translation_port" && idx < tlbPort.size()) {

            return tlbPort[idx];

        } else if (if_name == "scalar_port") {

            return scalarDataPort;

        } else if (if_name == "scalar_tlb_port") {

            return scalarDTLBPort;

        } else if (if_name == "sqc_port") {

            return sqcPort;

        } else if (if_name == "sqc_tlb_port") {

            return sqcTLBPort;

        } else if (if_name == "ldsPort") {

            return ldsPort;

        } else if (if_name == "gmTokenPort") {

            return gmTokenPort;

        } else {

            return ClockedObject::getPort(if_name, idx);

        }

    }


    InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }


  private:

    const int _cacheLineSize;

    const int _numBarrierSlots;

    int cacheLineBits;

    InstSeqNum globalSeqNum;

    int wavefrontSize;


    ScoreboardCheckToSchedule scoreboardCheckToSchedule;

    ScheduleToExecute scheduleToExecute;


    std::vector<WFBarrier> wfBarrierSlots;

    std::unordered_set<int> freeBarrierIds;


    // hold the time of the arrival of the first cache block related to

    // a particular GPUDynInst. This is used to calculate the difference

    // between the first and last chace block arrival times.

    std::unordered_map<GPUDynInstPtr, Tick> headTailMap;


  public:

    void updateInstStats(GPUDynInstPtr gpuDynInst);

    int activeWaves;


    struct ComputeUnitStats : public statistics::Group

    {

        ComputeUnitStats(statistics::Group *parent, int n_wf);


        statistics::Scalar vALUInsts;

        statistics::Formula vALUInstsPerWF;

        statistics::Scalar sALUInsts;

        statistics::Formula sALUInstsPerWF;

        statistics::Scalar instCyclesVALU;

        statistics::Scalar instCyclesSALU;

        statistics::Scalar threadCyclesVALU;

        statistics::Formula vALUUtilization;

        statistics::Scalar ldsNoFlatInsts;

        statistics::Formula ldsNoFlatInstsPerWF;

        statistics::Scalar flatVMemInsts;

        statistics::Formula flatVMemInstsPerWF;

        statistics::Scalar flatLDSInsts;

        statistics::Formula flatLDSInstsPerWF;

        statistics::Scalar vectorMemWrites;

        statistics::Formula vectorMemWritesPerWF;

        statistics::Scalar vectorMemReads;

        statistics::Formula vectorMemReadsPerWF;

        statistics::Scalar scalarMemWrites;

        statistics::Formula scalarMemWritesPerWF;

        statistics::Scalar scalarMemReads;

        statistics::Formula scalarMemReadsPerWF;


        statistics::Formula vectorMemReadsPerKiloInst;

        statistics::Formula vectorMemWritesPerKiloInst;

        statistics::Formula vectorMemInstsPerKiloInst;

        statistics::Formula scalarMemReadsPerKiloInst;

        statistics::Formula scalarMemWritesPerKiloInst;

        statistics::Formula scalarMemInstsPerKiloInst;


        // Cycles required to send register source (addr and data) from

        // register files to memory pipeline, per SIMD.

        statistics::Vector instCyclesVMemPerSimd;

        statistics::Vector instCyclesScMemPerSimd;

        statistics::Vector instCyclesLdsPerSimd;


        statistics::Scalar globalReads;

        statistics::Scalar globalWrites;

        statistics::Formula globalMemInsts;

        statistics::Scalar argReads;

        statistics::Scalar argWrites;

        statistics::Formula argMemInsts;

        statistics::Scalar spillReads;

        statistics::Scalar spillWrites;

        statistics::Formula spillMemInsts;

        statistics::Scalar groupReads;

        statistics::Scalar groupWrites;

        statistics::Formula groupMemInsts;

        statistics::Scalar privReads;

        statistics::Scalar privWrites;

        statistics::Formula privMemInsts;

        statistics::Scalar readonlyReads;

        statistics::Scalar readonlyWrites;

        statistics::Formula readonlyMemInsts;

        statistics::Scalar kernargReads;

        statistics::Scalar kernargWrites;

        statistics::Formula kernargMemInsts;


        statistics::Distribution waveLevelParallelism;


        // the following stats compute the avg. TLB accesslatency per

        // uncoalesced request (only for data)

        statistics::Scalar tlbRequests;

        statistics::Scalar tlbCycles;

        statistics::Formula tlbLatency;

        // hitsPerTLBLevel[x] are the hits in Level x TLB.

        // x = 0 is the page table.

        statistics::Vector hitsPerTLBLevel;


        statistics::Scalar ldsBankAccesses;

        statistics::Distribution ldsBankConflictDist;


        // over all memory instructions executed over all wavefronts

        // how many touched 0-4 pages, 4-8, ..., 60-64 pages

        statistics::Distribution pageDivergenceDist;

        // count of non-flat global memory vector instructions executed

        statistics::Scalar dynamicGMemInstrCnt;

        // count of flat global memory vector instructions executed

        statistics::Scalar dynamicFlatMemInstrCnt;

        statistics::Scalar dynamicLMemInstrCnt;


        statistics::Scalar wgBlockedDueBarrierAllocation;

        statistics::Scalar wgBlockedDueLdsAllocation;

        // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are

        // active when the instruction is committed, this number is still

        // incremented by 1

        statistics::Scalar numInstrExecuted;

        // Number of cycles among successive instruction executions across all

        // wavefronts of the same CU

        statistics::Distribution execRateDist;

        // number of individual vector operations executed

        statistics::Scalar numVecOpsExecuted;

        // number of individual f16 vector operations executed

        statistics::Scalar numVecOpsExecutedF16;

        // number of individual f32 vector operations executed

        statistics::Scalar numVecOpsExecutedF32;

        // number of individual f64 vector operations executed

        statistics::Scalar numVecOpsExecutedF64;

        // number of individual FMA 16,32,64 vector operations executed

        statistics::Scalar numVecOpsExecutedFMA16;

        statistics::Scalar numVecOpsExecutedFMA32;

        statistics::Scalar numVecOpsExecutedFMA64;

        // number of individual MAC 16,32,64 vector operations executed

        statistics::Scalar numVecOpsExecutedMAC16;

        statistics::Scalar numVecOpsExecutedMAC32;

        statistics::Scalar numVecOpsExecutedMAC64;

        // number of individual MAD 16,32,64 vector operations executed

        statistics::Scalar numVecOpsExecutedMAD16;

        statistics::Scalar numVecOpsExecutedMAD32;

        statistics::Scalar numVecOpsExecutedMAD64;

        // total number of two op FP vector operations executed

        statistics::Scalar numVecOpsExecutedTwoOpFP;

        // Total cycles that something is running on the GPU

        statistics::Scalar totalCycles;

        statistics::Formula vpc; // vector ops per cycle

        statistics::Formula vpc_f16; // vector ops per cycle

        statistics::Formula vpc_f32; // vector ops per cycle

        statistics::Formula vpc_f64; // vector ops per cycle

        statistics::Formula ipc; // vector instructions per cycle

        statistics::Distribution controlFlowDivergenceDist;

        statistics::Distribution activeLanesPerGMemInstrDist;

        statistics::Distribution activeLanesPerLMemInstrDist;

        // number of vector ALU instructions received

        statistics::Formula numALUInstsExecuted;

        // number of times a WG cannot start due to lack of free VGPRs in SIMDs

        statistics::Scalar numTimesWgBlockedDueVgprAlloc;

        // number of times a WG cannot start due to lack of free SGPRs in SIMDs

        statistics::Scalar numTimesWgBlockedDueSgprAlloc;

        statistics::Scalar numCASOps;

        statistics::Scalar numFailedCASOps;

        statistics::Scalar completedWfs;

        statistics::Scalar completedWGs;


        // distrubtion in latency difference between first and last cache block

        // arrival ticks

        statistics::Distribution headTailLatency;


        // Track the amount of interleaving between wavefronts on each SIMD.

        // This stat is sampled using instExecPerSimd to compute the number

        // of instructions that have been executed on a SIMD between a WF

        // executing two successive instructions.

        statistics::VectorDistribution instInterleave;

    } stats;

};


} // namespace gem5


#endif // __COMPUTE_UNIT_HH__