32#ifndef __COMPUTE_UNIT_HH__
33#define __COMPUTE_UNIT_HH__
37#include <unordered_set>
45#include "config/the_gpu_isa.hh"
46#include "enums/PrefetchType.hh"
66class ScalarRegisterFile;
68class VectorRegisterFile;
69class RegisterFileCache;
71struct ComputeUnitParams;
411 bool fetchContext=
false);
438 int bar_id = *free_bar_id;
455 template<
typename c0,
typename c1>
458 virtual void init()
override;
492 getRefCounter(
const uint32_t dispatchId,
const uint32_t wgId)
const;
541 saved(sender_state) { }
547 saved(sender_state) { }
688 *sender_state=
nullptr,
int _kernId=-1)
946 if (if_name ==
"memory_port" && idx <
memPort.size()) {
948 }
else if (if_name ==
"translation_port" && idx <
tlbPort.size()) {
950 }
else if (if_name ==
"scalar_port") {
952 }
else if (if_name ==
"scalar_tlb_port") {
954 }
else if (if_name ==
"sqc_port") {
956 }
else if (if_name ==
"sqc_tlb_port") {
958 }
else if (if_name ==
"ldsPort") {
960 }
else if (if_name ==
"gmTokenPort") {
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ComputeUnit * computeUnit
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
ComputeUnit * computeUnit
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
GMTokenPort(const std::string &name, SimObject *owner, PortID id=InvalidPortID)
bool recvTimingResp(PacketPtr)
Receive a timing response from the peer.
void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
ComputeUnit * computeUnit
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ITLBPort(const std::string &_name, ComputeUnit *_cu)
SenderState is information carried along with the packet, esp.
GPUDynInstPtr getMemInst() const
GPUDynInstPtr _gpuDynInst
SenderState(GPUDynInstPtr gpuDynInst)
the port intended to communicate between the CU and its LDS
bool stalled
whether or not it is stalled
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
LDSPort(const std::string &_name, ComputeUnit *_cu)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual Tick recvAtomic(PacketPtr pkt)
ComputeUnit * computeUnit
virtual void recvFunctional(PacketPtr pkt)
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
const char * description() const
Return a C string describing the event.
MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
std::deque< std::pair< PacketPtr, Wavefront * > > retries
ComputeUnit * computeUnit
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
SQCPort(const std::string &_name, ComputeUnit *_cu)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
virtual Tick recvAtomic(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
ComputeUnit * computeUnit
const char * description() const
Return a C string describing the event.
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
ScalarDataPort & scalarDataPort
ScalarDataPort * dataPort
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ComputeUnit * computeUnit
bool handleResponse(PacketPtr pkt)
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
int numCyclesPerLoadTransfer
int oprNetPipeLength() const
int simdUnitWidth() const
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
bool processTimingPacket(PacketPtr pkt)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
Cycles srf_scm_bus_latency
std::vector< uint64_t > instExecPerSimd
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
int numVectorGlobalMemUnits
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
Cycles vrf_gm_bus_latency
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
void handleMemPacket(PacketPtr pkt, int memport_index)
int mapWaveToLocalMem(Wavefront *w) const
std::vector< RegisterFileCache * > rfc
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void doSmReturn(GPUDynInstPtr gpuDynInst)
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::unordered_map< Addr, std::pair< int, int > > pageDataStruct
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
Tick scalar_req_tick_latency
int getCacheLineBits() const
std::vector< WFBarrier > wfBarrierSlots
The barrier slots for this CU.
void resetBarrier(int bar_id)
std::vector< std::vector< Addr > > lastVaddrSimd
Tick scalar_resp_tick_latency
Cycles vrf_lm_bus_latency
int numVectorSharedMemUnits
TokenManager * getTokenManager()
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
int cacheLineSize() const
WaitClass srfToScalarMemPipeBus
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
void sendInvL2(Addr paddr)
LdsState & getLds() const
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > lastExecCycle
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
int vrfToCoalescerBusWidth
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
Port & getPort(const std::string &if_name, PortID idx) override
Get a port with a given name and index.
std::vector< ScalarRegisterFile * > srf
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
ScoreboardCheckStage scoreboardCheckStage
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
int coalescerToVrfBusWidth
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
int dpBypassLength() const
int loadBusLength() const
enums::PrefetchType prefetchType
void processFetchReturn(PacketPtr pkt)
int numBarrierSlots() const
int scalarPipeLength() const
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
int numCyclesPerStoreTransfer
const int _numBarrierSlots
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
int storeBusLength() const
void initiateFetch(Wavefront *wavefront)
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
int spBypassLength() const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
void fetch(PacketPtr pkt, Wavefront *wavefront)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
static const FlagsType AutoDelete
void setFlags(Flags _flags)
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void makeResponse()
Take a request packet and modify it in place to be suitable for returning as a response to that reque...
Ports are used to interface objects to each other.
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
const std::string name() const
Return port name (for DPRINTF).
A RequestPort is a specialisation of a Port, which implements the default protocol for the three diff...
Communication interface between Schedule and Execute stages.
Communication interface between ScoreboardCheck and Schedule stages.
Abstract superclass for simulation objects.
static const int InvalidID
void decMaxBarrierCnt()
Decrement the number of WFs that are participating in this barrier.
int numYetToReachBarrier() const
Number of WFs that have not yet reached the barrier.
void setMaxBarrierCnt(int max_barrier_cnt)
Set the maximum barrier count (i.e., the number of WFs that are participating in the barrier).
void release()
Release this barrier resource so it can be used by other WGs.
void reset()
Reset the barrier.
void incNumAtBarrier()
Mark that a WF has reached the barrier.
int _maxBarrierCnt
The maximum number of WFs that can reach this barrier.
int _numAtBarrier
The number of WFs in the WG that have reached the barrier.
bool allAtBarrier() const
Have all WFs participating in this barrier reached the barrier? If so, then the barrier is satisfied ...
int maxBarrierCnt() const
A simple distribution stat.
This is a simple scalar statistic, like a counter.
A vector of distributions.
A vector of scalar stats.
ClockedObject declaration and implementation.
virtual Port & getPort(const std::string &if_name, PortID idx=InvalidPortID)
Get a port with a given name and index.
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
std::shared_ptr< Request > RequestPtr
const PortID InvalidPortID
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
uint64_t Tick
Tick count type.
Declaration of Statistics objects.
statistics::Scalar spillReads
statistics::Scalar groupWrites
statistics::Scalar numVecOpsExecutedF64
statistics::Scalar numFailedCASOps
statistics::Scalar numVecOpsExecuted
statistics::Formula vpc_f64
statistics::Scalar instCyclesSALU
statistics::Formula vectorMemWritesPerWF
statistics::Scalar argWrites
statistics::Scalar globalReads
statistics::Scalar numCASOps
statistics::Scalar completedWGs
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vALUInstsPerWF
statistics::Formula vectorMemWritesPerKiloInst
statistics::Formula sALUInstsPerWF
statistics::Formula readonlyMemInsts
statistics::Formula vALUUtilization
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::Formula privMemInsts
statistics::VectorDistribution instInterleave
statistics::Scalar flatVMemInsts
statistics::Scalar numVecOpsExecutedMAC64
statistics::Formula vpc_f16
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar dynamicLMemInstrCnt
statistics::Formula flatLDSInstsPerWF
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Vector instCyclesVMemPerSimd
statistics::Formula flatVMemInstsPerWF
statistics::Scalar argReads
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Scalar numVecOpsExecutedF32
statistics::Scalar numVecOpsExecutedFMA64
statistics::Scalar scalarMemWrites
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Formula groupMemInsts
statistics::Scalar privReads
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Formula numALUInstsExecuted
statistics::Scalar completedWfs
statistics::Distribution ldsBankConflictDist
statistics::Scalar vectorMemWrites
statistics::Scalar numInstrExecuted
statistics::Scalar vectorMemReads
statistics::Formula argMemInsts
statistics::Scalar tlbCycles
statistics::Formula scalarMemWritesPerKiloInst
statistics::Scalar scalarMemReads
statistics::Scalar tlbRequests
statistics::Formula kernargMemInsts
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedF16
statistics::Scalar groupReads
statistics::Scalar privWrites
statistics::Scalar kernargReads
statistics::Scalar instCyclesVALU
statistics::Formula scalarMemWritesPerWF
statistics::Scalar readonlyWrites
statistics::Scalar numVecOpsExecutedMAD64
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Formula vectorMemReadsPerWF
statistics::Scalar dynamicGMemInstrCnt
statistics::Formula vpc_f32
statistics::Scalar ldsBankAccesses
statistics::Formula tlbLatency
statistics::Scalar vALUInsts
statistics::Scalar numVecOpsExecutedFMA32
statistics::Formula scalarMemReadsPerKiloInst
statistics::Formula globalMemInsts
statistics::Formula scalarMemReadsPerWF
statistics::Scalar numVecOpsExecutedMAD16
statistics::Vector hitsPerTLBLevel
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar threadCyclesVALU
statistics::Scalar ldsNoFlatInsts
statistics::Scalar flatLDSInsts
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Scalar numVecOpsExecutedFMA16
statistics::Scalar spillWrites
statistics::Formula ldsNoFlatInstsPerWF
statistics::Scalar numVecOpsExecutedMAD32
statistics::Formula spillMemInsts
statistics::Scalar numVecOpsExecutedMFMAI8
statistics::Vector instCyclesLdsPerSimd
statistics::Vector instCyclesScMemPerSimd
statistics::Scalar kernargWrites
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar globalWrites
statistics::Scalar dynamicFlatMemInstrCnt
statistics::Distribution headTailLatency
statistics::Scalar totalCycles
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
statistics::Scalar readonlyReads
statistics::Scalar sALUInsts
statistics::Scalar numVecOpsExecutedMFMA
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
GPUDynInstPtr _gpuDynInst
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr)
ComputeUnit * computeUnit
Packet::SenderState * saved
SenderState(ComputeUnit *cu, PortID _port_index, Packet::SenderState *sender_state=nullptr)
GPUDynInstPtr _gpuDynInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(Wavefront *_wavefront)
SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
Packet::SenderState * saved
SenderState(GPUDynInstPtr gpuDynInst)
GPUDynInstPtr _gpuDynInst
Packet::SenderState * saved
SenderState(GPUDynInstPtr gpuDynInst, Packet::SenderState *sender_state=nullptr)
GPUDynInstPtr _gpuDynInst
A virtual base opaque structure used to hold state associated with the packet (e.g....