32#ifndef __GPU_DYN_INST_HH__
33#define __GPU_DYN_INST_HH__
43#include "debug/GPUMem.hh"
44#include "enums/StorageClassType.hh"
64 if constexpr (
sizeof(T) == 4) {
73 fatal(
"Attempted packed atomic bf16 on non 32-bit type");
147 uint64_t instSeqNum);
182 bool isOpcode(
const std::string& opcodeStr)
const;
183 bool isOpcode(
const std::string& opcodeStr,
184 const std::string& extStr)
const;
340 return std::make_unique<AtomicOpAnd<c0>>(*reg0);
342 return std::make_unique<AtomicOpOr<c0>>(*reg0);
344 return std::make_unique<AtomicOpXor<c0>>(*reg0);
346 return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1,
cu);
348 return std::make_unique<AtomicOpExch<c0>>(*reg0);
350 return std::make_unique<AtomicOpAdd<c0>>(*reg0);
352 return std::make_unique<AtomicOpSub<c0>>(*reg0);
354 return std::make_unique<AtomicOpInc<c0>>();
356 return std::make_unique<AtomicOpDec<c0>>();
358 return std::make_unique<AtomicOpMax<c0>>(*reg0);
360 return std::make_unique<AtomicOpMin<c0>>(*reg0);
362 return std::make_unique<AtomicOpPkAddBF16<c0>>(*reg0);
364 fatal(
"Unrecognized atomic operation");
400 assert(
statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
401 for (
int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
420 assert((newVal >= 0) && (newVal <= 4));
453 for (
int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
456 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: lane: %d has %d pending "
464 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: all lanes have no pending"
474 std::string statusVec_str =
"[";
478 for (
int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
481 statusVec_str +=
"]";
483 return statusVec_str;
AtomicOpFunctor * clone()
ComputeUnit * computeUnit
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
AtomicOpPkAddBF16(T _data)
AtomicOpFunctor * clone()
bool isFlatScratch() const
bool isKernelLaunch() const
std::unordered_map< Addr, std::vector< int > > StatusVector
std::vector< Tick > roundTripTime
bool hasDestinationSgpr() const
int numDstScalarRegOperands() const
std::map< Addr, std::vector< Tick > > lineAddressTime
void doApertureCheck(const VectorMask &mask)
void resolveFlatSegment(const VectorMask &mask)
std::vector< int > tlbHitLevel
bool isAtomicExch() const
std::vector< Tick > getRoundTripTime() const
bool isFlatGlobal() const
GPUStaticInst * _staticInst
bool hasDestinationVgpr() const
std::vector< int > statusVector
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
void decrementStatusVector(int lane)
bool isUnconditionalJump() const
GPUStaticInst * staticInstruction()
int numSrcScalarRegOperands() const
bool isOpcode(const std::string &opcodeStr) const
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum)
const std::map< Addr, std::vector< Tick > > & getLineAddressTime() const
bool isALU() const
accessor methods for the attributes of the underlying GPU static instruction
bool isReadOnlySeg() const
bool isSystemCoherent() const
bool hasSourceVgpr() const
int numDstVecRegOperands() const
StatusVector memStatusVector
bool hasSourceSgpr() const
int getLaneStatus(int lane) const
bool readsFlatScratch() const
void initiateAcc(GPUDynInstPtr gpuDynInst)
int getNumOperands() const
enums::StorageClassType executedAs()
const std::vector< OperandInfo > & dstVecRegOperands() const
void profileRoundTripTime(Tick currentTime, int hopId)
void resetStatusVector(int lane)
bool isCondBranch() const
bool writesExecMask() const
bool isPrivateSeg() const
bool isEndOfKernel() const
void resetEntireStatusVector()
const std::vector< OperandInfo > & srcVecRegOperands() const
bool isGloballyCoherent() const
bool readsExecMask() const
TheGpuISA::ScalarRegU32 srcLiteral() const
Tick getAccessTime() const
int maxSrcScalarRegOperandSize()
InstSeqNum seqNum() const
const std::vector< OperandInfo > & srcScalarRegOperands() const
const std::vector< OperandInfo > & dstScalarRegOperands() const
int numSrcVecRegOperands() const
void setRequestFlags(RequestPtr req) const
std::string printStatusVector() const
bool writesFlatScratch() const
bool allLanesZero() const
void execute(GPUDynInstPtr gpuDynInst)
AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
bool isKernArgSeg() const
void setStatusVector(int lane, int newVal)
void setAccessTime(Tick currentTime)
int maxSrcScalarRegOpSize
bool isAtomicPkAddBF16() const
int maxSrcVecRegOperandSize()
bool isAtomicNoRet() const
const std::string & disassemble() const
void completeAcc(GPUDynInstPtr gpuDynInst)
GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
const std::vector< int > virtIndices
const int opIdx
Index of this operand within the set of its parent instruction's operand list.
const std::vector< int > physIndices
RegisterOperandInfo(int op_idx, int num_dwords, const std::vector< int > &virt_indices, const std::vector< int > &phys_indices)
const int numDWORDs
Size of this operand in DWORDs.
int virtIdx(int reg_num=0) const
We typically only need the first virtual register for the operand regardless of its size.
int numRegisters() const
The number of registers required to store this operand.
RegisterOperandInfo()=delete
@ ATOMIC_RETURN_OP
The request is an atomic that returns data.
@ ATOMIC_NO_RETURN_OP
The request is an atomic that does not return data.
@ SLC_BIT
user-policy flags
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
#define fatal(...)
This implements a cprintf based fatal() function.
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
uint64_t Tick
Tick count type.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask