release/v19-0-0-0/gpu__dyn__inst_8hh_source.html

 /*
  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Authors: Anthony Gutierrez
  */

 #ifndef __GPU_DYN_INST_HH__
 #define __GPU_DYN_INST_HH__

 #include <cstdint>
 #include <string>

 #include "base/amo.hh"
 #include "base/logging.hh"
 #include "enums/MemType.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_exec_context.hh"

 class GPUStaticInst;

 template<typename T>
 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
 {
   public:
     T c;
     T s;

     ComputeUnit *computeUnit;

     AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
       : c(_c), s(_s), computeUnit(compute_unit) { }

     void
     execute(T *b)
     {
         computeUnit->numCASOps++;

         if (*b == c) {
             *b = s;
         } else {
             computeUnit->numFailedCASOps++;
         }

         if (computeUnit->xact_cas_mode) {
             computeUnit->xactCasLoadMap.clear();
         }
     }
     AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
 };

 typedef enum
 {
     VT_32,
     VT_64,
 } vgpr_type;

 class GPUDynInst : public GPUExecContext
 {
   public:
     GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
                uint64_t instSeqNum);
     ~GPUDynInst();
     void execute(GPUDynInstPtr gpuDynInst);
     int numSrcRegOperands();
     int numDstRegOperands();
     int getNumOperands();
     bool isVectorRegister(int operandIdx);
     bool isScalarRegister(int operandIdx);
     bool isCondRegister(int operandIdx);
     int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
     int getOperandSize(int operandIdx);
     bool isDstOperand(int operandIdx);
     bool isSrcOperand(int operandIdx);

     const std::string &disassemble() const;

     uint64_t seqNum() const;

     Enums::StorageClassType executedAs();

     // The address of the memory operation
     std::vector<Addr> addr;
     Addr pAddr;

     // The data to get written
     uint8_t *d_data;
     // Additional data (for atomics)
     uint8_t *a_data;
     // Additional data (for atomics)
     uint8_t *x_data;
     // The execution mask
     VectorMask exec_mask;

     // The memory type (M_U32, M_S32, ...)
     Enums::MemType m_type;

     // The equivalency class
     int equiv;
     // The return VGPR type (VT_32 or VT_64)
     vgpr_type v_type;
     // Number of VGPR's accessed (1, 2, or 4)
     int n_reg;
     // The return VGPR index
     int dst_reg;
     // There can be max 4 dest regs>
     int dst_reg_vec[4];
     // SIMD where the WF of the memory instruction has been mapped to
     int simdId;
     // unique id of the WF where the memory instruction belongs to
     int wfDynId;
     // The kernel id of the requesting wf
     int kern_id;
     // The CU id of the requesting wf
     int cu_id;
     // HW slot id where the WF is mapped to inside a SIMD unit
     int wfSlotId;
     // execution pipeline id where the memory instruction has been scheduled
     int pipeId;
     // The execution time of this operation
     Tick time;
     // The latency of this operation
     WaitClass latency;
     // A list of bank conflicts for the 4 cycles.
     uint32_t bc[4];

     // A pointer to ROM
     uint8_t *rom;
     // The size of the READONLY segment
     int sz_rom;

     // Initiate the specified memory operation, by creating a
     // memory request and sending it off to the memory system.
     void initiateAcc(GPUDynInstPtr gpuDynInst);
     // Complete the specified memory operation, by writing
     // value back to the RF in the case of a load or atomic
     // return or, in the case of a store, we do nothing
     void completeAcc(GPUDynInstPtr gpuDynInst);

     void updateStats();

     GPUStaticInst* staticInstruction() { return _staticInst; }

     bool isALU() const;
     bool isBranch() const;
     bool isNop() const;
     bool isReturn() const;
     bool isUnconditionalJump() const;
     bool isSpecialOp() const;
     bool isWaitcnt() const;

     bool isBarrier() const;
     bool isMemFence() const;
     bool isMemRef() const;
     bool isFlat() const;
     bool isLoad() const;
     bool isStore() const;

     bool isAtomic() const;
     bool isAtomicNoRet() const;
     bool isAtomicRet() const;

     bool isScalar() const;
     bool readsSCC() const;
     bool writesSCC() const;
     bool readsVCC() const;
     bool writesVCC() const;

     bool isAtomicAnd() const;
     bool isAtomicOr() const;
     bool isAtomicXor() const;
     bool isAtomicCAS() const;
     bool isAtomicExch() const;
     bool isAtomicAdd() const;
     bool isAtomicSub() const;
     bool isAtomicInc() const;
     bool isAtomicDec() const;
     bool isAtomicMax() const;
     bool isAtomicMin() const;

     bool isArgLoad() const;
     bool isGlobalMem() const;
     bool isLocalMem() const;

     bool isArgSeg() const;
     bool isGlobalSeg() const;
     bool isGroupSeg() const;
     bool isKernArgSeg() const;
     bool isPrivateSeg() const;
     bool isReadOnlySeg() const;
     bool isSpillSeg() const;

     bool isWorkitemScope() const;
     bool isWavefrontScope() const;
     bool isWorkgroupScope() const;
     bool isDeviceScope() const;
     bool isSystemScope() const;
     bool isNoScope() const;

     bool isRelaxedOrder() const;
     bool isAcquire() const;
     bool isRelease() const;
     bool isAcquireRelease() const;
     bool isNoOrder() const;

     bool isGloballyCoherent() const;
     bool isSystemCoherent() const;

     /*
      * Loads/stores/atomics may have acquire/release semantics associated
      * withthem. Some protocols want to see the acquire/release as separate
      * requests from the load/store/atomic. We implement that separation
      * using continuations (i.e., a function pointer with an object associated
      * with it). When, for example, the front-end generates a store with
      * release semantics, we will first issue a normal store and set the
      * continuation in the GPUDynInst to a function that generate a
      * release request. That continuation will be called when the normal
      * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
      * continuation will be called in the context of the same GPUDynInst
      * that generated the initial store.
      */
     std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;

     // when true, call execContinuation when response arrives
     bool useContinuation;

     template<typename c0> AtomicOpFunctor*
     makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
     {
         if (isAtomicAnd()) {
             return new AtomicOpAnd<c0>(*reg0);
         } else if (isAtomicOr()) {
             return new AtomicOpOr<c0>(*reg0);
         } else if (isAtomicXor()) {
             return new AtomicOpXor<c0>(*reg0);
         } else if (isAtomicCAS()) {
             return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
         } else if (isAtomicExch()) {
             return new AtomicOpExch<c0>(*reg0);
         } else if (isAtomicAdd()) {
             return new AtomicOpAdd<c0>(*reg0);
         } else if (isAtomicSub()) {
             return new AtomicOpSub<c0>(*reg0);
         } else if (isAtomicInc()) {
             return new AtomicOpInc<c0>();
         } else if (isAtomicDec()) {
             return new AtomicOpDec<c0>();
         } else if (isAtomicMax()) {
             return new AtomicOpMax<c0>(*reg0);
         } else if (isAtomicMin()) {
             return new AtomicOpMin<c0>(*reg0);
         } else {
             fatal("Unrecognized atomic operation");
         }
     }

     void
     setRequestFlags(RequestPtr req, bool setMemOrder=true)
     {
         // currently these are the easy scopes to deduce
         if (isPrivateSeg()) {
             req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
         } else if (isSpillSeg()) {
             req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
         } else if (isGlobalSeg()) {
             req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
         } else if (isReadOnlySeg()) {
             req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
         } else if (isGroupSeg()) {
             req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
         } else if (isFlat()) {
             panic("TODO: translate to correct scope");
         } else {
             fatal("%s has bad segment type\n", disassemble());
         }

         if (isWavefrontScope()) {
             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                         Request::WAVEFRONT_SCOPE);
         } else if (isWorkgroupScope()) {
             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                         Request::WORKGROUP_SCOPE);
         } else if (isDeviceScope()) {
             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                         Request::DEVICE_SCOPE);
         } else if (isSystemScope()) {
             req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                         Request::SYSTEM_SCOPE);
         } else if (!isNoScope() && !isWorkitemScope()) {
             fatal("%s has bad scope type\n", disassemble());
         }

         if (setMemOrder) {
             // set acquire and release flags
             if (isAcquire()) {
                 req->setFlags(Request::ACQUIRE);
             } else if (isRelease()) {
                 req->setFlags(Request::RELEASE);
             } else if (isAcquireRelease()) {
                 req->setFlags(Request::ACQUIRE | Request::RELEASE);
             } else if (!isNoOrder()) {
                 fatal("%s has bad memory order\n", disassemble());
             }
         }

         // set atomic type
         // currently, the instruction genenerator only produces atomic return
         // but a magic instruction can produce atomic no return
         if (isAtomicRet()) {
             req->setFlags(Request::ATOMIC_RETURN_OP);
         } else if (isAtomicNoRet()) {
             req->setFlags(Request::ATOMIC_NO_RETURN_OP);
         }
     }

     // Map returned packets and the addresses they satisfy with which lane they
     // were requested from
     typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
     StatusVector memStatusVector;

     // Track the status of memory requests per lane, a bit per lane
     VectorMask statusBitVector;
     // for ld_v# or st_v#
     std::vector<int> statusVector;
     std::vector<int> tlbHitLevel;

   private:
     GPUStaticInst *_staticInst;
     uint64_t _seqNum;
 };

 #endif // __GPU_DYN_INST_HH__
amo.hh

Wavefront
Definition: wavefront.hh:147

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:167

logging.hh

GPUDynInst::memStatusVector
StatusVector memStatusVector
Definition: gpu_dyn_inst.hh:348

GPUDynInst::addr
std::vector< Addr > addr
Definition: gpu_dyn_inst.hh:112

Request::PRIVATE_SEGMENT
Private Segment.
Definition: request.hh:243

fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:175

GPUDynInst::tlbHitLevel
std::vector< int > tlbHitLevel
Definition: gpu_dyn_inst.hh:354

WaitClass
Definition: misc.hh:50

ComputeUnit::xactCasLoadMap
std::map< unsigned, waveQueue > xactCasLoadMap
Definition: compute_unit.hh:740

AtomicOpMax
Definition: amo.hh:199

GPUDynInst::latency
WaitClass latency
Definition: gpu_dyn_inst.hh:152

GPUDynInst::makeAtomicOpFunctor
AtomicOpFunctor * makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
Definition: gpu_dyn_inst.hh:257

GPUDynInst::wfDynId
int wfDynId
Definition: gpu_dyn_inst.hh:140

GPUDynInst::pipeId
int pipeId
Definition: gpu_dyn_inst.hh:148

GPUDynInst::execContinuation
std::function< void(GPUStaticInst *, GPUDynInstPtr)> execContinuation
Definition: gpu_dyn_inst.hh:251

RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:83

VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45

GPUDynInst::rom
uint8_t * rom
Definition: gpu_dyn_inst.hh:157

AtomicOpCAS::execute
void execute(T *b)
Definition: gpu_dyn_inst.hh:64

GPUExecContext
Definition: gpu_exec_context.hh:46

AtomicOpCAS::c
T c
Definition: gpu_dyn_inst.hh:55

Request::SPILL_SEGMENT
Spill Segment.
Definition: request.hh:249

GPUDynInst::wfSlotId
int wfSlotId
Definition: gpu_dyn_inst.hh:146

AtomicOpCAS::computeUnit
ComputeUnit * computeUnit
Definition: gpu_dyn_inst.hh:58

VT_64
Definition: gpu_dyn_inst.hh:84

AtomicOpAdd
Definition: amo.hh:161

GPUDynInst::simdId
int simdId
Definition: gpu_dyn_inst.hh:138

GPUDynInst
Definition: gpu_dyn_inst.hh:87

Request::WAVEFRONT_SCOPE
Access has Wavefront scope visibility.
Definition: request.hh:230

GPUDynInst::a_data
uint8_t * a_data
Definition: gpu_dyn_inst.hh:118

Request::ATOMIC_NO_RETURN_OP
The request is an atomic that does not return data.
Definition: request.hh:167

GPUDynInst::pAddr
Addr pAddr
Definition: gpu_dyn_inst.hh:113

compute_unit.hh

GPUDynInst::statusVector
std::vector< int > statusVector
Definition: gpu_dyn_inst.hh:353

std::vector< Addr >

VT_32
Definition: gpu_dyn_inst.hh:83

Request::ATOMIC_RETURN_OP
The request is an atomic that returns data.
Definition: request.hh:165

ArmISA::b
Bitfield< 7 > b
Definition: miscregs_types.hh:378

AtomicOpCAS::AtomicOpCAS
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
Definition: gpu_dyn_inst.hh:60

GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48

AtomicOpExch
Definition: amo.hh:151

GPUDynInst::setRequestFlags
void setRequestFlags(RequestPtr req, bool setMemOrder=true)
Definition: gpu_dyn_inst.hh:287

GPUDynInst::v_type
vgpr_type v_type
Definition: gpu_dyn_inst.hh:130

gpu_exec_context.hh

Request::WORKGROUP_SCOPE
Access has Workgroup scope visibility.
Definition: request.hh:232

GPUDynInst::_seqNum
uint64_t _seqNum
Definition: gpu_dyn_inst.hh:358

Request::SCOPE_VALID
Has a synchronization scope been set?
Definition: request.hh:228

GPUDynInst::d_data
uint8_t * d_data
Definition: gpu_dyn_inst.hh:116

AtomicOpDec
Definition: amo.hh:190

Request::SYSTEM_SCOPE
Access has System (e.g., CPU + GPU) scope visibility.
Definition: request.hh:236

GPUStaticInst
Definition: gpu_static_inst.hh:60

Tick
uint64_t Tick
Tick count type.
Definition: types.hh:63

ComputeUnit::numCASOps
Stats::Scalar numCASOps
Definition: compute_unit.hh:367

GPUDynInst::x_data
uint8_t * x_data
Definition: gpu_dyn_inst.hh:120

Request::GLOBAL_SEGMENT
Global Segment.
Definition: request.hh:239

GPUDynInst::exec_mask
VectorMask exec_mask
Definition: gpu_dyn_inst.hh:122

GPUDynInst::staticInstruction
GPUStaticInst * staticInstruction()
Definition: gpu_dyn_inst.hh:171

AtomicOpCAS
Definition: gpu_dyn_inst.hh:52

Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142

GPUDynInst::n_reg
int n_reg
Definition: gpu_dyn_inst.hh:132

AtomicOpMin
Definition: amo.hh:215

GPUDynInst::useContinuation
bool useContinuation
Definition: gpu_dyn_inst.hh:254

ComputeUnit::xact_cas_mode
bool xact_cas_mode
Definition: compute_unit.hh:171

GPUDynInst::cu_id
int cu_id
Definition: gpu_dyn_inst.hh:144

ComputeUnit
Definition: compute_unit.hh:94

AtomicOpOr
Definition: amo.hh:131

AtomicOpInc
Definition: amo.hh:181

Request::READONLY_SEGMENT
Readonly Segment.
Definition: request.hh:247

TypedAtomicOpFunctor
Definition: amo.hh:50

Request::RELEASE
The request should be marked with RELEASE.
Definition: request.hh:162

GPUDynInst::kern_id
int kern_id
Definition: gpu_dyn_inst.hh:142

AtomicOpCAS::clone
AtomicOpFunctor * clone()
Definition: gpu_dyn_inst.hh:78

Request::GROUP_SEGMENT
Group Segment.
Definition: request.hh:241

GPUDynInst::_staticInst
GPUStaticInst * _staticInst
Definition: gpu_dyn_inst.hh:357

GPUDynInst::equiv
int equiv
Definition: gpu_dyn_inst.hh:128

ComputeUnit::numFailedCASOps
Stats::Scalar numFailedCASOps
Definition: compute_unit.hh:368

AtomicOpSub
Definition: amo.hh:171

AtomicOpAnd
Definition: amo.hh:121

AtomicOpFunctor
Definition: amo.hh:42

GPUDynInst::statusBitVector
VectorMask statusBitVector
Definition: gpu_dyn_inst.hh:351

AtomicOpCAS::s
T s
Definition: gpu_dyn_inst.hh:56

GPUDynInst::StatusVector
std::unordered_map< Addr, std::vector< int > > StatusVector
Definition: gpu_dyn_inst.hh:347

GPUDynInst::dst_reg
int dst_reg
Definition: gpu_dyn_inst.hh:134

vgpr_type
vgpr_type
Definition: gpu_dyn_inst.hh:81

GPUDynInst::time
Tick time
Definition: gpu_dyn_inst.hh:150

GPUDynInst::sz_rom
int sz_rom
Definition: gpu_dyn_inst.hh:159

GPUDynInst::m_type
Enums::MemType m_type
Definition: gpu_dyn_inst.hh:125

AtomicOpXor
Definition: amo.hh:141

Request::DEVICE_SCOPE
Access has Device (e.g., GPU) scope visibility.
Definition: request.hh:234

Request::ACQUIRE
The request should be marked with ACQUIRE.
Definition: request.hh:160