release/v19-0-0-0/arch_2hsail_2insts_2mem_8hh_source.html

 /*
  * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its contributors
  * may be used to endorse or promote products derived from this software
  * without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Author: Steve Reinhardt
  */

 #ifndef __ARCH_HSAIL_INSTS_MEM_HH__
 #define __ARCH_HSAIL_INSTS_MEM_HH__

 #include <type_traits>

 #include "arch/hsail/insts/decl.hh"
 #include "arch/hsail/insts/gpu_static_inst.hh"
 #include "arch/hsail/operand.hh"
 #include "gpu-compute/compute_unit.hh"

 namespace HsailISA
 {
     class MemInst
     {
       public:
         MemInst() : size(0), addr_operand(nullptr) { }

         MemInst(Enums::MemType m_type)
         {
             if (m_type == Enums::M_U64 ||
                 m_type == Enums::M_S64 ||
                 m_type == Enums::M_F64) {
                 size = 8;
             } else if (m_type == Enums::M_U32 ||
                        m_type == Enums::M_S32 ||
                        m_type == Enums::M_F32) {
                 size = 4;
             } else if (m_type == Enums::M_U16 ||
                        m_type == Enums::M_S16 ||
                        m_type == Enums::M_F16) {
                 size = 2;
             } else {
                 size = 1;
             }

             addr_operand = nullptr;
         }

         void
         init_addr(AddrOperandBase *_addr_operand)
         {
             addr_operand = _addr_operand;
         }

       private:
         int size;
         AddrOperandBase *addr_operand;

       public:
         int getMemOperandSize() { return size; }
         AddrOperandBase *getAddressOperand() { return addr_operand; }
     };

     template<typename DestOperandType, typename AddrOperandType>
     class LdaInstBase : public HsailGPUStaticInst
     {
       public:
         typename DestOperandType::DestOperand dest;
         AddrOperandType addr;

         LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
                     const char *_opcode)
            : HsailGPUStaticInst(obj, _opcode)
         {
             using namespace Brig;

             setFlag(ALU);

             unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
             dest.init(op_offs, obj);
             op_offs = obj->getOperandPtr(ib->operands, 1);
             addr.init(op_offs, obj);
         }

         int numSrcRegOperands() override
         { return(this->addr.isVectorRegister()); }
         int numDstRegOperands() override
         { return dest.isVectorRegister(); }
         bool isVectorRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.isVectorRegister() :
                    this->addr.isVectorRegister());
         }
         bool isCondRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.isCondRegister() :
                    this->addr.isCondRegister());
         }
         bool isScalarRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.isScalarRegister() :
                    this->addr.isScalarRegister());
         }
         bool isSrcOperand(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex > 0)
                 return(this->addr.isVectorRegister());
             return false;
         }
         bool isDstOperand(int operandIndex) override {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return(operandIndex == 0);
         }
         int getOperandSize(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.opSize() :
                    this->addr.opSize());
         }
         int
         getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.regIndex() :
                    this->addr.regIndex());
         }
         int getNumOperands() override
         {
             if (this->addr.isVectorRegister())
                 return 2;
             return 1;
         }
     };

     template<typename DestDataType, typename AddrOperandType>
     class LdaInst :
         public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
         public MemInst
     {
       public:
         void generateDisassembly();

         LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
                         const char *_opcode)
             : LdaInstBase<typename DestDataType::OperandType,
                           AddrOperandType>(ib, obj, _opcode)
         {
             init_addr(&this->addr);
         }

         void execute(GPUDynInstPtr gpuDynInst);
     };

     template<typename DataType>
     GPUStaticInst*
     decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
         BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);

         if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
             return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
         } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
             // V2/V4 not allowed
             switch (regDataType.regKind) {
               case Brig::BRIG_REGISTER_KIND_SINGLE:
                 return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
               case Brig::BRIG_REGISTER_KIND_DOUBLE:
                 return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
               default:
                 fatal("Bad ldas register operand type %d\n", regDataType.type);
             }
         } else {
             fatal("Bad ldas register operand kind %d\n", regDataType.kind);
         }
     }

     template<typename MemOperandType, typename DestOperandType,
              typename AddrOperandType>
     class LdInstBase : public HsailGPUStaticInst
     {
       public:
         Brig::BrigWidth8_t width;
         typename DestOperandType::DestOperand dest;
         AddrOperandType addr;

         Brig::BrigSegment segment;
         Brig::BrigMemoryOrder memoryOrder;
         Brig::BrigMemoryScope memoryScope;
         unsigned int equivClass;

         LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
                    const char *_opcode)
            : HsailGPUStaticInst(obj, _opcode)
         {
             using namespace Brig;

             setFlag(MemoryRef);
             setFlag(Load);

             if (ib->opcode == BRIG_OPCODE_LD) {
                 const BrigInstMem *ldst = (const BrigInstMem*)ib;

                 segment = (BrigSegment)ldst->segment;
                 memoryOrder = BRIG_MEMORY_ORDER_NONE;
                 memoryScope = BRIG_MEMORY_SCOPE_NONE;
                 equivClass = ldst->equivClass;

                 width = ldst->width;
                 unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
                 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
                 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
                     dest.init(op_offs, obj);

                 op_offs = obj->getOperandPtr(ib->operands, 1);
                 addr.init(op_offs, obj);
             } else {
                 const BrigInstAtomic *at = (const BrigInstAtomic*)ib;

                 segment = (BrigSegment)at->segment;
                 memoryOrder = (BrigMemoryOrder)at->memoryOrder;
                 memoryScope = (BrigMemoryScope)at->memoryScope;
                 equivClass = 0;

                 width = BRIG_WIDTH_1;
                 unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
                 const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);

                 if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
                     dest.init(op_offs, obj);

                 op_offs = obj->getOperandPtr(ib->operands,1);
                 addr.init(op_offs, obj);
             }

             switch (memoryOrder) {
               case BRIG_MEMORY_ORDER_NONE:
                 setFlag(NoOrder);
                 break;
               case BRIG_MEMORY_ORDER_RELAXED:
                 setFlag(RelaxedOrder);
                 break;
               case BRIG_MEMORY_ORDER_SC_ACQUIRE:
                 setFlag(Acquire);
                 break;
               case BRIG_MEMORY_ORDER_SC_RELEASE:
                 setFlag(Release);
                 break;
               case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
                 setFlag(AcquireRelease);
                 break;
               default:
                 fatal("LdInst has bad memory order type\n");
             }

             switch (memoryScope) {
               case BRIG_MEMORY_SCOPE_NONE:
                 setFlag(NoScope);
                 break;
               case BRIG_MEMORY_SCOPE_WORKITEM:
                 setFlag(WorkitemScope);
                 break;
               case BRIG_MEMORY_SCOPE_WORKGROUP:
                 setFlag(WorkgroupScope);
                 break;
               case BRIG_MEMORY_SCOPE_AGENT:
                 setFlag(DeviceScope);
                 break;
               case BRIG_MEMORY_SCOPE_SYSTEM:
                 setFlag(SystemScope);
                 break;
               default:
                 fatal("LdInst has bad memory scope type\n");
             }

             switch (segment) {
               case BRIG_SEGMENT_GLOBAL:
                 setFlag(GlobalSegment);
                 break;
               case BRIG_SEGMENT_GROUP:
                 setFlag(GroupSegment);
                 break;
               case BRIG_SEGMENT_PRIVATE:
                 setFlag(PrivateSegment);
                 break;
               case BRIG_SEGMENT_READONLY:
                 setFlag(ReadOnlySegment);
                 break;
               case BRIG_SEGMENT_SPILL:
                 setFlag(SpillSegment);
                 break;
               case BRIG_SEGMENT_FLAT:
                 setFlag(Flat);
                 break;
               case BRIG_SEGMENT_KERNARG:
                 setFlag(KernArgSegment);
                 break;
               case BRIG_SEGMENT_ARG:
                 setFlag(ArgSegment);
                 break;
               default:
                 panic("Ld: segment %d not supported\n", segment);
             }
         }

         int numSrcRegOperands() override
         { return(this->addr.isVectorRegister()); }
         int numDstRegOperands() override { return dest.isVectorRegister(); }
         int getNumOperands() override
         {
             if (this->addr.isVectorRegister())
                 return 2;
             else
                 return 1;
         }
         bool isVectorRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.isVectorRegister() :
                    this->addr.isVectorRegister());
         }
         bool isCondRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.isCondRegister() :
                    this->addr.isCondRegister());
         }
         bool isScalarRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.isScalarRegister() :
                    this->addr.isScalarRegister());
         }
         bool isSrcOperand(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex > 0)
                 return(this->addr.isVectorRegister());
             return false;
         }
         bool isDstOperand(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return(operandIndex == 0);
         }
         int getOperandSize(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.opSize() :
                    this->addr.opSize());
         }
         int
         getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return((operandIndex == 0) ? dest.regIndex() :
                    this->addr.regIndex());
         }
     };

     template<typename MemDataType, typename DestDataType,
              typename AddrOperandType>
     class LdInst :
         public LdInstBase<typename MemDataType::CType,
                           typename DestDataType::OperandType, AddrOperandType>,
         public MemInst
     {
         typename DestDataType::OperandType::DestOperand dest_vect[4];
         uint16_t num_dest_operands;
         void generateDisassembly() override;

       public:
         LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
                const char *_opcode)
             : LdInstBase<typename MemDataType::CType,
                          typename DestDataType::OperandType,
                          AddrOperandType>(ib, obj, _opcode),
               MemInst(MemDataType::memType)
         {
             init_addr(&this->addr);

             unsigned op_offs = obj->getOperandPtr(ib->operands,0);
             const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);

             if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
                 const Brig::BrigOperandOperandList *brigRegVecOp =
                     (const Brig::BrigOperandOperandList*)brigOp;

                 num_dest_operands =
                     *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;

                 assert(num_dest_operands <= 4);
             } else {
                 num_dest_operands = 1;
             }

             if (num_dest_operands > 1) {
                 assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);

                 for (int i = 0; i < num_dest_operands; ++i) {
                     dest_vect[i].init_from_vect(op_offs, obj, i);
                 }
             }
         }

         void
         initiateAcc(GPUDynInstPtr gpuDynInst) override
         {
             typedef typename MemDataType::CType c0;

             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;

             if (num_dest_operands > 1) {
                 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                     if (gpuDynInst->exec_mask[i])
                         gpuDynInst->statusVector.push_back(num_dest_operands);
                     else
                         gpuDynInst->statusVector.push_back(0);
             }

             for (int k = 0; k < num_dest_operands; ++k) {

                 c0 *d = &((c0*)gpuDynInst->d_data)
                     [k * gpuDynInst->computeUnit()->wfSize()];

                 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);

                         if (this->isLocalMem()) {
                             // load from shared memory
                             *d = gpuDynInst->wavefront()->ldsChunk->
                                 read<c0>(vaddr);
                         } else {
                             RequestPtr req = std::make_shared<Request>(0,
                                 vaddr, sizeof(c0), 0,
                                 gpuDynInst->computeUnit()->masterId(),
                                 0, gpuDynInst->wfDynId);

                             gpuDynInst->setRequestFlags(req);
                             PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
                             pkt->dataStatic(d);

                             if (gpuDynInst->computeUnit()->shader->
                                 separate_acquire_release &&
                                 gpuDynInst->isAcquire()) {
                                 // if this load has acquire semantics,
                                 // set the response continuation function
                                 // to perform an Acquire request
                                 gpuDynInst->execContinuation =
                                     &GPUStaticInst::execLdAcq;

                                 gpuDynInst->useContinuation = true;
                             } else {
                                 // the request will be finished when
                                 // the load completes
                                 gpuDynInst->useContinuation = false;
                             }
                             // translation is performed in sendRequest()
                             gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
                                                                    i, pkt);
                         }
                     }
                     ++d;
                 }
             }

             gpuDynInst->updateStats();
         }

         void
         completeAcc(GPUDynInstPtr gpuDynInst) override
         {
             typedef typename MemDataType::CType c1;

             constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;

             typedef typename std::conditional<is_vt_32,
                 typename std::conditional<std::is_floating_point<c1>::value,
                     float, typename std::conditional<std::is_signed<c1>::value,
                     int32_t, uint32_t>::type>::type,
                 typename std::conditional<std::is_floating_point<c1>::value,
                     double, typename std::conditional<std::is_signed<c1>::value,
                     int64_t, uint64_t>::type>::type>::type c0;


             Wavefront *w = gpuDynInst->wavefront();

             std::vector<uint32_t> regVec;
             // iterate over number of destination register operands since
             // this is a load
             for (int k = 0; k < num_dest_operands; ++k) {
                 assert((sizeof(c1) * num_dest_operands)
                        <= MAX_WIDTH_FOR_MEM_INST);

                 int dst = this->dest.regIndex() + k;
                 if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
                     dst = dest_vect[k].regIndex();
                 // virtual->physical VGPR mapping
                 int physVgpr = w->remap(dst, sizeof(c0), 1);
                 // save the physical VGPR index
                 regVec.push_back(physVgpr);

                 c1 *p1 =
                     &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];

                 for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                 "$%s%d <- %d global ld done (src = wavefront "
                                 "ld inst)\n", w->computeUnit->cu_id, w->simdId,
                                 w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
                                 dst, *p1);
                         // write the value into the physical VGPR. This is a
                         // purely functional operation. No timing is modeled.
                         w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
                                                                     *p1, i);
                     }
                     ++p1;
                 }
             }

             // Schedule the write operation of the load data on the VRF.
             // This simply models the timing aspect of the VRF write operation.
             // It does not modify the physical VGPR.
             int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
                 vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
                                      sizeof(c0), gpuDynInst->time);

             if (this->isGlobalMem()) {
                 gpuDynInst->computeUnit()->globalMemoryPipe
                     .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
             } else {
                 assert(this->isLocalMem());
                 gpuDynInst->computeUnit()->localMemoryPipe
                     .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
             }
         }

       private:
         void
         execLdAcq(GPUDynInstPtr gpuDynInst) override
         {
             // after the load has complete and if the load has acquire
             // semantics, issue an acquire request.
             if (!this->isLocalMem()) {
                 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
                     && gpuDynInst->isAcquire()) {
                     gpuDynInst->statusBitVector = VectorMask(1);
                     gpuDynInst->useContinuation = false;
                     // create request
                     RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
                                   gpuDynInst->computeUnit()->masterId(),
                                   0, gpuDynInst->wfDynId);
                     req->setFlags(Request::ACQUIRE);
                     gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
                 }
             }
         }

       public:
         bool isVectorRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if ((num_dest_operands != getNumOperands()) &&
                 (operandIndex == (getNumOperands()-1)))
                 return(this->addr.isVectorRegister());
             if (num_dest_operands > 1) {
                 return dest_vect[operandIndex].isVectorRegister();
             }
             else if (num_dest_operands == 1) {
                 return LdInstBase<typename MemDataType::CType,
                        typename DestDataType::OperandType,
                        AddrOperandType>::dest.isVectorRegister();
             }
             return false;
         }
         bool isCondRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if ((num_dest_operands != getNumOperands()) &&
                 (operandIndex == (getNumOperands()-1)))
                 return(this->addr.isCondRegister());
             if (num_dest_operands > 1)
                 return dest_vect[operandIndex].isCondRegister();
             else if (num_dest_operands == 1)
                 return LdInstBase<typename MemDataType::CType,
                        typename DestDataType::OperandType,
                        AddrOperandType>::dest.isCondRegister();
             return false;
         }
         bool isScalarRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if ((num_dest_operands != getNumOperands()) &&
                 (operandIndex == (getNumOperands()-1)))
                 return(this->addr.isScalarRegister());
             if (num_dest_operands > 1)
                 return dest_vect[operandIndex].isScalarRegister();
             else if (num_dest_operands == 1)
                 return LdInstBase<typename MemDataType::CType,
                        typename DestDataType::OperandType,
                        AddrOperandType>::dest.isScalarRegister();
             return false;
         }
         bool isSrcOperand(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if ((num_dest_operands != getNumOperands()) &&
                 (operandIndex == (getNumOperands()-1)))
                 return(this->addr.isVectorRegister());
             return false;
         }
         bool isDstOperand(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if ((num_dest_operands != getNumOperands()) &&
                 (operandIndex == (getNumOperands()-1)))
                 return false;
             return true;
         }
         int getOperandSize(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if ((num_dest_operands != getNumOperands()) &&
                 (operandIndex == (getNumOperands()-1)))
                 return(this->addr.opSize());
             if (num_dest_operands > 1)
                 return(dest_vect[operandIndex].opSize());
             else if (num_dest_operands == 1)
                 return(LdInstBase<typename MemDataType::CType,
                        typename DestDataType::OperandType,
                        AddrOperandType>::dest.opSize());
             return 0;
         }
         int
         getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if ((num_dest_operands != getNumOperands()) &&
                 (operandIndex == (getNumOperands()-1)))
                 return(this->addr.regIndex());
             if (num_dest_operands > 1)
                 return(dest_vect[operandIndex].regIndex());
             else if (num_dest_operands == 1)
                 return(LdInstBase<typename MemDataType::CType,
                        typename DestDataType::OperandType,
                        AddrOperandType>::dest.regIndex());
             return -1;
         }
         int getNumOperands() override
         {
             if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
                 return(num_dest_operands+1);
             else
                 return(num_dest_operands);
         }
         void execute(GPUDynInstPtr gpuDynInst) override;
     };

     template<typename MemDT, typename DestDT>
     GPUStaticInst*
     decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         unsigned op_offs = obj->getOperandPtr(ib->operands,1);
         BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);

         if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
             return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
         } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
                    tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
             switch (tmp.regKind) {
               case Brig::BRIG_REGISTER_KIND_SINGLE:
                 return new LdInst<MemDT, DestDT,
                                   SRegAddrOperand>(ib, obj, "ld");
               case Brig::BRIG_REGISTER_KIND_DOUBLE:
                 return new LdInst<MemDT, DestDT,
                                   DRegAddrOperand>(ib, obj, "ld");
               default:
                 fatal("Bad ld register operand type %d\n", tmp.regKind);
             }
         } else {
             fatal("Bad ld register operand kind %d\n", tmp.kind);
         }
     }

     template<typename MemDT>
     GPUStaticInst*
     decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         unsigned op_offs = obj->getOperandPtr(ib->operands,0);
         BrigRegOperandInfo dest = findRegDataType(op_offs, obj);

         assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
                dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
         switch(dest.regKind) {
           case Brig::BRIG_REGISTER_KIND_SINGLE:
             switch (ib->type) {
               case Brig::BRIG_TYPE_B8:
               case Brig::BRIG_TYPE_B16:
               case Brig::BRIG_TYPE_B32:
                 return decodeLd2<MemDT, B32>(ib, obj);
               case Brig::BRIG_TYPE_U8:
               case Brig::BRIG_TYPE_U16:
               case Brig::BRIG_TYPE_U32:
                 return decodeLd2<MemDT, U32>(ib, obj);
               case Brig::BRIG_TYPE_S8:
               case Brig::BRIG_TYPE_S16:
               case Brig::BRIG_TYPE_S32:
                 return decodeLd2<MemDT, S32>(ib, obj);
               case Brig::BRIG_TYPE_F16:
               case Brig::BRIG_TYPE_F32:
                 return decodeLd2<MemDT, U32>(ib, obj);
               default:
                 fatal("Bad ld register operand type %d, %d\n",
                       dest.regKind, ib->type);
             };
           case Brig::BRIG_REGISTER_KIND_DOUBLE:
             switch (ib->type) {
               case Brig::BRIG_TYPE_B64:
                 return decodeLd2<MemDT, B64>(ib, obj);
               case Brig::BRIG_TYPE_U64:
                 return decodeLd2<MemDT, U64>(ib, obj);
               case Brig::BRIG_TYPE_S64:
                 return decodeLd2<MemDT, S64>(ib, obj);
               case Brig::BRIG_TYPE_F64:
                 return decodeLd2<MemDT, U64>(ib, obj);
               default:
                 fatal("Bad ld register operand type %d, %d\n",
                       dest.regKind, ib->type);
             };
           default:
             fatal("Bad ld register operand type %d, %d\n", dest.regKind,
                   ib->type);
         }
     }

     template<typename MemDataType, typename SrcOperandType,
              typename AddrOperandType>
     class StInstBase : public HsailGPUStaticInst
     {
       public:
         typename SrcOperandType::SrcOperand src;
         AddrOperandType addr;

         Brig::BrigSegment segment;
         Brig::BrigMemoryScope memoryScope;
         Brig::BrigMemoryOrder memoryOrder;
         unsigned int equivClass;

         StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
                    const char *_opcode)
            : HsailGPUStaticInst(obj, _opcode)
         {
             using namespace Brig;

             setFlag(MemoryRef);
             setFlag(Store);

             if (ib->opcode == BRIG_OPCODE_ST) {
                 const BrigInstMem *ldst = (const BrigInstMem*)ib;

                 segment = (BrigSegment)ldst->segment;
                 memoryOrder = BRIG_MEMORY_ORDER_NONE;
                 memoryScope = BRIG_MEMORY_SCOPE_NONE;
                 equivClass = ldst->equivClass;

                 unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
                 const BrigOperand *baseOp = obj->getOperand(op_offs);

                 if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
                     (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
                     src.init(op_offs, obj);
                 }

                 op_offs = obj->getOperandPtr(ib->operands, 1);
                 addr.init(op_offs, obj);
             } else {
                 const BrigInstAtomic *at = (const BrigInstAtomic*)ib;

                 segment = (BrigSegment)at->segment;
                 memoryScope = (BrigMemoryScope)at->memoryScope;
                 memoryOrder = (BrigMemoryOrder)at->memoryOrder;
                 equivClass = 0;

                 unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
                 addr.init(op_offs, obj);

                 op_offs = obj->getOperandPtr(ib->operands, 1);
                 src.init(op_offs, obj);
             }

             switch (memoryOrder) {
               case BRIG_MEMORY_ORDER_NONE:
                 setFlag(NoOrder);
                 break;
               case BRIG_MEMORY_ORDER_RELAXED:
                 setFlag(RelaxedOrder);
                 break;
               case BRIG_MEMORY_ORDER_SC_ACQUIRE:
                 setFlag(Acquire);
                 break;
               case BRIG_MEMORY_ORDER_SC_RELEASE:
                 setFlag(Release);
                 break;
               case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
                 setFlag(AcquireRelease);
                 break;
               default:
                 fatal("StInst has bad memory order type\n");
             }

             switch (memoryScope) {
               case BRIG_MEMORY_SCOPE_NONE:
                 setFlag(NoScope);
                 break;
               case BRIG_MEMORY_SCOPE_WORKITEM:
                 setFlag(WorkitemScope);
                 break;
               case BRIG_MEMORY_SCOPE_WORKGROUP:
                 setFlag(WorkgroupScope);
                 break;
               case BRIG_MEMORY_SCOPE_AGENT:
                 setFlag(DeviceScope);
                 break;
               case BRIG_MEMORY_SCOPE_SYSTEM:
                 setFlag(SystemScope);
                 break;
               default:
                 fatal("StInst has bad memory scope type\n");
             }

             switch (segment) {
               case BRIG_SEGMENT_GLOBAL:
                 setFlag(GlobalSegment);
                 break;
               case BRIG_SEGMENT_GROUP:
                 setFlag(GroupSegment);
                 break;
               case BRIG_SEGMENT_PRIVATE:
                 setFlag(PrivateSegment);
                 break;
               case BRIG_SEGMENT_READONLY:
                 setFlag(ReadOnlySegment);
                 break;
               case BRIG_SEGMENT_SPILL:
                 setFlag(SpillSegment);
                 break;
               case BRIG_SEGMENT_FLAT:
                 setFlag(Flat);
                 break;
               case BRIG_SEGMENT_ARG:
                 setFlag(ArgSegment);
                 break;
               default:
                 panic("St: segment %d not supported\n", segment);
             }
         }

         int numDstRegOperands() override { return 0; }
         int numSrcRegOperands() override
         {
             return src.isVectorRegister() + this->addr.isVectorRegister();
         }
         int getNumOperands() override
         {
             if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
                 return 2;
             else
                 return 1;
         }
         bool isVectorRegister(int operandIndex) override
         {
             assert(operandIndex >= 0 && operandIndex < getNumOperands());
             return !operandIndex ? src.isVectorRegister() :
                    this->addr.isVectorRegister();
         }
         bool isCondRegister(int operandIndex) override
         {
             assert(operandIndex >= 0 && operandIndex < getNumOperands());
             return !operandIndex ? src.isCondRegister() :
                    this->addr.isCondRegister();
         }
         bool isScalarRegister(int operandIndex) override
         {
             assert(operandIndex >= 0 && operandIndex < getNumOperands());
             return !operandIndex ? src.isScalarRegister() :
                    this->addr.isScalarRegister();
         }
         bool isSrcOperand(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return true;
         }
         bool isDstOperand(int operandIndex) override { return false; }
         int getOperandSize(int operandIndex) override
         {
             assert(operandIndex >= 0 && operandIndex < getNumOperands());
             return !operandIndex ? src.opSize() : this->addr.opSize();
         }
         int
         getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
         {
             assert(operandIndex >= 0 && operandIndex < getNumOperands());
             return !operandIndex ? src.regIndex() : this->addr.regIndex();
         }
     };


     template<typename MemDataType, typename SrcDataType,
              typename AddrOperandType>
     class StInst :
         public StInstBase<MemDataType, typename SrcDataType::OperandType,
                           AddrOperandType>,
         public MemInst
     {
       public:
         typename SrcDataType::OperandType::SrcOperand src_vect[4];
         uint16_t num_src_operands;
         void generateDisassembly() override;

         StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
                         const char *_opcode, int srcIdx)
             : StInstBase<MemDataType, typename SrcDataType::OperandType,
                          AddrOperandType>(ib, obj, _opcode),
               MemInst(SrcDataType::memType)
         {
             init_addr(&this->addr);

             BrigRegOperandInfo rinfo;
             unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
             const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);

             if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
                 const Brig::BrigOperandConstantBytes *op =
                     (Brig::BrigOperandConstantBytes*)baseOp;

                 rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
                                            Brig::BRIG_TYPE_NONE);
             } else {
                 rinfo = findRegDataType(op_offs, obj);
             }

             if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
                 const Brig::BrigOperandOperandList *brigRegVecOp =
                     (const Brig::BrigOperandOperandList*)baseOp;

                 num_src_operands =
                     *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;

                 assert(num_src_operands <= 4);
             } else {
                 num_src_operands = 1;
             }

             if (num_src_operands > 1) {
                 assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);

                 for (int i = 0; i < num_src_operands; ++i) {
                     src_vect[i].init_from_vect(op_offs, obj, i);
                 }
             }
         }

         void
         initiateAcc(GPUDynInstPtr gpuDynInst) override
         {
             // before performing a store, check if this store has
             // release semantics, and if so issue a release first
             if (!this->isLocalMem()) {
                 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
                     && gpuDynInst->isRelease()) {

                     gpuDynInst->statusBitVector = VectorMask(1);
                     gpuDynInst->execContinuation = &GPUStaticInst::execSt;
                     gpuDynInst->useContinuation = true;
                     // create request
                     RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
                                   gpuDynInst->computeUnit()->masterId(),
                                   0, gpuDynInst->wfDynId);
                     req->setFlags(Request::RELEASE);
                     gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);

                     return;
                 }
             }

             // if there is no release semantic, perform stores immediately
             execSt(gpuDynInst);
         }

         // stores don't write anything back, so there is nothing
         // to do here. we only override this method to avoid the
         // fatal in the base class implementation
         void completeAcc(GPUDynInstPtr gpuDynInst) override { }

       private:
         // execSt may be called through a continuation
         // if the store had release semantics. see comment for
         // execSt in gpu_static_inst.hh
         void
         execSt(GPUDynInstPtr gpuDynInst) override
         {
             typedef typename MemDataType::CType c0;

             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;

             if (num_src_operands > 1) {
                 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
                     if (gpuDynInst->exec_mask[i])
                         gpuDynInst->statusVector.push_back(num_src_operands);
                     else
                         gpuDynInst->statusVector.push_back(0);
             }

             for (int k = 0; k < num_src_operands; ++k) {
                 c0 *d = &((c0*)gpuDynInst->d_data)
                     [k * gpuDynInst->computeUnit()->wfSize()];

                 for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);

                         if (this->isLocalMem()) {
                             //store to shared memory
                             gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
                                                                          *d);
                         } else {
                             RequestPtr req = std::make_shared<Request>(
                                 0, vaddr, sizeof(c0), 0,
                                 gpuDynInst->computeUnit()->masterId(),
                                 0, gpuDynInst->wfDynId);

                             gpuDynInst->setRequestFlags(req);
                             PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
                             pkt->dataStatic<c0>(d);

                             // translation is performed in sendRequest()
                             // the request will be finished when the store completes
                             gpuDynInst->useContinuation = false;
                             gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
                                                                    i, pkt);

                         }
                     }
                     ++d;
                 }
             }

             gpuDynInst->updateStats();
         }

       public:
         bool isVectorRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex == num_src_operands)
                 return this->addr.isVectorRegister();
             if (num_src_operands > 1)
                 return src_vect[operandIndex].isVectorRegister();
             else if (num_src_operands == 1)
                 return StInstBase<MemDataType,
                        typename SrcDataType::OperandType,
                        AddrOperandType>::src.isVectorRegister();
             return false;
         }
         bool isCondRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex == num_src_operands)
                 return this->addr.isCondRegister();
             if (num_src_operands > 1)
                 return src_vect[operandIndex].isCondRegister();
             else if (num_src_operands == 1)
                 return StInstBase<MemDataType,
                        typename SrcDataType::OperandType,
                        AddrOperandType>::src.isCondRegister();
             return false;
         }
         bool isScalarRegister(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex == num_src_operands)
                 return this->addr.isScalarRegister();
             if (num_src_operands > 1)
                 return src_vect[operandIndex].isScalarRegister();
             else if (num_src_operands == 1)
                 return StInstBase<MemDataType,
                        typename SrcDataType::OperandType,
                        AddrOperandType>::src.isScalarRegister();
             return false;
         }
         bool isSrcOperand(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             return true;
         }
         bool isDstOperand(int operandIndex) override { return false; }
         int getOperandSize(int operandIndex) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex == num_src_operands)
                 return this->addr.opSize();
             if (num_src_operands > 1)
                 return src_vect[operandIndex].opSize();
             else if (num_src_operands == 1)
                 return StInstBase<MemDataType,
                        typename SrcDataType::OperandType,
                        AddrOperandType>::src.opSize();
             return 0;
         }
         int
         getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex == num_src_operands)
                 return this->addr.regIndex();
             if (num_src_operands > 1)
                 return src_vect[operandIndex].regIndex();
             else if (num_src_operands == 1)
                 return StInstBase<MemDataType,
                        typename SrcDataType::OperandType,
                        AddrOperandType>::src.regIndex();
             return -1;
         }
         int getNumOperands() override
         {
             if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
                 return num_src_operands + 1;
             else
                 return num_src_operands;
         }
         void execute(GPUDynInstPtr gpuDynInst) override;
     };

     template<typename DataType, typename SrcDataType>
     GPUStaticInst*
     decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         int srcIdx = 0;
         int destIdx = 1;
         if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
             ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
             srcIdx = 1;
             destIdx = 0;
         }
         unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);

         BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);

         if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
             return new StInst<DataType, SrcDataType,
                               NoRegAddrOperand>(ib, obj, "st", srcIdx);
         } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
             // V2/V4 not allowed
             switch (tmp.regKind) {
               case Brig::BRIG_REGISTER_KIND_SINGLE:
                 return new StInst<DataType, SrcDataType,
                                   SRegAddrOperand>(ib, obj, "st", srcIdx);
               case Brig::BRIG_REGISTER_KIND_DOUBLE:
                 return new StInst<DataType, SrcDataType,
                                   DRegAddrOperand>(ib, obj, "st", srcIdx);
               default:
                 fatal("Bad st register operand type %d\n", tmp.type);
             }
         } else {
             fatal("Bad st register operand kind %d\n", tmp.kind);
         }
     }

     template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
              bool HasDst>
     class AtomicInstBase : public HsailGPUStaticInst
     {
       public:
         typename OperandType::DestOperand dest;
         typename OperandType::SrcOperand src[NumSrcOperands];
         AddrOperandType addr;

         Brig::BrigSegment segment;
         Brig::BrigMemoryOrder memoryOrder;
         Brig::BrigAtomicOperation atomicOperation;
         Brig::BrigMemoryScope memoryScope;
         Brig::BrigOpcode opcode;

         AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
                        const char *_opcode)
            : HsailGPUStaticInst(obj, _opcode)
         {
             using namespace Brig;

             const BrigInstAtomic *at = (const BrigInstAtomic*)ib;

             segment = (BrigSegment)at->segment;
             memoryScope = (BrigMemoryScope)at->memoryScope;
             memoryOrder = (BrigMemoryOrder)at->memoryOrder;
             atomicOperation = (BrigAtomicOperation)at->atomicOperation;
             opcode = (BrigOpcode)ib->opcode;

             assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET ||
                    opcode == Brig::BRIG_OPCODE_ATOMIC);

             setFlag(MemoryRef);

             if (opcode == Brig::BRIG_OPCODE_ATOMIC) {
                 setFlag(AtomicReturn);
             } else {
                 setFlag(AtomicNoReturn);
             }

             switch (memoryOrder) {
               case BRIG_MEMORY_ORDER_NONE:
                 setFlag(NoOrder);
                 break;
               case BRIG_MEMORY_ORDER_RELAXED:
                 setFlag(RelaxedOrder);
                 break;
               case BRIG_MEMORY_ORDER_SC_ACQUIRE:
                 setFlag(Acquire);
                 break;
               case BRIG_MEMORY_ORDER_SC_RELEASE:
                 setFlag(Release);
                 break;
               case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
                 setFlag(AcquireRelease);
                 break;
               default:
                 fatal("AtomicInst has bad memory order type\n");
             }

             switch (memoryScope) {
               case BRIG_MEMORY_SCOPE_NONE:
                 setFlag(NoScope);
                 break;
               case BRIG_MEMORY_SCOPE_WORKITEM:
                 setFlag(WorkitemScope);
                 break;
               case BRIG_MEMORY_SCOPE_WORKGROUP:
                 setFlag(WorkgroupScope);
                 break;
               case BRIG_MEMORY_SCOPE_AGENT:
                 setFlag(DeviceScope);
                 break;
               case BRIG_MEMORY_SCOPE_SYSTEM:
                 setFlag(SystemScope);
                 break;
               default:
                 fatal("AtomicInst has bad memory scope type\n");
             }

             switch (atomicOperation) {
               case Brig::BRIG_ATOMIC_AND:
                 setFlag(AtomicAnd);
                 break;
               case Brig::BRIG_ATOMIC_OR:
                 setFlag(AtomicOr);
                 break;
               case Brig::BRIG_ATOMIC_XOR:
                 setFlag(AtomicXor);
                 break;
               case Brig::BRIG_ATOMIC_CAS:
                 setFlag(AtomicCAS);
                 break;
               case Brig::BRIG_ATOMIC_EXCH:
                 setFlag(AtomicExch);
                 break;
               case Brig::BRIG_ATOMIC_ADD:
                 setFlag(AtomicAdd);
                 break;
               case Brig::BRIG_ATOMIC_WRAPINC:
                 setFlag(AtomicInc);
                 break;
               case Brig::BRIG_ATOMIC_WRAPDEC:
                 setFlag(AtomicDec);
                 break;
               case Brig::BRIG_ATOMIC_MIN:
                 setFlag(AtomicMin);
                 break;
               case Brig::BRIG_ATOMIC_MAX:
                 setFlag(AtomicMax);
                 break;
               case Brig::BRIG_ATOMIC_SUB:
                 setFlag(AtomicSub);
                 break;
               default:
                 fatal("Bad BrigAtomicOperation code %d\n", atomicOperation);
             }

             switch (segment) {
               case BRIG_SEGMENT_GLOBAL:
                 setFlag(GlobalSegment);
                 break;
               case BRIG_SEGMENT_GROUP:
                 setFlag(GroupSegment);
                 break;
               case BRIG_SEGMENT_FLAT:
                 setFlag(Flat);
                 break;
               default:
                 panic("Atomic: segment %d not supported\n", segment);
             }

             if (HasDst) {
                 unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
                 dest.init(op_offs, obj);

                 op_offs = obj->getOperandPtr(ib->operands, 1);
                 addr.init(op_offs, obj);

                 for (int i = 0; i < NumSrcOperands; ++i) {
                     op_offs = obj->getOperandPtr(ib->operands, i + 2);
                     src[i].init(op_offs, obj);
                 }
             } else {

                 unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
                 addr.init(op_offs, obj);

                 for (int i = 0; i < NumSrcOperands; ++i) {
                     op_offs = obj->getOperandPtr(ib->operands, i + 1);
                     src[i].init(op_offs, obj);
                 }
             }
         }

         int numSrcRegOperands()
         {
             int operands = 0;
             for (int i = 0; i < NumSrcOperands; i++) {
                 if (src[i].isVectorRegister()) {
                     operands++;
                 }
             }
             if (addr.isVectorRegister())
                 operands++;
             return operands;
         }
         int numDstRegOperands() { return dest.isVectorRegister(); }
         int getNumOperands()
         {
             if (addr.isVectorRegister())
                 return(NumSrcOperands + 2);
             return(NumSrcOperands + 1);
         }
         bool isVectorRegister(int operandIndex)
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex < NumSrcOperands)
                 return src[operandIndex].isVectorRegister();
             else if (operandIndex == NumSrcOperands)
                 return(addr.isVectorRegister());
             else
                 return dest.isVectorRegister();
         }
         bool isCondRegister(int operandIndex)
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex < NumSrcOperands)
                 return src[operandIndex].isCondRegister();
             else if (operandIndex == NumSrcOperands)
                 return(addr.isCondRegister());
             else
                 return dest.isCondRegister();
         }
         bool isScalarRegister(int operandIndex)
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex < NumSrcOperands)
                 return src[operandIndex].isScalarRegister();
             else if (operandIndex == NumSrcOperands)
                 return(addr.isScalarRegister());
             else
                 return dest.isScalarRegister();
         }
         bool isSrcOperand(int operandIndex)
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex < NumSrcOperands)
                 return true;
             else if (operandIndex == NumSrcOperands)
                 return(addr.isVectorRegister());
             else
                 return false;
         }
         bool isDstOperand(int operandIndex)
         {
             if (operandIndex <= NumSrcOperands)
                 return false;
             else
                 return true;
         }
         int getOperandSize(int operandIndex)
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex < NumSrcOperands)
                 return(src[operandIndex].opSize());
             else if (operandIndex == NumSrcOperands)
                 return(addr.opSize());
             else
                 return(dest.opSize());
         }
         int
         getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst)
         {
             assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
             if (operandIndex < NumSrcOperands)
                 return(src[operandIndex].regIndex());
             else if (operandIndex == NumSrcOperands)
                 return(addr.regIndex());
             else
                 return(dest.regIndex());
             return -1;
         }
     };

     template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
              bool HasDst>
     class AtomicInst :
         public AtomicInstBase<typename MemDataType::OperandType,
                               AddrOperandType, NumSrcOperands, HasDst>,
         public MemInst
     {
       public:
         void generateDisassembly() override;

         AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
                    const char *_opcode)
             : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
                              NumSrcOperands, HasDst>
                 (ib, obj, _opcode),
               MemInst(MemDataType::memType)
         {
             init_addr(&this->addr);
         }

         void
         initiateAcc(GPUDynInstPtr gpuDynInst) override
         {
             // before doing the RMW, check if this atomic has
             // release semantics, and if so issue a release first
             if (!this->isLocalMem()) {
                 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
                     && (gpuDynInst->isRelease()
                     || gpuDynInst->isAcquireRelease())) {

                     gpuDynInst->statusBitVector = VectorMask(1);

                     gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
                     gpuDynInst->useContinuation = true;

                     // create request
                     RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
                                   gpuDynInst->computeUnit()->masterId(),
                                   0, gpuDynInst->wfDynId);
                     req->setFlags(Request::RELEASE);
                     gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);

                     return;
                 }
             }

             // if there is no release semantic, execute the RMW immediately
             execAtomic(gpuDynInst);

         }

         void
         completeAcc(GPUDynInstPtr gpuDynInst) override
         {
             // if this is not an atomic return op, then we
             // have nothing more to do.
             if (this->isAtomicRet()) {
                 // the size of the src operands and the
                 // memory being operated on must match
                 // for HSAIL atomics - this assumption may
                 // not apply to all ISAs
                 typedef typename MemDataType::CType CType;

                 Wavefront *w = gpuDynInst->wavefront();
                 int dst = this->dest.regIndex();
                 std::vector<uint32_t> regVec;
                 // virtual->physical VGPR mapping
                 int physVgpr = w->remap(dst, sizeof(CType), 1);
                 regVec.push_back(physVgpr);
                 CType *p1 = &((CType*)gpuDynInst->d_data)[0];

                 for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                     if (gpuDynInst->exec_mask[i]) {
                         DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                 "$%s%d <- %d global ld done (src = wavefront "
                                 "ld inst)\n", w->computeUnit->cu_id, w->simdId,
                                 w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
                                 dst, *p1);
                         // write the value into the physical VGPR. This is a
                         // purely functional operation. No timing is modeled.
                         w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
                     }
                     ++p1;
                 }

                 // Schedule the write operation of the load data on the VRF.
                 // This simply models the timing aspect of the VRF write operation.
                 // It does not modify the physical VGPR.
                 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
                     vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
                                          sizeof(CType), gpuDynInst->time);

                 if (this->isGlobalMem()) {
                     gpuDynInst->computeUnit()->globalMemoryPipe
                         .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
                 } else {
                     assert(this->isLocalMem());
                     gpuDynInst->computeUnit()->localMemoryPipe
                         .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
                 }
             }
         }

         void execute(GPUDynInstPtr gpuDynInst) override;

       private:
         // execAtomic may be called through a continuation
         // if the RMW had release semantics. see comment for
         // execContinuation in gpu_dyn_inst.hh
         void
         execAtomic(GPUDynInstPtr gpuDynInst) override
         {
             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;

             typedef typename MemDataType::CType c0;

             c0 *d = &((c0*) gpuDynInst->d_data)[0];
             c0 *e = &((c0*) gpuDynInst->a_data)[0];
             c0 *f = &((c0*) gpuDynInst->x_data)[0];

             for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
                 if (gpuDynInst->exec_mask[i]) {
                     Addr vaddr = gpuDynInst->addr[i];

                     if (this->isLocalMem()) {
                         Wavefront *wavefront = gpuDynInst->wavefront();
                         *d = wavefront->ldsChunk->read<c0>(vaddr);

                         if (this->isAtomicAdd()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             wavefront->ldsChunk->read<c0>(vaddr) + (*e));
                         } else if (this->isAtomicSub()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             wavefront->ldsChunk->read<c0>(vaddr) - (*e));
                         } else if (this->isAtomicMax()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             std::max(wavefront->ldsChunk->read<c0>(vaddr),
                             (*e)));
                         } else if (this->isAtomicMin()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             std::min(wavefront->ldsChunk->read<c0>(vaddr),
                             (*e)));
                         } else if (this->isAtomicAnd()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             wavefront->ldsChunk->read<c0>(vaddr) & (*e));
                         } else if (this->isAtomicOr()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             wavefront->ldsChunk->read<c0>(vaddr) | (*e));
                         } else if (this->isAtomicXor()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
                         } else if (this->isAtomicInc()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             wavefront->ldsChunk->read<c0>(vaddr) + 1);
                         } else if (this->isAtomicDec()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             wavefront->ldsChunk->read<c0>(vaddr) - 1);
                         } else if (this->isAtomicExch()) {
                             wavefront->ldsChunk->write<c0>(vaddr, (*e));
                         } else if (this->isAtomicCAS()) {
                             wavefront->ldsChunk->write<c0>(vaddr,
                             (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
                             (*f) : wavefront->ldsChunk->read<c0>(vaddr));
                         } else {
                             fatal("Unrecognized or invalid HSAIL atomic op "
                                   "type.\n");
                         }
                     } else {
                         RequestPtr req =
                             std::make_shared<Request>(0, vaddr, sizeof(c0), 0,
                                         gpuDynInst->computeUnit()->masterId(),
                                         0, gpuDynInst->wfDynId,
                                         gpuDynInst->makeAtomicOpFunctor<c0>(e,
                                         f));

                         gpuDynInst->setRequestFlags(req);
                         PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
                         pkt->dataStatic(d);

                         if (gpuDynInst->computeUnit()->shader->
                             separate_acquire_release &&
                             (gpuDynInst->isAcquire())) {
                             // if this atomic has acquire semantics,
                             // schedule the continuation to perform an
                             // acquire after the RMW completes
                             gpuDynInst->execContinuation =
                                 &GPUStaticInst::execAtomicAcq;

                             gpuDynInst->useContinuation = true;
                         } else {
                             // the request will be finished when the RMW completes
                             gpuDynInst->useContinuation = false;
                         }
                         // translation is performed in sendRequest()
                         gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
                                                                pkt);
                     }
                 }

                 ++d;
                 ++e;
                 ++f;
             }

             gpuDynInst->updateStats();
         }

         // execAtomicACq will always be called through a continuation.
         // see comment for execContinuation in gpu_dyn_inst.hh
         void
         execAtomicAcq(GPUDynInstPtr gpuDynInst) override
         {
             // after performing the RMW, check to see if this instruction
             // has acquire semantics, and if so, issue an acquire
             if (!this->isLocalMem()) {
                 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
                      && gpuDynInst->isAcquire()) {
                     gpuDynInst->statusBitVector = VectorMask(1);

                     // the request will be finished when
                     // the acquire completes
                     gpuDynInst->useContinuation = false;
                     // create request
                     RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
                                   gpuDynInst->computeUnit()->masterId(),
                                   0, gpuDynInst->wfDynId);
                     req->setFlags(Request::ACQUIRE);
                     gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
                 }
             }
         }
     };

     template<typename DataType, typename AddrOperandType, int NumSrcOperands>
     GPUStaticInst*
     constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;

         if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
             return decodeLd<DataType>(ib, obj);
         } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
             switch (ib->type) {
               case Brig::BRIG_TYPE_B8:
                 return decodeSt<S8,S8>(ib, obj);
               case Brig::BRIG_TYPE_B16:
                 return decodeSt<S16,S16>(ib, obj);
               case Brig::BRIG_TYPE_B32:
                 return decodeSt<S32,S32>(ib, obj);
               case Brig::BRIG_TYPE_B64:
                 return decodeSt<S64,S64>(ib, obj);
               default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
             }
         } else {
             if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
                 return new AtomicInst<DataType, AddrOperandType,
                     NumSrcOperands, false>(ib, obj, "atomicnoret");
             else
                 return new AtomicInst<DataType, AddrOperandType,
                     NumSrcOperands, true>(ib, obj, "atomic");
         }
     }

     template<typename DataType, int NumSrcOperands>
     GPUStaticInst*
     decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
             Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;

         unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);

         BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);

         if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
             return constructAtomic<DataType, NoRegAddrOperand,
                                    NumSrcOperands>(ib, obj);
         } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
             // V2/V4 not allowed
             switch (tmp.regKind) {
               case Brig::BRIG_REGISTER_KIND_SINGLE:
                   return constructAtomic<DataType, SRegAddrOperand,
                                          NumSrcOperands>(ib, obj);
               case Brig::BRIG_REGISTER_KIND_DOUBLE:
                 return constructAtomic<DataType, DRegAddrOperand,
                                        NumSrcOperands>(ib, obj);
               default:
                 fatal("Bad atomic register operand type %d\n", tmp.type);
             }
         } else {
             fatal("Bad atomic register operand kind %d\n", tmp.kind);
         }
     }


     template<typename DataType>
     GPUStaticInst*
     decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;

         if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
             return decodeAtomicHelper<DataType, 2>(ib, obj);
         } else {
             return decodeAtomicHelper<DataType, 1>(ib, obj);
         }
     }

     template<typename DataType>
     GPUStaticInst*
     decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
     {
         const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
         if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
             return decodeAtomicHelper<DataType, 2>(ib, obj);
         } else {
             return decodeAtomicHelper<DataType, 1>(ib, obj);
         }
     }
 } // namespace HsailISA

 #endif // __ARCH_HSAIL_INSTS_MEM_HH__
Brig::BRIG_TYPE_S8
Definition: Brig.h:1014

Brig::BRIG_TYPE_F16
Definition: Brig.h:1018

Wavefront
Definition: wavefront.hh:147

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:167

DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:229

HsailISA::StInstBase::memoryOrder
Brig::BrigMemoryOrder memoryOrder
Definition: mem.hh:788

HsailISA::LdInstBase::memoryOrder
Brig::BrigMemoryOrder memoryOrder
Definition: mem.hh:217

HsailISA::LdInstBase::width
Brig::BrigWidth8_t width
Definition: mem.hh:212

HsailISA::LdInst::isVectorRegister
bool isVectorRegister(int operandIndex) override
Definition: mem.hh:602

Brig::BRIG_MEMORY_SCOPE_WORKITEM
Definition: Brig.h:529

Brig::BRIG_MEMORY_ORDER_SC_ACQUIRE
Definition: Brig.h:514

Brig::BrigInstAtomic
Definition: Brig.h:1335

Brig::BRIG_ATOMIC_WRAPDEC
Definition: Brig.h:290

HsailISA::StInst::getNumOperands
int getNumOperands() override
Definition: mem.hh:1166

HsailISA::AtomicInstBase::getRegisterIndex
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst)
Definition: mem.hh:1443

Brig::BRIG_TYPE_S64
Definition: Brig.h:1017

Brig::BRIG_SEGMENT_SPILL
Definition: Brig.h:940

operand.hh
Defines classes encapsulating HSAIL instruction operands.

Brig::BrigInstAtomic::segment
BrigSegment8_t segment
Definition: Brig.h:1337

HsailISA::LdaInstBase::isDstOperand
bool isDstOperand(int operandIndex) override
Definition: mem.hh:139

fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:175

ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:137

Brig::BrigInstBase::operands
BrigDataOffsetOperandList32_t operands
Definition: Brig.h:1323

Brig::BRIG_SEGMENT_FLAT
Definition: Brig.h:934

HsailISA::AtomicInstBase::isDstOperand
bool isDstOperand(int operandIndex)
Definition: mem.hh:1425

HsailISA::LdInstBase::isScalarRegister
bool isScalarRegister(int operandIndex) override
Definition: mem.hh:357

Brig::BRIG_TYPE_B8
Definition: Brig.h:1022

HsailISA::LdInstBase::getOperandSize
int getOperandSize(int operandIndex) override
Definition: mem.hh:375

BrigObject
Definition: brig_object.hh:60

Brig::BrigInstBase::type
BrigType16_t type
Definition: Brig.h:1322

HsailISA::StInstBase
Definition: mem.hh:780

Brig::BRIG_OPCODE_ATOMIC
Definition: Brig.h:653

Brig::BRIG_KIND_OPERAND_REGISTER
Definition: Brig.h:220

HsailISA::StInst::execSt
void execSt(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1042

HsailISA::LdInst::completeAcc
void completeAcc(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:501

ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:66

HsailISA::AtomicInst::completeAcc
void completeAcc(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1508

MAX_REGS_FOR_NON_VEC_MEM_INST
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
Definition: compute_unit.hh:59

HsailISA::StInst::isVectorRegister
bool isVectorRegister(int operandIndex) override
Definition: mem.hh:1094

MAX_WIDTH_FOR_MEM_INST
static const int MAX_WIDTH_FOR_MEM_INST
Definition: compute_unit.hh:60

HsailISA::LdInstBase
Definition: mem.hh:209

HsailISA::StInstBase::isVectorRegister
bool isVectorRegister(int operandIndex) override
Definition: mem.hh:912

Brig::BRIG_TYPE_U32
Definition: Brig.h:1012

Brig::BrigKind16_t
uint16_t BrigKind16_t
Definition: Brig.h:102

Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES
Definition: Brig.h:214

HsailISA::LdaInstBase::isSrcOperand
bool isSrcOperand(int operandIndex) override
Definition: mem.hh:132

HsailISA::LdaInst
Definition: mem.hh:165

Brig::BRIG_TYPE_B16
Definition: Brig.h:1023

Brig::BRIG_MEMORY_SCOPE_WORKGROUP
Definition: Brig.h:531

HsailISA::LdInstBase::getRegisterIndex
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:382

Brig::BrigInstAtomic::memoryOrder
BrigMemoryOrder8_t memoryOrder
Definition: Brig.h:1338

HsailISA::LdInstBase::isVectorRegister
bool isVectorRegister(int operandIndex) override
Definition: mem.hh:345

HsailISA::StInstBase::StInstBase
StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
Definition: mem.hh:791

HsailISA::LdInst::initiateAcc
void initiateAcc(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:436

RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:83

HsailISA::AtomicInst::execAtomic
void execAtomic(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1566

Brig::BrigInstBase
Definition: Brig.h:1319

HsailISA::LdaInstBase::getOperandSize
int getOperandSize(int operandIndex) override
Definition: mem.hh:143

SRegAddrOperand
RegAddrOperand< SRegOperand > SRegAddrOperand
Definition: operand.hh:703

GPUStaticInst::execLdAcq
virtual void execLdAcq(GPUDynInstPtr gpuDynInst)
Definition: gpu_static_inst.hh:225

VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45

addr
ip6_addr_t addr
Definition: inet.hh:335

HsailISA::StInstBase::isSrcOperand
bool isSrcOperand(int operandIndex) override
Definition: mem.hh:930

Brig::BrigInstMem::segment
BrigSegment8_t segment
Definition: Brig.h:1389

Brig::BRIG_REGISTER_KIND_DOUBLE
Definition: Brig.h:806

ArmISA::at
Bitfield< 35, 32 > at
Definition: miscregs_types.hh:153

HsailISA::StInstBase::equivClass
unsigned int equivClass
Definition: mem.hh:789

HsailISA::LdaInstBase::getNumOperands
int getNumOperands() override
Definition: mem.hh:156

Brig::BRIG_ATOMIC_MIN
Definition: Brig.h:286

Brig::BRIG_MEMORY_SCOPE_AGENT
Definition: Brig.h:532

Wavefront::simdId
int simdId
Definition: wavefront.hh:165

Brig::BrigInstMem::equivClass
uint8_t equivClass
Definition: Brig.h:1391

HsailISA::AtomicInstBase::isSrcOperand
bool isSrcOperand(int operandIndex)
Definition: mem.hh:1415

HsailISA::AtomicInstBase
Definition: mem.hh:1213

GPUStaticInst::execAtomicAcq
virtual void execAtomicAcq(GPUDynInstPtr gpuDynInst)
Definition: gpu_static_inst.hh:243

HsailISA::StInstBase::addr
AddrOperandType addr
Definition: mem.hh:784

HsailISA::StInst::isDstOperand
bool isDstOperand(int operandIndex) override
Definition: mem.hh:1138

MemCmd::WriteReq
Definition: packet.hh:90

HsailISA::AtomicInst::AtomicInst
AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
Definition: mem.hh:1466

Brig::BRIG_TYPE_U64
Definition: Brig.h:1013

HsailISA::LdInstBase::numDstRegOperands
int numDstRegOperands() override
Definition: mem.hh:337

HsailISA::AtomicInstBase::isCondRegister
bool isCondRegister(int operandIndex)
Definition: mem.hh:1395

Wavefront::wfSlotId
int wfSlotId
Definition: wavefront.hh:162

Brig::BRIG_ATOMIC_ST
Definition: Brig.h:288

compute_unit.hh

HsailISA::LdaInstBase::dest
DestOperandType::DestOperand dest
Definition: mem.hh:93

HsailISA::LdInstBase::equivClass
unsigned int equivClass
Definition: mem.hh:219

Wavefront::ldsChunk
LdsChunk * ldsChunk
Definition: wavefront.hh:260

HsailISA::MemInst::MemInst
MemInst(Enums::MemType m_type)
Definition: mem.hh:53

Brig::BRIG_ATOMIC_LD
Definition: Brig.h:284

Brig::BRIG_ATOMIC_SUB
Definition: Brig.h:289

HsailISA::LdaInstBase::addr
AddrOperandType addr
Definition: mem.hh:94

Brig::BRIG_ATOMIC_EXCH
Definition: Brig.h:283

std::vector< uint32_t >

HsailISA::StInstBase::memoryScope
Brig::BrigMemoryScope memoryScope
Definition: mem.hh:787

Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1040

HsailISA::StInst::isScalarRegister
bool isScalarRegister(int operandIndex) override
Definition: mem.hh:1120

VT_32
Definition: gpu_dyn_inst.hh:83

Brig::BrigOperandOperandList
Definition: Brig.h:1521

BrigRegOperandInfo::kind
Brig::BrigKind16_t kind
Definition: operand.hh:80

HsailISA::MemInst::MemInst
MemInst()
Definition: mem.hh:51

ArmISA::f
Bitfield< 6 > f
Definition: miscregs_types.hh:67

gpu_static_inst.hh

GPUStaticInst::execSt
virtual void execSt(GPUDynInstPtr gpuDynInst)
Definition: gpu_static_inst.hh:231

Brig::BrigBase::kind
BrigKind16_t kind
Definition: Brig.h:1180

RegAddrOperand
Definition: operand.hh:600

HsailISA::LdaInstBase::getRegisterIndex
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:150

GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48

Brig::BRIG_MEMORY_ORDER_RELAXED
Definition: Brig.h:513

type
uint8_t type
Definition: inet.hh:333

HsailISA::StInst::num_src_operands
uint16_t num_src_operands
Definition: mem.hh:959

GPUStaticInst::execAtomic
virtual void execAtomic(GPUDynInstPtr gpuDynInst)
Definition: gpu_static_inst.hh:237

HsailISA::LdInst::execLdAcq
void execLdAcq(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:582

HsailISA::StInst::isCondRegister
bool isCondRegister(int operandIndex) override
Definition: mem.hh:1107

HsailISA::AtomicInst
Definition: mem.hh:1458

HsailISA::StInst::getRegisterIndex
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1153

Brig::BRIG_OPCODE_ATOMICNORET
Definition: Brig.h:654

BrigObject::getOperandPtr
unsigned getOperandPtr(int offs, int index) const
Definition: brig_object.cc:122

HsailISA::LdInst::isDstOperand
bool isDstOperand(int operandIndex) override
Definition: mem.hh:654

HsailISA::LdInstBase::addr
AddrOperandType addr
Definition: mem.hh:214

Brig::BRIG_SEGMENT_READONLY
Definition: Brig.h:936

Brig::BRIG_TYPE_S16
Definition: Brig.h:1015

Brig::BrigInstAtomic::memoryScope
BrigMemoryScope8_t memoryScope
Definition: Brig.h:1339

HsailISA::StInstBase::numSrcRegOperands
int numSrcRegOperands() override
Definition: mem.hh:901

HsailISA::LdInstBase::dest
DestOperandType::DestOperand dest
Definition: mem.hh:213

NoRegAddrOperand
Definition: operand.hh:706

Brig::BrigMemoryOrder
BrigMemoryOrder
Definition: Brig.h:505

HsailISA::AtomicInstBase::atomicOperation
Brig::BrigAtomicOperation atomicOperation
Definition: mem.hh:1222

HsailISA::LdInstBase::isCondRegister
bool isCondRegister(int operandIndex) override
Definition: mem.hh:351

GPUStaticInst
Definition: gpu_static_inst.hh:60

Brig::BRIG_ATOMIC_MAX
Definition: Brig.h:285

HsailISA::MemInst::getAddressOperand
AddrOperandBase * getAddressOperand()
Definition: mem.hh:86

HsailISA::decodeLd
GPUStaticInst * decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:729

Brig::BRIG_TYPE_U8
Definition: Brig.h:1010

BrigObject::getData
const uint8_t * getData(int offs) const
Definition: brig_object.cc:110

HsailISA::LdInst
Definition: mem.hh:392

decl.hh

HsailISA::LdaInst::LdaInst
LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
Definition: mem.hh:172

MipsISA::k
Bitfield< 23 > k
Definition: dt_constants.hh:80

HsailISA::AtomicInstBase::opcode
Brig::BrigOpcode opcode
Definition: mem.hh:1224

ArmISA::d
Bitfield< 9 > d
Definition: miscregs_types.hh:63

HsailISA::MemInst::getMemOperandSize
int getMemOperandSize()
Definition: mem.hh:85

MipsISA::vaddr
vaddr
Definition: pra_constants.hh:277

HsailISA::decodeAtomic
GPUStaticInst * decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:1753

Brig::BrigInstMem
Definition: Brig.h:1387

Wavefront::computeUnit
ComputeUnit * computeUnit
Definition: wavefront.hh:167

HsailISA::AtomicInstBase::segment
Brig::BrigSegment segment
Definition: mem.hh:1220

Brig::BrigMemoryScope
BrigMemoryScope
Definition: Brig.h:521

Brig::BRIG_ATOMIC_XOR
Definition: Brig.h:292

Brig::BRIG_TYPE_F64
Definition: Brig.h:1020

HsailISA::StInstBase::segment
Brig::BrigSegment segment
Definition: mem.hh:786

HsailISA::LdInstBase::segment
Brig::BrigSegment segment
Definition: mem.hh:216

HsailISA::MemInst::addr_operand
AddrOperandBase * addr_operand
Definition: mem.hh:82

ComputeUnit::wfSize
int wfSize() const
Definition: compute_unit.hh:252

HsailISA::AtomicInstBase::getOperandSize
int getOperandSize(int operandIndex)
Definition: mem.hh:1432

Brig::BRIG_MEMORY_SCOPE_NONE
Definition: Brig.h:528

HsailISA::AtomicInstBase::memoryScope
Brig::BrigMemoryScope memoryScope
Definition: mem.hh:1223

AddrOperandBase
Definition: operand.hh:580

HsailISA::StInst::initiateAcc
void initiateAcc(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1006

Brig::BRIG_ATOMIC_CAS
Definition: Brig.h:282

HsailISA::AtomicInst::execAtomicAcq
void execAtomicAcq(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1666

HsailISA::LdInstBase::LdInstBase
LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
Definition: mem.hh:221

HsailISA::StInstBase::isScalarRegister
bool isScalarRegister(int operandIndex) override
Definition: mem.hh:924

LdsChunk::read
T read(const uint32_t index)
a read operation
Definition: lds_state.hh:74

HsailISA::LdaInstBase::isCondRegister
bool isCondRegister(int operandIndex) override
Definition: mem.hh:120

Brig::BrigInstAtomic::atomicOperation
BrigAtomicOperation8_t atomicOperation
Definition: Brig.h:1340

MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:280

Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142

HsailISA::AtomicInstBase::dest
OperandType::DestOperand dest
Definition: mem.hh:1216

HsailISA::LdaInstBase::numDstRegOperands
int numDstRegOperands() override
Definition: mem.hh:112

HsailISA::StInstBase::isCondRegister
bool isCondRegister(int operandIndex) override
Definition: mem.hh:918

BrigRegOperandInfo::type
Brig::BrigType type
Definition: operand.hh:81

HsailISA::LdaInstBase::isVectorRegister
bool isVectorRegister(int operandIndex) override
Definition: mem.hh:114

Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:255

Brig::BRIG_WIDTH_1
Definition: Brig.h:1125

HsailISA::AtomicInst::initiateAcc
void initiateAcc(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1477

Brig::BRIG_SEGMENT_GROUP
Definition: Brig.h:938

HsailISA::MemInst::size
int size
Definition: mem.hh:81

HsailISA::StInstBase::isDstOperand
bool isDstOperand(int operandIndex) override
Definition: mem.hh:935

HsailISA::LdInstBase::numSrcRegOperands
int numSrcRegOperands() override
Definition: mem.hh:335

HsailISA::AtomicInstBase::AtomicInstBase
AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
Definition: mem.hh:1226

Brig::BRIG_KIND_OPERAND_ADDRESS
Definition: Brig.h:210

Brig::BrigSegment
BrigSegment
Definition: Brig.h:925

Brig::BrigOperandConstantBytes
Definition: Brig.h:1489

HsailISA::LdaInstBase
Definition: mem.hh:90

HsailISA::LdInst::isCondRegister
bool isCondRegister(int operandIndex) override
Definition: mem.hh:618

HsailISA::StInst
Definition: mem.hh:952

Brig::BRIG_ATOMIC_ADD
Definition: Brig.h:280

Brig::BrigOperandConstantBytes::base
BrigBase base
Definition: Brig.h:1490

DRegAddrOperand
RegAddrOperand< DRegOperand > DRegAddrOperand
Definition: operand.hh:704

HsailISA::AtomicInstBase::isScalarRegister
bool isScalarRegister(int operandIndex)
Definition: mem.hh:1405

HsailISA::LdInst::num_dest_operands
uint16_t num_dest_operands
Definition: mem.hh:398

ArmISA::e
Bitfield< 9 > e
Definition: miscregs_types.hh:64

HsailISA::AtomicInstBase::addr
AddrOperandType addr
Definition: mem.hh:1218

Brig::BRIG_SEGMENT_PRIVATE
Definition: Brig.h:939

HsailISA
Definition: gpu_decoder.hh:51

HsailISA::AtomicInstBase::numDstRegOperands
int numDstRegOperands()
Definition: mem.hh:1378

BrigObject::getOperand
const Brig::BrigOperand * getOperand(int offs) const
Definition: brig_object.cc:116

Brig::BRIG_TYPE_B64
Definition: Brig.h:1025

HsailISA::StInst::completeAcc
void completeAcc(GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:1035

Brig::BRIG_TYPE_S32
Definition: Brig.h:1016

Brig::BRIG_SEGMENT_GLOBAL
Definition: Brig.h:935

Brig::BRIG_KIND_OPERAND_OPERAND_LIST
Definition: Brig.h:219

HsailISA::StInstBase::getNumOperands
int getNumOperands() override
Definition: mem.hh:905

HsailISA::LdaInstBase::numSrcRegOperands
int numSrcRegOperands() override
Definition: mem.hh:110

HsailISA::StInstBase::numDstRegOperands
int numDstRegOperands() override
Definition: mem.hh:900

Request::RELEASE
The request should be marked with RELEASE.
Definition: request.hh:162

HsailISA::MemInst::init_addr
void init_addr(AddrOperandBase *_addr_operand)
Definition: mem.hh:75

MemCmd::SwapReq
Definition: packet.hh:115

Brig::BRIG_OPCODE_LD
Definition: Brig.h:651

HsailISA::StInst::isSrcOperand
bool isSrcOperand(int operandIndex) override
Definition: mem.hh:1133

MemCmd::ReadReq
Definition: packet.hh:87

HsailISA::StInstBase::src
SrcOperandType::SrcOperand src
Definition: mem.hh:783

ComputeUnit::vrf
std::vector< VectorRegisterFile * > vrf
Definition: compute_unit.hh:140

Brig::BrigOperandOperandList::elements
BrigDataOffsetOperandList32_t elements
Definition: Brig.h:1523

BrigRegOperandInfo
Definition: operand.hh:77

Brig::BRIG_REGISTER_KIND_SINGLE
Definition: Brig.h:805

Brig::BRIG_SEGMENT_KERNARG
Definition: Brig.h:937

Brig::BRIG_OPCODE_ST
Definition: Brig.h:652

Brig::BRIG_MEMORY_ORDER_SC_RELEASE
Definition: Brig.h:515

Brig::BrigAtomicOperation
BrigAtomicOperation
Definition: Brig.h:270

HsailISA::constructAtomic
GPUStaticInst * constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:1691

HsailISA::decodeAtomicHelper
GPUStaticInst * decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:1721

BrigRegOperandInfo::regKind
Brig::BrigRegisterKind regKind
Definition: operand.hh:82

HsailISA::MemInst
Definition: mem.hh:48

HsailISA::AtomicInstBase::getNumOperands
int getNumOperands()
Definition: mem.hh:1379

HsailISA::StInstBase::getOperandSize
int getOperandSize(int operandIndex) override
Definition: mem.hh:936

HsailISA::decodeAtomicNoRet
GPUStaticInst * decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:1766

HsailISA::LdInst::LdInst
LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
Definition: mem.hh:402

HsailISA::AtomicInstBase::numSrcRegOperands
int numSrcRegOperands()
Definition: mem.hh:1366

Brig::BRIG_MEMORY_ORDER_NONE
Definition: Brig.h:512

LdsChunk::write
void write(const uint32_t index, const T value)
a write operation
Definition: lds_state.hh:87

HsailISA::StInstBase::getRegisterIndex
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:942

Brig::BRIG_TYPE_F32
Definition: Brig.h:1019

Brig::BRIG_TYPE_B32
Definition: Brig.h:1024

Brig::BrigBase
Definition: Brig.h:1178

Brig
Definition: Brig.h:46

Brig::BRIG_ATOMIC_AND
Definition: Brig.h:281

findRegDataType
BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj)
Definition: operand.cc:213

X86ISA::op
Bitfield< 4 > op
Definition: types.hh:80

HsailISA::LdInst::getNumOperands
int getNumOperands() override
Definition: mem.hh:691

Brig::BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE
Definition: Brig.h:516

HsailISA::decodeLd2
GPUStaticInst * decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:703

HsailISA::LdInstBase::isSrcOperand
bool isSrcOperand(int operandIndex) override
Definition: mem.hh:363

Wavefront::remap
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0)
Definition: wavefront.cc:282

HsailISA::AtomicInstBase::memoryOrder
Brig::BrigMemoryOrder memoryOrder
Definition: mem.hh:1221

HsailISA::LdInst::getOperandSize
int getOperandSize(int operandIndex) override
Definition: mem.hh:662

HsailISA::LdaInstBase::LdaInstBase
LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
Definition: mem.hh:96

HsailISA::decodeLda
GPUStaticInst * decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:185

HsailISA::LdInstBase::getNumOperands
int getNumOperands() override
Definition: mem.hh:338

HsailISA::decodeSt
GPUStaticInst * decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
Definition: mem.hh:1178

Brig::BRIG_MEMORY_SCOPE_SYSTEM
Definition: Brig.h:533

Brig::BRIG_TYPE_U16
Definition: Brig.h:1011

HsailISA::LdInstBase::isDstOperand
bool isDstOperand(int operandIndex) override
Definition: mem.hh:370

HsailISA::LdInstBase::memoryScope
Brig::BrigMemoryScope memoryScope
Definition: mem.hh:218

Brig::BRIG_ATOMIC_WRAPINC
Definition: Brig.h:291

HsailISA::LdInst::isSrcOperand
bool isSrcOperand(int operandIndex) override
Definition: mem.hh:646

Brig::BrigInstMem::width
BrigWidth8_t width
Definition: Brig.h:1392

HsailISA::LdInst::getRegisterIndex
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
Definition: mem.hh:677

Request::ACQUIRE
The request should be marked with ACQUIRE.
Definition: request.hh:160

HsailISA::AtomicInstBase::isVectorRegister
bool isVectorRegister(int operandIndex)
Definition: mem.hh:1385

HsailISA::LdaInstBase::isScalarRegister
bool isScalarRegister(int operandIndex) override
Definition: mem.hh:126

Brig::BrigWidth8_t
uint8_t BrigWidth8_t
Definition: Brig.h:146

HsailISA::LdInst::isScalarRegister
bool isScalarRegister(int operandIndex) override
Definition: mem.hh:632

Brig::BrigOpcode
BrigOpcode
Definition: Brig.h:538

HsailISA::StInst::getOperandSize
int getOperandSize(int operandIndex) override
Definition: mem.hh:1139

Brig::BRIG_TYPE_NONE
Definition: Brig.h:1009

HsailISA::StInst::StInst
StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode, int srcIdx)
Definition: mem.hh:962

Brig::BRIG_SEGMENT_ARG
Definition: Brig.h:941

ProbePoints::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:104

HsailISA::HsailGPUStaticInst
Definition: gpu_static_inst.hh:53

Brig::BrigInstBase::opcode
BrigOpcode16_t opcode
Definition: Brig.h:1321

Brig::BRIG_ATOMIC_OR
Definition: Brig.h:287