develop/vega_2insts_2op__encodings_8hh_source.html

/*

 * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__

#define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__


#include "arch/amdgpu/vega/gpu_decoder.hh"

#include "arch/amdgpu/vega/gpu_mem_helpers.hh"

#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"

#include "arch/amdgpu/vega/operand.hh"

#include "debug/GPUExec.hh"

#include "debug/VEGA.hh"

#include "mem/ruby/system/RubySystem.hh"


namespace gem5

{


namespace VegaISA

{

    struct BufferRsrcDescriptor

    {

        uint64_t baseAddr : 48;

        uint32_t stride : 14;

        uint32_t cacheSwizzle : 1;

        uint32_t swizzleEn : 1;

        uint32_t numRecords : 32;

        uint32_t dstSelX : 3;

        uint32_t dstSelY : 3;

        uint32_t dstSelZ : 3;

        uint32_t dstSelW : 3;

        uint32_t numFmt : 3;

        uint32_t dataFmt : 4;

        uint32_t elemSize : 2;

        uint32_t idxStride : 2;

        uint32_t addTidEn : 1;

        uint32_t atc : 1;

        uint32_t hashEn : 1;

        uint32_t heap : 1;

        uint32_t mType : 3;

        uint32_t type : 2;

    };


    // --- purely virtual instruction classes ---


    class Inst_SOP2 : public VEGAGPUStaticInst

    {

      public:

        Inst_SOP2(InFmt_SOP2*, const std::string &opcode);


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_SOP2 instData;

        // possible second DWORD

        InstFormat extData;

        uint32_t varSize;


      private:

        bool hasSecondDword(InFmt_SOP2 *);

    }; // Inst_SOP2


    class Inst_SOPK : public VEGAGPUStaticInst

    {

      public:

        Inst_SOPK(InFmt_SOPK*, const std::string &opcode);

        ~Inst_SOPK();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_SOPK instData;

        // possible second DWORD

        InstFormat extData;

        uint32_t varSize;


      private:

        bool hasSecondDword(InFmt_SOPK *);

    }; // Inst_SOPK


    class Inst_SOP1 : public VEGAGPUStaticInst

    {

      public:

        Inst_SOP1(InFmt_SOP1*, const std::string &opcode);

        ~Inst_SOP1();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_SOP1 instData;

        // possible second DWORD

        InstFormat extData;

        uint32_t varSize;


      private:

        bool hasSecondDword(InFmt_SOP1 *);

    }; // Inst_SOP1


    class Inst_SOPC : public VEGAGPUStaticInst

    {

      public:

        Inst_SOPC(InFmt_SOPC*, const std::string &opcode);

        ~Inst_SOPC();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_SOPC instData;

        // possible second DWORD

        InstFormat extData;

        uint32_t varSize;


      private:

        bool hasSecondDword(InFmt_SOPC *);

    }; // Inst_SOPC


    class Inst_SOPP : public VEGAGPUStaticInst

    {

      public:

        Inst_SOPP(InFmt_SOPP*, const std::string &opcode);

        ~Inst_SOPP();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_SOPP instData;

    }; // Inst_SOPP


    class Inst_SMEM : public VEGAGPUStaticInst

    {

      public:

        Inst_SMEM(InFmt_SMEM*, const std::string &opcode);

        ~Inst_SMEM();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        template<int N>

        void

        initMemRead(GPUDynInstPtr gpuDynInst)

        {

            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,

                                                    MemCmd::ReadReq);

        }


        template<int N>

        void

        initMemWrite(GPUDynInstPtr gpuDynInst)

        {

            initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,

                                                    MemCmd::WriteReq);

        }


        void

        calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr,

                 ScalarRegU32 offset)

        {

            Addr vaddr = ((addr.rawData() + offset) & ~0x3);

            gpu_dyn_inst->scalarAddr = vaddr;

        }


        void

        calcAddr(GPUDynInstPtr gpu_dyn_inst,

                 ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)

        {

            BufferRsrcDescriptor rsrc_desc;

            ScalarRegU32 clamped_offset(offset);

            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),

                        sizeof(BufferRsrcDescriptor));


            if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {

                clamped_offset = rsrc_desc.numRecords;

            } else if (rsrc_desc.stride && offset

                       > (rsrc_desc.stride * rsrc_desc.numRecords)) {

                clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);

            }


            Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);

            gpu_dyn_inst->scalarAddr = vaddr;

        }


        // first instruction DWORD

        InFmt_SMEM instData;

        // second instruction DWORD

        InFmt_SMEM_1 extData;

    }; // Inst_SMEM


    class Inst_VOP2 : public VEGAGPUStaticInst

    {

      public:

        Inst_VOP2(InFmt_VOP2*, const std::string &opcode);

        ~Inst_VOP2();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_VOP2 instData;

        // possible second DWORD

        InstFormat extData;

        uint32_t varSize;


        template<typename T>

        T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)

        {

            T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);

            // use copies of original src0, src1, and dest during selecting

            T origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);

            T origSrc1(gpuDynInst, instData.VSRC1);


            src0_sdwa.read();

            origSrc0_sdwa.read();

            origSrc1.read();


            DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register v[%d], "

                "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "

                "%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "

                "SRC1_NEG: %d, SRC1_ABS: %d\n",

                opcode().c_str(), extData.iFmt_VOP_SDWA.SRC0,

                extData.iFmt_VOP_SDWA.DST_SEL, extData.iFmt_VOP_SDWA.DST_U,

                extData.iFmt_VOP_SDWA.CLMP, extData.iFmt_VOP_SDWA.SRC0_SEL,

                extData.iFmt_VOP_SDWA.SRC0_SEXT,

                extData.iFmt_VOP_SDWA.SRC0_NEG, extData.iFmt_VOP_SDWA.SRC0_ABS,

                extData.iFmt_VOP_SDWA.SRC1_SEL,

                extData.iFmt_VOP_SDWA.SRC1_SEXT,

                extData.iFmt_VOP_SDWA.SRC1_NEG,

                extData.iFmt_VOP_SDWA.SRC1_ABS);


            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,

                            src1, origSrc1);


            return src0_sdwa;

        }


        template<typename T>

        void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T & vdst)

        {

            T origVdst(gpuDynInst, instData.VDST);


            Wavefront *wf = gpuDynInst->wavefront();

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    origVdst[lane] = vdst[lane]; // keep copy consistent

                }

            }


            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);

        }


        template<typename T>

        T dppHelper(GPUDynInstPtr gpuDynInst, T & src1)

        {

            T src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);

            src0_dpp.read();


            DPRINTF(VEGA, "Handling %s SRC DPP. SRC0: register v[%d], "

                "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "

                "SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",

                opcode().c_str(), extData.iFmt_VOP_DPP.SRC0,

                extData.iFmt_VOP_DPP.DPP_CTRL, extData.iFmt_VOP_DPP.SRC0_ABS,

                extData.iFmt_VOP_DPP.SRC0_NEG, extData.iFmt_VOP_DPP.SRC1_ABS,

                extData.iFmt_VOP_DPP.SRC1_NEG, extData.iFmt_VOP_DPP.BC,

                extData.iFmt_VOP_DPP.BANK_MASK, extData.iFmt_VOP_DPP.ROW_MASK);


            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);


            return src0_dpp;

        }


        template<typename T>

        void vop2Helper(GPUDynInstPtr gpuDynInst,

                        void (*fOpImpl)(T&, T&, T&, Wavefront*))

        {

            Wavefront *wf = gpuDynInst->wavefront();

            T src0(gpuDynInst, instData.SRC0);

            T src1(gpuDynInst, instData.VSRC1);

            T vdst(gpuDynInst, instData.VDST);


            src0.readSrc();

            src1.read();


            if (isSDWAInst()) {

                T src0_sdwa = sdwaSrcHelper(gpuDynInst, src1);

                fOpImpl(src0_sdwa, src1, vdst, wf);

                sdwaDstHelper(gpuDynInst, vdst);

            } else if (isDPPInst()) {

                T src0_dpp = dppHelper(gpuDynInst, src1);

                fOpImpl(src0_dpp, src1, vdst, wf);

            } else {

                fOpImpl(src0, src1, vdst, wf);

            }


            vdst.write();

        }


      private:

        bool hasSecondDword(InFmt_VOP2 *);

    }; // Inst_VOP2


    class Inst_VOP1 : public VEGAGPUStaticInst

    {

      public:

        Inst_VOP1(InFmt_VOP1*, const std::string &opcode);

        ~Inst_VOP1();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_VOP1 instData;

        // possible second DWORD

        InstFormat extData;

        uint32_t varSize;


      private:

        bool hasSecondDword(InFmt_VOP1 *);

    }; // Inst_VOP1


    class Inst_VOPC : public VEGAGPUStaticInst

    {

      public:

        Inst_VOPC(InFmt_VOPC*, const std::string &opcode);

        ~Inst_VOPC();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_VOPC instData;

        // possible second DWORD

        InstFormat extData;

        uint32_t varSize;


      private:

        bool hasSecondDword(InFmt_VOPC *);

    }; // Inst_VOPC


    class Inst_VINTRP : public VEGAGPUStaticInst

    {

      public:

        Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);

        ~Inst_VINTRP();


        int instSize() const override;


      protected:

        // first instruction DWORD

        InFmt_VINTRP instData;

    }; // Inst_VINTRP


    class Inst_VOP3A : public VEGAGPUStaticInst

    {

      public:

        Inst_VOP3A(InFmt_VOP3A*, const std::string &opcode, bool sgpr_dst);

        ~Inst_VOP3A();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_VOP3A instData;

        // second instruction DWORD

        InFmt_VOP3_1 extData;


      private:

        bool hasSecondDword(InFmt_VOP3A *);

        const bool sgprDst;

    }; // Inst_VOP3A


    class Inst_VOP3B : public VEGAGPUStaticInst

    {

      public:

        Inst_VOP3B(InFmt_VOP3B*, const std::string &opcode);

        ~Inst_VOP3B();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_VOP3B instData;

        // second instruction DWORD

        InFmt_VOP3_1 extData;


      private:

        bool hasSecondDword(InFmt_VOP3B *);

    }; // Inst_VOP3B


    class Inst_DS : public VEGAGPUStaticInst

    {

      public:

        Inst_DS(InFmt_DS*, const std::string &opcode);

        ~Inst_DS();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        template<typename T>

        void

        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    Addr vaddr = gpuDynInst->addr[lane] + offset;


                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]

                        = wf->ldsChunk->read<T>(vaddr);

                }

            }

        }


        template<int N>

        void

        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    Addr vaddr = gpuDynInst->addr[lane] + offset;

                    for (int i = 0; i < N; ++i) {

                        (reinterpret_cast<VecElemU32*>(

                            gpuDynInst->d_data))[lane * N + i]

                            = wf->ldsChunk->read<VecElemU32>(

                                vaddr + i*sizeof(VecElemU32));

                    }

                }

            }

        }


        template<typename T>

        void

        initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;

                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;


                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]

                        = wf->ldsChunk->read<T>(vaddr0);

                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]

                        = wf->ldsChunk->read<T>(vaddr1);

                }

            }

        }


        template<typename T>

        void

        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    Addr vaddr = gpuDynInst->addr[lane] + offset;

                    wf->ldsChunk->write<T>(vaddr,

                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);

                }

            }

        }


        template<int N>

        void

        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    Addr vaddr = gpuDynInst->addr[lane] + offset;

                    for (int i = 0; i < N; ++i) {

                        wf->ldsChunk->write<VecElemU32>(

                            vaddr + i*sizeof(VecElemU32),

                            (reinterpret_cast<VecElemU32*>(

                                gpuDynInst->d_data))[lane * N + i]);

                    }

                }

            }

        }


        template<typename T>

        void

        initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    Addr vaddr0 = gpuDynInst->addr[lane] + offset0;

                    Addr vaddr1 = gpuDynInst->addr[lane] + offset1;

                    wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(

                        gpuDynInst->d_data))[lane * 2]);

                    wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(

                        gpuDynInst->d_data))[lane * 2 + 1]);

                }

            }

        }


        template<typename T>

        void

        initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    Addr vaddr = gpuDynInst->addr[lane] + offset;


                    AtomicOpFunctorPtr amo_op =

                        gpuDynInst->makeAtomicOpFunctor<T>(

                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],

                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);


                    (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]

                        = wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));

                }

            }

        }


        void

        calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)

        {

            Wavefront *wf = gpuDynInst->wavefront();


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    gpuDynInst->addr.at(lane) = (Addr)addr[lane];

                }

            }

        }


        // first instruction DWORD

        InFmt_DS instData;

        // second instruction DWORD

        InFmt_DS_1 extData;

    }; // Inst_DS


    class Inst_MUBUF : public VEGAGPUStaticInst

    {

      public:

        Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);

        ~Inst_MUBUF();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        template<typename T>

        void

        initMemRead(GPUDynInstPtr gpuDynInst)

        {

            // temporarily modify exec_mask to supress memory accesses to oob

            // regions.  Only issue memory requests for lanes that have their

            // exec_mask set and are not out of bounds.

            VectorMask old_exec_mask = gpuDynInst->exec_mask;

            gpuDynInst->exec_mask &= ~oobMask;

            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);

            gpuDynInst->exec_mask = old_exec_mask;

        }


        template<int N>

        void

        initMemRead(GPUDynInstPtr gpuDynInst)

        {

            // temporarily modify exec_mask to supress memory accesses to oob

            // regions.  Only issue memory requests for lanes that have their

            // exec_mask set and are not out of bounds.

            VectorMask old_exec_mask = gpuDynInst->exec_mask;

            gpuDynInst->exec_mask &= ~oobMask;

            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);

            gpuDynInst->exec_mask = old_exec_mask;

        }


        template<typename T>

        void

        initMemWrite(GPUDynInstPtr gpuDynInst)

        {

            // temporarily modify exec_mask to supress memory accesses to oob

            // regions.  Only issue memory requests for lanes that have their

            // exec_mask set and are not out of bounds.

            VectorMask old_exec_mask = gpuDynInst->exec_mask;

            gpuDynInst->exec_mask &= ~oobMask;

            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);

            gpuDynInst->exec_mask = old_exec_mask;

        }


        template<int N>

        void

        initMemWrite(GPUDynInstPtr gpuDynInst)

        {

            // temporarily modify exec_mask to supress memory accesses to oob

            // regions.  Only issue memory requests for lanes that have their

            // exec_mask set and are not out of bounds.

            VectorMask old_exec_mask = gpuDynInst->exec_mask;

            gpuDynInst->exec_mask &= ~oobMask;

            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);

            gpuDynInst->exec_mask = old_exec_mask;

        }


        void

        injectGlobalMemFence(GPUDynInstPtr gpuDynInst)

        {

            // create request and set flags

            gpuDynInst->resetEntireStatusVector();

            gpuDynInst->setStatusVector(0, 1);

            RequestPtr req = std::make_shared<Request>(0, 0, 0,

                                       gpuDynInst->computeUnit()->

                                       requestorId(), 0,

                                       gpuDynInst->wfDynId);

            gpuDynInst->setRequestFlags(req);

            gpuDynInst->computeUnit()->

                injectGlobalMemFence(gpuDynInst, false, req);

        }


        template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>

        void

        calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,

            SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)

        {

            Addr vaddr = 0;

            Addr base_addr = 0;

            Addr stride = 0;

            Addr buf_idx = 0;

            Addr buf_off = 0;

            Addr buffer_offset = 0;

            BufferRsrcDescriptor rsrc_desc;


            std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),

                sizeof(BufferRsrcDescriptor));


            base_addr = rsrc_desc.baseAddr;


            stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)

                + rsrc_desc.stride) : rsrc_desc.stride;


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    vaddr = base_addr + s_offset.rawData();

                    buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);


                    buf_off = v_off[lane] + inst_offset;


                    if (rsrc_desc.swizzleEn) {

                        Addr idx_stride = 8 << rsrc_desc.idxStride;

                        Addr elem_size = 2 << rsrc_desc.elemSize;

                        Addr idx_msb = buf_idx / idx_stride;

                        Addr idx_lsb = buf_idx % idx_stride;

                        Addr off_msb = buf_off / elem_size;

                        Addr off_lsb = buf_off % elem_size;

                        DPRINTF(VEGA, "mubuf swizzled lane %d: "

                                "idx_stride = %llx, elem_size = %llx, "

                                "idx_msb = %llx, idx_lsb = %llx, "

                                "off_msb = %llx, off_lsb = %llx\n",

                                lane, idx_stride, elem_size, idx_msb, idx_lsb,

                                off_msb, off_lsb);


                        buffer_offset =(idx_msb * stride + off_msb * elem_size)

                            * idx_stride + idx_lsb * elem_size + off_lsb;

                    } else {

                        buffer_offset = buf_off + stride * buf_idx;

                    }


                    if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {

                        if (buffer_offset >=

                            rsrc_desc.numRecords - s_offset.rawData()) {

                            DPRINTF(VEGA, "mubuf out-of-bounds condition 1: "

                                    "lane = %d, buffer_offset = %llx, "

                                    "const_stride = %llx, "

                                    "const_num_records = %llx\n",

                                    lane, buf_off + stride * buf_idx,

                                    stride, rsrc_desc.numRecords);

                            oobMask.set(lane);

                            continue;

                        }

                    }


                    if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {

                        if (buf_idx >= rsrc_desc.numRecords ||

                            buf_off >= stride) {

                            DPRINTF(VEGA, "mubuf out-of-bounds condition 2: "

                                    "lane = %d, offset = %llx, "

                                    "index = %llx, "

                                    "const_num_records = %llx\n",

                                    lane, buf_off, buf_idx,

                                    rsrc_desc.numRecords);

                            oobMask.set(lane);

                            continue;

                        }

                    }


                    vaddr += buffer_offset;


                    DPRINTF(VEGA, "Calculating mubuf address for lane %d: "

                            "vaddr = %llx, base_addr = %llx, "

                            "stride = %llx, buf_idx = %llx, buf_off = %llx\n",

                            lane, vaddr, base_addr, stride,

                            buf_idx, buf_off);

                    gpuDynInst->addr.at(lane) = vaddr;

                }

            }

        }


        // first instruction DWORD

        InFmt_MUBUF instData;

        // second instruction DWORD

        InFmt_MUBUF_1 extData;

        // Mask of lanes with out-of-bounds accesses.  Needs to be tracked

        // seperately from the exec_mask so that we remember to write zero

        // to the registers associated with out of bounds lanes.

        VectorMask oobMask;

    }; // Inst_MUBUF


    class Inst_MTBUF : public VEGAGPUStaticInst

    {

      public:

        Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);

        ~Inst_MTBUF();


        int instSize() const override;

        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_MTBUF instData;

        // second instruction DWORD

        InFmt_MTBUF_1 extData;


      private:

        bool hasSecondDword(InFmt_MTBUF *);

    }; // Inst_MTBUF


    class Inst_MIMG : public VEGAGPUStaticInst

    {

      public:

        Inst_MIMG(InFmt_MIMG*, const std::string &opcode);

        ~Inst_MIMG();


        int instSize() const override;

        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_MIMG instData;

        // second instruction DWORD

        InFmt_MIMG_1 extData;

    }; // Inst_MIMG


    class Inst_EXP : public VEGAGPUStaticInst

    {

      public:

        Inst_EXP(InFmt_EXP*, const std::string &opcode);

        ~Inst_EXP();


        int instSize() const override;

        void initOperandInfo() override;


      protected:

        // first instruction DWORD

        InFmt_EXP instData;

        // second instruction DWORD

        InFmt_EXP_1 extData;

    }; // Inst_EXP


    class Inst_FLAT : public VEGAGPUStaticInst

    {

      public:

        Inst_FLAT(InFmt_FLAT*, const std::string &opcode);

        ~Inst_FLAT();


        int instSize() const override;

        void generateDisassembly() override;


        void initOperandInfo() override;


      protected:

        template<typename T>

        void

        initMemRead(GPUDynInstPtr gpuDynInst)

        {

            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {

                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);

            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {

                Wavefront *wf = gpuDynInst->wavefront();

                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                    if (gpuDynInst->exec_mask[lane]) {

                        Addr vaddr = gpuDynInst->addr[lane];

                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]

                            = wf->ldsChunk->read<T>(vaddr);

                    }

                }

            }

        }


        template<int N>

        void

        initMemRead(GPUDynInstPtr gpuDynInst)

        {

            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {

                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);

            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {

                Wavefront *wf = gpuDynInst->wavefront();

                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                    if (gpuDynInst->exec_mask[lane]) {

                        Addr vaddr = gpuDynInst->addr[lane];

                        for (int i = 0; i < N; ++i) {

                            (reinterpret_cast<VecElemU32*>(

                                gpuDynInst->d_data))[lane * N + i]

                                = wf->ldsChunk->read<VecElemU32>(

                                        vaddr + i*sizeof(VecElemU32));

                        }

                    }

                }

            }

        }


        template<typename T>

        void

        initMemWrite(GPUDynInstPtr gpuDynInst)

        {

            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {

                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);

            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {

                Wavefront *wf = gpuDynInst->wavefront();

                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                    if (gpuDynInst->exec_mask[lane]) {

                        Addr vaddr = gpuDynInst->addr[lane];

                        wf->ldsChunk->write<T>(vaddr,

                            (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);

                    }

                }

            }

        }


        template<int N>

        void

        initMemWrite(GPUDynInstPtr gpuDynInst)

        {

            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {

                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);

            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {

                Wavefront *wf = gpuDynInst->wavefront();

                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                    if (gpuDynInst->exec_mask[lane]) {

                        Addr vaddr = gpuDynInst->addr[lane];

                        for (int i = 0; i < N; ++i) {

                            wf->ldsChunk->write<VecElemU32>(

                                vaddr + i*sizeof(VecElemU32),

                                (reinterpret_cast<VecElemU32*>(

                                    gpuDynInst->d_data))[lane * N + i]);

                        }

                    }

                }

            }

        }


        template<typename T>

        void

        initAtomicAccess(GPUDynInstPtr gpuDynInst)

        {

            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {

                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);

            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {

                Wavefront *wf = gpuDynInst->wavefront();

                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                    if (gpuDynInst->exec_mask[lane]) {

                        Addr vaddr = gpuDynInst->addr[lane];

                        auto amo_op =

                            gpuDynInst->makeAtomicOpFunctor<T>(

                                &(reinterpret_cast<T*>(

                                    gpuDynInst->a_data))[lane],

                                &(reinterpret_cast<T*>(

                                    gpuDynInst->x_data))[lane]);


                        T tmp = wf->ldsChunk->read<T>(vaddr);

                        (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));

                        wf->ldsChunk->write<T>(vaddr, tmp);

                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;

                    }

                }

            }

        }


        void

        calcAddr(GPUDynInstPtr gpuDynInst, ScalarRegU32 vaddr,

                 ScalarRegU32 saddr, ScalarRegI32 offset)

        {

            // Offset is a 13-bit field w/the following meanings:

            // In Flat instructions, offset is a 12-bit unsigned number

            // In Global/Scratch instructions, offset is a 13-bit signed number

            if (isFlat()) {

                offset = offset & 0xfff;

            } else {

                offset = (ScalarRegI32)sext<13>(offset);

            }

            // If saddr = 0x7f there is no scalar reg to read and address will

            // be a 64-bit address. Otherwise, saddr is the reg index for a

            // scalar reg used as the base address for a 32-bit address.

            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {

                ConstVecOperandU64 vbase(gpuDynInst, vaddr);

                vbase.read();


                calcAddrVgpr(gpuDynInst, vbase, offset);

            } else {

                // Assume we are operating in 64-bit mode and read a pair of

                // SGPRs for the address base.

                ConstScalarOperandU64 sbase(gpuDynInst, saddr);

                sbase.read();


                ConstVecOperandU32 voffset(gpuDynInst, vaddr);

                voffset.read();


                calcAddrSgpr(gpuDynInst, voffset, sbase, offset);

            }


            if (isFlat()) {

                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);

            } else {

                gpuDynInst->staticInstruction()->executed_as =

                    enums::SC_GLOBAL;

            }

        }


        void

        issueRequestHelper(GPUDynInstPtr gpuDynInst)

        {

            if ((gpuDynInst->executedAs() == enums::SC_GLOBAL && isFlat())

                    || isFlatGlobal()) {

                gpuDynInst->computeUnit()->globalMemoryPipe

                    .issueRequest(gpuDynInst);

            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {

                assert(isFlat());

                gpuDynInst->computeUnit()->localMemoryPipe

                    .issueRequest(gpuDynInst);

            } else {

                fatal("Unsupported scope for flat instruction.\n");

            }

        }


        bool

        vgprIsOffset()

        {

            return (extData.SADDR != 0x7f);

        }


        // first instruction DWORD

        InFmt_FLAT instData;

        // second instruction DWORD

        InFmt_FLAT_1 extData;


      private:

        void initFlatOperandInfo();

        void initGlobalOperandInfo();


        void generateFlatDisassembly();

        void generateGlobalDisassembly();


        void

        calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr,

                     ConstScalarOperandU64 &saddr, ScalarRegI32 offset)

        {

            // Use SGPR pair as a base address and add VGPR-offset and

            // instruction offset. The VGPR-offset is always 32-bits so we

            // mask any upper bits from the vaddr.

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    ScalarRegI32 voffset = vaddr[lane];

                    gpuDynInst->addr.at(lane) =

                        saddr.rawData() + voffset + offset;

                }

            }

        }


        void

        calcAddrVgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr,

                     ScalarRegI32 offset)

        {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (gpuDynInst->exec_mask[lane]) {

                    gpuDynInst->addr.at(lane) = addr[lane] + offset;

                }

            }

        }

    }; // Inst_FLAT

} // namespace VegaISA

} // namespace gem5


#endif // __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__