develop/vop2_8cc_source.html

/*

 * Copyright (c) 2024 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "arch/amdgpu/vega/insts/inst_util.hh"

#include "arch/amdgpu/vega/insts/instructions.hh"

#include "debug/VEGA.hh"


namespace gem5

{


namespace VegaISA

{

    // --- Inst_VOP2__V_CNDMASK_B32 class methods ---


    Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_cndmask_b32")

    {

        setFlag(ALU);

        setFlag(ReadsVCC);

    } // Inst_VOP2__V_CNDMASK_B32


    Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32()

    {

    } // ~Inst_VOP2__V_CNDMASK_B32


    // --- description from .arch file ---

    // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC

    // as a scalar GPR in S2.

    void


    Inst_VOP2__V_CNDMASK_B32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);


        src0.readSrc();

        src1.read();

        vcc.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane]

                    = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_ADD_F32 class methods ---


    Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_add_f32")

    {

        setFlag(ALU);

        setFlag(F32);

    } // Inst_VOP2__V_ADD_F32


    Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32()

    {

    } // ~Inst_VOP2__V_ADD_F32


    // --- description from .arch file ---

    // D.f = S0.f + S1.f.

    void


    Inst_VOP2__V_ADD_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        VecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);


        if (isDPPInst()) {

            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);

            src0_dpp.read();


            DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "

                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "

                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "

                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,

                    extData.iFmt_VOP_DPP.DPP_CTRL,

                    extData.iFmt_VOP_DPP.SRC0_ABS,

                    extData.iFmt_VOP_DPP.SRC0_NEG,

                    extData.iFmt_VOP_DPP.SRC1_ABS,

                    extData.iFmt_VOP_DPP.SRC1_NEG,

                    extData.iFmt_VOP_DPP.BC,

                    extData.iFmt_VOP_DPP.BANK_MASK,

                    extData.iFmt_VOP_DPP.ROW_MASK);


            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0_dpp[lane] + src1[lane];

                }

            }

        } else {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0[lane] + src1[lane];

                }

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_SUB_F32 class methods ---


    Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_sub_f32")

    {

        setFlag(ALU);

        setFlag(F32);

    } // Inst_VOP2__V_SUB_F32


    Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32()

    {

    } // ~Inst_VOP2__V_SUB_F32


    // --- description from .arch file ---

    // D.f = S0.f - S1.f.

    // SQ translates to V_ADD_F32.

    void


    Inst_VOP2__V_SUB_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] - src1[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_SUBREV_F32 class methods ---


    Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_subrev_f32")

    {

        setFlag(ALU);

        setFlag(F32);

    } // Inst_VOP2__V_SUBREV_F32


    Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32()

    {

    } // ~Inst_VOP2__V_SUBREV_F32


    // --- description from .arch file ---

    // D.f = S1.f - S0.f.

    // SQ translates to V_ADD_F32.

    void


    Inst_VOP2__V_SUBREV_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src1[lane] - src0[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---


    Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_legacy_f32")

    {

        setFlag(ALU);

        setFlag(F32);

    } // Inst_VOP2__V_MUL_LEGACY_F32


    Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32()

    {

    } // ~Inst_VOP2__V_MUL_LEGACY_F32


    // --- description from .arch file ---

    // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).

    void


    Inst_VOP2__V_MUL_LEGACY_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] * src1[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MUL_F32 class methods ---


    Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_f32")

    {

        setFlag(ALU);

        setFlag(F32);

    } // Inst_VOP2__V_MUL_F32


    Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32()

    {

    } // ~Inst_VOP2__V_MUL_F32


    // --- description from .arch file ---

    // D.f = S0.f * S1.f.

    void


    Inst_VOP2__V_MUL_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                if (std::isnan(src0[lane]) ||

                    std::isnan(src1[lane])) {

                    vdst[lane] = NAN;

                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||

                           std::fpclassify(src0[lane]) == FP_ZERO) &&

                           !std::signbit(src0[lane])) {

                    if (std::isinf(src1[lane])) {

                        vdst[lane] = NAN;

                    } else if (!std::signbit(src1[lane])) {

                        vdst[lane] = +0.0;

                    } else {

                        vdst[lane] = -0.0;

                    }

                } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||

                           std::fpclassify(src0[lane]) == FP_ZERO) &&

                           std::signbit(src0[lane])) {

                    if (std::isinf(src1[lane])) {

                        vdst[lane] = NAN;

                    } else if (std::signbit(src1[lane])) {

                        vdst[lane] = +0.0;

                    } else {

                        vdst[lane] = -0.0;

                    }

                } else if (std::isinf(src0[lane]) &&

                           !std::signbit(src0[lane])) {

                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||

                        std::fpclassify(src1[lane]) == FP_ZERO) {

                        vdst[lane] = NAN;

                    } else if (!std::signbit(src1[lane])) {

                        vdst[lane] = +INFINITY;

                    } else {

                        vdst[lane] = -INFINITY;

                    }

                } else if (std::isinf(src0[lane]) &&

                           std::signbit(src0[lane])) {

                    if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||

                        std::fpclassify(src1[lane]) == FP_ZERO) {

                        vdst[lane] = NAN;

                    } else if (std::signbit(src1[lane])) {

                        vdst[lane] = +INFINITY;

                    } else {

                        vdst[lane] = -INFINITY;

                    }

                } else {

                    vdst[lane] = src0[lane] * src1[lane];

                }

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MUL_I32_I24 class methods ---


    Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_i32_i24")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MUL_I32_I24


    Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24()

    {

    } // ~Inst_VOP2__V_MUL_I32_I24


    // --- description from .arch file ---

    // D.i = S0.i[23:0] * S1.i[23:0].

    void


    Inst_VOP2__V_MUL_I32_I24::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);

        VecOperandI32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = sext<24>(bits(src0[lane], 23, 0))

                    * sext<24>(bits(src1[lane], 23, 0));

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---


    Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MUL_HI_I32_I24


    Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24()

    {

    } // ~Inst_VOP2__V_MUL_HI_I32_I24


    // --- description from .arch file ---

    // D.i = (S0.i[23:0] * S1.i[23:0])>>32.

    void


    Inst_VOP2__V_MUL_HI_I32_I24::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);

        VecOperandI32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                VecElemI64 tmp_src0

                    = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));

                VecElemI64 tmp_src1

                    = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));


                vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MUL_U32_U24 class methods ---


    Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_u32_u24")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MUL_U32_U24


    Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24()

    {

    } // ~Inst_VOP2__V_MUL_U32_U24


    // --- description from .arch file ---

    // D.u = S0.u[23:0] * S1.u[23:0].

    void


    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)

    {

        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,

                         VecOperandU32& vdst, Wavefront* wf) {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = bits(src0[lane], 23, 0) *

                                 bits(src1[lane], 23, 0);

                }

            }

        };


        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);

    } // execute


    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---


    Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MUL_HI_U32_U24


    Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24()

    {

    } // ~Inst_VOP2__V_MUL_HI_U32_U24


    // --- description from .arch file ---

    // D.i = (S0.u[23:0] * S1.u[23:0])>>32.

    void


    Inst_VOP2__V_MUL_HI_U32_U24::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);

                VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);

                vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MIN_F32 class methods ---


    Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_min_f32")

    {

        setFlag(ALU);

        setFlag(F32);

    } // Inst_VOP2__V_MIN_F32


    Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32()

    {

    } // ~Inst_VOP2__V_MIN_F32


    // --- description from .arch file ---

    // D.f = (S0.f < S1.f ? S0.f : S1.f).

    void


    Inst_VOP2__V_MIN_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::fmin(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MAX_F32 class methods ---


    Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_max_f32")

    {

        setFlag(ALU);

        setFlag(F32);

    } // Inst_VOP2__V_MAX_F32


    Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32()

    {

    } // ~Inst_VOP2__V_MAX_F32


    // --- description from .arch file ---

    // D.f = (S0.f >= S1.f ? S0.f : S1.f).

    void


    Inst_VOP2__V_MAX_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::fmax(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MIN_I32 class methods ---


    Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_min_i32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MIN_I32


    Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32()

    {

    } // ~Inst_VOP2__V_MIN_I32


    // --- description from .arch file ---

    // D.i = min(S0.i, S1.i).

    void


    Inst_VOP2__V_MIN_I32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);

        VecOperandI32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::min(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MAX_I32 class methods ---


    Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_max_i32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MAX_I32


    Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32()

    {

    } // ~Inst_VOP2__V_MAX_I32


    // --- description from .arch file ---

    // D.i = max(S0.i, S1.i).

    void


    Inst_VOP2__V_MAX_I32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);

        VecOperandI32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::max(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MIN_U32 class methods ---


    Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_min_u32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MIN_U32


    Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32()

    {

    } // ~Inst_VOP2__V_MIN_U32


    // --- description from .arch file ---

    // D.u = min(S0.u, S1.u).

    void


    Inst_VOP2__V_MIN_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::min(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MAX_U32 class methods ---


    Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_max_u32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MAX_U32


    Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32()

    {

    } // ~Inst_VOP2__V_MAX_U32


    // --- description from .arch file ---

    // D.u = max(S0.u, S1.u).

    void


    Inst_VOP2__V_MAX_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::max(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_LSHRREV_B32 class methods ---


    Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_lshrrev_b32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_LSHRREV_B32


    Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32()

    {

    } // ~Inst_VOP2__V_LSHRREV_B32


    // --- description from .arch file ---

    // D.u = S1.u >> S0.u[4:0].

    // The vacated bits are set to zero.

    // SQ translates this to an internal SP opcode.

    void


    Inst_VOP2__V_LSHRREV_B32::execute(GPUDynInstPtr gpuDynInst)

    {

        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,

                         VecOperandU32& vdst, Wavefront* wf) {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);

                }

            }

        };


        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);

    } // execute


    // --- Inst_VOP2__V_ASHRREV_I32 class methods ---


    Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_ashrrev_i32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_ASHRREV_I32


    Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32()

    {

    } // ~Inst_VOP2__V_ASHRREV_I32


    // --- description from .arch file ---

    // D.i = signext(S1.i) >> S0.i[4:0].

    // The vacated bits are set to the sign bit of the input value.

    // SQ translates this to an internal SP opcode.

    void


    Inst_VOP2__V_ASHRREV_I32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);

        VecOperandI32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_LSHLREV_B32 class methods ---


    Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_lshlrev_b32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_LSHLREV_B32


    Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32()

    {

    } // ~Inst_VOP2__V_LSHLREV_B32


    // --- description from .arch file ---

    // D.u = S1.u << S0.u[4:0].

    // SQ translates this to an internal SP opcode.

    void


    Inst_VOP2__V_LSHLREV_B32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        VecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        if (isSDWAInst()) {

            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);

            // use copies of original src0, src1, and vdst during selecting

            VecOperandU32 origSrc0_sdwa(gpuDynInst,

                                        extData.iFmt_VOP_SDWA.SRC0);

            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);

            VecOperandU32 origVdst(gpuDynInst, instData.VDST);


            src0_sdwa.read();

            origSrc0_sdwa.read();

            origSrc1.read();


            DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "

                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "

                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "

                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",

                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,

                    extData.iFmt_VOP_SDWA.DST_U,

                    extData.iFmt_VOP_SDWA.CLMP,

                    extData.iFmt_VOP_SDWA.SRC0_SEL,

                    extData.iFmt_VOP_SDWA.SRC0_SEXT,

                    extData.iFmt_VOP_SDWA.SRC0_NEG,

                    extData.iFmt_VOP_SDWA.SRC0_ABS,

                    extData.iFmt_VOP_SDWA.SRC1_SEL,

                    extData.iFmt_VOP_SDWA.SRC1_SEXT,

                    extData.iFmt_VOP_SDWA.SRC1_NEG,

                    extData.iFmt_VOP_SDWA.SRC1_ABS);


            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,

                            src1, origSrc1);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);

                    origVdst[lane] = vdst[lane]; // keep copy consistent

                }

            }


            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);

        } else {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);

                }

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_AND_B32 class methods ---


    Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_and_b32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_AND_B32


    Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32()

    {

    } // ~Inst_VOP2__V_AND_B32


    // --- description from .arch file ---

    // D.u = S0.u & S1.u.

    // Input and output modifiers not supported.

    void


    Inst_VOP2__V_AND_B32::execute(GPUDynInstPtr gpuDynInst)

    {

        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,

                         VecOperandU32& vdst, Wavefront* wf) {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0[lane] & src1[lane];

                }

            }

        };


        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);

    } // execute


    // --- Inst_VOP2__V_OR_B32 class methods ---


    Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_or_b32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_OR_B32


    Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32()

    {

    } // ~Inst_VOP2__V_OR_B32


    // --- description from .arch file ---

    // D.u = S0.u | S1.u.

    // Input and output modifiers not supported.

    void


    Inst_VOP2__V_OR_B32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        VecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        if (isSDWAInst()) {

            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);

            // use copies of original src0, src1, and dest during selecting

            VecOperandU32 origSrc0_sdwa(gpuDynInst,

                                        extData.iFmt_VOP_SDWA.SRC0);

            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);

            VecOperandU32 origVdst(gpuDynInst, instData.VDST);


            src0_sdwa.read();

            origSrc0_sdwa.read();

            origSrc1.read();


            DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "

                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "

                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "

                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",

                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,

                    extData.iFmt_VOP_SDWA.DST_U,

                    extData.iFmt_VOP_SDWA.CLMP,

                    extData.iFmt_VOP_SDWA.SRC0_SEL,

                    extData.iFmt_VOP_SDWA.SRC0_SEXT,

                    extData.iFmt_VOP_SDWA.SRC0_NEG,

                    extData.iFmt_VOP_SDWA.SRC0_ABS,

                    extData.iFmt_VOP_SDWA.SRC1_SEL,

                    extData.iFmt_VOP_SDWA.SRC1_SEXT,

                    extData.iFmt_VOP_SDWA.SRC1_NEG,

                    extData.iFmt_VOP_SDWA.SRC1_ABS);


            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,

                            src1, origSrc1);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0_sdwa[lane] | src1[lane];

                    origVdst[lane] = vdst[lane]; // keep copy consistent

                }

            }


            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);

        } else {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0[lane] | src1[lane];

                }

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_XOR_B32 class methods ---


    Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_xor_b32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_XOR_B32


    Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32()

    {

    } // ~Inst_VOP2__V_XOR_B32


    // --- description from .arch file ---

    // D.u = S0.u ^ S1.u.

    // Input and output modifiers not supported.

    void


    Inst_VOP2__V_XOR_B32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] ^ src1[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_DOT2C_F32_BF16 class methods ---


    Inst_VOP2__V_DOT2C_F32_BF16::Inst_VOP2__V_DOT2C_F32_BF16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_dot2c_f32_bf16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_DOT2C_F32_BF16


    Inst_VOP2__V_DOT2C_F32_BF16::~Inst_VOP2__V_DOT2C_F32_BF16()

    {

    } // ~Inst_VOP2__V_DOT2C_F32_BF16


    void


    Inst_VOP2__V_DOT2C_F32_BF16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        VecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();

        vdst.read();


        fatal_if(isSDWAInst(), "SDWA not supported for V_DOT2C_F32_BF16");


        VecElemU32 src0d[NumVecElemPerVecReg];

        if (isDPPInst()) {

            VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);

            src0_dpp.read();


            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                src0d[lane] = src0_dpp[lane];

            }

        } else {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                src0d[lane] = src0[lane];

            }

        }


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                AMDGPU::mxbfloat16 a1, a2, b1, b2;

                a1.data = uint16_t(bits(src0d[lane], 15, 0));

                a2.data = uint16_t(bits(src0d[lane], 31, 16));

                b1.data = uint16_t(bits(src1[lane], 15, 0));

                b2.data = uint16_t(bits(src1[lane], 31, 16));


                vdst[lane] += float(a1) * float(b1);

                vdst[lane] += float(a2) * float(b2);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MAC_F32 class methods ---


    Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mac_f32")

    {

        setFlag(ALU);

        setFlag(F32);

        setFlag(MAC);

    } // Inst_VOP2__V_MAC_F32


    Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32()

    {

    } // ~Inst_VOP2__V_MAC_F32


    // --- description from .arch file ---

    // D.f = S0.f * S1.f + D.f.

    // SQ translates to V_MAD_F32.

    void


    Inst_VOP2__V_MAC_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        VecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();

        vdst.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);


        if (isDPPInst()) {

            VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);

            src0_dpp.read();


            DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "

                    "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "

                    "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "

                    "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,

                    extData.iFmt_VOP_DPP.DPP_CTRL,

                    extData.iFmt_VOP_DPP.SRC0_ABS,

                    extData.iFmt_VOP_DPP.SRC0_NEG,

                    extData.iFmt_VOP_DPP.SRC1_ABS,

                    extData.iFmt_VOP_DPP.SRC1_NEG,

                    extData.iFmt_VOP_DPP.BC,

                    extData.iFmt_VOP_DPP.BANK_MASK,

                    extData.iFmt_VOP_DPP.ROW_MASK);


            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = std::fma(src0_dpp[lane], src1[lane],

                                          vdst[lane]);

                }

            }

        } else {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);

                }

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MADMK_F32 class methods ---


    Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_madmk_f32")

    {

        setFlag(ALU);

        setFlag(F32);

        setFlag(MAD);

    } // Inst_VOP2__V_MADMK_F32


    Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32()

    {

    } // ~Inst_VOP2__V_MADMK_F32


    // --- description from .arch file ---

    // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.

    // This opcode cannot use the VOP3 encoding and cannot use input/output

    // ---  modifiers.

    // SQ translates to V_MAD_F32.

    void


    Inst_VOP2__V_MADMK_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        VecElemF32 k = extData.imm_f32;


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::fma(src0[lane], k, src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MADAK_F32 class methods ---


    Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_madak_f32")

    {

        setFlag(ALU);

        setFlag(F32);

        setFlag(MAD);

    } // Inst_VOP2__V_MADAK_F32


    Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32()

    {

    } // ~Inst_VOP2__V_MADAK_F32


    // --- description from .arch file ---

    // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.

    // This opcode cannot use the VOP3 encoding and cannot use input/output

    // ---  modifiers.

    // SQ translates to V_MAD_F32.

    void


    Inst_VOP2__V_MADAK_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);

        VecElemF32 k = extData.imm_f32;


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::fma(src0[lane], src1[lane], k);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---


    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_add_co_u32")

    {

        setFlag(ALU);

        setFlag(WritesVCC);

    } // Inst_VOP2__V_ADD_CO_U32


    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()

    {

    } // ~Inst_VOP2__V_ADD_CO_U32


    // --- description from .arch file ---

    // D.u = S0.u + S1.u;

    // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED

    // ---  overflow or carry-out for V_ADDC_U32.

    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.

    void


    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        VecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);


        src0.readSrc();

        src1.read();


        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        if (isSDWAInst()) {

            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);

            // use copies of original src0, src1, and dest during selecting

            VecOperandU32 origSrc0_sdwa(gpuDynInst,

                                        extData.iFmt_VOP_SDWA.SRC0);

            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);

            VecOperandU32 origVdst(gpuDynInst, instData.VDST);


            src0_sdwa.read();

            origSrc0_sdwa.read();

            origSrc1.read();


            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "

                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "

                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "

                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",

                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,

                    extData.iFmt_VOP_SDWA.DST_U,

                    extData.iFmt_VOP_SDWA.CLMP,

                    extData.iFmt_VOP_SDWA.SRC0_SEL,

                    extData.iFmt_VOP_SDWA.SRC0_SEXT,

                    extData.iFmt_VOP_SDWA.SRC0_NEG,

                    extData.iFmt_VOP_SDWA.SRC0_ABS,

                    extData.iFmt_VOP_SDWA.SRC1_SEL,

                    extData.iFmt_VOP_SDWA.SRC1_SEXT,

                    extData.iFmt_VOP_SDWA.SRC1_NEG,

                    extData.iFmt_VOP_SDWA.SRC1_ABS);


            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,

                            src1, origSrc1);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0_sdwa[lane] + src1[lane];

                    origVdst[lane] = vdst[lane]; // keep copy consistent

                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]

                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);

                }

            }


            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);

        } else {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0[lane] + src1[lane];

                    vcc.setBit(lane, ((VecElemU64)src0[lane]

                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);

                }

            }

        }


        vcc.write();

        vdst.write();

    } // execute


    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---


    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_sub_co_u32")

    {

        setFlag(ALU);

        setFlag(WritesVCC);

    } // Inst_VOP2__V_SUB_CO_U32


    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()

    {

    } // ~Inst_VOP2__V_SUB_CO_U32


    // --- description from .arch file ---

    // D.u = S0.u - S1.u;

    // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or

    // carry-out for V_SUBB_U32.

    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.

    void


    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] - src1[lane];

                vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);

            }

        }


        vdst.write();

        vcc.write();

    } // execute


    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---


    Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_subrev_co_u32")

    {

        setFlag(ALU);

        setFlag(WritesVCC);

    } // Inst_VOP2__V_SUBREV_CO_U32


    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()

    {

    } // ~Inst_VOP2__V_SUBREV_CO_U32


    // --- description from .arch file ---

    // D.u = S1.u - S0.u;

    // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or

    // carry-out for V_SUBB_U32.

    // In VOP3 the VCC destination may be an arbitrary SGPR-pair.

    void


    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src1[lane] - src0[lane];

                vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);

            }

        }


        vdst.write();

        vcc.write();

    } // execute


    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---


    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_addc_co_u32")

    {

        setFlag(ALU);

        setFlag(WritesVCC);

        setFlag(ReadsVCC);

    } // Inst_VOP2__V_ADDC_CO_U32


    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()

    {

    } // ~Inst_VOP2__V_ADDC_CO_U32


    // --- description from .arch file ---

    // D.u = S0.u + S1.u + VCC[threadId];

    // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)

    // is an UNSIGNED overflow.

    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC

    // source comes from the SGPR-pair at S2.u.

    void


    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);


        src0.readSrc();

        src1.read();

        vcc.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] + src1[lane]

                    + bits(vcc.rawData(), lane);

                vcc.setBit(lane, ((VecElemU64)src0[lane]

                    + (VecElemU64)src1[lane]

                        + (VecElemU64)bits(vcc.rawData(), lane, lane))

                            >= 0x100000000 ? 1 : 0);

            }

        }


        vdst.write();

        vcc.write();

    } // execute


    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---


    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_subb_co_u32")

    {

        setFlag(ALU);

        setFlag(WritesVCC);

        setFlag(ReadsVCC);

    } // Inst_VOP2__V_SUBB_CO_U32


    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()

    {

    } // ~Inst_VOP2__V_SUBB_CO_U32


    // --- description from .arch file ---

    // D.u = S0.u - S1.u - VCC[threadId];

    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED

    // ---  overflow.

    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC

    // ---  source comes from the SGPR-pair at S2.u.

    void


    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);


        src0.readSrc();

        src1.read();

        vcc.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane]

                    = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);

                vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))

                    > src0[lane] ? 1 : 0);

            }

        }


        vdst.write();

        vcc.write();

    } // execute


    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---


    Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_subbrev_co_u32")

    {

        setFlag(ALU);

        setFlag(WritesVCC);

        setFlag(ReadsVCC);

    } // Inst_VOP2__V_SUBBREV_CO_U32


    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()

    {

    } // ~Inst_VOP2__V_SUBBREV_CO_U32


    // --- description from .arch file ---

    // D.u = S1.u - S0.u - VCC[threadId];

    // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED

    // overflow.

    // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC

    // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.

    // SQ translates this to V_SUBREV_U32 with reversed operands.

    void


    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);

        ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);


        src0.readSrc();

        src1.read();

        vcc.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane]

                    = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);

                vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))

                    > src1[lane] ? 1 : 0);

            }

        }


        vdst.write();

        vcc.write();

    } // execute


    // --- Inst_VOP2__V_ADD_F16 class methods ---


    Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_add_f16")

    {

        setFlag(ALU);

        setFlag(F16);

    } // Inst_VOP2__V_ADD_F16


    Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16()

    {

    } // ~Inst_VOP2__V_ADD_F16


    // --- description from .arch file ---

    // D.f16 = S0.f16 + S1.f16.

    // Supports denormals, round mode, exception flags, saturation.

    void


    Inst_VOP2__V_ADD_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_SUB_F16 class methods ---


    Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_sub_f16")

    {

        setFlag(ALU);

        setFlag(F16);

    } // Inst_VOP2__V_SUB_F16


    Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16()

    {

    } // ~Inst_VOP2__V_SUB_F16


    // --- description from .arch file ---

    // D.f16 = S0.f16 - S1.f16.

    // Supports denormals, round mode, exception flags, saturation.

    // SQ translates to V_ADD_F16.

    void


    Inst_VOP2__V_SUB_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_SUBREV_F16 class methods ---


    Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_subrev_f16")

    {

        setFlag(ALU);

        setFlag(F16);

    } // Inst_VOP2__V_SUBREV_F16


    Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16()

    {

    } // ~Inst_VOP2__V_SUBREV_F16


    // --- description from .arch file ---

    // D.f16 = S1.f16 - S0.f16.

    // Supports denormals, round mode, exception flags, saturation.

    // SQ translates to V_ADD_F16.

    void


    Inst_VOP2__V_SUBREV_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_MUL_F16 class methods ---


    Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_f16")

    {

        setFlag(ALU);

        setFlag(F16);

    } // Inst_VOP2__V_MUL_F16


    Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16()

    {

    } // ~Inst_VOP2__V_MUL_F16


    // --- description from .arch file ---

    // D.f16 = S0.f16 * S1.f16.

    // Supports denormals, round mode, exception flags, saturation.

    void


    Inst_VOP2__V_MUL_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_MAC_F16 class methods ---


    Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mac_f16")

    {

        setFlag(ALU);

        setFlag(F16);

        setFlag(MAC);

    } // Inst_VOP2__V_MAC_F16


    Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16()

    {

    } // ~Inst_VOP2__V_MAC_F16


    // --- description from .arch file ---

    // D.f16 = S0.f16 * S1.f16 + D.f16.

    // Supports round mode, exception flags, saturation.

    // SQ translates this to V_MAD_F16.

    void


    Inst_VOP2__V_MAC_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_MADMK_F16 class methods ---


    Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_madmk_f16")

    {

        setFlag(ALU);

        setFlag(F16);

        setFlag(MAD);

    } // Inst_VOP2__V_MADMK_F16


    Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16()

    {

    } // ~Inst_VOP2__V_MADMK_F16


    // --- description from .arch file ---

    // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored

    // in the following literal DWORD.

    // This opcode cannot use the VOP3 encoding and cannot use input/output

    // modifiers. Supports round mode, exception flags, saturation.

    // SQ translates this to V_MAD_F16.

    void


    Inst_VOP2__V_MADMK_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_MADAK_F16 class methods ---


    Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_madak_f16")

    {

        setFlag(ALU);

        setFlag(F16);

        setFlag(MAD);

    } // Inst_VOP2__V_MADAK_F16


    Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16()

    {

    } // ~Inst_VOP2__V_MADAK_F16


    // --- description from .arch file ---

    // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored

    // in the following literal DWORD.

    // This opcode cannot use the VOP3 encoding and cannot use input/output

    // modifiers. Supports round mode, exception flags, saturation.

    // SQ translates this to V_MAD_F16.

    void


    Inst_VOP2__V_MADAK_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_ADD_U16 class methods ---


    Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_add_u16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_ADD_U16


    Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16()

    {

    } // ~Inst_VOP2__V_ADD_U16


    // --- description from .arch file ---

    // D.u16 = S0.u16 + S1.u16.

    // Supports saturation (unsigned 16-bit integer domain).

    void


    Inst_VOP2__V_ADD_U16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);

        VecOperandU16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] + src1[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_SUB_U16 class methods ---


    Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_sub_u16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_SUB_U16


    Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16()

    {

    } // ~Inst_VOP2__V_SUB_U16


    // --- description from .arch file ---

    // D.u16 = S0.u16 - S1.u16.

    // Supports saturation (unsigned 16-bit integer domain).

    void


    Inst_VOP2__V_SUB_U16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);

        VecOperandU16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] - src1[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_SUBREV_U16 class methods ---


    Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_subrev_u16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_SUBREV_U16


    Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16()

    {

    } // ~Inst_VOP2__V_SUBREV_U16


    // --- description from .arch file ---

    // D.u16 = S1.u16 - S0.u16.

    // Supports saturation (unsigned 16-bit integer domain).

    // SQ translates this to V_SUB_U16 with reversed operands.

    void


    Inst_VOP2__V_SUBREV_U16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);

        VecOperandU16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src1[lane] - src0[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MUL_LO_U16 class methods ---


    Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_mul_lo_u16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MUL_LO_U16


    Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16()

    {

    } // ~Inst_VOP2__V_MUL_LO_U16


    // --- description from .arch file ---

    // D.u16 = S0.u16 * S1.u16.

    // Supports saturation (unsigned 16-bit integer domain).

    void


    Inst_VOP2__V_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);

        VecOperandU16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] * src1[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_LSHLREV_B16 class methods ---


    Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_lshlrev_b16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_LSHLREV_B16


    Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16()

    {

    } // ~Inst_VOP2__V_LSHLREV_B16


    // --- description from .arch file ---

    // D.u[15:0] = S1.u[15:0] << S0.u[3:0].

    // SQ translates this to an internal SP opcode.

    void


    Inst_VOP2__V_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)

    {

        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,

                         VecOperandU32& vdst, Wavefront* wf) {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);

                }

            }

        };


        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);

    } // execute


    // --- Inst_VOP2__V_LSHRREV_B16 class methods ---


    Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_lshrrev_b16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_LSHRREV_B16


    Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16()

    {

    } // ~Inst_VOP2__V_LSHRREV_B16


    // --- description from .arch file ---

    // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].

    // The vacated bits are set to zero.

    // SQ translates this to an internal SP opcode.

    void


    Inst_VOP2__V_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);

        VecOperandU16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src1[lane] >> src0[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_ASHRREV_I16 class methods ---


    Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_ashrrev_i16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_ASHRREV_I16


    Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16()

    {

    } // ~Inst_VOP2__V_ASHRREV_I16


    // --- description from .arch file ---

    // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].

    // The vacated bits are set to the sign bit of the input value.

    // SQ translates this to an internal SP opcode.

    void


    Inst_VOP2__V_ASHRREV_I16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);

        VecOperandI16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src1[lane] >> src0[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MAX_F16 class methods ---


    Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_max_f16")

    {

        setFlag(ALU);

        setFlag(F16);

    } // Inst_VOP2__V_MAX_F16


    Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16()

    {

    } // ~Inst_VOP2__V_MAX_F16


    // --- description from .arch file ---

    // D.f16 = max(S0.f16, S1.f16).

    // IEEE compliant. Supports denormals, round mode, exception flags,

    // saturation.

    void


    Inst_VOP2__V_MAX_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_MIN_F16 class methods ---


    Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_min_f16")

    {

        setFlag(ALU);

        setFlag(F16);

    } // Inst_VOP2__V_MIN_F16


    Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16()

    {

    } // ~Inst_VOP2__V_MIN_F16


    // --- description from .arch file ---

    // D.f16 = min(S0.f16, S1.f16).

    // IEEE compliant. Supports denormals, round mode, exception flags,

    // saturation.

    void


    Inst_VOP2__V_MIN_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_MAX_U16 class methods ---


    Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_max_u16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MAX_U16


    Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16()

    {

    } // ~Inst_VOP2__V_MAX_U16


    // --- description from .arch file ---

    // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).

    void


    Inst_VOP2__V_MAX_U16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);

        VecOperandU16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::max(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MAX_I16 class methods ---


    Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_max_i16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MAX_I16


    Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16()

    {

    } // ~Inst_VOP2__V_MAX_I16


    // --- description from .arch file ---

    // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).

    void


    Inst_VOP2__V_MAX_I16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);

        VecOperandI16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::max(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MIN_U16 class methods ---


    Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_min_u16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MIN_U16


    Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16()

    {

    } // ~Inst_VOP2__V_MIN_U16


    // --- description from .arch file ---

    // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).

    void


    Inst_VOP2__V_MIN_U16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);

        VecOperandU16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::min(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_MIN_I16 class methods ---


    Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_min_i16")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_MIN_I16


    Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16()

    {

    } // ~Inst_VOP2__V_MIN_I16


    // --- description from .arch file ---

    // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).

    void


    Inst_VOP2__V_MIN_I16::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);

        VecOperandI16 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::min(src0[lane], src1[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_LDEXP_F16 class methods ---


    Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_ldexp_f16")

    {

        setFlag(ALU);

        setFlag(F16);

    } // Inst_VOP2__V_LDEXP_F16


    Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16()

    {

    } // ~Inst_VOP2__V_LDEXP_F16


    // --- description from .arch file ---

    // D.f16 = S0.f16 * (2 ** S1.i16).

    void


    Inst_VOP2__V_LDEXP_F16::execute(GPUDynInstPtr gpuDynInst)

    {

        panicUnimplemented();

    } // execute


    // --- Inst_VOP2__V_ADD_U32 class methods ---


    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_add_u32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_ADD_U32


    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()

    {

    } // ~Inst_VOP2__V_ADD_U32


    // --- description from .arch file ---

    // D.u = S0.u + S1.u;

    void


    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        VecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        if (isSDWAInst()) {

            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);

            // use copies of original src0, src1, and dest during selecting

            VecOperandU32 origSrc0_sdwa(gpuDynInst,

                                        extData.iFmt_VOP_SDWA.SRC0);

            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);

            VecOperandU32 origVdst(gpuDynInst, instData.VDST);


            src0_sdwa.read();

            origSrc0_sdwa.read();

            origSrc1.read();


            DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "

                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "

                    "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "

                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",

                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,

                    extData.iFmt_VOP_SDWA.DST_U,

                    extData.iFmt_VOP_SDWA.CLMP,

                    extData.iFmt_VOP_SDWA.SRC0_SEL,

                    extData.iFmt_VOP_SDWA.SRC0_SEXT,

                    extData.iFmt_VOP_SDWA.SRC0_NEG,

                    extData.iFmt_VOP_SDWA.SRC0_ABS,

                    extData.iFmt_VOP_SDWA.SRC1_SEL,

                    extData.iFmt_VOP_SDWA.SRC1_SEXT,

                    extData.iFmt_VOP_SDWA.SRC1_NEG,

                    extData.iFmt_VOP_SDWA.SRC1_ABS);


            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,

                            src1, origSrc1);


            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0_sdwa[lane] + src1[lane];

                    origVdst[lane] = vdst[lane]; // keep copy consistent

                }

            }


            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);

        } else {

            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

                if (wf->execMask(lane)) {

                    vdst[lane] = src0[lane] + src1[lane];

                }

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_SUB_U32 class methods ---


    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_sub_u32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_SUB_U32


    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()

    {

    } // ~Inst_VOP2__V_SUB_U32


    // --- description from .arch file ---

    // D.u = S0.u - S1.u;

    void


    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src0[lane] - src1[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_SUBREV_U32 class methods ---


    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_subrev_u32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_SUBREV_U32


    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()

    {

    } // ~Inst_VOP2__V_SUBREV_U32


    // --- description from .arch file ---

    // D.u = S1.u - S0.u;

    void


    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = src1[lane] - src0[lane];

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_FMAC_F32 class methods ---


    Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_fmac_f32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_FMAC_F32


    Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()

    {

    } // ~Inst_VOP2__V_FMAC_F32


    // --- description from .arch file ---

    // D.u = S1.u - S0.u;

    void


    Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);

        VecOperandF32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();

        vdst.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_FMAC_F64 class methods ---


    Inst_VOP2__V_FMAC_F64::Inst_VOP2__V_FMAC_F64(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_fmac_f64")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_FMAC_F64


    Inst_VOP2__V_FMAC_F64::~Inst_VOP2__V_FMAC_F64()

    {

    } // ~Inst_VOP2__V_FMAC_F64


    // --- description from .arch file ---

    // D0.f64 = fma(S0.f64, S1.f64, D0.f64)

    void


    Inst_VOP2__V_FMAC_F64::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandF64 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandF64 src1(gpuDynInst, instData.VSRC1);

        VecOperandF64 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();

        vdst.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);

            }

        }


        vdst.write();

    } // execute


    // --- Inst_VOP2__V_XNOR_B32 class methods ---


    Inst_VOP2__V_XNOR_B32::Inst_VOP2__V_XNOR_B32(InFmt_VOP2 *iFmt)

        : Inst_VOP2(iFmt, "v_xnor_b32")

    {

        setFlag(ALU);

    } // Inst_VOP2__V_XNOR_B32


    Inst_VOP2__V_XNOR_B32::~Inst_VOP2__V_XNOR_B32()

    {

    } // ~Inst_VOP2__V_XNOR_B32


    // --- description from .arch file ---

    // D.u = S1.u - S0.u;

    void


    Inst_VOP2__V_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)

    {

        Wavefront *wf = gpuDynInst->wavefront();

        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);

        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);

        VecOperandU32 vdst(gpuDynInst, instData.VDST);


        src0.readSrc();

        src1.read();

        vdst.read();


        panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);

        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            if (wf->execMask(lane)) {

                vdst[lane] = ~(src0[lane] ^ src1[lane]);

            }

        }


        vdst.write();

    } // execute


} // namespace VegaISA

} // namespace gem5

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

gem5::AMDGPU::mxfp::data
uint32_t data
Definition mxfp.hh:112

gem5::GPUStaticInst::isDPPInst
bool isDPPInst() const
Definition gpu_static_inst.hh:116

gem5::GPUStaticInst::setFlag
void setFlag(Flags flag)
Definition gpu_static_inst.hh:251

gem5::GPUStaticInst::isSDWAInst
bool isSDWAInst() const
Definition gpu_static_inst.hh:115

gem5::GPUStaticInst::_opcode
const std::string _opcode
Definition gpu_static_inst.hh:305

gem5::VegaISA::Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32
Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *)
Definition vop2.cc:1370

gem5::VegaISA::Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32
~Inst_VOP2__V_ADDC_CO_U32()
Definition vop2.cc:1378

gem5::VegaISA::Inst_VOP2__V_ADDC_CO_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1389

gem5::VegaISA::Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32
Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *)
Definition vop2.cc:1198

gem5::VegaISA::Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32
~Inst_VOP2__V_ADD_CO_U32()
Definition vop2.cc:1205

gem5::VegaISA::Inst_VOP2__V_ADD_CO_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1215

gem5::VegaISA::Inst_VOP2__V_ADD_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1532

gem5::VegaISA::Inst_VOP2__V_ADD_F16::Inst_VOP2__V_ADD_F16
Inst_VOP2__V_ADD_F16(InFmt_VOP2 *)
Definition vop2.cc:1517

gem5::VegaISA::Inst_VOP2__V_ADD_F16::~Inst_VOP2__V_ADD_F16
~Inst_VOP2__V_ADD_F16()
Definition vop2.cc:1524

gem5::VegaISA::Inst_VOP2__V_ADD_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:98

gem5::VegaISA::Inst_VOP2__V_ADD_F32::Inst_VOP2__V_ADD_F32
Inst_VOP2__V_ADD_F32(InFmt_VOP2 *)
Definition vop2.cc:84

gem5::VegaISA::Inst_VOP2__V_ADD_F32::~Inst_VOP2__V_ADD_F32
~Inst_VOP2__V_ADD_F32()
Definition vop2.cc:91

gem5::VegaISA::Inst_VOP2__V_ADD_U16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1690

gem5::VegaISA::Inst_VOP2__V_ADD_U16::~Inst_VOP2__V_ADD_U16
~Inst_VOP2__V_ADD_U16()
Definition vop2.cc:1682

gem5::VegaISA::Inst_VOP2__V_ADD_U16::Inst_VOP2__V_ADD_U16
Inst_VOP2__V_ADD_U16(InFmt_VOP2 *)
Definition vop2.cc:1676

gem5::VegaISA::Inst_VOP2__V_ADD_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2151

gem5::VegaISA::Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32
Inst_VOP2__V_ADD_U32(InFmt_VOP2 *)
Definition vop2.cc:2138

gem5::VegaISA::Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32
~Inst_VOP2__V_ADD_U32()
Definition vop2.cc:2144

gem5::VegaISA::Inst_VOP2__V_AND_B32::~Inst_VOP2__V_AND_B32
~Inst_VOP2__V_AND_B32()
Definition vop2.cc:855

gem5::VegaISA::Inst_VOP2__V_AND_B32::Inst_VOP2__V_AND_B32
Inst_VOP2__V_AND_B32(InFmt_VOP2 *)
Definition vop2.cc:849

gem5::VegaISA::Inst_VOP2__V_AND_B32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:863

gem5::VegaISA::Inst_VOP2__V_ASHRREV_I16::Inst_VOP2__V_ASHRREV_I16
Inst_VOP2__V_ASHRREV_I16(InFmt_VOP2 *)
Definition vop2.cc:1892

gem5::VegaISA::Inst_VOP2__V_ASHRREV_I16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1907

gem5::VegaISA::Inst_VOP2__V_ASHRREV_I16::~Inst_VOP2__V_ASHRREV_I16
~Inst_VOP2__V_ASHRREV_I16()
Definition vop2.cc:1898

gem5::VegaISA::Inst_VOP2__V_ASHRREV_I32::Inst_VOP2__V_ASHRREV_I32
Inst_VOP2__V_ASHRREV_I32(InFmt_VOP2 *)
Definition vop2.cc:734

gem5::VegaISA::Inst_VOP2__V_ASHRREV_I32::~Inst_VOP2__V_ASHRREV_I32
~Inst_VOP2__V_ASHRREV_I32()
Definition vop2.cc:740

gem5::VegaISA::Inst_VOP2__V_ASHRREV_I32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:749

gem5::VegaISA::Inst_VOP2__V_CNDMASK_B32::Inst_VOP2__V_CNDMASK_B32
Inst_VOP2__V_CNDMASK_B32(InFmt_VOP2 *)
Definition vop2.cc:43

gem5::VegaISA::Inst_VOP2__V_CNDMASK_B32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:58

gem5::VegaISA::Inst_VOP2__V_CNDMASK_B32::~Inst_VOP2__V_CNDMASK_B32
~Inst_VOP2__V_CNDMASK_B32()
Definition vop2.cc:50

gem5::VegaISA::Inst_VOP2__V_DOT2C_F32_BF16::Inst_VOP2__V_DOT2C_F32_BF16
Inst_VOP2__V_DOT2C_F32_BF16(InFmt_VOP2 *)
Definition vop2.cc:992

gem5::VegaISA::Inst_VOP2__V_DOT2C_F32_BF16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1003

gem5::VegaISA::Inst_VOP2__V_DOT2C_F32_BF16::~Inst_VOP2__V_DOT2C_F32_BF16
~Inst_VOP2__V_DOT2C_F32_BF16()
Definition vop2.cc:998

gem5::VegaISA::Inst_VOP2__V_FMAC_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2299

gem5::VegaISA::Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32
Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *)
Definition vop2.cc:2286

gem5::VegaISA::Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32
~Inst_VOP2__V_FMAC_F32()
Definition vop2.cc:2292

gem5::VegaISA::Inst_VOP2__V_FMAC_F64::Inst_VOP2__V_FMAC_F64
Inst_VOP2__V_FMAC_F64(InFmt_VOP2 *)
Definition vop2.cc:2323

gem5::VegaISA::Inst_VOP2__V_FMAC_F64::~Inst_VOP2__V_FMAC_F64
~Inst_VOP2__V_FMAC_F64()
Definition vop2.cc:2329

gem5::VegaISA::Inst_VOP2__V_FMAC_F64::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2336

gem5::VegaISA::Inst_VOP2__V_LDEXP_F16::~Inst_VOP2__V_LDEXP_F16
~Inst_VOP2__V_LDEXP_F16()
Definition vop2.cc:2125

gem5::VegaISA::Inst_VOP2__V_LDEXP_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2132

gem5::VegaISA::Inst_VOP2__V_LDEXP_F16::Inst_VOP2__V_LDEXP_F16
Inst_VOP2__V_LDEXP_F16(InFmt_VOP2 *)
Definition vop2.cc:2118

gem5::VegaISA::Inst_VOP2__V_LSHLREV_B16::~Inst_VOP2__V_LSHLREV_B16
~Inst_VOP2__V_LSHLREV_B16()
Definition vop2.cc:1831

gem5::VegaISA::Inst_VOP2__V_LSHLREV_B16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1839

gem5::VegaISA::Inst_VOP2__V_LSHLREV_B16::Inst_VOP2__V_LSHLREV_B16
Inst_VOP2__V_LSHLREV_B16(InFmt_VOP2 *)
Definition vop2.cc:1825

gem5::VegaISA::Inst_VOP2__V_LSHLREV_B32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:786

gem5::VegaISA::Inst_VOP2__V_LSHLREV_B32::Inst_VOP2__V_LSHLREV_B32
Inst_VOP2__V_LSHLREV_B32(InFmt_VOP2 *)
Definition vop2.cc:772

gem5::VegaISA::Inst_VOP2__V_LSHLREV_B32::~Inst_VOP2__V_LSHLREV_B32
~Inst_VOP2__V_LSHLREV_B32()
Definition vop2.cc:778

gem5::VegaISA::Inst_VOP2__V_LSHRREV_B16::Inst_VOP2__V_LSHRREV_B16
Inst_VOP2__V_LSHRREV_B16(InFmt_VOP2 *)
Definition vop2.cc:1854

gem5::VegaISA::Inst_VOP2__V_LSHRREV_B16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1869

gem5::VegaISA::Inst_VOP2__V_LSHRREV_B16::~Inst_VOP2__V_LSHRREV_B16
~Inst_VOP2__V_LSHRREV_B16()
Definition vop2.cc:1860

gem5::VegaISA::Inst_VOP2__V_LSHRREV_B32::Inst_VOP2__V_LSHRREV_B32
Inst_VOP2__V_LSHRREV_B32(InFmt_VOP2 *)
Definition vop2.cc:704

gem5::VegaISA::Inst_VOP2__V_LSHRREV_B32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:719

gem5::VegaISA::Inst_VOP2__V_LSHRREV_B32::~Inst_VOP2__V_LSHRREV_B32
~Inst_VOP2__V_LSHRREV_B32()
Definition vop2.cc:710

gem5::VegaISA::Inst_VOP2__V_MAC_F16::Inst_VOP2__V_MAC_F16
Inst_VOP2__V_MAC_F16(InFmt_VOP2 *)
Definition vop2.cc:1603

gem5::VegaISA::Inst_VOP2__V_MAC_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1620

gem5::VegaISA::Inst_VOP2__V_MAC_F16::~Inst_VOP2__V_MAC_F16
~Inst_VOP2__V_MAC_F16()
Definition vop2.cc:1611

gem5::VegaISA::Inst_VOP2__V_MAC_F32::~Inst_VOP2__V_MAC_F32
~Inst_VOP2__V_MAC_F32()
Definition vop2.cc:1056

gem5::VegaISA::Inst_VOP2__V_MAC_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1064

gem5::VegaISA::Inst_VOP2__V_MAC_F32::Inst_VOP2__V_MAC_F32
Inst_VOP2__V_MAC_F32(InFmt_VOP2 *)
Definition vop2.cc:1048

gem5::VegaISA::Inst_VOP2__V_MADAK_F16::Inst_VOP2__V_MADAK_F16
Inst_VOP2__V_MADAK_F16(InFmt_VOP2 *)
Definition vop2.cc:1651

gem5::VegaISA::Inst_VOP2__V_MADAK_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1670

gem5::VegaISA::Inst_VOP2__V_MADAK_F16::~Inst_VOP2__V_MADAK_F16
~Inst_VOP2__V_MADAK_F16()
Definition vop2.cc:1659

gem5::VegaISA::Inst_VOP2__V_MADAK_F32::Inst_VOP2__V_MADAK_F32
Inst_VOP2__V_MADAK_F32(InFmt_VOP2 *)
Definition vop2.cc:1156

gem5::VegaISA::Inst_VOP2__V_MADAK_F32::~Inst_VOP2__V_MADAK_F32
~Inst_VOP2__V_MADAK_F32()
Definition vop2.cc:1164

gem5::VegaISA::Inst_VOP2__V_MADAK_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1174

gem5::VegaISA::Inst_VOP2__V_MADMK_F16::~Inst_VOP2__V_MADMK_F16
~Inst_VOP2__V_MADMK_F16()
Definition vop2.cc:1634

gem5::VegaISA::Inst_VOP2__V_MADMK_F16::Inst_VOP2__V_MADMK_F16
Inst_VOP2__V_MADMK_F16(InFmt_VOP2 *)
Definition vop2.cc:1626

gem5::VegaISA::Inst_VOP2__V_MADMK_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1645

gem5::VegaISA::Inst_VOP2__V_MADMK_F32::~Inst_VOP2__V_MADMK_F32
~Inst_VOP2__V_MADMK_F32()
Definition vop2.cc:1122

gem5::VegaISA::Inst_VOP2__V_MADMK_F32::Inst_VOP2__V_MADMK_F32
Inst_VOP2__V_MADMK_F32(InFmt_VOP2 *)
Definition vop2.cc:1114

gem5::VegaISA::Inst_VOP2__V_MADMK_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1132

gem5::VegaISA::Inst_VOP2__V_MAX_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1946

gem5::VegaISA::Inst_VOP2__V_MAX_F16::~Inst_VOP2__V_MAX_F16
~Inst_VOP2__V_MAX_F16()
Definition vop2.cc:1937

gem5::VegaISA::Inst_VOP2__V_MAX_F16::Inst_VOP2__V_MAX_F16
Inst_VOP2__V_MAX_F16(InFmt_VOP2 *)
Definition vop2.cc:1930

gem5::VegaISA::Inst_VOP2__V_MAX_F32::~Inst_VOP2__V_MAX_F32
~Inst_VOP2__V_MAX_F32()
Definition vop2.cc:530

gem5::VegaISA::Inst_VOP2__V_MAX_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:537

gem5::VegaISA::Inst_VOP2__V_MAX_F32::Inst_VOP2__V_MAX_F32
Inst_VOP2__V_MAX_F32(InFmt_VOP2 *)
Definition vop2.cc:523

gem5::VegaISA::Inst_VOP2__V_MAX_I16::Inst_VOP2__V_MAX_I16
Inst_VOP2__V_MAX_I16(InFmt_VOP2 *)
Definition vop2.cc:2010

gem5::VegaISA::Inst_VOP2__V_MAX_I16::~Inst_VOP2__V_MAX_I16
~Inst_VOP2__V_MAX_I16()
Definition vop2.cc:2016

gem5::VegaISA::Inst_VOP2__V_MAX_I16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2023

gem5::VegaISA::Inst_VOP2__V_MAX_I32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:609

gem5::VegaISA::Inst_VOP2__V_MAX_I32::~Inst_VOP2__V_MAX_I32
~Inst_VOP2__V_MAX_I32()
Definition vop2.cc:602

gem5::VegaISA::Inst_VOP2__V_MAX_I32::Inst_VOP2__V_MAX_I32
Inst_VOP2__V_MAX_I32(InFmt_VOP2 *)
Definition vop2.cc:596

gem5::VegaISA::Inst_VOP2__V_MAX_U16::~Inst_VOP2__V_MAX_U16
~Inst_VOP2__V_MAX_U16()
Definition vop2.cc:1980

gem5::VegaISA::Inst_VOP2__V_MAX_U16::Inst_VOP2__V_MAX_U16
Inst_VOP2__V_MAX_U16(InFmt_VOP2 *)
Definition vop2.cc:1974

gem5::VegaISA::Inst_VOP2__V_MAX_U16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1987

gem5::VegaISA::Inst_VOP2__V_MAX_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:681

gem5::VegaISA::Inst_VOP2__V_MAX_U32::~Inst_VOP2__V_MAX_U32
~Inst_VOP2__V_MAX_U32()
Definition vop2.cc:674

gem5::VegaISA::Inst_VOP2__V_MAX_U32::Inst_VOP2__V_MAX_U32
Inst_VOP2__V_MAX_U32(InFmt_VOP2 *)
Definition vop2.cc:668

gem5::VegaISA::Inst_VOP2__V_MIN_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1968

gem5::VegaISA::Inst_VOP2__V_MIN_F16::~Inst_VOP2__V_MIN_F16
~Inst_VOP2__V_MIN_F16()
Definition vop2.cc:1959

gem5::VegaISA::Inst_VOP2__V_MIN_F16::Inst_VOP2__V_MIN_F16
Inst_VOP2__V_MIN_F16(InFmt_VOP2 *)
Definition vop2.cc:1952

gem5::VegaISA::Inst_VOP2__V_MIN_F32::~Inst_VOP2__V_MIN_F32
~Inst_VOP2__V_MIN_F32()
Definition vop2.cc:493

gem5::VegaISA::Inst_VOP2__V_MIN_F32::Inst_VOP2__V_MIN_F32
Inst_VOP2__V_MIN_F32(InFmt_VOP2 *)
Definition vop2.cc:486

gem5::VegaISA::Inst_VOP2__V_MIN_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:500

gem5::VegaISA::Inst_VOP2__V_MIN_I16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2095

gem5::VegaISA::Inst_VOP2__V_MIN_I16::~Inst_VOP2__V_MIN_I16
~Inst_VOP2__V_MIN_I16()
Definition vop2.cc:2088

gem5::VegaISA::Inst_VOP2__V_MIN_I16::Inst_VOP2__V_MIN_I16
Inst_VOP2__V_MIN_I16(InFmt_VOP2 *)
Definition vop2.cc:2082

gem5::VegaISA::Inst_VOP2__V_MIN_I32::~Inst_VOP2__V_MIN_I32
~Inst_VOP2__V_MIN_I32()
Definition vop2.cc:566

gem5::VegaISA::Inst_VOP2__V_MIN_I32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:573

gem5::VegaISA::Inst_VOP2__V_MIN_I32::Inst_VOP2__V_MIN_I32
Inst_VOP2__V_MIN_I32(InFmt_VOP2 *)
Definition vop2.cc:560

gem5::VegaISA::Inst_VOP2__V_MIN_U16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2059

gem5::VegaISA::Inst_VOP2__V_MIN_U16::~Inst_VOP2__V_MIN_U16
~Inst_VOP2__V_MIN_U16()
Definition vop2.cc:2052

gem5::VegaISA::Inst_VOP2__V_MIN_U16::Inst_VOP2__V_MIN_U16
Inst_VOP2__V_MIN_U16(InFmt_VOP2 *)
Definition vop2.cc:2046

gem5::VegaISA::Inst_VOP2__V_MIN_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:645

gem5::VegaISA::Inst_VOP2__V_MIN_U32::~Inst_VOP2__V_MIN_U32
~Inst_VOP2__V_MIN_U32()
Definition vop2.cc:638

gem5::VegaISA::Inst_VOP2__V_MIN_U32::Inst_VOP2__V_MIN_U32
Inst_VOP2__V_MIN_U32(InFmt_VOP2 *)
Definition vop2.cc:632

gem5::VegaISA::Inst_VOP2__V_MUL_F16::~Inst_VOP2__V_MUL_F16
~Inst_VOP2__V_MUL_F16()
Definition vop2.cc:1589

gem5::VegaISA::Inst_VOP2__V_MUL_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1597

gem5::VegaISA::Inst_VOP2__V_MUL_F16::Inst_VOP2__V_MUL_F16
Inst_VOP2__V_MUL_F16(InFmt_VOP2 *)
Definition vop2.cc:1582

gem5::VegaISA::Inst_VOP2__V_MUL_F32::~Inst_VOP2__V_MUL_F32
~Inst_VOP2__V_MUL_F32()
Definition vop2.cc:266

gem5::VegaISA::Inst_VOP2__V_MUL_F32::Inst_VOP2__V_MUL_F32
Inst_VOP2__V_MUL_F32(InFmt_VOP2 *)
Definition vop2.cc:259

gem5::VegaISA::Inst_VOP2__V_MUL_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:273

gem5::VegaISA::Inst_VOP2__V_MUL_HI_I32_I24::Inst_VOP2__V_MUL_HI_I32_I24
Inst_VOP2__V_MUL_HI_I32_I24(InFmt_VOP2 *)
Definition vop2.cc:378

gem5::VegaISA::Inst_VOP2__V_MUL_HI_I32_I24::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:391

gem5::VegaISA::Inst_VOP2__V_MUL_HI_I32_I24::~Inst_VOP2__V_MUL_HI_I32_I24
~Inst_VOP2__V_MUL_HI_I32_I24()
Definition vop2.cc:384

gem5::VegaISA::Inst_VOP2__V_MUL_HI_U32_U24::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:461

gem5::VegaISA::Inst_VOP2__V_MUL_HI_U32_U24::~Inst_VOP2__V_MUL_HI_U32_U24
~Inst_VOP2__V_MUL_HI_U32_U24()
Definition vop2.cc:454

gem5::VegaISA::Inst_VOP2__V_MUL_HI_U32_U24::Inst_VOP2__V_MUL_HI_U32_U24
Inst_VOP2__V_MUL_HI_U32_U24(InFmt_VOP2 *)
Definition vop2.cc:448

gem5::VegaISA::Inst_VOP2__V_MUL_I32_I24::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:354

gem5::VegaISA::Inst_VOP2__V_MUL_I32_I24::Inst_VOP2__V_MUL_I32_I24
Inst_VOP2__V_MUL_I32_I24(InFmt_VOP2 *)
Definition vop2.cc:341

gem5::VegaISA::Inst_VOP2__V_MUL_I32_I24::~Inst_VOP2__V_MUL_I32_I24
~Inst_VOP2__V_MUL_I32_I24()
Definition vop2.cc:347

gem5::VegaISA::Inst_VOP2__V_MUL_LEGACY_F32::~Inst_VOP2__V_MUL_LEGACY_F32
~Inst_VOP2__V_MUL_LEGACY_F32()
Definition vop2.cc:229

gem5::VegaISA::Inst_VOP2__V_MUL_LEGACY_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:236

gem5::VegaISA::Inst_VOP2__V_MUL_LEGACY_F32::Inst_VOP2__V_MUL_LEGACY_F32
Inst_VOP2__V_MUL_LEGACY_F32(InFmt_VOP2 *)
Definition vop2.cc:222

gem5::VegaISA::Inst_VOP2__V_MUL_LO_U16::Inst_VOP2__V_MUL_LO_U16
Inst_VOP2__V_MUL_LO_U16(InFmt_VOP2 *)
Definition vop2.cc:1788

gem5::VegaISA::Inst_VOP2__V_MUL_LO_U16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1802

gem5::VegaISA::Inst_VOP2__V_MUL_LO_U16::~Inst_VOP2__V_MUL_LO_U16
~Inst_VOP2__V_MUL_LO_U16()
Definition vop2.cc:1794

gem5::VegaISA::Inst_VOP2__V_MUL_U32_U24::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:432

gem5::VegaISA::Inst_VOP2__V_MUL_U32_U24::~Inst_VOP2__V_MUL_U32_U24
~Inst_VOP2__V_MUL_U32_U24()
Definition vop2.cc:425

gem5::VegaISA::Inst_VOP2__V_MUL_U32_U24::Inst_VOP2__V_MUL_U32_U24
Inst_VOP2__V_MUL_U32_U24(InFmt_VOP2 *)
Definition vop2.cc:419

gem5::VegaISA::Inst_VOP2__V_OR_B32::~Inst_VOP2__V_OR_B32
~Inst_VOP2__V_OR_B32()
Definition vop2.cc:884

gem5::VegaISA::Inst_VOP2__V_OR_B32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:892

gem5::VegaISA::Inst_VOP2__V_OR_B32::Inst_VOP2__V_OR_B32
Inst_VOP2__V_OR_B32(InFmt_VOP2 *)
Definition vop2.cc:878

gem5::VegaISA::Inst_VOP2__V_SUBBREV_CO_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1488

gem5::VegaISA::Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32
~Inst_VOP2__V_SUBBREV_CO_U32()
Definition vop2.cc:1476

gem5::VegaISA::Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32
Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2 *)
Definition vop2.cc:1468

gem5::VegaISA::Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32
~Inst_VOP2__V_SUBB_CO_U32()
Definition vop2.cc:1428

gem5::VegaISA::Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32
Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *)
Definition vop2.cc:1420

gem5::VegaISA::Inst_VOP2__V_SUBB_CO_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1439

gem5::VegaISA::Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32
Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2 *)
Definition vop2.cc:1327

gem5::VegaISA::Inst_VOP2__V_SUBREV_CO_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1344

gem5::VegaISA::Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32
~Inst_VOP2__V_SUBREV_CO_U32()
Definition vop2.cc:1334

gem5::VegaISA::Inst_VOP2__V_SUBREV_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1576

gem5::VegaISA::Inst_VOP2__V_SUBREV_F16::~Inst_VOP2__V_SUBREV_F16
~Inst_VOP2__V_SUBREV_F16()
Definition vop2.cc:1567

gem5::VegaISA::Inst_VOP2__V_SUBREV_F16::Inst_VOP2__V_SUBREV_F16
Inst_VOP2__V_SUBREV_F16(InFmt_VOP2 *)
Definition vop2.cc:1560

gem5::VegaISA::Inst_VOP2__V_SUBREV_F32::~Inst_VOP2__V_SUBREV_F32
~Inst_VOP2__V_SUBREV_F32()
Definition vop2.cc:191

gem5::VegaISA::Inst_VOP2__V_SUBREV_F32::Inst_VOP2__V_SUBREV_F32
Inst_VOP2__V_SUBREV_F32(InFmt_VOP2 *)
Definition vop2.cc:184

gem5::VegaISA::Inst_VOP2__V_SUBREV_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:199

gem5::VegaISA::Inst_VOP2__V_SUBREV_U16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1765

gem5::VegaISA::Inst_VOP2__V_SUBREV_U16::Inst_VOP2__V_SUBREV_U16
Inst_VOP2__V_SUBREV_U16(InFmt_VOP2 *)
Definition vop2.cc:1750

gem5::VegaISA::Inst_VOP2__V_SUBREV_U16::~Inst_VOP2__V_SUBREV_U16
~Inst_VOP2__V_SUBREV_U16()
Definition vop2.cc:1756

gem5::VegaISA::Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32
Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *)
Definition vop2.cc:2250

gem5::VegaISA::Inst_VOP2__V_SUBREV_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2263

gem5::VegaISA::Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32
~Inst_VOP2__V_SUBREV_U32()
Definition vop2.cc:2256

gem5::VegaISA::Inst_VOP2__V_SUB_CO_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1301

gem5::VegaISA::Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32
Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *)
Definition vop2.cc:1284

gem5::VegaISA::Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32
~Inst_VOP2__V_SUB_CO_U32()
Definition vop2.cc:1291

gem5::VegaISA::Inst_VOP2__V_SUB_F16::Inst_VOP2__V_SUB_F16
Inst_VOP2__V_SUB_F16(InFmt_VOP2 *)
Definition vop2.cc:1538

gem5::VegaISA::Inst_VOP2__V_SUB_F16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1554

gem5::VegaISA::Inst_VOP2__V_SUB_F16::~Inst_VOP2__V_SUB_F16
~Inst_VOP2__V_SUB_F16()
Definition vop2.cc:1545

gem5::VegaISA::Inst_VOP2__V_SUB_F32::Inst_VOP2__V_SUB_F32
Inst_VOP2__V_SUB_F32(InFmt_VOP2 *)
Definition vop2.cc:146

gem5::VegaISA::Inst_VOP2__V_SUB_F32::~Inst_VOP2__V_SUB_F32
~Inst_VOP2__V_SUB_F32()
Definition vop2.cc:153

gem5::VegaISA::Inst_VOP2__V_SUB_F32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:161

gem5::VegaISA::Inst_VOP2__V_SUB_U16::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:1727

gem5::VegaISA::Inst_VOP2__V_SUB_U16::~Inst_VOP2__V_SUB_U16
~Inst_VOP2__V_SUB_U16()
Definition vop2.cc:1719

gem5::VegaISA::Inst_VOP2__V_SUB_U16::Inst_VOP2__V_SUB_U16
Inst_VOP2__V_SUB_U16(InFmt_VOP2 *)
Definition vop2.cc:1713

gem5::VegaISA::Inst_VOP2__V_SUB_U32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2227

gem5::VegaISA::Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32
~Inst_VOP2__V_SUB_U32()
Definition vop2.cc:2220

gem5::VegaISA::Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32
Inst_VOP2__V_SUB_U32(InFmt_VOP2 *)
Definition vop2.cc:2214

gem5::VegaISA::Inst_VOP2__V_XNOR_B32::Inst_VOP2__V_XNOR_B32
Inst_VOP2__V_XNOR_B32(InFmt_VOP2 *)
Definition vop2.cc:2360

gem5::VegaISA::Inst_VOP2__V_XNOR_B32::~Inst_VOP2__V_XNOR_B32
~Inst_VOP2__V_XNOR_B32()
Definition vop2.cc:2366

gem5::VegaISA::Inst_VOP2__V_XNOR_B32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:2373

gem5::VegaISA::Inst_VOP2__V_XOR_B32::Inst_VOP2__V_XOR_B32
Inst_VOP2__V_XOR_B32(InFmt_VOP2 *)
Definition vop2.cc:955

gem5::VegaISA::Inst_VOP2__V_XOR_B32::execute
void execute(GPUDynInstPtr) override
Definition vop2.cc:969

gem5::VegaISA::Inst_VOP2__V_XOR_B32::~Inst_VOP2__V_XOR_B32
~Inst_VOP2__V_XOR_B32()
Definition vop2.cc:961

gem5::VegaISA::Inst_VOP2::vop2Helper
void vop2Helper(GPUDynInstPtr gpuDynInst, void(*fOpImpl)(T &, T &, T &, Wavefront *))
Definition op_encodings.hh:344

gem5::VegaISA::Inst_VOP2::extData
InstFormat extData
Definition op_encodings.hh:273

gem5::VegaISA::Inst_VOP2::instData
InFmt_VOP2 instData
Definition op_encodings.hh:271

gem5::VegaISA::Inst_VOP2::Inst_VOP2
Inst_VOP2(InFmt_VOP2 *, const std::string &opcode)
Definition op_encodings.cc:601

gem5::VegaISA::ScalarOperand::write
void write() override
Definition operand.hh:440

gem5::VegaISA::ScalarOperand::read
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:419

gem5::VegaISA::ScalarOperand::rawData
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:402

gem5::VegaISA::ScalarOperand::setBit
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
Definition operand.hh:507

gem5::VegaISA::VEGAGPUStaticInst::panicUnimplemented
void panicUnimplemented() const
Definition gpu_static_inst.cc:54

gem5::VegaISA::VecOperand::read
void read() override
read from the vrf.
Definition operand.hh:148

gem5::VegaISA::VecOperand::readSrc
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:132

gem5::VegaISA::VecOperand::write
void write() override
write to the vrf.
Definition operand.hh:203

gem5::Wavefront
Definition wavefront.hh:62

gem5::Wavefront::execMask
VectorMask & execMask()
Definition wavefront.cc:1581

gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79

gem5::sext
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129

fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268

panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246

inst_util.hh

instructions.hh

gem5::AMDGPU::mxbfloat16
mxfp< fp16_e8m7_info > mxbfloat16
Definition mxfp_types.hh:49

gem5::ArmISA::a1
Bitfield< 22 > a1
Definition misc_types.hh:609

gem5::MipsISA::k
Bitfield< 23 > k
Definition dt_constants.hh:81

gem5::QARMA::b1
Bitfield< 7, 4 > b1
Definition qarma.hh:65

gem5::QARMA::b2
Bitfield< 11, 8 > b2
Definition qarma.hh:64

gem5::VegaISA
classes that represnt vector/scalar operands in VEGA ISA.
Definition faults.cc:39

gem5::VegaISA::ScalarOperandU64
ScalarOperand< ScalarRegU64, false > ScalarOperandU64
Definition operand.hh:804

gem5::VegaISA::ConstVecOperandF32
VecOperand< VecElemF32, true > ConstVecOperandF32
Definition operand.hh:846

gem5::VegaISA::VecElemI32
int32_t VecElemI32
Definition gpu_registers.hh:166

gem5::VegaISA::VecOperandU32
VecOperand< VecElemU32, false > VecOperandU32
Definition operand.hh:829

gem5::VegaISA::ConstVecOperandF64
VecOperand< VecElemF64, true > ConstVecOperandF64
Definition operand.hh:849

gem5::VegaISA::VecOperandI16
VecOperand< VecElemI16, false, 1 > VecOperandI16
Definition operand.hh:828

gem5::VegaISA::ConstVecOperandI32
VecOperand< VecElemI32, true > ConstVecOperandI32
Definition operand.hh:845

gem5::VegaISA::ConstVecOperandU32
VecOperand< VecElemU32, true > ConstVecOperandU32
Definition operand.hh:844

gem5::VegaISA::VecElemI64
int64_t VecElemI64
Definition gpu_registers.hh:169

gem5::VegaISA::processSDWA_src
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:836

gem5::VegaISA::VecElemU32
uint32_t VecElemU32
Definition gpu_registers.hh:165

gem5::VegaISA::VecOperandU16
VecOperand< VecElemU16, false, 1 > VecOperandU16
Definition operand.hh:827

gem5::VegaISA::ConstScalarOperandU64
ScalarOperand< ScalarRegU64, true > ConstScalarOperandU64
Definition operand.hh:818

gem5::VegaISA::ConstVecOperandU16
VecOperand< VecElemU16, true, 1 > ConstVecOperandU16
Definition operand.hh:842

gem5::VegaISA::VecElemF32
float VecElemF32
Definition gpu_registers.hh:167

gem5::VegaISA::processSDWA_dst
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:892

gem5::VegaISA::NumVecElemPerVecReg
const int NumVecElemPerVecReg(64)

gem5::VegaISA::VecElemU64
uint64_t VecElemU64
Definition gpu_registers.hh:168

gem5::VegaISA::VecOperandI32
VecOperand< VecElemI32, false > VecOperandI32
Definition operand.hh:830

gem5::VegaISA::ConstVecOperandI16
VecOperand< VecElemI16, true, 1 > ConstVecOperandI16
Definition operand.hh:843

gem5::VegaISA::REG_VCC_LO
@ REG_VCC_LO
Definition gpu_registers.hh:56

gem5::VegaISA::VecOperandF64
VecOperand< VecElemF64, false > VecOperandF64
Definition operand.hh:833

gem5::VegaISA::VecOperandF32
VecOperand< VecElemF32, false > VecOperandF32
Definition operand.hh:831

gem5::VegaISA::processDPP
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:424

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

std::isinf
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:78

std::isnan
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:83

gem5::VegaISA::InFmt_VOP2
Definition gpu_decoder.hh:1972