develop/vop3p_8cc_source.html

/*

 * Copyright (c) 2023 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "arch/amdgpu/vega/insts/vop3p.hh"


#include "arch/amdgpu/vega/insts/instructions.hh"

#include "arch/arm/insts/fplib.hh"


namespace gem5

{


namespace VegaISA

{


using half = uint16_t;


// Helper functions

template<int N>

int32_t


dotClampI(int32_t value, bool clamp)

{

    // Only valid for N < 32

    static_assert(N < 32);


    if (!clamp) {

        return static_cast<int32_t>(value);

    }


    int32_t min = -(1 << (N - 1));

    int32_t max = (1 << (N - 1)) - 1;

    return std::clamp<int32_t>(value, min, max);

}


template<int N>

uint32_t


dotClampU(uint32_t value, bool clamp)

{

    // Only valid for N < 32

    static_assert(N < 32);


    if (!clamp) {

        return static_cast<int32_t>(value);

    }


    uint32_t min = 0;

    uint32_t max = (1 << N) - 1;

    return std::clamp<int32_t>(value, min, max);

}


int16_t


clampI16(int32_t value, bool clamp)

{

    if (!clamp) {

        return static_cast<int16_t>(value);

    }


    return std::clamp(value,

            static_cast<int32_t>(std::numeric_limits<int16_t>::min()),

            static_cast<int32_t>(std::numeric_limits<int16_t>::max()));

}


uint16_t


clampU16(uint32_t value, bool clamp)

{

    if (!clamp) {

        return static_cast<uint16_t>(value);

    }


    return std::clamp(value,

            static_cast<uint32_t>(std::numeric_limits<uint16_t>::min()),

            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));

}


uint16_t


clampF16(uint16_t value, bool clamp)

{

    if (!clamp) {

        return value;

    }


    // Values of one and zero in fp16.

    constexpr uint16_t one = 0x3c00;

    constexpr uint16_t zero = 0x0;

    ArmISA::FPSCR fpscr1, fpscr2;


    // If value > one, set to one, then if value < zero set to zero.

    uint16_t imm = fplibMin(value, one, fpscr1);

    return fplibMax(imm, zero, fpscr2);

}


float


clampF32(float value, bool clamp)

{

    if (!clamp) {

        return value;

    }


    return std::clamp(value, 0.0f, 1.0f);

}


// Begin instruction execute definitions


void Inst_VOP3P__V_PK_MAD_I16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](int16_t S0, int16_t S1, int16_t S2, bool clamp) -> int16_t

    {

        return clampI16(S0 * S1 + S2, clamp);

    };


    vop3pHelper<int16_t>(gpuDynInst, opImpl);

}


void


Inst_VOP3P__V_PK_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t

    {

        // Only return lower 16 bits of result - This operation cannot clamp.

        uint32_t D = S0 * S1;

        uint16_t Dh = D & 0xFFFF;

        return Dh;

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_ADD_I16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t

    {

        return clampI16(S0 + S1, clamp);

    };


    vop3pHelper<int16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_SUB_I16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t

    {

        return clampI16(S0 - S1, clamp);

    };


    vop3pHelper<int16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t

    {

        unsigned shift_val = bits(S0, 3, 0);


        // Shift does not clamp

        return S1 << shift_val;

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t

    {

        unsigned shift_val = bits(S0, 3, 0);


        return S1 >> shift_val;

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_ASHRREV_B16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t

    {

        // Sign extend to larger type to ensure we don't lose sign bits when

        // shifting.

        int32_t S1e = S1;

        unsigned shift_val = bits(S0, 3, 0);


        return S1e >> shift_val;

    };


    vop3pHelper<int16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MAX_I16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t

    {

        return clampI16((S0 >= S1) ? S0 : S1, clamp);

    };


    vop3pHelper<int16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MIN_I16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t

    {

        return clampI16((S0 < S1) ? S0 : S1, clamp);

    };


    vop3pHelper<int16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MAD_U16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint16_t S0, uint16_t S1, uint16_t S2, bool clamp) -> uint16_t

    {

        return clampU16(S0 * S1 + S2, clamp);

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_ADD_U16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t

    {

        return clampU16(S0 + S1, clamp);

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_SUB_U16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t

    {

        return clampU16(S0 - S1, clamp);

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MAX_U16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t

    {

        return clampU16((S0 >= S1) ? S0 : S1, clamp);

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MIN_U16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t

    {

        return clampU16((S0 < S1) ? S0 : S1, clamp);

    };


    vop3pHelper<uint16_t>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_FMA_F16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](half S0, half S1, half S2, bool clamp) -> half

    {

        ArmISA::FPSCR fpscr;

        return clampF16(fplibMulAdd(S2, S0, S1, fpscr), clamp);

    };


    vop3pHelper<half>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](half S0, half S1, bool clamp) -> half

    {

        ArmISA::FPSCR fpscr;

        return clampF16(fplibAdd(S0, S1, fpscr), clamp);

    };


    vop3pHelper<half>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MUL_F16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](half S0, half S1, bool clamp) -> half

    {

        ArmISA::FPSCR fpscr;

        return clampF16(fplibMul(S0, S1, fpscr), clamp);

    };


    vop3pHelper<half>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MIN_F16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](half S0, half S1, bool clamp) -> half

    {

        ArmISA::FPSCR fpscr;

        return clampF16(fplibMin(S0, S1, fpscr), clamp);

    };


    vop3pHelper<half>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl = [](half S0, half S1, bool clamp) -> half

    {

        ArmISA::FPSCR fpscr;

        return clampF16(fplibMax(S0, S1, fpscr), clamp);

    };


    vop3pHelper<half>(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_DOT2_F32_F16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t

    {

        constexpr unsigned INBITS = 16;


        constexpr unsigned elems = 32 / INBITS;

        half S0[elems];

        half S1[elems];


        for (int i = 0; i < elems; ++i) {

            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);

            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);

        }


        float S2 = *reinterpret_cast<float*>(&S2r);


        // Compute components individually to prevent overflow across packing

        half C[elems];

        float Csum = 0.0f;


        for (int i = 0; i < elems; ++i) {

            ArmISA::FPSCR fpscr;

            C[i] = fplibMul(S0[i], S1[i], fpscr);

            uint32_t conv =

                ArmISA::fplibConvert<uint16_t, uint32_t>(

                        C[i], ArmISA::FPRounding_TIEEVEN, fpscr);

            Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);

        }


        Csum += S2;

        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);


        return rv;

    };


    dotHelper(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_DOT2_F32_BF16::execute(GPUDynInstPtr gpuDynInst)

{

    // Do not use dotHelper here as OPSEL is ignored for this instruction.

    Wavefront *wf = gpuDynInst->wavefront();

    ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);

    ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);

    ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);

    VecOperandU32 vdst(gpuDynInst, instData.VDST);


    src0.readSrc();

    src1.readSrc();

    src2.readSrc();

    vdst.read();


    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

        if (wf->execMask(lane)) {

            AMDGPU::mxbfloat16 a1, a2, b1, b2;

            a1.data = uint16_t(bits(src0[lane], 15, 0));

            a2.data = uint16_t(bits(src0[lane], 31, 16));

            b1.data = uint16_t(bits(src1[lane], 15, 0));

            b2.data = uint16_t(bits(src1[lane], 31, 16));


            if (instData.NEG_HI & 0x1) a2 = -a2;

            if (instData.NEG_HI & 0x2) b2 = -b2;

            if (extData.NEG & 0x1) a1 = -a1;

            if (extData.NEG & 0x2) b1 = -b1;


            vdst[lane] += float(a1) * float(b1);

            vdst[lane] += float(a2) * float(b2);

            vdst[lane] += src2[lane];


            clampF32(vdst[lane], (bool)instData.CLMP);

        }

    }


    vdst.write();

}


void Inst_VOP3P__V_DOT2_I32_I16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t

    {

        constexpr unsigned INBITS = 16;


        constexpr unsigned elems = 32 / INBITS;

        uint32_t S0[elems];

        uint32_t S1[elems];


        for (int i = 0; i < elems; ++i) {

            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);

            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);

        }


        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);


        // Compute components individually to prevent overflow across packing

        int32_t C[elems];

        int32_t Csum = 0;


        for (int i = 0; i < elems; ++i) {

            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);

            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));

            Csum += C[i];

        }


        Csum += S2;

        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);


        return rv;

    };


    dotHelper(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_DOT2_U32_U16::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t

    {

        constexpr unsigned INBITS = 16;


        constexpr unsigned elems = 32 / INBITS;

        uint32_t S0[elems];

        uint32_t S1[elems];


        for (int i = 0; i < elems; ++i) {

            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);

            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);

        }


        // Compute components individually to prevent overflow across packing

        uint32_t C[elems];

        uint32_t Csum = 0;


        for (int i = 0; i < elems; ++i) {

            C[i] = S0[i] * S1[i];

            C[i] = dotClampU<INBITS>(C[i], clamp);

            Csum += C[i];

        }


        Csum += S2;


        return Csum;

    };


    dotHelper(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_DOT4_I32_I8::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t

    {

        constexpr unsigned INBITS = 8;


        constexpr unsigned elems = 32 / INBITS;

        uint32_t S0[elems];

        uint32_t S1[elems];


        for (int i = 0; i < elems; ++i) {

            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);

            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);

        }


        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);


        // Compute components individually to prevent overflow across packing

        int32_t C[elems];

        int32_t Csum = 0;


        for (int i = 0; i < elems; ++i) {

            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);

            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));

            Csum += C[i];

        }


        Csum += S2;

        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);


        return rv;

    };


    dotHelper(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_DOT4_U32_U8::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t

    {

        constexpr unsigned INBITS = 8;


        constexpr unsigned elems = 32 / INBITS;

        uint32_t S0[elems];

        uint32_t S1[elems];


        for (int i = 0; i < elems; ++i) {

            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);

            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);

        }


        // Compute components individually to prevent overflow across packing

        uint32_t C[elems];

        uint32_t Csum = 0;


        for (int i = 0; i < elems; ++i) {

            C[i] = S0[i] * S1[i];

            C[i] = dotClampU<INBITS>(C[i], clamp);

            Csum += C[i];

        }


        Csum += S2;


        return Csum;

    };


    dotHelper(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_DOT8_I32_I4::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t

    {

        constexpr unsigned INBITS = 4;


        constexpr unsigned elems = 32 / INBITS;

        uint32_t S0[elems];

        uint32_t S1[elems];


        for (int i = 0; i < elems; ++i) {

            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);

            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);

        }


        int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);


        // Compute components individually to prevent overflow across packing

        int32_t C[elems];

        int32_t Csum = 0;


        for (int i = 0; i < elems; ++i) {

            C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);

            C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));

            Csum += C[i];

        }


        Csum += S2;

        uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);


        return rv;

    };


    dotHelper(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)

{

    auto opImpl =

        [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t

    {

        constexpr unsigned INBITS = 4;


        constexpr unsigned elems = 32 / INBITS;

        uint32_t S0[elems];

        uint32_t S1[elems];


        for (int i = 0; i < elems; ++i) {

            S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);

            S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);

        }


        // Compute components individually to prevent overflow across packing

        uint32_t C[elems];

        uint32_t Csum = 0;


        for (int i = 0; i < elems; ++i) {

            C[i] = S0[i] * S1[i];

            C[i] = dotClampU<INBITS>(C[i], clamp);

            Csum += C[i];

        }


        Csum += S2;


        return Csum;

    };


    dotHelper(gpuDynInst, opImpl);

}


void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)

{

    Wavefront *wf = gpuDynInst->wavefront();

    unsigned accum_offset = wf->accumOffset;


    ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset);

    VecOperandU32 vdst(gpuDynInst, instData.VDST);


    src.readSrc();


    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

        if (wf->execMask(lane)) {

            vdst[lane] = src[lane];

        }

    }


    vdst.write();

}


void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)

{

    Wavefront *wf = gpuDynInst->wavefront();

    unsigned accum_offset = wf->accumOffset;


    ConstVecOperandU32 src(gpuDynInst, extData.SRC0);

    VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);


    src.readSrc();


    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

        if (wf->execMask(lane)) {

            vdst[lane] = src[lane];

        }

    }


    vdst.write();

}


// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---


Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)

    : Inst_VOP3P(iFmt, "v_pk_fma_f32")

{

    setFlag(ALU);

} // Inst_VOP3P__V_PK_FMA_F32


Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()

{

} // ~Inst_VOP3P__V_PK_FMA_F32


// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =

//     S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .

void


Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)

{

    // This is a special case of packed instructions which operates on

    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float

    // values cannot use bitwise operations. Consider the U64 to imply

    // untyped 64-bits of data.

    Wavefront *wf = gpuDynInst->wavefront();

    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);

    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);

    ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);

    VecOperandU64 vdst(gpuDynInst, instData.VDST);


    src0.readSrc();

    src1.readSrc();

    src2.readSrc();


    int opsel = instData.OPSEL;

    int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);


    int neg = extData.NEG;

    int neg_hi = instData.NEG_HI;


    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

        if (wf->execMask(lane)) {

            uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)

                                       : bits(src0[lane], 31, 0);

            uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)

                                       : bits(src1[lane], 31, 0);

            uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)

                                       : bits(src2[lane], 31, 0);


            float s0lf = *reinterpret_cast<float*>(&s0l);

            float s1lf = *reinterpret_cast<float*>(&s1l);

            float s2lf = *reinterpret_cast<float*>(&s2l);


            if (neg & 1) s0lf = -s0lf;

            if (neg & 1) s1lf = -s1lf;

            if (neg & 1) s2lf = -s2lf;


            float dword1 = std::fma(s0lf, s1lf, s2lf);


            uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)

                                          : bits(src0[lane], 31, 0);

            uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)

                                          : bits(src1[lane], 31, 0);

            uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)

                                          : bits(src2[lane], 31, 0);


            float s0hf = *reinterpret_cast<float*>(&s0h);

            float s1hf = *reinterpret_cast<float*>(&s1h);

            float s2hf = *reinterpret_cast<float*>(&s2h);


            if (neg_hi & 1) s0hf = -s0hf;

            if (neg_hi & 1) s1hf = -s1hf;

            if (neg_hi & 1) s2hf = -s2hf;


            float dword2 = std::fma(s0hf, s1hf, s2hf);


            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);

            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);


            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;

        }

    }


    vdst.write();

} // execute


// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---


Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)

    : Inst_VOP3P(iFmt, "v_pk_mul_f32")

{

    setFlag(ALU);

} // Inst_VOP3P__V_PK_MUL_F32


Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()

{

} // ~Inst_VOP3P__V_PK_MUL_F32


// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *

//              S1.f[31:0]

void


Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)

{

    // This is a special case of packed instructions which operates on

    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float

    // values cannot use bitwise operations. Consider the U64 to imply

    // untyped 64-bits of data.

    Wavefront *wf = gpuDynInst->wavefront();

    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);

    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);

    VecOperandU64 vdst(gpuDynInst, instData.VDST);


    src0.readSrc();

    src1.readSrc();


    int opsel = instData.OPSEL;

    int opsel_hi = extData.OPSEL_HI;


    int neg = extData.NEG;

    int neg_hi = instData.NEG_HI;


    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

        if (wf->execMask(lane)) {

            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)

                                               : bits(src0[lane], 31, 0);

            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)

                                               : bits(src1[lane], 31, 0);


            float ldwordf = *reinterpret_cast<float*>(&lower_dword);

            float udwordf = *reinterpret_cast<float*>(&upper_dword);


            if (neg & 1) ldwordf = -ldwordf;

            if (neg & 2) udwordf = -udwordf;


            float dword1 = ldwordf * udwordf;


            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)

                                         : bits(src0[lane], 31, 0);

            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)

                                         : bits(src1[lane], 31, 0);


            ldwordf = *reinterpret_cast<float*>(&lower_dword);

            udwordf = *reinterpret_cast<float*>(&upper_dword);


            if (neg_hi & 1) ldwordf = -ldwordf;

            if (neg_hi & 2) udwordf = -udwordf;


            float dword2 = ldwordf * udwordf;


            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);

            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);


            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;

        }

    }


    vdst.write();

} // execute


// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---


Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)

    : Inst_VOP3P(iFmt, "v_pk_add_f32")

{

    setFlag(ALU);

} // Inst_VOP3P__V_PK_ADD_F32


Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()

{

} // ~Inst_VOP3P__V_PK_ADD_F32


// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +

//              S1.f[31:0]

void


Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)

{

    // This is a special case of packed instructions which operates on

    // 64-bit inputs/outputs and not 32-bit. U64 is used here as float

    // values cannot use bitwise operations. Consider the U64 to imply

    // untyped 64-bits of data.

    Wavefront *wf = gpuDynInst->wavefront();

    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);

    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);

    VecOperandU64 vdst(gpuDynInst, instData.VDST);


    src0.readSrc();

    src1.readSrc();


    panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);

    panic_if(isDPPInst(), "DPP not supported for %s", _opcode);


    int opsel = instData.OPSEL;

    int opsel_hi = extData.OPSEL_HI;


    int neg = extData.NEG;

    int neg_hi = instData.NEG_HI;


    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

        if (wf->execMask(lane)) {

            uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)

                                               : bits(src0[lane], 31, 0);

            uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)

                                               : bits(src1[lane], 31, 0);


            float ldwordf = *reinterpret_cast<float*>(&lower_dword);

            float udwordf = *reinterpret_cast<float*>(&upper_dword);


            if (neg & 1) ldwordf = -ldwordf;

            if (neg & 2) udwordf = -udwordf;


            float dword1 = ldwordf + udwordf;


            lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)

                                         : bits(src0[lane], 31, 0);

            upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)

                                         : bits(src1[lane], 31, 0);


            ldwordf = *reinterpret_cast<float*>(&lower_dword);

            udwordf = *reinterpret_cast<float*>(&upper_dword);


            if (neg_hi & 1) ldwordf = -ldwordf;

            if (neg_hi & 2) udwordf = -udwordf;


            float dword2 = ldwordf + udwordf;


            uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);

            uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);


            vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;

        }

    }


    vdst.write();

} // execute


// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---


Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)

    : Inst_VOP3P(iFmt, "v_pk_mov_b32")

{

    setFlag(ALU);

} // Inst_VOP3P__V_PK_MOV_B32


Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32()

{

} // ~Inst_VOP3P__V_PK_MOV_B32


// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].

void


Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)

{

    // This is a special case of packed instructions which operates on

    // 64-bit inputs/outputs and not 32-bit.

    Wavefront *wf = gpuDynInst->wavefront();

    ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);

    ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);

    VecOperandU64 vdst(gpuDynInst, instData.VDST);


    src0.readSrc();

    src1.readSrc();


    // Only OPSEL[1:0] are used

    // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0

    int opsel = instData.OPSEL;


    warn_if(instData.NEG_HI || extData.NEG,

            "Negative modifier undefined for %s", _opcode);


    for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

        if (wf->execMask(lane)) {

            // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1

            uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)

                                               : bits(src0[lane], 31, 0);

            uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)

                                               : bits(src1[lane], 31, 0);


            vdst[lane] = upper_dword << 32 | lower_dword;

        }

    }


    vdst.write();

} // execute


// --- Inst_VOP3P__V_MFMA_LOAD_SCALE class methods ---


Inst_VOP3P__V_MFMA_LOAD_SCALE::Inst_VOP3P__V_MFMA_LOAD_SCALE(InFmt_VOP3P *iFmt)

    : Inst_VOP3P(iFmt, "v_mfma_load_scale")

{

    setFlag(ALU);

} // Inst_VOP3P__V_MFMA_LOAD_SCALE


Inst_VOP3P__V_MFMA_LOAD_SCALE::~Inst_VOP3P__V_MFMA_LOAD_SCALE()

{} // ~Inst_VOP3P__V_MFMA_LOAD_SCALE


void


Inst_VOP3P__V_MFMA_LOAD_SCALE::execute(GPUDynInstPtr gpuDynInst)

{

    // This is implemented differently in gem5 to avoid needing to change a

    // large amount of code to handle a 4-dword instruction. Instead, we

    // implement a fake VOP3P instruction which is assumed to come before an

    // MFMA instruction.

    //

    // See https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/

    //              instruction-set-architectures/

    //              amd-instinct-cdna4-instruction-set-architecture.pdf

    // section 7.2.1 for details.

    Wavefront *wf = gpuDynInst->wavefront();

    ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);

    ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);


    src0.readSrc();

    src1.readSrc();


    if (isVectorReg(extData.SRC0)) {

        int opsel = ((extData.OPSEL_HI & 1) << 1) | (instData.OPSEL & 1);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            wf->setMfmaAScale(lane,

                              bits(src0[lane], opsel * 8 + 7, opsel * 8));

        }

    } else {

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            wf->setMfmaAScale(lane, bits(src0[lane], 30, 23));

        }

    }


    if (isVectorReg(extData.SRC1)) {

        int opsel = ((extData.OPSEL_HI & 2) << 1) | (instData.OPSEL & 2);


        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            wf->setMfmaBScale(lane,

                              bits(src1[lane], opsel * 8 + 7, opsel * 8));

        }

    } else {

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            wf->setMfmaBScale(lane, bits(src1[lane], 30, 23));

        }

    }

}


} // namespace VegaISA

} // namespace gem5

gem5::AMDGPU::mxfp::data
uint32_t data
Definition mxfp.hh:112

gem5::GPUStaticInst::isDPPInst
bool isDPPInst() const
Definition gpu_static_inst.hh:116

gem5::GPUStaticInst::setFlag
void setFlag(Flags flag)
Definition gpu_static_inst.hh:251

gem5::GPUStaticInst::isSDWAInst
bool isSDWAInst() const
Definition gpu_static_inst.hh:115

gem5::GPUStaticInst::_opcode
const std::string _opcode
Definition gpu_static_inst.hh:305

gem5::VegaISA::Inst_VOP3P__V_ACCVGPR_READ::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:635

gem5::VegaISA::Inst_VOP3P__V_ACCVGPR_WRITE::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:654

gem5::VegaISA::Inst_VOP3P__V_DOT2_F32_BF16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:384

gem5::VegaISA::Inst_VOP3P__V_DOT2_F32_F16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:344

gem5::VegaISA::Inst_VOP3P__V_DOT2_I32_I16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:422

gem5::VegaISA::Inst_VOP3P__V_DOT2_U32_U16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:459

gem5::VegaISA::Inst_VOP3P__V_DOT4_I32_I8::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:493

gem5::VegaISA::Inst_VOP3P__V_DOT4_U32_U8::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:530

gem5::VegaISA::Inst_VOP3P__V_DOT8_I32_I4::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:564

gem5::VegaISA::Inst_VOP3P__V_DOT8_U32_U4::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:601

gem5::VegaISA::Inst_VOP3P__V_MFMA_LOAD_SCALE::Inst_VOP3P__V_MFMA_LOAD_SCALE
Inst_VOP3P__V_MFMA_LOAD_SCALE(InFmt_VOP3P *)
Definition vop3p.cc:951

gem5::VegaISA::Inst_VOP3P__V_MFMA_LOAD_SCALE::execute
void execute(GPUDynInstPtr) override
Definition vop3p.cc:961

gem5::VegaISA::Inst_VOP3P__V_MFMA_LOAD_SCALE::~Inst_VOP3P__V_MFMA_LOAD_SCALE
~Inst_VOP3P__V_MFMA_LOAD_SCALE()
Definition vop3p.cc:957

gem5::VegaISA::Inst_VOP3P__V_PK_ADD_F16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:300

gem5::VegaISA::Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32
~Inst_VOP3P__V_PK_ADD_F32()
Definition vop3p.cc:835

gem5::VegaISA::Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32
Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *)
Definition vop3p.cc:829

gem5::VegaISA::Inst_VOP3P__V_PK_ADD_F32::execute
void execute(GPUDynInstPtr) override
Definition vop3p.cc:842

gem5::VegaISA::Inst_VOP3P__V_PK_ADD_I16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:158

gem5::VegaISA::Inst_VOP3P__V_PK_ADD_U16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:249

gem5::VegaISA::Inst_VOP3P__V_PK_ASHRREV_B16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:203

gem5::VegaISA::Inst_VOP3P__V_PK_FMA_F16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:289

gem5::VegaISA::Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32
Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *)
Definition vop3p.cc:675

gem5::VegaISA::Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32
~Inst_VOP3P__V_PK_FMA_F32()
Definition vop3p.cc:681

gem5::VegaISA::Inst_VOP3P__V_PK_FMA_F32::execute
void execute(GPUDynInstPtr) override
Definition vop3p.cc:688

gem5::VegaISA::Inst_VOP3P__V_PK_LSHLREV_B16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:178

gem5::VegaISA::Inst_VOP3P__V_PK_LSHRREV_B16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:191

gem5::VegaISA::Inst_VOP3P__V_PK_MAD_I16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:133

gem5::VegaISA::Inst_VOP3P__V_PK_MAD_U16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:238

gem5::VegaISA::Inst_VOP3P__V_PK_MAX_F16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:333

gem5::VegaISA::Inst_VOP3P__V_PK_MAX_I16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:218

gem5::VegaISA::Inst_VOP3P__V_PK_MAX_U16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:269

gem5::VegaISA::Inst_VOP3P__V_PK_MIN_F16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:322

gem5::VegaISA::Inst_VOP3P__V_PK_MIN_I16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:228

gem5::VegaISA::Inst_VOP3P__V_PK_MIN_U16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:279

gem5::VegaISA::Inst_VOP3P__V_PK_MOV_B32::execute
void execute(GPUDynInstPtr) override
Definition vop3p.cc:916

gem5::VegaISA::Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32
Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *)
Definition vop3p.cc:904

gem5::VegaISA::Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32
~Inst_VOP3P__V_PK_MOV_B32()
Definition vop3p.cc:910

gem5::VegaISA::Inst_VOP3P__V_PK_MUL_F16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:311

gem5::VegaISA::Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32
~Inst_VOP3P__V_PK_MUL_F32()
Definition vop3p.cc:763

gem5::VegaISA::Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32
Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *)
Definition vop3p.cc:757

gem5::VegaISA::Inst_VOP3P__V_PK_MUL_F32::execute
void execute(GPUDynInstPtr) override
Definition vop3p.cc:770

gem5::VegaISA::Inst_VOP3P__V_PK_MUL_LO_U16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:145

gem5::VegaISA::Inst_VOP3P__V_PK_SUB_I16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:168

gem5::VegaISA::Inst_VOP3P__V_PK_SUB_U16::execute
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:259

gem5::VegaISA::Inst_VOP3P::dotHelper
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
Definition op_encodings.hh:769

gem5::VegaISA::Inst_VOP3P::extData
InFmt_VOP3P_1 extData
Definition op_encodings.hh:686

gem5::VegaISA::Inst_VOP3P::instData
InFmt_VOP3P instData
Definition op_encodings.hh:684

gem5::VegaISA::Inst_VOP3P::vop3pHelper
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
Definition op_encodings.hh:689

gem5::VegaISA::Inst_VOP3P::Inst_VOP3P
Inst_VOP3P(InFmt_VOP3P *, const std::string &opcode)
Definition op_encodings.cc:1187

gem5::VegaISA::VecOperand::read
void read() override
read from the vrf.
Definition operand.hh:148

gem5::VegaISA::VecOperand::readSrc
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:132

gem5::VegaISA::VecOperand::write
void write() override
write to the vrf.
Definition operand.hh:203

gem5::Wavefront
Definition wavefront.hh:62

gem5::Wavefront::accumOffset
uint32_t accumOffset
Definition wavefront.hh:138

gem5::Wavefront::setMfmaBScale
void setMfmaBScale(int idx, uint8_t value)
Definition wavefront.cc:1706

gem5::Wavefront::execMask
VectorMask & execMask()
Definition wavefront.cc:1581

gem5::Wavefront::setMfmaAScale
void setMfmaAScale(int idx, uint8_t value)
Definition wavefront.cc:1699

fplib.hh
Floating-point library code, which will gradually replace vfp.hh.

gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79

gem5::sext
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129

panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246

instructions.hh

warn_if
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:315

gem5::AMDGPU::mxbfloat16
mxfp< fp16_e8m7_info > mxbfloat16
Definition mxfp_types.hh:49

gem5::ArmISA::a1
Bitfield< 22 > a1
Definition misc_types.hh:609

gem5::ArmISA::mask
Bitfield< 3, 0 > mask
Definition pcstate.hh:63

gem5::ArmISA::imm
Bitfield< 7, 0 > imm
Definition types.hh:132

gem5::ArmISA::fplibConvert
uint16_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:3597

gem5::ArmISA::fplibMul
uint16_t fplibMul(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:4820

gem5::ArmISA::fplibMulAdd
uint16_t fplibMulAdd(uint16_t addend, uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:3876

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::ArmISA::fplibMax
uint16_t fplibMax(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:4646

gem5::ArmISA::fplibMin
uint16_t fplibMin(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:4733

gem5::ArmISA::fplibAdd
uint16_t fplibAdd(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:3346

gem5::ArmISA::FPRounding_TIEEVEN
@ FPRounding_TIEEVEN
Definition fplib.hh:62

gem5::QARMA::b1
Bitfield< 7, 4 > b1
Definition qarma.hh:65

gem5::QARMA::b2
Bitfield< 11, 8 > b2
Definition qarma.hh:64

gem5::VegaISA
classes that represnt vector/scalar operands in VEGA ISA.
Definition faults.cc:39

gem5::VegaISA::clampI16
int16_t clampI16(int32_t value, bool clamp)
Definition vop3p.cc:79

gem5::VegaISA::isVectorReg
bool isVectorReg(int opIdx)
Definition gpu_registers.cc:256

gem5::VegaISA::dotClampI
int32_t dotClampI(int32_t value, bool clamp)
Definition vop3p.cc:48

gem5::VegaISA::clampF32
float clampF32(float value, bool clamp)
Definition vop3p.cc:120

gem5::VegaISA::VecOperandU32
VecOperand< VecElemU32, false > VecOperandU32
Definition operand.hh:829

gem5::VegaISA::ConstVecOperandU32
VecOperand< VecElemU32, true > ConstVecOperandU32
Definition operand.hh:844

gem5::VegaISA::clampF16
uint16_t clampF16(uint16_t value, bool clamp)
Definition vop3p.cc:103

gem5::VegaISA::half
uint16_t half
Definition vop3p.cc:43

gem5::VegaISA::NumVecElemPerVecReg
const int NumVecElemPerVecReg(64)

gem5::VegaISA::VecOperandU64
VecOperand< VecElemU64, false > VecOperandU64
Definition operand.hh:832

gem5::VegaISA::dotClampU
uint32_t dotClampU(uint32_t value, bool clamp)
Definition vop3p.cc:64

gem5::VegaISA::ConstVecOperandU64
VecOperand< VecElemU64, true > ConstVecOperandU64
Definition operand.hh:847

gem5::VegaISA::clampU16
uint16_t clampU16(uint32_t value, bool clamp)
Definition vop3p.cc:91

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::VegaISA::InFmt_VOP3P
Definition gpu_decoder.hh:2063

vop3p.hh