release/v23-0-0-1/gcn3_2insts_2inst__util_8hh_source.html

/*

 * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__

#define __ARCH_GCN3_INSTS_INST_UTIL_HH__


#include <cmath>


#include "arch/amdgpu/gcn3/gpu_registers.hh"


namespace gem5

{


// values for SDWA select operations

enum SDWASelVals : int

{

    SDWA_BYTE_0 = 0, /* select data[7:0] */

    SDWA_BYTE_1 = 1, /* select data[15:8] */

    SDWA_BYTE_2 = 2, /* select data[23:16] */

    SDWA_BYTE_3 = 3, /* select data[31:24] */

    SDWA_WORD_0 = 4, /* select data[15:0] */

    SDWA_WORD_1 = 5, /* select data[31:16] */

    SDWA_DWORD  = 6  /* select data[31:0] */

};


// values for format of destination bits for SDWA operations

enum SDWADstVals : int

{

    SDWA_UNUSED_PAD      = 0, /* Pad all unused bits with 0 */

    SDWA_UNUSED_SEXT     = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */

    SDWA_UNUSED_PRESERVE = 2  /* select data[31:0] */

};


// values for DPP operations

enum SqDPPVals : int

{

    SQ_DPP_QUAD_PERM_MAX   = 0xFF,

    SQ_DPP_RESERVED        = 0x100,

    SQ_DPP_ROW_SL1         = 0x101,

    SQ_DPP_ROW_SL15        = 0x10F,

    SQ_DPP_ROW_SR1         = 0x111,

    SQ_DPP_ROW_SR15        = 0x11F,

    SQ_DPP_ROW_RR1         = 0x121,

    SQ_DPP_ROW_RR15        = 0x12F,

    SQ_DPP_WF_SL1          = 0x130,

    SQ_DPP_WF_RL1          = 0x134,

    SQ_DPP_WF_SR1          = 0x138,

    SQ_DPP_WF_RR1          = 0x13C,

    SQ_DPP_ROW_MIRROR      = 0x140,

    SQ_DPP_ROW_HALF_MIRROR = 0x141,

    SQ_DPP_ROW_BCAST15     = 0x142,

    SQ_DPP_ROW_BCAST31     = 0x143

};

static const int ROW_SIZE = 16; /* 16 registers per row */

static const int NUM_BANKS = 4; /* 64 registers, 16/bank */


namespace Gcn3ISA

{

    template<typename T>

    inline T

    wholeQuadMode(T val)

    {

        T wqm = 0;

        T mask = 0xF;


        for (T bits = val; mask != 0; mask <<= 4)

            if ((bits & mask) != 0)

                wqm |= mask;


        return wqm;

    }


    template<typename T>

    inline T

    quadMask(T val)

    {

        T qmsk = 0;

        T mask = 0xF;

        T qbit = 0x1;


        for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {

            if (bits & mask) {

                qmsk |= qbit;

            }

        }


        return qmsk;

    }


    template<typename T>

    inline ScalarRegI32

    countZeroBits(T val)

    {

        ScalarRegI32 num_zeros

            = std::numeric_limits<T>::digits - popCount(val);


        return num_zeros;

    }


    template<typename T>

    inline ScalarRegI32

    findFirstZero(T val)

    {

        if (val == ~T(0)) {

            return -1;

        }


        return findLsbSet(~val);

    }


    template<typename T>

    inline ScalarRegI32

    findFirstOne(T val)

    {

        if (!val) {

            return -1;

        }


        return findLsbSet(val);

    }


    template<typename T>

    inline ScalarRegI32

    findFirstOneMsb(T val)

    {

        if (!val) {

            return -1;

        }


        return findMsbSet(val);

    }


    template<typename T>

    inline ScalarRegI32

    countZeroBitsMsb(T val)

    {

        if (!val) {

            return -1;

        }


        return std::numeric_limits<T>::digits - 1 - findMsbSet(val);

    }


    inline ScalarRegI32

    firstOppositeSignBit(ScalarRegI32 val)

    {

        bool found(false);

        bool sign_bit = (val & 0x80000000) != 0;

        ScalarRegU32 tmp_val(0);

        int count(0);


        if (!val || val == -1) {

            return -1;

        }


        for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {

            tmp_val = val & (0x80000000 >> i);


            if (!sign_bit) {

                if (tmp_val) {

                    found = true;

                    break;

                }

            } else {

                if (!tmp_val) {

                    found = true;

                    break;

                }

            }

            ++count;

        }


        if (found) {

            return count;

        } else {

            return -1;

        }

    }


    inline ScalarRegI32

    firstOppositeSignBit(ScalarRegI64 val)

    {

        bool found(false);

        bool sign_bit = (val & 0x8000000000000000ULL) != 0;

        ScalarRegU64 tmp_val(0);

        int count(0);


        if (!val || val == -1) {

            return -1;

        }


        for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {

            tmp_val = val & (0x8000000000000000ULL >> i);


            if (!sign_bit) {

                if (tmp_val) {

                    found = true;

                    break;

                }

            } else {

                if (!tmp_val) {

                    found = true;

                    break;

                }

            }

            ++count;

        }


        if (found) {

            return count;

        } else {

            return -1;

        }

    }


    template<typename T>

    inline T

    median(T val_0, T val_1, T val_2)

    {

        if (std::is_floating_point_v<T>) {

            return std::fmax(std::fmin(val_0, val_1),

                std::fmin(std::fmax(val_0, val_1), val_2));

        } else {

            return std::max(std::min(val_0, val_1),

                std::min(std::max(val_0, val_1), val_2));

        }

    }


    template <typename T>

    inline T roundNearestEven(T val)

    {

        T int_part = 0;

        T nearest_round = std::floor(val + 0.5);

        if ((int)std::floor(val) % 2 == 0

            && std::modf(std::abs(val), &int_part) == 0.5) {

          nearest_round = nearest_round - 1;

        }


        return nearest_round;

    }


    inline VecElemU32

    muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1,

        VecElemU64 val_2)

    {

        __uint128_t u0 = (__uint128_t)val_0;

        __uint128_t u1 = (__uint128_t)val_1;

        __uint128_t u2 = (__uint128_t)val_2;

        __uint128_t result = u0 * u1 + u2;


        dst = (VecElemU64)result;


        return (VecElemU32)(result >> 64) ? 1 : 0;

    }


    inline VecElemU32

    muladd(VecElemI64 &dst, VecElemI32 val_0, VecElemI32 val_1,

        VecElemI64 val_2)

    {

        __int128_t u0 = (__int128_t)val_0;

        __int128_t u1 = (__int128_t)val_1;

        __int128_t u2 = (__int128_t)val_2;

        __int128_t result = u0 * u1 + u2;


        dst = (VecElemI64)result;


        return (VecElemU32)(result >> 64) ? 1 : 0;

    }


    int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,

                    int rowOffset, bool & outOfBounds)

    {

        // local variables

        // newLane will be the same as the input lane unless swizzling happens

        int newLane = currLane;

        // for shift/rotate permutations; positive values are LEFT rotates

        int count = 1;

        int localRowOffset = rowOffset;

        int localRowNum = rowNum;


        if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}

            int quadBase = (currLane & ~(3));

            int quadPix = (currLane & 3);

            quadPix = ((dppCtrl >> (2 * quadPix)) & 3);

            newLane = (quadBase | quadPix);

        } else if (dppCtrl == SQ_DPP_RESERVED) {

            panic("ERROR: instruction using reserved DPP_CTRL value\n");

        } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&

                   (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}

            count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);

            if ((localRowOffset + count >= 0) &&

                (localRowOffset + count < ROW_SIZE)) {

                localRowOffset += count;

                newLane = (rowNum | localRowOffset);

            } else {

                outOfBounds = true;

            }

        } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&

                   (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}

            count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);

            if ((localRowOffset + count >= 0) &&

                (localRowOffset + count < ROW_SIZE)) {

                localRowOffset += count;

                newLane = (rowNum | localRowOffset);

            } else {

                outOfBounds = true;

            }

        } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&

                   (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}

            count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);

            localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;

            newLane = (rowNum | localRowOffset);

        } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1

            count = 1;

            if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {

                newLane += count;

            } else {

                outOfBounds = true;

            }

        } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1

            count = 1;

            newLane = (currLane + count + NumVecElemPerVecReg) %

                      NumVecElemPerVecReg;

        } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1

            count = -1;

            int currVal = (currLane + count);

            if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {

                newLane += count;

            } else {

                outOfBounds = true;

            }

        } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1

            count = -1;

            newLane = (currLane + count + NumVecElemPerVecReg) %

                      NumVecElemPerVecReg;

        } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR

            localRowOffset = (15 - localRowOffset);

            newLane = (rowNum | localRowOffset);

        } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR

            localRowNum = (currLane & -0x7);

            localRowOffset = (currLane & 0x7);

            localRowOffset = (7 - localRowNum);

            newLane = (localRowNum | localRowOffset);

        } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15

            count = 15;

            if (currLane > count) {

                newLane = (currLane & ~count) - 1;

            }

        } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31

            count = 31;

            if (currLane > count) {

                newLane = (currLane & ~count) - 1;

            }

        } else {

            panic("Unimplemented DPP control operation: %d\n", dppCtrl);

        }


        return newLane;

    }


    template<typename T>

    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,

                    T & src0)

    {

        // local variables

        SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;

        int boundCtrl = dppInst.BOUND_CTRL;

        int bankMask = dppInst.BANK_MASK;

        int rowMask = dppInst.ROW_MASK;

        // row, bank info to be calculated per lane

        int rowNum = 0, bankNum = 0, rowOffset = 0;

        // outLane will be the same as the input lane unless swizzling happens

        int outLane = 0;

        bool laneDisabled = false;

        // flags used for determining if a lane should be written to/reset/etc.

        bool outOfBounds = false, zeroSrc = false;

        long long threadValid = 0;


        if (dppInst.SRC0_NEG) {

            src0.negModifier();

        }


        if (dppInst.SRC0_ABS) {

            src0.absModifier();

        }


        // iterate over all register lanes, performing steps 2-4

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            threadValid = (0x1LL << lane);

            rowNum = (lane / ROW_SIZE);

            rowOffset = (lane % ROW_SIZE);

            bankNum = (rowOffset / NUM_BANKS);


            if (((rowMask & (0x1 << rowNum)) == 0)   /* row mask */   ||

                ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {

                laneDisabled = true;

                continue;

            }


            if (!laneDisabled) {

                outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,

                                      outOfBounds);

            }


            if (laneDisabled) {

                threadValid = 0;

            } else if (outOfBounds) {

                if (boundCtrl == 1) {

                    zeroSrc = true;

                } else {

                    threadValid = 0;

                }

            } else if (!gpuDynInst->exec_mask[lane]) {

                if (boundCtrl == 1) {

                    zeroSrc = true;

                } else {

                    threadValid = 0;

                }

            }


            if (threadValid != 0 && !outOfBounds && !zeroSrc) {

                assert(!laneDisabled);

                src0[outLane] = src0[lane];

            } else if (zeroSrc) {

                src0[lane] = 0;

            }


            // reset for next iteration

            laneDisabled = false;

        }

    }


    template<typename T>

    void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,

                    T & src0, T & src1)

    {

        if (dppInst.SRC1_NEG) {

            src1.negModifier();

        }


        if (dppInst.SRC1_ABS) {

            src1.absModifier();

        }


        // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,

        // which is only used for negation/absolute value, call other version

        // to do everything else.

        processDPP(gpuDynInst, dppInst, src0);

    }


    template<typename T>

    T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,

                             const SDWASelVals sel, const bool signExt)

    {

        // local variables

        int low_bit = 0, high_bit = 0;

        bool signExt_local = signExt;

        T retVal = 0;


        // if we're preserving all of the bits, then we can immediately return

        if (sel == SDWA_DWORD) {

            return currOperVal;

        }


        if (sel < SDWA_WORD_0) { // we are selecting 1 byte

            /*

              Process byte 0 first.  This code eiter selects the original bits

              of byte 0, or makes the bits of the selected byte be byte 0 (and

              next either sign extends or zero's out upper bits).

            */

            low_bit = (sel * Gcn3ISA::BITS_PER_BYTE);

            high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;

            retVal = bits(currOperVal, high_bit, low_bit);


            // make sure update propagated, since used next

            fatal_if(bits(retVal, Gcn3ISA::MSB_PER_BYTE) !=

                     bits(origOperVal, high_bit),

                     "ERROR: SDWA byte update not propagated: retVal: %d, "

                     "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE),

                     bits(origOperVal, high_bit));

            // sign extended value depends on upper-most bit of the new byte 0

            signExt_local = (signExt &&

                             (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80));


            // process all other bytes -- if sign extending, make them 1, else

            // all 0's so leave as is

            if (signExt_local) {

                retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);

            }

        } else if (sel < SDWA_DWORD) { // we are selecting 1 word

            /*

              Process word 0 first.  This code eiter selects the original bits

              of word 0, or makes the bits of the selected word be word 0 (and

              next either sign extends or zero's out upper bits).

            */

            low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;

            high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;

            retVal = bits(currOperVal, high_bit, low_bit);


            // make sure update propagated, since used next

            fatal_if(bits(retVal, Gcn3ISA::MSB_PER_WORD) !=

                     bits(origOperVal, high_bit),

                     "ERROR: SDWA word update not propagated: retVal: %d, "

                     "orig: %d\n",

                     bits(retVal, Gcn3ISA::MSB_PER_WORD),

                     bits(origOperVal, high_bit));

            // sign extended value depends on upper-most bit of the new word 0

            signExt_local = (signExt &&

                             (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) &

                              0x8000));


            // process other word -- if sign extending, make them 1, else all

            // 0's so leave as is

            if (signExt_local) {

                retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);

            }

        } else {

            assert(sel != SDWA_DWORD); // should have returned earlier

            panic("Unimplemented SDWA select operation: %d\n", sel);

        }


        return retVal;

    }


    template<typename T>

    void sdwaInstSrcImpl(T & currOper, T & origCurrOper,

                         const SDWASelVals sel, const bool signExt)

    {

        // iterate over all lanes, setting appropriate, selected value

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],

                                                    origCurrOper[lane], sel,

                                                    signExt);

        }

    }


    template<typename T>

    T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,

                             const bool clamp, const SDWASelVals sel,

                             const SDWADstVals unusedBits_format)

    {

        // local variables

        int low_bit = 0, high_bit = 0;

        bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);

        //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);

        bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);

        T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,

          origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;


        // if we're preserving all of the bits, then we can immediately return

        if (unusedBits_format == SDWA_UNUSED_PRESERVE) {

            assert(sel == SDWA_DWORD);

            return currDstVal;

        } else if (sel == SDWA_DWORD) {

            // NOTE: users may set the unused bits variable to anything in this

            // scenario, because it will be ignored

            return currDstVal;

        }


        if (sel < SDWA_WORD_0) { // we are selecting 1 byte

            // if we sign extended depends on upper-most bit of byte 0

            signExt = (signExt &&

                       (bits(currDstVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x80));


            for (int byte = 0; byte < 4; ++byte) {

                low_bit = byte * Gcn3ISA::BITS_PER_BYTE;

                high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;

                /*

                  Options:

                    1.  byte == sel: we are keeping all bits in this byte

                    2.  preserve is set: keep this byte as is because the

                    output preserve flag is set

                    3.  byte > sel && signExt: we're sign extending and

                    this byte is one of the bytes we need to sign extend

                */

                origBits_thisByte = bits(origDstVal, high_bit, low_bit);

                currBits_thisByte = bits(currDstVal, high_bit, low_bit);

                newBits = ((byte == sel) ? origBits_thisByte :

                           ((preserve) ? currBits_thisByte :

                            (((byte > sel) && signExt) ? 0xff : 0)));

                retVal = insertBits(retVal, high_bit, low_bit, newBits);

            }

        } else if (sel < SDWA_DWORD) { // we are selecting 1 word

            low_bit = 0;

            high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;

            // if we sign extended depends on upper-most bit of word 0

            signExt = (signExt &&

                       (bits(currDstVal, high_bit, low_bit) & 0x8000));


            for (int word = 0; word < 2; ++word) {

                low_bit = word * Gcn3ISA::BITS_PER_WORD;

                high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;

                /*

                  Options:

                    1.  word == sel & 1: we are keeping all bits in this word

                    2.  preserve is set: keep this word as is because the

                    output preserve flag is set

                    3.  word > (sel & 1) && signExt: we're sign extending and

                    this word is one of the words we need to sign extend

                */

                origBits_thisWord = bits(origDstVal, high_bit, low_bit);

                currBits_thisWord = bits(currDstVal, high_bit, low_bit);

                newBits = ((word == (sel & 0x1)) ? origBits_thisWord :

                           ((preserve) ? currBits_thisWord :

                            (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));

                retVal = insertBits(retVal, high_bit, low_bit, newBits);

            }

        } else {

            assert(sel != SDWA_DWORD); // should have returned earlier

            panic("Unimplemented SDWA select operation: %d\n", sel);

        }


        return retVal;

    }


    template<typename T>

    void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,

                         const SDWASelVals sel,

                         const SDWADstVals unusedBits_format)

    {

        // iterate over all lanes, setting appropriate, selected value

        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {

            dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],

                                                   origDstOper[lane], clamp,

                                                   sel, unusedBits_format);

        }

    }


    template<typename T>

    void processSDWA_src_helper(T & currSrc, T & origCurrSrc,

                                const SDWASelVals src_sel,

                                const bool src_signExt, const bool src_abs,

                                const bool src_neg)

    {

        if (src_neg) {

            currSrc.negModifier();

        }


        if (src_abs) {

            currSrc.absModifier();

        }


        sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);

    }


    template<typename T>

    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)

    {

        // local variables

        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;

        const bool src0_signExt = sdwaInst.SRC0_SEXT;

        const bool src0_neg = sdwaInst.SRC0_NEG;

        const bool src0_abs = sdwaInst.SRC0_ABS;


        // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1

        // operand.  So ensure that SRC1 fields are not set, then call helper

        // function only on src0.

        assert(!sdwaInst.SRC1_SEXT);

        assert(!sdwaInst.SRC1_NEG);

        assert(!sdwaInst.SRC1_ABS);


        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,

                               src0_abs, src0_neg);

    }


    template<typename T>

    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,

                         T & src1, T & origSrc1)

    {

        // local variables

        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;

        const bool src0_signExt = sdwaInst.SRC0_SEXT;

        const bool src0_neg = sdwaInst.SRC0_NEG;

        const bool src0_abs = sdwaInst.SRC0_ABS;

        const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;

        const bool src1_signExt = sdwaInst.SRC1_SEXT;

        const bool src1_neg = sdwaInst.SRC1_NEG;

        const bool src1_abs = sdwaInst.SRC1_ABS;


        processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,

                               src0_abs, src0_neg);

        processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,

                               src1_abs, src1_neg);

    }


    template<typename T>

    void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)

    {

        // local variables

        const SDWADstVals dst_unusedBits_format =

            (SDWADstVals)sdwaInst.DST_UNUSED;

        const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;

        const bool clamp = sdwaInst.CLAMP;


        sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);

    }

} // namespace Gcn3ISA

} // namespace gem5


#endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__

gpu_registers.hh

gem5::findMsbSet
constexpr int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition bitfield.hh:276

gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:76

gem5::popCount
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition bitfield.hh:350

gem5::insertBits
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition bitfield.hh:182

gem5::findLsbSet
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
Definition bitfield.hh:312

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188

fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236

gem5::ArmISA::mask
Bitfield< 3, 0 > mask
Definition pcstate.hh:63

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::ArmISA::NumVecElemPerVecReg
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61

gem5::ArmISA::sel
sel
Definition misc_types.hh:708

gem5::Gcn3ISA::sdwaInstDstImpl_helper
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition inst_util.hh:666

gem5::Gcn3ISA::BITS_PER_BYTE
const int BITS_PER_BYTE
Definition gpu_registers.hh:143

gem5::Gcn3ISA::BITS_PER_WORD
const int BITS_PER_WORD
Definition gpu_registers.hh:144

gem5::Gcn3ISA::countZeroBits
ScalarRegI32 countZeroBits(T val)
Definition inst_util.hh:120

gem5::Gcn3ISA::VecElemU64
uint64_t VecElemU64
Definition gpu_registers.hh:168

gem5::Gcn3ISA::muladd
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition inst_util.hh:271

gem5::Gcn3ISA::countZeroBitsMsb
ScalarRegI32 countZeroBitsMsb(T val)
Definition inst_util.hh:163

gem5::Gcn3ISA::processSDWA_src_helper
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition inst_util.hh:788

gem5::Gcn3ISA::ScalarRegU32
uint32_t ScalarRegU32
Definition gpu_registers.hh:153

gem5::Gcn3ISA::quadMask
T quadMask(T val)
Definition inst_util.hh:103

gem5::Gcn3ISA::sdwaInstSrcImpl
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition inst_util.hh:647

gem5::Gcn3ISA::median
T median(T val_0, T val_1, T val_2)
Definition inst_util.hh:246

gem5::Gcn3ISA::firstOppositeSignBit
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition inst_util.hh:173

gem5::Gcn3ISA::ScalarRegU64
uint64_t ScalarRegU64
Definition gpu_registers.hh:156

gem5::Gcn3ISA::VecElemI32
int32_t VecElemI32
Definition gpu_registers.hh:166

gem5::Gcn3ISA::wholeQuadMode
T wholeQuadMode(T val)
Definition inst_util.hh:89

gem5::Gcn3ISA::sdwaInstSrcImpl_helper
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition inst_util.hh:554

gem5::Gcn3ISA::processSDWA_src
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:823

gem5::Gcn3ISA::sdwaInstDstImpl
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition inst_util.hh:767

gem5::Gcn3ISA::VecElemI64
int64_t VecElemI64
Definition gpu_registers.hh:169

gem5::Gcn3ISA::ScalarRegI64
int64_t ScalarRegI64
Definition gpu_registers.hh:157

gem5::Gcn3ISA::processDPP
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:415

gem5::Gcn3ISA::dppInstImpl
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition inst_util.hh:318

gem5::Gcn3ISA::VecElemU32
uint32_t VecElemU32
Definition gpu_registers.hh:165

gem5::Gcn3ISA::findFirstZero
ScalarRegI32 findFirstZero(T val)
Definition inst_util.hh:130

gem5::Gcn3ISA::ScalarRegI32
int32_t ScalarRegI32
Definition gpu_registers.hh:154

gem5::Gcn3ISA::MSB_PER_WORD
const int MSB_PER_WORD
Definition gpu_registers.hh:146

gem5::Gcn3ISA::processSDWA_dst
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:879

gem5::Gcn3ISA::roundNearestEven
T roundNearestEven(T val)
Definition inst_util.hh:258

gem5::Gcn3ISA::MSB_PER_BYTE
const int MSB_PER_BYTE
Definition gpu_registers.hh:145

gem5::Gcn3ISA::findFirstOneMsb
ScalarRegI32 findFirstOneMsb(T val)
Definition inst_util.hh:152

gem5::Gcn3ISA::findFirstOne
ScalarRegI32 findFirstOne(T val)
Definition inst_util.hh:141

gem5::X86ISA::count
count
Definition misc.hh:710

gem5::X86ISA::val
Bitfield< 63 > val
Definition misc.hh:776

gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition gpu_translation_state.hh:38

gem5::NUM_BANKS
static const int NUM_BANKS
Definition inst_util.hh:83

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::ROW_SIZE
static const int ROW_SIZE
Definition inst_util.hh:82

gem5::SDWADstVals
SDWADstVals
Definition inst_util.hh:56

gem5::SDWA_UNUSED_PRESERVE
@ SDWA_UNUSED_PRESERVE
Definition inst_util.hh:59

gem5::SDWA_UNUSED_PAD
@ SDWA_UNUSED_PAD
Definition inst_util.hh:57

gem5::SDWA_UNUSED_SEXT
@ SDWA_UNUSED_SEXT
Definition inst_util.hh:58

gem5::SDWASelVals
SDWASelVals
Definition inst_util.hh:44

gem5::SDWA_BYTE_1
@ SDWA_BYTE_1
Definition inst_util.hh:46

gem5::SDWA_BYTE_3
@ SDWA_BYTE_3
Definition inst_util.hh:48

gem5::SDWA_DWORD
@ SDWA_DWORD
Definition inst_util.hh:51

gem5::SDWA_WORD_1
@ SDWA_WORD_1
Definition inst_util.hh:50

gem5::SDWA_BYTE_2
@ SDWA_BYTE_2
Definition inst_util.hh:47

gem5::SDWA_WORD_0
@ SDWA_WORD_0
Definition inst_util.hh:49

gem5::SDWA_BYTE_0
@ SDWA_BYTE_0
Definition inst_util.hh:45

gem5::SqDPPVals
SqDPPVals
Definition inst_util.hh:64

gem5::SQ_DPP_WF_RL1
@ SQ_DPP_WF_RL1
Definition inst_util.hh:74

gem5::SQ_DPP_ROW_SR1
@ SQ_DPP_ROW_SR1
Definition inst_util.hh:69

gem5::SQ_DPP_ROW_BCAST31
@ SQ_DPP_ROW_BCAST31
Definition inst_util.hh:80

gem5::SQ_DPP_ROW_SL15
@ SQ_DPP_ROW_SL15
Definition inst_util.hh:68

gem5::SQ_DPP_ROW_HALF_MIRROR
@ SQ_DPP_ROW_HALF_MIRROR
Definition inst_util.hh:78

gem5::SQ_DPP_QUAD_PERM_MAX
@ SQ_DPP_QUAD_PERM_MAX
Definition inst_util.hh:65

gem5::SQ_DPP_ROW_SL1
@ SQ_DPP_ROW_SL1
Definition inst_util.hh:67

gem5::SQ_DPP_ROW_MIRROR
@ SQ_DPP_ROW_MIRROR
Definition inst_util.hh:77

gem5::SQ_DPP_RESERVED
@ SQ_DPP_RESERVED
Definition inst_util.hh:66

gem5::SQ_DPP_ROW_BCAST15
@ SQ_DPP_ROW_BCAST15
Definition inst_util.hh:79

gem5::SQ_DPP_ROW_RR1
@ SQ_DPP_ROW_RR1
Definition inst_util.hh:71

gem5::SQ_DPP_ROW_SR15
@ SQ_DPP_ROW_SR15
Definition inst_util.hh:70

gem5::SQ_DPP_ROW_RR15
@ SQ_DPP_ROW_RR15
Definition inst_util.hh:72

gem5::SQ_DPP_WF_RR1
@ SQ_DPP_WF_RR1
Definition inst_util.hh:76

gem5::SQ_DPP_WF_SL1
@ SQ_DPP_WF_SL1
Definition inst_util.hh:73

gem5::SQ_DPP_WF_SR1
@ SQ_DPP_WF_SR1
Definition inst_util.hh:75

gem5::Gcn3ISA::InFmt_VOP_DPP
Definition gpu_decoder.hh:1608

gem5::Gcn3ISA::InFmt_VOP_DPP::BOUND_CTRL
unsigned int BOUND_CTRL
Definition gpu_decoder.hh:1612

gem5::Gcn3ISA::InFmt_VOP_DPP::BANK_MASK
unsigned int BANK_MASK
Definition gpu_decoder.hh:1617

gem5::Gcn3ISA::InFmt_VOP_DPP::SRC0_NEG
unsigned int SRC0_NEG
Definition gpu_decoder.hh:1613

gem5::Gcn3ISA::InFmt_VOP_DPP::DPP_CTRL
unsigned int DPP_CTRL
Definition gpu_decoder.hh:1610

gem5::Gcn3ISA::InFmt_VOP_DPP::SRC1_NEG
unsigned int SRC1_NEG
Definition gpu_decoder.hh:1615

gem5::Gcn3ISA::InFmt_VOP_DPP::SRC1_ABS
unsigned int SRC1_ABS
Definition gpu_decoder.hh:1616

gem5::Gcn3ISA::InFmt_VOP_DPP::SRC0_ABS
unsigned int SRC0_ABS
Definition gpu_decoder.hh:1614

gem5::Gcn3ISA::InFmt_VOP_DPP::ROW_MASK
unsigned int ROW_MASK
Definition gpu_decoder.hh:1618

gem5::Gcn3ISA::InFmt_VOP_SDWA
Definition gpu_decoder.hh:1622

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC0_SEL
unsigned int SRC0_SEL
Definition gpu_decoder.hh:1628

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC0_SEXT
unsigned int SRC0_SEXT
Definition gpu_decoder.hh:1629

gem5::Gcn3ISA::InFmt_VOP_SDWA::DST_SEL
unsigned int DST_SEL
Definition gpu_decoder.hh:1624

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC0_NEG
unsigned int SRC0_NEG
Definition gpu_decoder.hh:1630

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC1_ABS
unsigned int SRC1_ABS
Definition gpu_decoder.hh:1636

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC1_SEXT
unsigned int SRC1_SEXT
Definition gpu_decoder.hh:1634

gem5::Gcn3ISA::InFmt_VOP_SDWA::CLAMP
unsigned int CLAMP
Definition gpu_decoder.hh:1626

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC1_SEL
unsigned int SRC1_SEL
Definition gpu_decoder.hh:1633

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC1_NEG
unsigned int SRC1_NEG
Definition gpu_decoder.hh:1635

gem5::Gcn3ISA::InFmt_VOP_SDWA::DST_UNUSED
unsigned int DST_UNUSED
Definition gpu_decoder.hh:1625

gem5::Gcn3ISA::InFmt_VOP_SDWA::SRC0_ABS
unsigned int SRC0_ABS
Definition gpu_decoder.hh:1631