release/v21-2-1-0/gpu__dyn__inst_8hh_source.html

/*

 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#ifndef __GPU_DYN_INST_HH__

#define __GPU_DYN_INST_HH__


#include <cstdint>

#include <memory>

#include <string>


#include "base/amo.hh"

#include "base/logging.hh"

#include "base/trace.hh"

#include "debug/GPUMem.hh"

#include "enums/StorageClassType.hh"

#include "gpu-compute/compute_unit.hh"

#include "gpu-compute/gpu_exec_context.hh"

#include "gpu-compute/operand_info.hh"


namespace gem5

{


class GPUStaticInst;


template<typename T>

class AtomicOpCAS : public TypedAtomicOpFunctor<T>

{

  public:

    T c;

    T s;


    ComputeUnit *computeUnit;


    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)

      : c(_c), s(_s), computeUnit(compute_unit) { }


    void

    execute(T *b)

    {

        computeUnit->stats.numCASOps++;


        if (*b == c) {

            *b = s;

        } else {

            computeUnit->stats.numFailedCASOps++;

        }

    }

    AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }

};


class RegisterOperandInfo

{

  public:

    RegisterOperandInfo() = delete;

    RegisterOperandInfo(int op_idx, int num_dwords,

                        const std::vector<int> &virt_indices,

                        const std::vector<int> &phys_indices)

        : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),

          physIndices(phys_indices)

    {

    }


    int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }

    int operandIdx() const { return opIdx; }

    int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }


  private:

    const int opIdx;

    const int numDWORDs;

    const std::vector<int> virtIndices;

    const std::vector<int> physIndices;

};


class GPUDynInst : public GPUExecContext

{

  public:

    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,

               uint64_t instSeqNum);

    ~GPUDynInst();

    void execute(GPUDynInstPtr gpuDynInst);


    const std::vector<OperandInfo>& srcVecRegOperands() const;

    const std::vector<OperandInfo>& dstVecRegOperands() const;

    const std::vector<OperandInfo>& srcScalarRegOperands() const;

    const std::vector<OperandInfo>& dstScalarRegOperands() const;


    int numSrcRegOperands();

    int numDstRegOperands();


    int numSrcVecRegOperands() const;

    int numDstVecRegOperands() const;

    int maxSrcVecRegOperandSize();

    int numSrcVecDWords();

    int numDstVecDWords();


    int numSrcScalarRegOperands() const;

    int numDstScalarRegOperands() const;

    int maxSrcScalarRegOperandSize();

    int numSrcScalarDWords();

    int numDstScalarDWords();


    int maxOperandSize();


    int getNumOperands() const;


    bool hasSourceSgpr() const;

    bool hasDestinationSgpr() const;

    bool hasSourceVgpr() const;

    bool hasDestinationVgpr() const;


    // returns true if the string "opcodeStr" is found in the

    // opcode of the instruction

    bool isOpcode(const std::string& opcodeStr) const;

    bool isOpcode(const std::string& opcodeStr,

                  const std::string& extStr) const;


    const std::string &disassemble() const;


    InstSeqNum seqNum() const;


    Addr pc();

    void pc(Addr _pc);


    enums::StorageClassType executedAs();


    // virtual address for scalar memory operations

    Addr scalarAddr;

    // virtual addressies for vector memory operations

    std::vector<Addr> addr;

    Addr pAddr;


    // vector data to get written

    uint8_t *d_data;

    // scalar data to be transferred

    uint8_t *scalar_data;

    // Additional data (for atomics)

    uint8_t *a_data;

    // Additional data (for atomics)

    uint8_t *x_data;

    // The execution mask

    VectorMask exec_mask;


    // SIMD where the WF of the memory instruction has been mapped to

    int simdId;

    // unique id of the WF where the memory instruction belongs to

    int wfDynId;

    // The kernel id of the requesting wf

    int kern_id;

    // The CU id of the requesting wf

    int cu_id;

    // The workgroup id of the requesting wf

    int wg_id;

    // HW slot id where the WF is mapped to inside a SIMD unit

    int wfSlotId;

    // execution pipeline id where the memory instruction has been scheduled

    int execUnitId;

    // The execution time of this operation

    Tick time;

    // The latency of this operation

    WaitClass latency;


    // Initiate the specified memory operation, by creating a

    // memory request and sending it off to the memory system.

    void initiateAcc(GPUDynInstPtr gpuDynInst);

    // Complete the specified memory operation, by writing

    // value back to the RF in the case of a load or atomic

    // return or, in the case of a store, we do nothing

    void completeAcc(GPUDynInstPtr gpuDynInst);


    void updateStats();


    GPUStaticInst* staticInstruction() { return _staticInst; }


    TheGpuISA::ScalarRegU32 srcLiteral() const;


    bool isALU() const;

    bool isBranch() const;

    bool isCondBranch() const;

    bool isNop() const;

    bool isReturn() const;

    bool isEndOfKernel() const;

    bool isKernelLaunch() const;

    bool isSDWAInst() const;

    bool isDPPInst() const;

    bool isUnconditionalJump() const;

    bool isSpecialOp() const;

    bool isWaitcnt() const;

    bool isSleep() const;


    bool isBarrier() const;

    bool isMemSync() const;

    bool isMemRef() const;

    bool isFlat() const;

    bool isFlatGlobal() const;

    bool isLoad() const;

    bool isStore() const;


    bool isAtomic() const;

    bool isAtomicNoRet() const;

    bool isAtomicRet() const;


    bool isScalar() const;

    bool isVector() const;

    bool readsSCC() const;

    bool writesSCC() const;

    bool readsVCC() const;

    bool writesVCC() const;

    bool readsExec() const;

    bool writesExec() const;

    bool readsMode() const;

    bool writesMode() const;

    bool ignoreExec() const;

    bool readsFlatScratch() const;

    bool writesFlatScratch() const;

    bool readsExecMask() const;

    bool writesExecMask() const;


    bool isAtomicAnd() const;

    bool isAtomicOr() const;

    bool isAtomicXor() const;

    bool isAtomicCAS() const;

    bool isAtomicExch() const;

    bool isAtomicAdd() const;

    bool isAtomicSub() const;

    bool isAtomicInc() const;

    bool isAtomicDec() const;

    bool isAtomicMax() const;

    bool isAtomicMin() const;


    bool isArgLoad() const;

    bool isGlobalMem() const;

    bool isLocalMem() const;


    bool isArgSeg() const;

    bool isGlobalSeg() const;

    bool isGroupSeg() const;

    bool isKernArgSeg() const;

    bool isPrivateSeg() const;

    bool isReadOnlySeg() const;

    bool isSpillSeg() const;


    bool isGloballyCoherent() const;

    bool isSystemCoherent() const;


    bool isF16() const;

    bool isF32() const;

    bool isF64() const;


    bool isFMA() const;

    bool isMAC() const;

    bool isMAD() const;


    // for FLAT memory ops. check the segment address

    // against the APE registers to see if it falls

    // within one of the APE ranges for LDS/SCRATCH/GPUVM.

    // if it does not fall into one of the three APEs, it

    // will be a regular global access.

    void doApertureCheck(const VectorMask &mask);

    // Function to resolve a flat accesses during execution stage.

    void resolveFlatSegment(const VectorMask &mask);


    template<typename c0> AtomicOpFunctorPtr

    makeAtomicOpFunctor(c0 *reg0, c0 *reg1)

    {

        if (isAtomicAnd()) {

            return std::make_unique<AtomicOpAnd<c0>>(*reg0);

        } else if (isAtomicOr()) {

            return std::make_unique<AtomicOpOr<c0>>(*reg0);

        } else if (isAtomicXor()) {

            return std::make_unique<AtomicOpXor<c0>>(*reg0);

        } else if (isAtomicCAS()) {

            return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);

        } else if (isAtomicExch()) {

            return std::make_unique<AtomicOpExch<c0>>(*reg0);

        } else if (isAtomicAdd()) {

            return std::make_unique<AtomicOpAdd<c0>>(*reg0);

        } else if (isAtomicSub()) {

            return std::make_unique<AtomicOpSub<c0>>(*reg0);

        } else if (isAtomicInc()) {

            return std::make_unique<AtomicOpInc<c0>>();

        } else if (isAtomicDec()) {

            return std::make_unique<AtomicOpDec<c0>>();

        } else if (isAtomicMax()) {

            return std::make_unique<AtomicOpMax<c0>>(*reg0);

        } else if (isAtomicMin()) {

            return std::make_unique<AtomicOpMin<c0>>(*reg0);

        } else {

            fatal("Unrecognized atomic operation");

        }

    }


    void

    setRequestFlags(RequestPtr req) const

    {

        if (isGloballyCoherent()) {

            req->setCacheCoherenceFlags(Request::GLC_BIT);

        }


        if (isSystemCoherent()) {

            req->setCacheCoherenceFlags(Request::SLC_BIT);

        }


        if (isAtomicRet()) {

            req->setFlags(Request::ATOMIC_RETURN_OP);

        } else if (isAtomicNoRet()) {

            req->setFlags(Request::ATOMIC_NO_RETURN_OP);

        }


        if (isMemSync()) {

            // the path for kernel launch and kernel end is different

            // from non-kernel mem sync.

            assert(!isKernelLaunch());

            assert(!isEndOfKernel());


            // must be wbinv inst if not kernel launch/end

            req->setCacheCoherenceFlags(Request::INV_L1);

        }

    }


    // reset the number of pending memory requests for all lanes

    void

    resetEntireStatusVector()

    {

        assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);

        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {

            resetStatusVector(lane);

        }

    }


    // reset the number of pending memory requests for the inputted lane

    void

    resetStatusVector(int lane)

    {

        setStatusVector(lane, 0);

    }


    // set the number of pending memory requests for the inputted lane

    void

    setStatusVector(int lane, int newVal)

    {

        // currently we can have up to 2 memory requests per lane (if the

        // lane's request goes across multiple cache lines)

        assert((newVal >= 0) && (newVal <= 2));

        statusVector[lane] = newVal;

    }


    // subtracts the number of pending memory requests for the inputted lane

    // by 1

    void

    decrementStatusVector(int lane)

    {

        // this lane may have multiple requests, so only subtract one for

        // this request

        assert(statusVector[lane] >= 1);

        statusVector[lane]--;

    }


    // return the current number of pending memory requests for the inputted

    // lane

    int

    getLaneStatus(int lane) const

    {

        return statusVector[lane];

    }


    // returns true if all memory requests from all lanes have been received,

    // else returns false

    bool

    allLanesZero() const

    {

        // local variables

        bool allZero = true;


        // iterate over all lanes, checking the number of pending memory

        // requests they have

        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {

            // if any lane still has pending requests, return false

            if (statusVector[lane] > 0) {

                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "

                        "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,

                        statusVector[lane], addr[lane]);

                allZero = false;

            }

        }


        if (allZero) {

            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"

                    " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);

        }

        return allZero;

    }


    // returns a string representing the current state of the statusVector

    std::string

    printStatusVector() const

    {

        std::string statusVec_str = "[";


        // iterate over all lanes, adding the current number of pending

        // requests for this lane to the string

        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {

            statusVec_str += std::to_string(statusVector[lane]);

        }

        statusVec_str += "]";


        return statusVec_str;

    }


    // Map returned packets and the addresses they satisfy with which lane they

    // were requested from

    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;

    StatusVector memStatusVector;


    // Track the status of memory requests per lane, an int per lane to allow

    // unaligned accesses

    std::vector<int> statusVector;

    // for ld_v# or st_v#

    std::vector<int> tlbHitLevel;


    // for misaligned scalar ops we track the number

    // of outstanding reqs here

    int numScalarReqs;


    Tick getAccessTime() const { return accessTime; }


    void setAccessTime(Tick currentTime) { accessTime = currentTime; }


    void profileRoundTripTime(Tick currentTime, int hopId);

    std::vector<Tick> getRoundTripTime() const { return roundTripTime; }


    void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);

    const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const

    { return lineAddressTime; }


    // inst used to save/restore a wavefront context

    bool isSaveRestore;

  private:

    GPUStaticInst *_staticInst;

    const InstSeqNum _seqNum;

    int maxSrcVecRegOpSize;

    int maxSrcScalarRegOpSize;


    // the time the request was started

    Tick accessTime = -1;


    // hold the tick when the instruction arrives at certain hop points

    // on it's way to main memory

    std::vector<Tick> roundTripTime;


    // hold each cache block address for the instruction and a vector

    // to hold the tick when the block arrives at certain hop points

    std::map<Addr, std::vector<Tick>> lineAddressTime;

};


} // namespace gem5


#endif // __GPU_DYN_INST_HH__