develop/gpu__dyn__inst_8cc_source.html

/*

 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "gpu-compute/gpu_dyn_inst.hh"


#include "debug/GPUInst.hh"

#include "debug/GPUMem.hh"

#include "gpu-compute/gpu_static_inst.hh"

#include "gpu-compute/scalar_register_file.hh"

#include "gpu-compute/shader.hh"

#include "gpu-compute/wavefront.hh"


namespace gem5

{


GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,

                       GPUStaticInst *static_inst, InstSeqNum instSeqNum)

    : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),

      (Addr)0), numScalarReqs(0), isSaveRestore(false),

      _staticInst(static_inst), _seqNum(instSeqNum),

      maxSrcVecRegOpSize(-1), maxSrcScalarRegOpSize(-1)

{

    _staticInst->initOperandInfo();

    statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);

    tlbHitLevel.assign(computeUnit()->wfSize(), -1);

    // vector instructions can have up to 4 source/destination operands

    d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];

    a_data = new uint8_t[computeUnit()->wfSize() * 8];

    x_data = new uint8_t[computeUnit()->wfSize() * 8];

    // scalar loads can read up to 16 Dwords of data (see publicly

    // available Vega ISA manual)

    scalar_data = new uint8_t[16 * sizeof(uint32_t)];

    for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {

        scalar_data[i] = 0;

    }

    for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {

        a_data[i] = 0;

        x_data[i] = 0;

    }

    for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {

        d_data[i] = 0;

    }

    time = 0;


    cu_id = _cu->cu_id;

    if (_wf) {

        simdId = _wf->simdId;

        wfDynId = _wf->wfDynId;

        kern_id = _wf->kernId;

        wg_id = _wf->wgId;

        wfSlotId = _wf->wfSlotId;

    } else {

        simdId = -1;

        wfDynId = -1;

        kern_id = -1;

        wg_id = -1;

        wfSlotId = -1;

    }


    DPRINTF(GPUInst, "%s: generating operand info for %d operands\n",

            disassemble(), getNumOperands());


    _staticInst->initDynOperandInfo(wavefront(), computeUnit());


}


GPUDynInst::~GPUDynInst()

{

    delete[] d_data;

    delete[] a_data;

    delete[] x_data;

    delete[] scalar_data;

    delete _staticInst;

}


void


GPUDynInst::execute(GPUDynInstPtr gpuDynInst)

{

    _staticInst->execute(gpuDynInst);

}


const std::vector<OperandInfo>&


GPUDynInst::srcVecRegOperands() const

{

    return _staticInst->srcVecRegOperands();

}


const std::vector<OperandInfo>&


GPUDynInst::dstVecRegOperands() const

{

    return _staticInst->dstVecRegOperands();

}


const std::vector<OperandInfo>&


GPUDynInst::srcScalarRegOperands() const

{

    return _staticInst->srcScalarRegOperands();

}


const std::vector<OperandInfo>&


GPUDynInst::dstScalarRegOperands() const

{

    return _staticInst->dstScalarRegOperands();

}


int


GPUDynInst::numSrcRegOperands()

{

    return _staticInst->numSrcRegOperands();

}


int


GPUDynInst::numDstRegOperands()

{

    return _staticInst->numDstRegOperands();

}


int


GPUDynInst::numSrcVecRegOperands() const

{

    return _staticInst->numSrcVecOperands();

}


int


GPUDynInst::numDstVecRegOperands() const

{

    return _staticInst->numDstVecOperands();

}


int


GPUDynInst::maxSrcVecRegOperandSize()

{

    if (maxSrcVecRegOpSize != -1)

        return maxSrcVecRegOpSize;


    maxSrcVecRegOpSize = 0;

    for (const auto& srcVecOp : srcVecRegOperands())

        if (srcVecOp.sizeInDWords() > maxSrcVecRegOpSize)

            maxSrcVecRegOpSize = srcVecOp.sizeInDWords();


    return maxSrcVecRegOpSize;

}


int


GPUDynInst::numSrcVecDWords()

{

    return _staticInst->numSrcVecDWords();

}


int


GPUDynInst::numDstVecDWords()

{

    return _staticInst->numDstVecDWords();

}


int


GPUDynInst::numSrcScalarRegOperands() const

{

    return _staticInst->numSrcScalarOperands();

}


int


GPUDynInst::numDstScalarRegOperands() const

{

    return _staticInst->numDstScalarOperands();

}


int


GPUDynInst::maxSrcScalarRegOperandSize()

{

    if (maxSrcScalarRegOpSize != -1)

        return maxSrcScalarRegOpSize;


    maxSrcScalarRegOpSize = 0;

    for (const auto& srcScOp : srcScalarRegOperands())

        if (srcScOp.sizeInDWords() > maxSrcScalarRegOpSize)

            maxSrcScalarRegOpSize = srcScOp.sizeInDWords();


    return maxSrcScalarRegOpSize;

}


int


GPUDynInst::numSrcScalarDWords()

{

    return _staticInst->numSrcScalarDWords();

}


int


GPUDynInst::numDstScalarDWords()

{

    return _staticInst->numDstScalarDWords();

}


int


GPUDynInst::maxOperandSize()

{

    return _staticInst->maxOperandSize();

}


int


GPUDynInst::getNumOperands() const

{

    return _staticInst->getNumOperands();

}


bool


GPUDynInst::hasSourceVgpr() const

{

    return !srcVecRegOperands().empty();

}


bool


GPUDynInst::hasDestinationVgpr() const

{

    return !dstVecRegOperands().empty();

}


bool


GPUDynInst::hasSourceSgpr() const

{

    return !srcScalarRegOperands().empty();

}


bool


GPUDynInst::hasDestinationSgpr() const

{

    return !dstScalarRegOperands().empty();

}


bool


GPUDynInst::isOpcode(const std::string& opcodeStr,

                     const std::string& extStr) const

{

    return _staticInst->opcode().find(opcodeStr) != std::string::npos &&

        _staticInst->opcode().find(extStr) != std::string::npos;

}


bool


GPUDynInst::isOpcode(const std::string& opcodeStr) const

{

    return _staticInst->opcode().find(opcodeStr) != std::string::npos;

}


const std::string&


GPUDynInst::disassemble() const

{

    return _staticInst->disassemble();

}


InstSeqNum


GPUDynInst::seqNum() const

{

    return _seqNum;

}


Addr


GPUDynInst::pc()

{

    return wavefront()->pc();

}


void


GPUDynInst::pc(Addr _pc)

{

    wavefront()->pc(_pc);

}


enums::StorageClassType


GPUDynInst::executedAs()

{

    return _staticInst->executed_as;

}


// Process a memory instruction and (if necessary) submit timing request

void


GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)

{

    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",

            cu->cu_id, simdId, wfSlotId, exec_mask);


    _staticInst->initiateAcc(gpuDynInst);

}


void


GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)

{

    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="

            "%#x complete\n",

            cu->cu_id, simdId, wfSlotId, exec_mask);


    _staticInst->completeAcc(gpuDynInst);

}


bool


GPUDynInst::isALU() const

{

    return _staticInst->isALU();

}


bool


GPUDynInst::isBranch() const

{

    return _staticInst->isBranch();

}


bool


GPUDynInst::isCondBranch() const

{

    return _staticInst->isCondBranch();

}


bool


GPUDynInst::isNop() const

{

    return _staticInst->isNop();

}


bool


GPUDynInst::isEndOfKernel() const

{

    return _staticInst->isEndOfKernel();

}


bool


GPUDynInst::isKernelLaunch() const

{

    return _staticInst->isKernelLaunch();

}


bool


GPUDynInst::isSDWAInst() const

{

    return _staticInst->isSDWAInst();

}


bool


GPUDynInst::isDPPInst() const

{

    return _staticInst->isDPPInst();

}


bool


GPUDynInst::isReturn() const

{

    return _staticInst->isReturn();

}


bool


GPUDynInst::isUnconditionalJump() const

{

    return _staticInst->isUnconditionalJump();

}


bool


GPUDynInst::isSpecialOp() const

{

    return _staticInst->isSpecialOp();

}


bool


GPUDynInst::isWaitcnt() const

{

    return _staticInst->isWaitcnt();

}


bool


GPUDynInst::isSleep() const

{

    return _staticInst->isSleep();

}


bool


GPUDynInst::isBarrier() const

{

    return _staticInst->isBarrier();

}


bool


GPUDynInst::isMemSync() const

{

    return _staticInst->isMemSync();

}


bool


GPUDynInst::isMemRef() const

{

    return _staticInst->isMemRef();

}


bool


GPUDynInst::isFlat() const

{

    return _staticInst->isFlat();

}


bool


GPUDynInst::isFlatGlobal() const

{

    return _staticInst->isFlatGlobal();

}


bool


GPUDynInst::isFlatScratch() const

{

    return _staticInst->isFlatScratch();

}


bool


GPUDynInst::isLoad() const

{

    return _staticInst->isLoad();

}


bool


GPUDynInst::isStore() const

{

    return _staticInst->isStore();

}


bool


GPUDynInst::isAtomic() const

{

    return _staticInst->isAtomic();

}


bool


GPUDynInst::isAtomicNoRet() const

{

    return _staticInst->isAtomicNoRet();

}


bool


GPUDynInst::isAtomicRet() const

{

    return _staticInst->isAtomicRet();

}


bool


GPUDynInst::isVector() const

{

    return !_staticInst->isScalar();

}


bool


GPUDynInst::isScalar() const

{

    return _staticInst->isScalar();

}


bool


GPUDynInst::readsSCC() const

{

    return _staticInst->readsSCC();

}


bool


GPUDynInst::writesSCC() const

{

    return _staticInst->writesSCC();

}


bool


GPUDynInst::readsVCC() const

{

    for (const auto& srcOp : _staticInst->srcOperands())

        if (srcOp.isVcc())

            return true;


    return _staticInst->readsVCC();

}


bool


GPUDynInst::writesVCC() const

{

    for (const auto& dstOp : _staticInst->dstOperands())

        if (dstOp.isVcc())

            return true;


    return _staticInst->writesVCC();

}


bool


GPUDynInst::readsMode() const

{

    return _staticInst->readsMode();

}


bool


GPUDynInst::writesMode() const

{

    return _staticInst->writesMode();

}


bool


GPUDynInst::readsExec() const

{

    return _staticInst->readsEXEC();

}


bool


GPUDynInst::writesExec() const

{

    return _staticInst->writesEXEC();

}


bool


GPUDynInst::ignoreExec() const

{

    return _staticInst->ignoreExec();

}


bool


GPUDynInst::writesExecMask() const

{

    for (const auto& dstOp : _staticInst->dstOperands())

        if (dstOp.isExec())

            return true;


    return _staticInst->writesEXEC();

}


bool


GPUDynInst::readsExecMask() const

{

    for (const auto& srcOp : _staticInst->srcOperands())

        if (srcOp.isExec())

            return true;


    return _staticInst->readsEXEC();

}


bool


GPUDynInst::writesFlatScratch() const

{

    for (const auto& dstScalarOp : dstScalarRegOperands())

        if (dstScalarOp.isFlatScratch())

            return true;


    return false;

}


bool


GPUDynInst::readsFlatScratch() const

{

    for (const auto& srcScalarOp : srcScalarRegOperands())

        if (srcScalarOp.isFlatScratch())

            return true;


    return false;

}


bool


GPUDynInst::needsToken() const

{

    return isGlobalMem() || isFlat() || isFlatGlobal() || isFlatScratch();

}


bool


GPUDynInst::isAtomicAnd() const

{

    return _staticInst->isAtomicAnd();

}


bool


GPUDynInst::isAtomicOr() const

{

    return _staticInst->isAtomicOr();

}


bool


GPUDynInst::isAtomicXor() const

{

    return _staticInst->isAtomicXor();

}


bool


GPUDynInst::isAtomicCAS() const

{

    return _staticInst->isAtomicCAS();

}


bool GPUDynInst::isAtomicExch() const

{

    return _staticInst->isAtomicExch();

}


bool


GPUDynInst::isAtomicAdd() const

{

    return _staticInst->isAtomicAdd();

}


bool


GPUDynInst::isAtomicSub() const

{

    return _staticInst->isAtomicSub();

}


bool


GPUDynInst::isAtomicInc() const

{

    return _staticInst->isAtomicInc();

}


bool


GPUDynInst::isAtomicDec() const

{

    return _staticInst->isAtomicDec();

}


bool


GPUDynInst::isAtomicMax() const

{

    return _staticInst->isAtomicMax();

}


bool


GPUDynInst::isAtomicMin() const

{

    return _staticInst->isAtomicMin();

}


bool


GPUDynInst::isAtomicPkAddBF16() const

{

    return _staticInst->isAtomicPkAddBF16();

}


bool


GPUDynInst::isArgLoad() const

{

    return _staticInst->isArgLoad();

}


bool


GPUDynInst::isGlobalMem() const

{

    return _staticInst->isGlobalMem();

}


bool


GPUDynInst::isLocalMem() const

{

    return _staticInst->isLocalMem();

}


bool


GPUDynInst::isArgSeg() const

{

    return _staticInst->isArgSeg();

}


bool


GPUDynInst::isGlobalSeg() const

{

    return _staticInst->isGlobalSeg();

}


bool


GPUDynInst::isGroupSeg() const

{

    return _staticInst->isGroupSeg();

}


bool


GPUDynInst::isKernArgSeg() const

{

    return _staticInst->isKernArgSeg();

}


bool


GPUDynInst::isPrivateSeg() const

{

    return _staticInst->isPrivateSeg();

}


bool


GPUDynInst::isReadOnlySeg() const

{

    return _staticInst->isReadOnlySeg();

}


bool


GPUDynInst::isSpillSeg() const

{

    return _staticInst->isSpillSeg();

}


bool


GPUDynInst::isGloballyCoherent() const

{

    return _staticInst->isGloballyCoherent();

}


bool


GPUDynInst::isSystemCoherent() const

{

    return _staticInst->isSystemCoherent();

}


bool


GPUDynInst::isI8() const

{

    return _staticInst->isI8();

}


bool


GPUDynInst::isF16() const

{

    return _staticInst->isF16();

}


bool


GPUDynInst::isF32() const

{

    return _staticInst->isF32();

}


bool


GPUDynInst::isF64() const

{

    return _staticInst->isF64();

}


bool


GPUDynInst::isFMA() const

{

    return _staticInst->isFMA();

}


bool


GPUDynInst::isMAC() const

{

    return _staticInst->isMAC();

}


bool


GPUDynInst::isMAD() const

{

    return _staticInst->isMAD();

}


bool


GPUDynInst::isMFMA() const

{

    return _staticInst->isMFMA();

}


void


GPUDynInst::doApertureCheck(const VectorMask &mask)

{

    assert(mask.any());

    // find the segment of the first active address, after

    // that we check that all other active addresses also

    // fall within the same APE

    for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {

        if (mask[lane]) {

            if (computeUnit()->shader->isLdsApe(addr[lane])) {

                // group segment

                staticInstruction()->executed_as = enums::SC_GROUP;

                break;

            } else if (computeUnit()->shader->isScratchApe(addr[lane])) {

                // private segment

                staticInstruction()->executed_as = enums::SC_PRIVATE;

                break;

            } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {

                // we won't support GPUVM

                fatal("flat access is in GPUVM APE\n");

            } else if (bits(addr[lane], 63, 47) != 0x1FFFF &&

                       bits(addr[lane], 63, 47)) {

                // we are in the "hole", this is a memory violation

                fatal("flat access at addr %#x has a memory violation\n",

                      addr[lane]);

            } else {

                // global memory segment

                staticInstruction()->executed_as = enums::SC_GLOBAL;

                break;

            }

        }

    }


    // we should have found the segment

    assert(executedAs() != enums::SC_NONE);


    // flat accesses should not straddle multiple APEs so we

    // must check that all addresses fall within the same APE

    if (executedAs() == enums::SC_GROUP) {

        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {

            if (mask[lane]) {

                // if the first valid addr we found above was LDS,

                // all the rest should be

                assert(computeUnit()->shader->isLdsApe(addr[lane]));

            }

        }

    } else if (executedAs() == enums::SC_PRIVATE) {

        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {

            if (mask[lane]) {

                // if the first valid addr we found above was private,

                // all the rest should be

                assert(computeUnit()->shader->isScratchApe(addr[lane]));

            }

        }

    } else {

        for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {

            if (mask[lane]) {

                // if the first valid addr we found above was global,

                // all the rest should be. because we don't have an

                // explicit range of the global segment, we just make

                // sure that the address fall in no other APE and that

                // it is not a memory violation

                assert(!computeUnit()->shader->isLdsApe(addr[lane]));

                assert(!computeUnit()->shader->isScratchApe(addr[lane]));

                assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));

                assert(!(bits(addr[lane], 63, 47) != 0x1FFFF

                       && bits(addr[lane], 63, 47)));

            }

        }

    }

}


void


GPUDynInst::resolveFlatSegment(const VectorMask &mask)

{

    doApertureCheck(mask);


    // Now that we know the aperature, do the following:

    // 1. Transform the flat address to its segmented equivalent.

    // 2. Set the execUnitId based an the aperture check.

    // 3. Decrement any extra resources that were reserved. Other

    //    resources are released as normal, below.

    if (executedAs() == enums::SC_GLOBAL) {

        // no transormation for global segment

        wavefront()->execUnitId =  wavefront()->flatGmUnitId;

        if (isLoad()) {

            wavefront()->rdLmReqsInPipe--;

        } else if (isStore()) {

            wavefront()->wrLmReqsInPipe--;

        } else if (isAtomic() || isMemSync()) {

            wavefront()->wrLmReqsInPipe--;

            wavefront()->rdLmReqsInPipe--;

        } else {

            panic("Invalid memory operation!\n");

        }

    } else if (executedAs() == enums::SC_GROUP) {

        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {

            if (mask[lane]) {

                // flat address calculation goes here.

                // addr[lane] = segmented address

                addr[lane] = addr[lane] -

                    wavefront()->computeUnit->shader->ldsApe().base;

                assert(addr[lane] <

                  wavefront()->computeUnit->getLds().getAddrRange().size());

            }

        }

        wavefront()->execUnitId =  wavefront()->flatLmUnitId;

        wavefront()->decVMemInstsIssued();

        wavefront()->vmemIssued.erase(seqNum());

        if (isLoad()) {

            wavefront()->rdGmReqsInPipe--;

        } else if (isStore()) {

            wavefront()->wrGmReqsInPipe--;

        } else if (isAtomic() || isMemSync()) {

            wavefront()->rdGmReqsInPipe--;

            wavefront()->wrGmReqsInPipe--;

        } else {

            panic("Invalid memory operation!\n");

        }

    } else if (executedAs() == enums::SC_PRIVATE) {


        ComputeUnit *cu = wavefront()->computeUnit;


        if (wavefront()->gfxVersion == GfxVersion::gfx942 ||

            wavefront()->gfxVersion == GfxVersion::gfx950) {

            // Architected flat scratch base address is in a dedicated hardware

            // register.

            for (int lane = 0; lane < cu->wfSize(); ++lane) {

                if (mask[lane]) {

                    // The scratch base is added for other gfx versions,

                    // otherwise this would simply add the register base.

                    addr[lane] = addr[lane] - cu->shader->getScratchBase()

                        + wavefront()->archFlatScratchAddr;

                }

            }

        } else {

            // In absolute flat scratch the program needs to place scratch

            // address in SGPRn-3,4.

            uint32_t numSgprs = wavefront()->maxSgprs;

            uint32_t physSgprIdx =

                cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);

            uint32_t offset = cu->srf[simdId]->read(physSgprIdx);

            physSgprIdx =

                cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);

            uint32_t size = cu->srf[simdId]->read(physSgprIdx);


            for (int lane = 0; lane < cu->wfSize(); ++lane) {

                if (mask[lane]) {

                    addr[lane] = addr[lane] + lane * size + offset +

                        cu->shader->getHiddenPrivateBase() -

                        cu->shader->getScratchBase();

                }

            }

        }


        wavefront()->execUnitId = wavefront()->flatLmUnitId;


        // For FLAT the local memory pipe counters are incremented, but they

        // are not incremented for explicit scratch_* instructions. Only

        // decrement these counters if we are explicitly a FLAT instruction.

        if (isFlat()) {

            wavefront()->decLGKMInstsIssued();

            wavefront()->lgkmIssued.erase(seqNum());

            if (isLoad()) {

                wavefront()->rdLmReqsInPipe--;

            } else if (isStore()) {

                wavefront()->wrLmReqsInPipe--;

            } else if (isAtomic() || isMemSync()) {

                wavefront()->wrLmReqsInPipe--;

                wavefront()->rdLmReqsInPipe--;

            } else {

                panic("Invalid memory operation!\n");

            }

        }

    } else {

        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {

            if (mask[lane]) {

                panic("flat addr %#llx maps to bad segment %d\n",

                      addr[lane], executedAs());

            }

        }

    }

}


TheGpuISA::ScalarRegU32


GPUDynInst::srcLiteral() const

{

    return _staticInst->srcLiteral();

}


void


GPUDynInst::updateStats()

{

    if (_staticInst->isLocalMem()) {

        // access to LDS (shared) memory

        cu->stats.dynamicLMemInstrCnt++;

    } else if (_staticInst->isFlat()) {

        cu->stats.dynamicFlatMemInstrCnt++;

    } else {

        // access to global memory


        // update PageDivergence histogram

        int number_pages_touched = cu->pagesTouched.size();

        assert(number_pages_touched);

        cu->stats.pageDivergenceDist.sample(number_pages_touched);


        std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;


        for (auto it : cu->pagesTouched) {

            // see if this page has been touched before. if not, this also

            // inserts the page into the table.

            ret = cu->pageAccesses

                .insert(ComputeUnit::pageDataStruct::value_type(it.first,

                        std::make_pair(1, it.second)));


            // if yes, then update the stats

            if (!ret.second) {

                ret.first->second.first++;

                ret.first->second.second += it.second;

            }

        }


        cu->pagesTouched.clear();


        // total number of memory instructions (dynamic)

        // Atomics are counted as a single memory instruction.

        // this is # memory instructions per wavefronts, not per workitem

        cu->stats.dynamicGMemInstrCnt++;

    }

}


void


GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)

{

    // Only take the first measurement in the case of coalescing

    if (roundTripTime.size() > hopId)

        return;


    roundTripTime.push_back(currentTime);

}


void


GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)

{

    if (lineAddressTime.count(addr)) {

        if (lineAddressTime[addr].size() > hopId) {

            return;

        }


        lineAddressTime[addr].push_back(currentTime);

    } else if (hopId == 0) {

        auto addressTimeVec = std::vector<Tick> { currentTime };

        lineAddressTime.insert(std::make_pair(addr, addressTimeVec));

    }

}


} // namespace gem5

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

gem5::ComputeUnit
Definition compute_unit.hh:204

gem5::ComputeUnit::wfSize
int wfSize() const
Definition compute_unit.hh:412

gem5::ComputeUnit::cu_id
int cu_id
Definition compute_unit.hh:294

gem5::ComputeUnit::shader
Shader * shader
Definition compute_unit.hh:359

gem5::GPUDynInst::isFlatScratch
bool isFlatScratch() const
Definition gpu_dyn_inst.cc:436

gem5::GPUDynInst::isKernelLaunch
bool isKernelLaunch() const
Definition gpu_dyn_inst.cc:358

gem5::GPUDynInst::isAtomicCAS
bool isAtomicCAS() const
Definition gpu_dyn_inst.cc:610

gem5::GPUDynInst::isSpecialOp
bool isSpecialOp() const
Definition gpu_dyn_inst.cc:388

gem5::GPUDynInst::roundTripTime
std::vector< Tick > roundTripTime
Definition gpu_dyn_inst.hh:530

gem5::GPUDynInst::isI8
bool isI8() const
Definition gpu_dyn_inst.cc:735

gem5::GPUDynInst::isLocalMem
bool isLocalMem() const
Definition gpu_dyn_inst.cc:675

gem5::GPUDynInst::hasDestinationSgpr
bool hasDestinationSgpr() const
Definition gpu_dyn_inst.cc:254

gem5::GPUDynInst::writesVCC
bool writesVCC() const
Definition gpu_dyn_inst.cc:506

gem5::GPUDynInst::isAtomicDec
bool isAtomicDec() const
Definition gpu_dyn_inst.cc:639

gem5::GPUDynInst::numDstScalarDWords
int numDstScalarDWords()
Definition gpu_dyn_inst.cc:218

gem5::GPUDynInst::readsVCC
bool readsVCC() const
Definition gpu_dyn_inst.cc:496

gem5::GPUDynInst::isNop
bool isNop() const
Definition gpu_dyn_inst.cc:346

gem5::GPUDynInst::isF16
bool isF16() const
Definition gpu_dyn_inst.cc:741

gem5::GPUDynInst::numDstScalarRegOperands
int numDstScalarRegOperands() const
Definition gpu_dyn_inst.cc:192

gem5::GPUDynInst::lineAddressTime
std::map< Addr, std::vector< Tick > > lineAddressTime
Definition gpu_dyn_inst.hh:534

gem5::GPUDynInst::doApertureCheck
void doApertureCheck(const VectorMask &mask)
Definition gpu_dyn_inst.cc:783

gem5::GPUDynInst::isAtomicRet
bool isAtomicRet() const
Definition gpu_dyn_inst.cc:466

gem5::GPUDynInst::resolveFlatSegment
void resolveFlatSegment(const VectorMask &mask)
Definition gpu_dyn_inst.cc:855

gem5::GPUDynInst::isSaveRestore
bool isSaveRestore
Definition gpu_dyn_inst.hh:513

gem5::GPUDynInst::tlbHitLevel
std::vector< int > tlbHitLevel
Definition gpu_dyn_inst.hh:495

gem5::GPUDynInst::wfDynId
int wfDynId
Definition gpu_dyn_inst.hh:215

gem5::GPUDynInst::isGlobalMem
bool isGlobalMem() const
Definition gpu_dyn_inst.cc:669

gem5::GPUDynInst::isAtomicMin
bool isAtomicMin() const
Definition gpu_dyn_inst.cc:651

gem5::GPUDynInst::isAtomicExch
bool isAtomicExch() const
Definition gpu_dyn_inst.cc:615

gem5::GPUDynInst::numDstRegOperands
int numDstRegOperands()
Definition gpu_dyn_inst.cc:142

gem5::GPUDynInst::isFlatGlobal
bool isFlatGlobal() const
Definition gpu_dyn_inst.cc:430

gem5::GPUDynInst::isBranch
bool isBranch() const
Definition gpu_dyn_inst.cc:334

gem5::GPUDynInst::isF32
bool isF32() const
Definition gpu_dyn_inst.cc:747

gem5::GPUDynInst::isAtomicSub
bool isAtomicSub() const
Definition gpu_dyn_inst.cc:627

gem5::GPUDynInst::simdId
int simdId
Definition gpu_dyn_inst.hh:213

gem5::GPUDynInst::_staticInst
GPUStaticInst * _staticInst
Definition gpu_dyn_inst.hh:519

gem5::GPUDynInst::hasDestinationVgpr
bool hasDestinationVgpr() const
Definition gpu_dyn_inst.cc:242

gem5::GPUDynInst::statusVector
std::vector< int > statusVector
Definition gpu_dyn_inst.hh:493

gem5::GPUDynInst::profileLineAddressTime
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
Definition gpu_dyn_inst.cc:1054

gem5::GPUDynInst::isUnconditionalJump
bool isUnconditionalJump() const
Definition gpu_dyn_inst.cc:382

gem5::GPUDynInst::maxSrcVecRegOpSize
int maxSrcVecRegOpSize
Definition gpu_dyn_inst.hh:521

gem5::GPUDynInst::staticInstruction
GPUStaticInst * staticInstruction()
Definition gpu_dyn_inst.hh:241

gem5::GPUDynInst::numSrcScalarRegOperands
int numSrcScalarRegOperands() const
Definition gpu_dyn_inst.cc:186

gem5::GPUDynInst::isOpcode
bool isOpcode(const std::string &opcodeStr) const
Definition gpu_dyn_inst.cc:268

gem5::GPUDynInst::a_data
uint8_t * a_data
Definition gpu_dyn_inst.hh:206

gem5::GPUDynInst::numScalarReqs
int numScalarReqs
Definition gpu_dyn_inst.hh:499

gem5::GPUDynInst::GPUDynInst
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum)
Definition gpu_dyn_inst.cc:44

gem5::GPUDynInst::isAtomicXor
bool isAtomicXor() const
Definition gpu_dyn_inst.cc:604

gem5::GPUDynInst::isALU
bool isALU() const
accessor methods for the attributes of the underlying GPU static instruction
Definition gpu_dyn_inst.cc:328

gem5::GPUDynInst::isReadOnlySeg
bool isReadOnlySeg() const
Definition gpu_dyn_inst.cc:711

gem5::GPUDynInst::isSystemCoherent
bool isSystemCoherent() const
Definition gpu_dyn_inst.cc:729

gem5::GPUDynInst::isMemRef
bool isMemRef() const
Definition gpu_dyn_inst.cc:418

gem5::GPUDynInst::isAtomicAnd
bool isAtomicAnd() const
Definition gpu_dyn_inst.cc:592

gem5::GPUDynInst::isStore
bool isStore() const
Definition gpu_dyn_inst.cc:448

gem5::GPUDynInst::isDPPInst
bool isDPPInst() const
Definition gpu_dyn_inst.cc:370

gem5::GPUDynInst::isSleep
bool isSleep() const
Definition gpu_dyn_inst.cc:400

gem5::GPUDynInst::wg_id
int wg_id
Definition gpu_dyn_inst.hh:221

gem5::GPUDynInst::numSrcScalarDWords
int numSrcScalarDWords()
Definition gpu_dyn_inst.cc:212

gem5::GPUDynInst::exec_mask
VectorMask exec_mask
Definition gpu_dyn_inst.hh:210

gem5::GPUDynInst::isMemSync
bool isMemSync() const
Definition gpu_dyn_inst.cc:412

gem5::GPUDynInst::needsToken
bool needsToken() const
Definition gpu_dyn_inst.cc:586

gem5::GPUDynInst::writesSCC
bool writesSCC() const
Definition gpu_dyn_inst.cc:490

gem5::GPUDynInst::numDstVecDWords
int numDstVecDWords()
Definition gpu_dyn_inst.cc:180

gem5::GPUDynInst::hasSourceVgpr
bool hasSourceVgpr() const
Definition gpu_dyn_inst.cc:236

gem5::GPUDynInst::numDstVecRegOperands
int numDstVecRegOperands() const
Definition gpu_dyn_inst.cc:154

gem5::GPUDynInst::ignoreExec
bool ignoreExec() const
Definition gpu_dyn_inst.cc:540

gem5::GPUDynInst::hasSourceSgpr
bool hasSourceSgpr() const
Definition gpu_dyn_inst.cc:248

gem5::GPUDynInst::scalar_data
uint8_t * scalar_data
Definition gpu_dyn_inst.hh:204

gem5::GPUDynInst::isReturn
bool isReturn() const
Definition gpu_dyn_inst.cc:376

gem5::GPUDynInst::readsSCC
bool readsSCC() const
Definition gpu_dyn_inst.cc:484

gem5::GPUDynInst::isMAD
bool isMAD() const
Definition gpu_dyn_inst.cc:771

gem5::GPUDynInst::readsFlatScratch
bool readsFlatScratch() const
Definition gpu_dyn_inst.cc:576

gem5::GPUDynInst::initiateAcc
void initiateAcc(GPUDynInstPtr gpuDynInst)
Definition gpu_dyn_inst.cc:305

gem5::GPUDynInst::cu_id
int cu_id
Definition gpu_dyn_inst.hh:219

gem5::GPUDynInst::getNumOperands
int getNumOperands() const
Definition gpu_dyn_inst.cc:230

gem5::GPUDynInst::writesExec
bool writesExec() const
Definition gpu_dyn_inst.cc:534

gem5::GPUDynInst::isSDWAInst
bool isSDWAInst() const
Definition gpu_dyn_inst.cc:364

gem5::GPUDynInst::isWaitcnt
bool isWaitcnt() const
Definition gpu_dyn_inst.cc:394

gem5::GPUDynInst::numSrcVecDWords
int numSrcVecDWords()
Definition gpu_dyn_inst.cc:174

gem5::GPUDynInst::writesMode
bool writesMode() const
Definition gpu_dyn_inst.cc:522

gem5::GPUDynInst::executedAs
enums::StorageClassType executedAs()
Definition gpu_dyn_inst.cc:298

gem5::GPUDynInst::scalarAddr
Addr scalarAddr
Definition gpu_dyn_inst.hh:196

gem5::GPUDynInst::isFlat
bool isFlat() const
Definition gpu_dyn_inst.cc:424

gem5::GPUDynInst::dstVecRegOperands
const std::vector< OperandInfo > & dstVecRegOperands() const
Definition gpu_dyn_inst.cc:118

gem5::GPUDynInst::profileRoundTripTime
void profileRoundTripTime(Tick currentTime, int hopId)
Definition gpu_dyn_inst.cc:1044

gem5::GPUDynInst::isCondBranch
bool isCondBranch() const
Definition gpu_dyn_inst.cc:340

gem5::GPUDynInst::writesExecMask
bool writesExecMask() const
Definition gpu_dyn_inst.cc:546

gem5::GPUDynInst::isPrivateSeg
bool isPrivateSeg() const
Definition gpu_dyn_inst.cc:705

gem5::GPUDynInst::isEndOfKernel
bool isEndOfKernel() const
Definition gpu_dyn_inst.cc:352

gem5::GPUDynInst::srcVecRegOperands
const std::vector< OperandInfo > & srcVecRegOperands() const
Definition gpu_dyn_inst.cc:112

gem5::GPUDynInst::isAtomicInc
bool isAtomicInc() const
Definition gpu_dyn_inst.cc:633

gem5::GPUDynInst::isGloballyCoherent
bool isGloballyCoherent() const
Definition gpu_dyn_inst.cc:723

gem5::GPUDynInst::x_data
uint8_t * x_data
Definition gpu_dyn_inst.hh:208

gem5::GPUDynInst::readsExecMask
bool readsExecMask() const
Definition gpu_dyn_inst.cc:556

gem5::GPUDynInst::isGroupSeg
bool isGroupSeg() const
Definition gpu_dyn_inst.cc:693

gem5::GPUDynInst::wfSlotId
int wfSlotId
Definition gpu_dyn_inst.hh:223

gem5::GPUDynInst::srcLiteral
TheGpuISA::ScalarRegU32 srcLiteral() const
Definition gpu_dyn_inst.cc:997

gem5::GPUDynInst::readsExec
bool readsExec() const
Definition gpu_dyn_inst.cc:528

gem5::GPUDynInst::maxSrcScalarRegOperandSize
int maxSrcScalarRegOperandSize()
Definition gpu_dyn_inst.cc:198

gem5::GPUDynInst::isScalar
bool isScalar() const
Definition gpu_dyn_inst.cc:478

gem5::GPUDynInst::isVector
bool isVector() const
Definition gpu_dyn_inst.cc:472

gem5::GPUDynInst::seqNum
InstSeqNum seqNum() const
Definition gpu_dyn_inst.cc:280

gem5::GPUDynInst::~GPUDynInst
~GPUDynInst()
Definition gpu_dyn_inst.cc:96

gem5::GPUDynInst::maxOperandSize
int maxOperandSize()
Definition gpu_dyn_inst.cc:224

gem5::GPUDynInst::isFMA
bool isFMA() const
Definition gpu_dyn_inst.cc:759

gem5::GPUDynInst::srcScalarRegOperands
const std::vector< OperandInfo > & srcScalarRegOperands() const
Definition gpu_dyn_inst.cc:124

gem5::GPUDynInst::isMFMA
bool isMFMA() const
Definition gpu_dyn_inst.cc:777

gem5::GPUDynInst::isAtomicAdd
bool isAtomicAdd() const
Definition gpu_dyn_inst.cc:621

gem5::GPUDynInst::dstScalarRegOperands
const std::vector< OperandInfo > & dstScalarRegOperands() const
Definition gpu_dyn_inst.cc:130

gem5::GPUDynInst::numSrcVecRegOperands
int numSrcVecRegOperands() const
Definition gpu_dyn_inst.cc:148

gem5::GPUDynInst::kern_id
int kern_id
Definition gpu_dyn_inst.hh:217

gem5::GPUDynInst::numSrcRegOperands
int numSrcRegOperands()
Definition gpu_dyn_inst.cc:136

gem5::GPUDynInst::d_data
uint8_t * d_data
Definition gpu_dyn_inst.hh:202

gem5::GPUDynInst::updateStats
void updateStats()
Definition gpu_dyn_inst.cc:1003

gem5::GPUDynInst::_seqNum
const InstSeqNum _seqNum
Definition gpu_dyn_inst.hh:520

gem5::GPUDynInst::isBarrier
bool isBarrier() const
Definition gpu_dyn_inst.cc:406

gem5::GPUDynInst::isLoad
bool isLoad() const
Definition gpu_dyn_inst.cc:442

gem5::GPUDynInst::addr
std::vector< Addr > addr
Definition gpu_dyn_inst.hh:198

gem5::GPUDynInst::writesFlatScratch
bool writesFlatScratch() const
Definition gpu_dyn_inst.cc:566

gem5::GPUDynInst::readsMode
bool readsMode() const
Definition gpu_dyn_inst.cc:516

gem5::GPUDynInst::execute
void execute(GPUDynInstPtr gpuDynInst)
Definition gpu_dyn_inst.cc:106

gem5::GPUDynInst::isMAC
bool isMAC() const
Definition gpu_dyn_inst.cc:765

gem5::GPUDynInst::isKernArgSeg
bool isKernArgSeg() const
Definition gpu_dyn_inst.cc:699

gem5::GPUDynInst::isArgLoad
bool isArgLoad() const
Definition gpu_dyn_inst.cc:663

gem5::GPUDynInst::maxSrcScalarRegOpSize
int maxSrcScalarRegOpSize
Definition gpu_dyn_inst.hh:522

gem5::GPUDynInst::time
Tick time
Definition gpu_dyn_inst.hh:227

gem5::GPUDynInst::pc
Addr pc()
Definition gpu_dyn_inst.cc:286

gem5::GPUDynInst::isGlobalSeg
bool isGlobalSeg() const
Definition gpu_dyn_inst.cc:687

gem5::GPUDynInst::isArgSeg
bool isArgSeg() const
Definition gpu_dyn_inst.cc:681

gem5::GPUDynInst::isAtomic
bool isAtomic() const
Definition gpu_dyn_inst.cc:454

gem5::GPUDynInst::isAtomicOr
bool isAtomicOr() const
Definition gpu_dyn_inst.cc:598

gem5::GPUDynInst::isAtomicPkAddBF16
bool isAtomicPkAddBF16() const
Definition gpu_dyn_inst.cc:657

gem5::GPUDynInst::maxSrcVecRegOperandSize
int maxSrcVecRegOperandSize()
Definition gpu_dyn_inst.cc:160

gem5::GPUDynInst::isAtomicNoRet
bool isAtomicNoRet() const
Definition gpu_dyn_inst.cc:460

gem5::GPUDynInst::isSpillSeg
bool isSpillSeg() const
Definition gpu_dyn_inst.cc:717

gem5::GPUDynInst::disassemble
const std::string & disassemble() const
Definition gpu_dyn_inst.cc:274

gem5::GPUDynInst::completeAcc
void completeAcc(GPUDynInstPtr gpuDynInst)
Definition gpu_dyn_inst.cc:314

gem5::GPUDynInst::isF64
bool isF64() const
Definition gpu_dyn_inst.cc:753

gem5::GPUDynInst::isAtomicMax
bool isAtomicMax() const
Definition gpu_dyn_inst.cc:645

gem5::GPUExecContext::cu
ComputeUnit * cu
Definition gpu_exec_context.hh:62

gem5::GPUExecContext::computeUnit
ComputeUnit * computeUnit()
Definition gpu_exec_context.cc:44

gem5::GPUExecContext::wavefront
Wavefront * wavefront()
Definition gpu_exec_context.cc:50

gem5::GPUExecContext::GPUExecContext
GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
Definition gpu_exec_context.cc:38

gem5::GPUStaticInst
Definition gpu_static_inst.hh:62

gem5::GPUStaticInst::executed_as
enums::StorageClassType executed_as
Definition gpu_static_inst.hh:249

gem5::Shader::ldsApe
const ApertureRegister & ldsApe() const
Definition shader.hh:146

gem5::Wavefront
Definition wavefront.hh:62

gem5::Wavefront::maxSgprs
uint32_t maxSgprs
Definition wavefront.hh:136

gem5::Wavefront::flatGmUnitId
int flatGmUnitId
Definition wavefront.hh:107

gem5::Wavefront::flatLmUnitId
int flatLmUnitId
Definition wavefront.hh:106

gem5::Wavefront::pc
Addr pc() const
Definition wavefront.cc:1569

gem5::Wavefront::rdLmReqsInPipe
int rdLmReqsInPipe
Definition wavefront.hh:190

gem5::Wavefront::wgId
uint32_t wgId
Definition wavefront.hh:166

gem5::Wavefront::simdId
const int simdId
Definition wavefront.hh:102

gem5::Wavefront::kernId
int kernId
Definition wavefront.hh:100

gem5::Wavefront::lgkmIssued
std::set< InstSeqNum > lgkmIssued
Definition wavefront.hh:292

gem5::Wavefront::computeUnit
ComputeUnit * computeUnit
Definition wavefront.hh:109

gem5::Wavefront::execUnitId
int execUnitId
Definition wavefront.hh:105

gem5::Wavefront::decVMemInstsIssued
void decVMemInstsIssued()
Definition wavefront.cc:1452

gem5::Wavefront::wfSlotId
const int wfSlotId
Definition wavefront.hh:99

gem5::Wavefront::decLGKMInstsIssued
void decLGKMInstsIssued()
Definition wavefront.cc:1464

gem5::Wavefront::wrLmReqsInPipe
int wrLmReqsInPipe
Definition wavefront.hh:192

gem5::Wavefront::archFlatScratchAddr
Addr archFlatScratchAddr
Definition wavefront.hh:211

gem5::Wavefront::vmemIssued
std::set< InstSeqNum > vmemIssued
Definition wavefront.hh:291

gem5::Wavefront::wrGmReqsInPipe
int wrGmReqsInPipe
Definition wavefront.hh:193

gem5::Wavefront::rdGmReqsInPipe
int rdGmReqsInPipe
Definition wavefront.hh:191

gem5::Wavefront::wfDynId
uint64_t wfDynId
Definition wavefront.hh:235

std::pair
STL pair class.
Definition stl.hh:58

std::vector
STL vector class.
Definition stl.hh:37

gpu_static_inst.hh

gpu_dyn_inst.hh

gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220

fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232

gem5::ArmISA::mask
Bitfield< 3, 0 > mask
Definition pcstate.hh:63

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::ArmISA::offset
Bitfield< 23, 0 > offset
Definition types.hh:144

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147

gem5::Tick
uint64_t Tick
Tick count type.
Definition types.hh:58

gem5::VectorMask
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48

gem5::InstSeqNum
uint64_t InstSeqNum
Definition inst_seq.hh:40

scalar_register_file.hh

shader.hh

gem5::ApertureRegister::base
Addr base
Definition shader.hh:76

wavefront.hh