develop/shader_8cc_source.html

/*

 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "gpu-compute/shader.hh"


#include <limits>


#include "arch/amdgpu/common/gpu_translation_state.hh"

#include "arch/amdgpu/common/tlb.hh"

#include "base/chunk_generator.hh"

#include "debug/GPUAgentDisp.hh"

#include "debug/GPUDisp.hh"

#include "debug/GPUMem.hh"

#include "debug/GPUShader.hh"

#include "debug/GPUWgLatency.hh"

#include "dev/amdgpu/hwreg_defines.hh"

#include "gpu-compute/dispatcher.hh"

#include "gpu-compute/gpu_command_processor.hh"

#include "gpu-compute/gpu_static_inst.hh"

#include "gpu-compute/hsa_queue_entry.hh"

#include "gpu-compute/wavefront.hh"

#include "mem/packet.hh"

#include "mem/ruby/system/RubySystem.hh"

#include "sim/sim_exit.hh"


namespace gem5

{


Shader::Shader(const Params &p) : ClockedObject(p),

    _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),

    gpuTc(nullptr), cpuPointer(p.cpu_pointer),

    tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",

          false, Event::CPU_Tick_Pri),

    timingSim(p.timing), hsail_mode(SIMT),

    impl_kern_launch_acq(p.impl_kern_launch_acq),

    impl_kern_end_rel(p.impl_kern_end_rel),

    coissue_return(1),

    trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),

    n_cu_per_sqc(p.cu_per_sqc),

    globalMemSize(p.globalmem),

    nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),

    _dispatcher(*p.dispatcher), systemHub(p.system_hub),

    max_valu_insts(p.max_valu_insts), total_valu_insts(0),

    progressInterval(p.progress_interval),

    stats(this, p.CUs[0]->wfSize())

{

    gpuCmdProc.setShader(this);

    _dispatcher.setShader(this);


    // These apertures are set by the driver. In full system mode that is done

    // using a PM4 packet but the emulated SE mode driver does not set them

    // explicitly, so we need to define some reasonable defaults here.

    _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;

    _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;


    _ldsApe.base = 0x1000000000000;

    _ldsApe.limit =  (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;


    _scratchApe.base = 0x2000000000000;

    _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;


    // The scratch and LDS address can be queried starting in gfx900. The

    // base addresses are in the SH_MEM_BASES 32-bit register. The upper 16

    // bits are for the LDS address and the lower 16 bits are for scratch

    // address. In both cases the 16 bits represent bits 63:48 of the address.

    // This means bits 47:0 of the base address is always zero.

    setHwReg(HW_REG_SH_MEM_BASES, 0x00010002);


    shHiddenPrivateBaseVmid = 0;


    cuList.resize(n_cu);


    panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");


    for (int i = 0; i < n_cu; ++i) {

        cuList[i] = p.CUs[i];

        assert(i == cuList[i]->cu_id);

        cuList[i]->shader = this;

        cuList[i]->idleCUTimeout = p.idlecu_timeout;

    }

}


GPUDispatcher&


Shader::dispatcher()

{

    return _dispatcher;

}


Addr


Shader::mmap(int length)

{


    Addr start;


    // round up length to the next page

    length = roundUp(length, X86ISA::PageBytes);


    Process *proc = gpuTc->getProcessPtr();

    auto mem_state = proc->memState;


    if (proc->mmapGrowsDown()) {

        DPRINTF(GPUShader, "GROWS DOWN");

        start = mem_state->getMmapEnd() - length;

        mem_state->setMmapEnd(start);

    } else {

        DPRINTF(GPUShader, "GROWS UP");

        start = mem_state->getMmapEnd();

        mem_state->setMmapEnd(start + length);


        // assertion to make sure we don't overwrite the stack (it grows down)

        assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >

               mem_state->getMmapEnd());

    }


    DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);


    proc->allocateMem(start, length);


    return start;

}


void


Shader::init()

{

    // grab the threadContext of the thread running on the CPU

    assert(cpuPointer);

    gpuTc = cpuPointer->getContext(0);

    assert(gpuTc);

}


Shader::~Shader()

{

    for (int j = 0; j < n_cu; ++j)

        delete cuList[j];

}


void


Shader::updateContext(int cid) {

    // context of the thread which dispatched work

    assert(cpuPointer);

    gpuTc = cpuPointer->getContext(cid);

    assert(gpuTc);

}


void


Shader::execScheduledAdds()

{

    assert(!sa_when.empty());


    // apply any scheduled adds

    for (int i = 0; i < sa_n; ++i) {

        if (sa_when[i] <= curTick()) {

            *sa_val[i] += sa_x[i];

            panic_if(*sa_val[i] < 0, "Negative counter value\n");

            sa_val.erase(sa_val.begin() + i);

            sa_x.erase(sa_x.begin() + i);

            sa_when.erase(sa_when.begin() + i);

            --sa_n;

            --i;

        }

    }

    if (!sa_when.empty()) {

        Tick shader_wakeup = *std::max_element(sa_when.begin(),

                 sa_when.end());

        DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);

        schedule(tickEvent, shader_wakeup);

    } else {

        DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");

    }

}


/*

 * dispatcher/shader arranges invalidate requests to the CUs

 */

void


Shader::prepareInvalidate(HSAQueueEntry *task) {

    // if invalidate has already started/finished, then do nothing

    if (task->isInvStarted()) return;


    // invalidate has never started; it can only perform once at kernel launch

    assert(task->outstandingInvs() == -1);

    int kernId = task->dispatchId();

    // counter value is 0 now, indicating the inv is about to start

    _dispatcher.updateInvCounter(kernId, +1);


    // iterate all cus managed by the shader, to perform invalidate.

    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {

        // create a request to hold INV info; the request's fields will

        // be updated in cu before use

        auto tcc_req = std::make_shared<Request>(0, 0, 0,

                                                 cuList[i_cu]->requestorId(),

                                                 0, -1);


        _dispatcher.updateInvCounter(kernId, +1);

        // all necessary INV flags are all set now, call cu to execute

        cuList[i_cu]->doInvalidate(tcc_req, task->dispatchId());


        // A set of CUs share a single SQC cache. Send a single invalidate

        // request to each SQC

        auto sqc_req = std::make_shared<Request>(0, 0, 0,

                                                 cuList[i_cu]->requestorId(),

                                                 0, -1);


        if ((i_cu % n_cu_per_sqc) == 0) {

            cuList[i_cu]->doSQCInvalidate(sqc_req, task->dispatchId());

        }


        // I don't like this. This is intrusive coding.

        cuList[i_cu]->resetRegisterPool();

    }

}


void


Shader::prepareFlush(GPUDynInstPtr gpuDynInst){

    int kernId = gpuDynInst->kern_id;

    // flush has never been started, performed only once at kernel end

    assert(_dispatcher.getOutstandingWbs(kernId) == 0);


    // the first cu, managed by the shader, performs flush operation,

    // assuming that L2 cache is shared by all cus in the shader

    int i_cu = 0;

    _dispatcher.updateWbCounter(kernId, +1);

    cuList[i_cu]->doFlush(gpuDynInst);

}


bool


Shader::dispatchWorkgroups(HSAQueueEntry *task)

{

    bool scheduledSomething = false;

    int cuCount = 0;

    int curCu = nextSchedCu;

    int disp_count(0);


    while (cuCount < n_cu) {

        //Every time we try a CU, update nextSchedCu

        nextSchedCu = (nextSchedCu + 1) % n_cu;


        // dispatch workgroup iff the following two conditions are met:

        // (a) wg_rem is true - there are unassigned workgroups in the grid

        // (b) there are enough free slots in cu cuList[i] for this wg

        int num_wfs_in_wg = 0;

        bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);

        if (!task->dispComplete() && can_disp) {

            scheduledSomething = true;

            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",

                            curCu, task->globalWgId());

            DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",

                            curCu, task->globalWgId());

            DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",

                    curTick(), task->globalWgId(), curCu);


            if (!cuList[curCu]->tickEvent.scheduled()) {

                if (!_activeCus)

                    _lastInactiveTick = curTick();

                _activeCus++;

            }


            panic_if(_activeCus <= 0 || _activeCus > cuList.size(),

                     "Invalid activeCu size\n");

            cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);


            task->markWgDispatch();

            ++disp_count;

        }


        ++cuCount;

        curCu = nextSchedCu;

    }


     DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);


    return scheduledSomething;

}


void


Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,

                           bool suppress_func_errors, int cu_id)

{

    int block_size = cuList.at(cu_id)->cacheLineSize();

    unsigned size = req->getSize();


    Addr tmp_addr;

    BaseMMU::Mode trans_mode;


    if (cmd == MemCmd::ReadReq) {

        trans_mode = BaseMMU::Read;

    } else if (cmd == MemCmd::WriteReq) {

        trans_mode = BaseMMU::Write;

    } else {

        fatal("unexcepted MemCmd\n");

    }


    tmp_addr = req->getVaddr();

    Addr split_addr = roundDown(tmp_addr + size - 1, block_size);


    assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);


    // Misaligned access

    if (split_addr > tmp_addr) {

        RequestPtr req1, req2;

        req->splitOnVaddr(split_addr, req1, req2);


        PacketPtr pkt1 = new Packet(req2, cmd);

        PacketPtr pkt2 = new Packet(req1, cmd);


        functionalTLBAccess(pkt1, cu_id, trans_mode);

        functionalTLBAccess(pkt2, cu_id, trans_mode);


        PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);

        PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);


        new_pkt1->dataStatic(data);

        new_pkt2->dataStatic((uint8_t*)data + req1->getSize());


        if (suppress_func_errors) {

            new_pkt1->setSuppressFuncError();

            new_pkt2->setSuppressFuncError();

        }


        // fixme: this should be cuList[cu_id] if cu_id != n_cu

        // The latter requires a memPort in the dispatcher

        cuList[0]->memPort[0].sendFunctional(new_pkt1);

        cuList[0]->memPort[0].sendFunctional(new_pkt2);


        delete new_pkt1;

        delete new_pkt2;

        delete pkt1;

        delete pkt2;

    } else {

        PacketPtr pkt = new Packet(req, cmd);

        functionalTLBAccess(pkt, cu_id, trans_mode);

        PacketPtr new_pkt = new Packet(pkt->req, cmd);

        new_pkt->dataStatic(data);


        if (suppress_func_errors) {

            new_pkt->setSuppressFuncError();

        };


        // fixme: this should be cuList[cu_id] if cu_id != n_cu

        // The latter requires a memPort in the dispatcher

        cuList[0]->memPort[0].sendFunctional(new_pkt);


        delete new_pkt;

        delete pkt;

    }

}


void


Shader::ScheduleAdd(int *val,Tick when,int x)

{

    sa_val.push_back(val);

    when += curTick();

    sa_when.push_back(when);

    sa_x.push_back(x);

    ++sa_n;

    if (!tickEvent.scheduled() || (when < tickEvent.when())) {

        DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "

                "%lu\n", when);

        reschedule(tickEvent, when, true);

    } else {

        assert(tickEvent.scheduled());

        DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "

                "%lu\n", when);

    }

}


void


Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,

                  MemCmd cmd, bool suppress_func_errors)

{

    uint8_t *data_buf = (uint8_t*)ptr;


    for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());

         !gen.done(); gen.next()) {


        RequestPtr req = std::make_shared<Request>(

            gen.addr(), gen.size(), 0,

            cuList[0]->requestorId(), 0, 0, nullptr);


        doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);

        data_buf += gen.size();

    }

}


void


Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)

{

    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);

}


void


Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,

                bool suppress_func_errors)

{

    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,

        suppress_func_errors);

}


void


Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)

{

    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);

}


void


Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,

                 bool suppress_func_errors)

{

    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,

              suppress_func_errors);

}


/*

 * Send a packet through the appropriate TLB functional port.

 * If cu_id=n_cu, then this is the dispatcher's TLB.

 * Otherwise it's the TLB of the cu_id compute unit.

 */

void


Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)

{

    // update senderState. Need to know the gpuTc and the TLB mode

    pkt->senderState =

        new GpuTranslationState(mode, gpuTc, false);


    // even when the perLaneTLB flag is turned on

    // it's ok tp send all accesses through lane 0

    // since the lane # is not known here,

    // This isn't important since these are functional accesses.

    cuList[cu_id]->tlbPort[0].sendFunctional(pkt);


    /* safe_cast the senderState */

    GpuTranslationState *sender_state =

               safe_cast<GpuTranslationState*>(pkt->senderState);


    delete sender_state->tlbEntry;

    delete pkt->senderState;

}


/*

 * allow the shader to sample stats from constituent devices

 */

void


Shader::sampleStore(const Tick accessTime)

{

    stats.storeLatencyDist.sample(accessTime);

    stats.allLatencyDist.sample(accessTime);

}


/*

 * allow the shader to sample stats from constituent devices

 */

void


Shader::sampleLoad(const Tick accessTime)

{

    stats.loadLatencyDist.sample(accessTime);

    stats.allLatencyDist.sample(accessTime);

}


void


Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)

{

    // Only sample instructions that go all the way to main memory

    if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {

        return;

    }


    Tick t1 = roundTripTime[0];

    Tick t2 = roundTripTime[1];

    Tick t3 = roundTripTime[2];

    Tick t4 = roundTripTime[3];

    Tick t5 = roundTripTime[4];


    stats.initToCoalesceLatency.sample(t2-t1);

    stats.rubyNetworkLatency.sample(t3-t2);

    stats.gmEnqueueLatency.sample(t4-t3);

    stats.gmToCompleteLatency.sample(t5-t4);

}


void


Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)

{

    stats.coalsrLineAddresses.sample(lineMap.size());

    std::vector<Tick> netTimes;


    // For each cache block address generated by a vmem inst, calculate

    // the round-trip time for that cache block.

    for (auto& it : lineMap) {

        const std::vector<Tick>& timeVec = it.second;

        if (timeVec.size() == 2) {

            netTimes.push_back(timeVec[1] - timeVec[0]);

        }

    }


    // Sort the cache block round trip times so that the first

    // distrubtion is always measuring the fastests and the last

    // distrubtion is always measuring the slowest cache block.

    std::sort(netTimes.begin(), netTimes.end());


    // Sample the round trip time for each N cache blocks into the

    // Nth distribution.

    int idx = 0;

    for (auto& time : netTimes) {

        stats.cacheBlockRoundTrip[idx].sample(time);

        ++idx;

    }

}


void


Shader::notifyCuSleep() {

    // If all CUs attached to his shader are asleep, update shaderActiveTicks

    panic_if(_activeCus <= 0 || _activeCus > cuList.size(),

             "Invalid activeCu size\n");

    _activeCus--;

    if (!_activeCus) {

        stats.shaderActiveTicks += curTick() - _lastInactiveTick;


        if (kernelExitRequested) {

            kernelExitRequested = false;

            if (blitKernel) {

                exitSimLoop("GPU Blit Kernel Completed");

            } else {

                exitSimLoop("GPU Kernel Completed");

            }

        }

    }

}


void


Shader::decNumOutstandingInvL2s()

{

    num_outstanding_invl2s--;


    if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {

        for (auto &dispatch : deferred_dispatches) {

            gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),

                                         std::get<1>(dispatch),

                                         std::get<2>(dispatch));

        }

        deferred_dispatches.clear();

    }

}


void


Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,

                            Addr host_pkt_addr)

{

    deferred_dispatches.push_back(

            std::make_tuple(raw_pkt, queue_id, host_pkt_addr));

}


RequestorID


Shader::vramRequestorId()

{

    return gpuCmdProc.vramRequestorId();

}


GfxVersion


Shader::getGfxVersion() const

{

    return gpuCmdProc.getGfxVersion();

}


Shader::ShaderStats::ShaderStats(statistics::Group *parent, int wf_size)

    : statistics::Group(parent),

      ADD_STAT(allLatencyDist, "delay distribution for all"),

      ADD_STAT(loadLatencyDist, "delay distribution for loads"),

      ADD_STAT(storeLatencyDist, "delay distribution for stores"),

      ADD_STAT(initToCoalesceLatency,

               "Ticks from vmem inst initiateAcc to coalescer issue"),

      ADD_STAT(rubyNetworkLatency,

               "Ticks from coalescer issue to coalescer hit callback"),

      ADD_STAT(gmEnqueueLatency,

               "Ticks from coalescer hit callback to GM pipe enqueue"),

      ADD_STAT(gmToCompleteLatency,

               "Ticks queued in GM pipes ordered response buffer"),

      ADD_STAT(coalsrLineAddresses,

               "Number of cache lines for coalesced request"),

      ADD_STAT(shaderActiveTicks,

               "Total ticks that any CU attached to this shader is active"),

      ADD_STAT(vectorInstSrcOperand,

               "vector instruction source operand distribution"),

      ADD_STAT(vectorInstDstOperand,

               "vector instruction destination operand distribution")

{

    allLatencyDist

        .init(0, 1600000-1, 10000)

        .flags(statistics::pdf | statistics::oneline);


    loadLatencyDist

        .init(0, 1600000-1, 10000)

        .flags(statistics::pdf | statistics::oneline);


    storeLatencyDist

        .init(0, 1600000-1, 10000)

        .flags(statistics::pdf | statistics::oneline);


    initToCoalesceLatency

        .init(0, 1600000-1, 10000)

        .flags(statistics::pdf | statistics::oneline);


    rubyNetworkLatency

        .init(0, 1600000-1, 10000)

        .flags(statistics::pdf | statistics::oneline);


    gmEnqueueLatency

        .init(0, 1600000-1, 10000)

        .flags(statistics::pdf | statistics::oneline);


    gmToCompleteLatency

        .init(0, 1600000-1, 10000)

        .flags(statistics::pdf | statistics::oneline);


    coalsrLineAddresses

        .init(0, 20, 1)

        .flags(statistics::pdf | statistics::oneline);


    vectorInstSrcOperand.init(4);

    vectorInstDstOperand.init(4);


    cacheBlockRoundTrip = new statistics::Distribution[wf_size];

    for (int idx = 0; idx < wf_size; ++idx) {

        std::stringstream namestr;

        ccprintf(namestr, "%s.cacheBlockRoundTrip%d",

                 static_cast<Shader*>(parent)->name(), idx);

        cacheBlockRoundTrip[idx]

            .init(0, 1600000-1, 10000)

            .name(namestr.str())

            .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")

            .flags(statistics::pdf | statistics::oneline);

    }

}


} // namespace gem5

RubySystem.hh

tlb.hh

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

chunk_generator.hh
Declaration and inline definition of ChunkGenerator object.

data
const char data[]
Definition circlebuf.test.cc:48

gem5::BaseMMU::Mode
Mode
Definition mmu.hh:56

gem5::BaseMMU::Write
@ Write
Definition mmu.hh:56

gem5::BaseMMU::Read
@ Read
Definition mmu.hh:56

gem5::ChunkGenerator
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Definition chunk_generator.hh:60

gem5::ClockedObject::ClockedObject
ClockedObject(const ClockedObjectParams &p)
Definition clocked_object.cc:46

gem5::GPUDispatcher
Definition dispatcher.hh:63

gem5::HSAQueueEntry
Definition hsa_queue_entry.hh:61

gem5::HSAQueueEntry::isInvStarted
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
Definition hsa_queue_entry.hh:352

gem5::HSAQueueEntry::globalWgId
int globalWgId() const
Definition hsa_queue_entry.hh:251

gem5::HSAQueueEntry::markWgDispatch
void markWgDispatch()
Definition hsa_queue_entry.hh:288

gem5::HSAQueueEntry::outstandingInvs
int outstandingInvs()
Definition hsa_queue_entry.hh:342

gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition hsa_queue_entry.hh:181

gem5::HSAQueueEntry::dispComplete
bool dispComplete() const
Definition hsa_queue_entry.hh:231

gem5::MemCmd
Definition packet.hh:77

gem5::MemCmd::WriteReq
@ WriteReq
Definition packet.hh:90

gem5::MemCmd::ReadReq
@ ReadReq
Definition packet.hh:87

gem5::Named::name
virtual std::string name() const
Definition named.hh:60

gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295

gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175

gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition packet.hh:545

gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition packet.hh:377

gem5::Packet::setSuppressFuncError
void setSuppressFuncError()
Definition packet.hh:757

gem5::Process
Definition process.hh:67

gem5::Process::mmapGrowsDown
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd?
Definition process.hh:151

gem5::Process::allocateMem
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition process.cc:318

gem5::Process::memState
std::shared_ptr< MemState > memState
Definition process.hh:300

gem5::Shader::kernelExitRequested
bool kernelExitRequested
Definition shader.hh:102

gem5::Shader::mmap
Addr mmap(int length)
Definition shader.cc:118

gem5::Shader::prepareInvalidate
void prepareInvalidate(HSAQueueEntry *task)
Definition shader.cc:204

gem5::Shader::AccessMem
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition shader.cc:400

gem5::Shader::blitKernel
bool blitKernel
Definition shader.hh:105

gem5::Shader::notifyCuSleep
void notifyCuSleep()
Definition shader.cc:541

gem5::Shader::doFunctionalAccess
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition shader.cc:308

gem5::Shader::execScheduledAdds
void execScheduledAdds()
Definition shader.cc:174

gem5::Shader::getGfxVersion
GfxVersion getGfxVersion() const
Definition shader.cc:593

gem5::Shader::tickEvent
EventFunctionWrapper tickEvent
Definition shader.hh:231

gem5::Shader::cuList
std::vector< ComputeUnit * > cuList
Definition shader.hh:269

gem5::Shader::addDeferredDispatch
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:576

gem5::Shader::nextSchedCu
int nextSchedCu
Definition shader.hh:256

gem5::Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition shader.cc:381

gem5::Shader::_dispatcher
GPUDispatcher & _dispatcher
Definition shader.hh:272

gem5::Shader::sa_n
uint32_t sa_n
Definition shader.hh:259

gem5::Shader::Params
ShaderParams Params
Definition shader.hh:113

gem5::Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
Definition shader.cc:512

gem5::Shader::sa_when
std::vector< uint64_t > sa_when
Definition shader.hh:264

gem5::Shader::init
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition shader.cc:151

gem5::Shader::sa_x
std::vector< int32_t > sa_x
Definition shader.hh:266

gem5::Shader::ReadMem
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:418

gem5::Shader::stats
gem5::Shader::ShaderStats stats

gem5::Shader::deferred_dispatches
std::vector< std::tuple< void *, uint32_t, Addr > > deferred_dispatches
Definition shader.hh:110

gem5::Shader::gpuTc
ThreadContext * gpuTc
Definition shader.hh:124

gem5::Shader::dispatchWorkgroups
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition shader.cc:259

gem5::Shader::dispatcher
GPUDispatcher & dispatcher()
Definition shader.cc:112

gem5::Shader::Shader
Shader(const Params &p)
Definition shader.cc:57

gem5::Shader::decNumOutstandingInvL2s
void decNumOutstandingInvL2s()
Definition shader.cc:561

gem5::Shader::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:587

gem5::Shader::cpuThread
SimpleThread * cpuThread
Definition shader.hh:123

gem5::Shader::updateContext
void updateContext(int cid)
Definition shader.cc:166

gem5::Shader::n_cu_per_sqc
int n_cu_per_sqc
Definition shader.hh:250

gem5::Shader::WriteMem
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:432

gem5::Shader::prepareFlush
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition shader.cc:246

gem5::Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition shader.cc:492

gem5::Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition shader.cc:485

gem5::Shader::~Shader
~Shader()
Definition shader.cc:159

gem5::Shader::functionalTLBAccess
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition shader.cc:451

gem5::Shader::num_outstanding_invl2s
int num_outstanding_invl2s
Definition shader.hh:109

gem5::Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition shader.cc:475

gem5::Shader::cpuPointer
BaseCPU * cpuPointer
Definition shader.hh:125

gem5::Shader::gpuCmdProc
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:271

gem5::Shader::n_cu
int n_cu
Definition shader.hh:246

gem5::Shader::_lastInactiveTick
Tick _lastInactiveTick
Definition shader.hh:98

gem5::Shader::sa_val
std::vector< int * > sa_val
Definition shader.hh:262

gem5::Shader::_activeCus
int _activeCus
Definition shader.hh:95

gem5::statistics::Distribution
A simple distribution stat.
Definition statistics.hh:2084

gem5::statistics::Group
Statistics container.
Definition group.hh:93

std::vector
STL vector class.
Definition stl.hh:37

dispatcher.hh
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...

gpu_static_inst.hh

gpu_command_processor.hh
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...

gpu_translation_state.hh

ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75

gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279

gem5::roundUp
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:260

gem5::ChunkGenerator::done
bool done() const
Are we done?
Definition chunk_generator.hh:141

gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition eventq.hh:1012

gem5::EventManager::reschedule
void reschedule(Event &event, Tick when, bool always=false)
Definition eventq.hh:1030

gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207

fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232

panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246

gem5::statistics::Group::Group
Group()=delete

hsa_queue_entry.hh
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).

hwreg_defines.hh

gem5::ArmISA::t4
Bitfield< 4 > t4
Definition misc_types.hh:308

gem5::ArmISA::t2
Bitfield< 2 > t2
Definition misc_types.hh:310

gem5::ArmISA::mode
Bitfield< 4, 0 > mode
Definition misc_types.hh:74

gem5::ArmISA::t3
Bitfield< 3 > t3
Definition misc_types.hh:309

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::ArmISA::t5
Bitfield< 5 > t5
Definition misc_types.hh:307

gem5::ArmISA::t1
Bitfield< 1 > t1
Definition misc_types.hh:311

gem5::MipsISA::p
Bitfield< 0 > p
Definition pra_constants.hh:326

gem5::RiscvISA::x
Bitfield< 3 > x
Definition pagetable.hh:78

gem5::X86ISA::val
Bitfield< 63 > val
Definition misc.hh:804

gem5::X86ISA::PageBytes
const Addr PageBytes
Definition page_size.hh:49

gem5::statistics
Definition statistics.cc:57

gem5::statistics::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61

gem5::statistics::oneline
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::safe_cast
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74

gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition request.hh:94

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::curTick
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46

gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147

gem5::InstMemoryHopMax
@ InstMemoryHopMax
Definition misc.hh:58

gem5::exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
Definition sim_events.cc:111

gem5::Tick
uint64_t Tick
Tick count type.
Definition types.hh:58

gem5::RequestorID
uint16_t RequestorID
Definition request.hh:95

gem5::PacketPtr
Packet * PacketPtr
Definition thread_context.hh:70

gem5::ccprintf
void ccprintf(cp::Print &print)
Definition cprintf.hh:130

gem5::HW_REG_SH_MEM_BASES
@ HW_REG_SH_MEM_BASES
Definition hwreg_defines.hh:58

packet.hh
Declaration of the Packet class.

shader.hh

sim_exit.hh

gem5::GpuTranslationState
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition gpu_translation_state.hh:59

gem5::GpuTranslationState::tlbEntry
Serializable * tlbEntry
Definition gpu_translation_state.hh:73

gem5::Shader::ShaderStats::vectorInstSrcOperand
statistics::Vector vectorInstSrcOperand
Definition shader.hh:380

gem5::Shader::ShaderStats::storeLatencyDist
statistics::Distribution storeLatencyDist
Definition shader.hh:358

gem5::Shader::ShaderStats::initToCoalesceLatency
statistics::Distribution initToCoalesceLatency
Definition shader.hh:361

gem5::Shader::ShaderStats::shaderActiveTicks
statistics::Scalar shaderActiveTicks
Definition shader.hh:379

gem5::Shader::ShaderStats::loadLatencyDist
statistics::Distribution loadLatencyDist
Definition shader.hh:357

gem5::Shader::ShaderStats::allLatencyDist
statistics::Distribution allLatencyDist
Definition shader.hh:356

gem5::Shader::ShaderStats::gmToCompleteLatency
statistics::Distribution gmToCompleteLatency
Definition shader.hh:370

gem5::Shader::ShaderStats::ShaderStats
ShaderStats(statistics::Group *parent, int wf_size)
Definition shader.cc:598

gem5::Shader::ShaderStats::coalsrLineAddresses
statistics::Distribution coalsrLineAddresses
Definition shader.hh:373

gem5::Shader::ShaderStats::vectorInstDstOperand
statistics::Vector vectorInstDstOperand
Definition shader.hh:381

gem5::Shader::ShaderStats::rubyNetworkLatency
statistics::Distribution rubyNetworkLatency
Definition shader.hh:364

gem5::Shader::ShaderStats::cacheBlockRoundTrip
statistics::Distribution * cacheBlockRoundTrip
Definition shader.hh:377

gem5::Shader::ShaderStats::gmEnqueueLatency
statistics::Distribution gmEnqueueLatency
Definition shader.hh:367

wavefront.hh