develop/gpu__command__processor_8cc_source.html

/*

 * Copyright (c) 2018 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "gpu-compute/gpu_command_processor.hh"


#include <cassert>


#include "arch/amdgpu/vega/pagetable_walker.hh"

#include "base/chunk_generator.hh"

#include "debug/GPUCommandProc.hh"

#include "debug/GPUDisp.hh"

#include "debug/GPUInitAbi.hh"

#include "debug/GPUKernelInfo.hh"

#include "dev/amdgpu/amdgpu_device.hh"

#include "gpu-compute/compute_unit.hh"

#include "gpu-compute/dispatcher.hh"

#include "gpu-compute/shader.hh"

#include "mem/abstract_mem.hh"

#include "mem/packet_access.hh"

#include "mem/se_translating_port_proxy.hh"

#include "mem/translating_port_proxy.hh"

#include "params/GPUCommandProcessor.hh"

#include "sim/full_system.hh"

#include "sim/process.hh"

#include "sim/proxy_ptr.hh"

#include "sim/sim_exit.hh"

#include "sim/syscall_emul_buf.hh"


namespace gem5

{


GPUCommandProcessor::GPUCommandProcessor(const Params &p)

    : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),

      walker(p.walker), hsaPP(p.hsapp),

      target_non_blit_kernel_id(p.target_non_blit_kernel_id)

{

    assert(hsaPP);

    hsaPP->setDevice(this);

    dispatcher.setCommandProcessor(this);

}


HSAPacketProcessor&


GPUCommandProcessor::hsaPacketProc()

{

    return *hsaPP;

}


RequestorID


GPUCommandProcessor::vramRequestorId()

{

    return gpuDevice->vramRequestorId();

}


TranslationGenPtr


GPUCommandProcessor::translate(Addr vaddr, Addr size)

{

    if (!FullSystem) {

        // Grab the process and try to translate the virtual address with it;

        // with new extensions, it will likely be wrong to just arbitrarily

        // grab context zero.

        auto process = sys->threads[0]->getProcessPtr();


        return process->pTable->translateRange(vaddr, size);

    }


    // In full system use the page tables setup by the kernel driver rather

    // than the CPU page tables.

    return TranslationGenPtr(

        new AMDGPUVM::UserTranslationGen(&gpuDevice->getVM(), walker,

                                         1 /* vmid */, vaddr, size));

}


void


GPUCommandProcessor::performTimingRead(PacketPtr pkt, int dispType)

{

    // Use the shader to access the CUs and call the read request from

    // the SQC port. Call submit kernel dispatch in the timing response

    // function in receive timing response of SQC port. Schedule this

    // timing read when...just currTick

    ComputeUnit *cu = shader()->cuList[0];

    pkt->senderState = new ComputeUnit::SQCPort::SenderState(

            cu->wfList[0][0], true);

    ComputeUnit::SQCPort::SenderState *sender_state =

        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);

    sender_state->dispatchType = dispType;

    ComputeUnit::SQCPort sqc_port = cu->sqcPort;


    if (!sqc_port.sendTimingReq(pkt)) {

        sqc_port.retries.push_back(

            std::pair<PacketPtr, Wavefront*>(pkt, sender_state->wavefront)

        );

    }

}


void


GPUCommandProcessor::completeTimingRead(int dispType)

{

    struct KernelDispatchData dispatchData = kernelDispatchList.front();

    kernelDispatchList.pop_front();

    delete dispatchData.readPkt;


    // Only one of the following can happen at any time from one CP. Figure

    // out what performed the timing read and call to appropriate function.

    if (kernelDispatchList.size() == 0) {

        switch (dispType) {

          case ComputeUnit::SQCPort::SenderState::DISPATCH_KERNEL_OBJECT:

            dispatchKernelObject(dispatchData.akc, dispatchData.raw_pkt,

                    dispatchData.queue_id, dispatchData.host_pkt_addr);

            break;

          case ComputeUnit::SQCPort::SenderState::DISPATCH_PRELOAD_ARG:

            initPreload(dispatchData.akc, dispatchData.task);

            break;

        }

    }

}


void


GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,

                                       Addr host_pkt_addr)

{

    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;

    // The kernel object should be aligned to a 64B boundary, but not

    // necessarily a cache line boundary.

    unsigned akc_alignment_granularity = 64;

    assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));


    if (shader()->getNumOutstandingInvL2s() > 0) {

        DPRINTF(GPUCommandProc,

                "Deferring kernel launch due to outstanding L2 invalidates\n");

        shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);


        return;

    }


    AMDKernelCode *akc = new AMDKernelCode;


    if (!FullSystem) {

        auto *tc = sys->threads[0];

        SETranslatingPortProxy virt_proxy(tc);


        DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");

        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,

            sizeof(AMDKernelCode));


        dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);

    } else {

        bool is_system_page = true;

        Addr phys_addr = disp_pkt->kernel_object;


        int vmid = 1;

        unsigned tmp_bytes;

        walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),

                                phys_addr, tmp_bytes, BaseMMU::Mode::Read,

                                is_system_page);


        DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"

                " s:%d\n", disp_pkt->kernel_object, phys_addr,

                sizeof(AMDKernelCode), is_system_page);


        if (is_system_page) {

            DPRINTF(GPUCommandProc,

                    "sending system DMA read for kernel_object\n");


            auto dma_callback = new DmaVirtCallback<uint32_t>(

              [=](const uint32_t&) {

                dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);

              });


            dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),

                    dma_callback, (void *)akc);

        } else {

            DPRINTF(GPUCommandProc,

                    "kernel_object in device, using device mem\n");


            // Read from GPU memory manager one cache line at a time to prevent

            // rare cases where the AKC spans two memory pages.

            ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),

                               akc_alignment_granularity);

            for (; !gen.done(); gen.next()) {

                Addr chunk_addr = gen.addr();

                int vmid = 1;

                unsigned dummy;

                walker->startFunctional(

                    gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,

                    dummy, BaseMMU::Mode::Read, is_system_page);


                Request::Flags flags = Request::PHYSICAL;

                RequestPtr request = std::make_shared<Request>(chunk_addr,

                    akc_alignment_granularity, flags,

                    walker->getDevRequestor());

                PacketPtr readPkt = new Packet(request, MemCmd::ReadReq);

                readPkt->dataStatic((uint8_t *)akc + gen.complete());

                // If the request spans two device memories, the device memory

                // returned will be null.

                assert(system()->getDeviceMemory(readPkt) != nullptr);

                struct KernelDispatchData dispatchData;

                dispatchData.akc = akc;

                dispatchData.raw_pkt = raw_pkt;

                dispatchData.queue_id = queue_id;

                dispatchData.host_pkt_addr = host_pkt_addr;

                dispatchData.readPkt = readPkt;

                kernelDispatchList.push_back(dispatchData);

                performTimingRead(readPkt,

                    ComputeUnit::SQCPort::SenderState::DISPATCH_KERNEL_OBJECT);

            }

        }

    }

}


void


GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,

                                        uint32_t queue_id, Addr host_pkt_addr)

{

    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;


    if (akc->kernarg_preload_spec_length != 0) {

        akc->kernel_code_entry_byte_offset += KernargPreloadPktSize;

    }


    sanityCheckAKC(akc);


    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "

        "kernel object\n", akc->kernel_code_entry_byte_offset);


    Addr machine_code_addr = (Addr)disp_pkt->kernel_object

        + akc->kernel_code_entry_byte_offset;


    DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",

        machine_code_addr);


    std::string kernel_name;


    bool is_blit_kernel;

    if (!disp_pkt->completion_signal) {

        kernel_name = "Some kernel";

        is_blit_kernel = false;

    } else {

        kernel_name = "Blit kernel";

        is_blit_kernel = true;

    }


    DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());


    GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()

                          : driver()->getGfxVersion();

    HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,

        dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,

        gfxVersion);


    // The driver expects the start time to be in ns

    Tick start_ts = curTick() / sim_clock::as_int::ns;

    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});


    // Potentially skip a non-blit kernel

    if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {

        DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",

                non_blit_kernel_id, dynamic_task_id);


        // Notify the HSA PP that this kernel is complete

        hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());

        if (task->completionSignal()) {

            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "

                    "signal! Addr: %d\n", task->completionSignal());


            sendCompletionSignal(task->completionSignal());

        } else {

            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "

                "signal\n");

        }


        ++dynamic_task_id;

        ++non_blit_kernel_id;


        delete akc;


        // Notify the run script that a kernel has been skipped

        exitSimLoop("Skipping GPU Kernel");


        return;

    }


    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "

        "grid size (%dx%dx%d) kernarg addr: %#x, completion "

        "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,

        disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,

        disp_pkt->grid_size_x, disp_pkt->grid_size_y,

        disp_pkt->grid_size_z, disp_pkt->kernarg_address,

        disp_pkt->completion_signal);


    DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "

        "num scalar regs: %d, code addr: %#x, kernarg size: %d, "

        "LDS size: %d)\n", kernel_name, task->numVectorRegs(),

        task->numScalarRegs(), task->codeAddr(), 0, 0);


    if (akc->kernarg_preload_spec_length == 0) {

        initABI(task);


        delete akc;

    } else {

        readPreload(akc, task);

    }


    ++dynamic_task_id;

    if (!is_blit_kernel) ++non_blit_kernel_id;

}


void


GPUCommandProcessor::sendCompletionSignal(Addr signal_handle)

{

    // Originally the completion signal was read functionally and written

    // with a timing DMA. This can cause issues in FullSystem mode and

    // cause translation failures. Therefore, in FullSystem mode everything

    // is done in timing mode.


    if (!FullSystem) {

        uint64_t signal_value = functionalReadHsaSignal(signal_handle);


        updateHsaSignal(signal_handle, signal_value - 1);

    } else {

        // The semantics of the HSA signal is to decrement the current

        // signal value by one. Do this asynchronously via DMAs and

        // callbacks as we can safely continue with this function

        // while waiting for the next packet from the host.

        updateHsaSignalAsync(signal_handle, -1);

    }

}


void


GPUCommandProcessor::updateHsaSignalAsync(Addr signal_handle, int64_t diff)

{

    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);

    uint64_t *mailboxValue = new uint64_t;

    auto cb2 = new DmaVirtCallback<uint64_t>(

        [ = ] (const uint64_t &)

            { updateHsaMailboxData(signal_handle, mailboxValue); });

    dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue);

    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n",

            mailbox_addr);

}


void


GPUCommandProcessor::updateHsaMailboxData(Addr signal_handle,

                                          uint64_t *mailbox_value)

{

    Addr event_addr = getHsaSignalEventAddr(signal_handle);


    DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value);

    if (*mailbox_value != 0) {

        // This is an interruptible signal. Now, read the

        // event ID and directly communicate with the driver

        // about that event notification.

        auto cb = new DmaVirtCallback<uint64_t>(

            [ = ] (const uint64_t &)

                { updateHsaEventData(signal_handle, mailbox_value); });

        dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);

    } else {

        delete mailbox_value;


        Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);


        amd_event_t *event_ts = new amd_event_t;

        event_ts->start_ts = dispatchStartTime[signal_handle];

        event_ts->end_ts = curTick() / sim_clock::as_int::ns;

        auto cb = new DmaVirtCallback<uint64_t>(

            [ = ] (const uint64_t &)

                { updateHsaEventTs(signal_handle, event_ts); });

        dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb, (void *)event_ts);

        DPRINTF(GPUCommandProc, "updateHsaMailboxData reading timestamp addr "

                "%lx\n", ts_addr);


        dispatchStartTime.erase(signal_handle);

    }

}


void


GPUCommandProcessor::updateHsaEventData(Addr signal_handle,

                                        uint64_t *event_value)

{

    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);


    DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value);

    // Write *event_value to the mailbox to clear the event

    auto cb = new DmaVirtCallback<uint64_t>(

        [ = ] (const uint64_t &)

            { updateHsaSignalDone(event_value); }, *event_value);

    dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);


    Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);


    amd_event_t *event_ts = new amd_event_t;

    event_ts->start_ts = dispatchStartTime[signal_handle];

    event_ts->end_ts = curTick() / sim_clock::as_int::ns;

    auto cb2 = new DmaVirtCallback<uint64_t>(

        [ = ] (const uint64_t &)

            { updateHsaEventTs(signal_handle, event_ts); });

    dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb2, (void *)event_ts);

    DPRINTF(GPUCommandProc, "updateHsaEventData reading timestamp addr %lx\n",

            ts_addr);


    dispatchStartTime.erase(signal_handle);

}


void


GPUCommandProcessor::updateHsaEventTs(Addr signal_handle,

                                      amd_event_t *ts)

{

    delete ts;


    Addr value_addr = getHsaSignalValueAddr(signal_handle);

    int64_t diff = -1;


    uint64_t *signalValue = new uint64_t;

    auto cb = new DmaVirtCallback<uint64_t>(

        [ = ] (const uint64_t &)

            { updateHsaSignalData(value_addr, diff, signalValue); });

    dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);

    DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",

            value_addr);

}


void


GPUCommandProcessor::updateHsaSignalData(Addr value_addr, int64_t diff,

                                         uint64_t *prev_value)

{

    // Reuse the value allocated for the read

    DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",

            *prev_value, *prev_value + diff);

    *prev_value += diff;

    auto cb = new DmaVirtCallback<uint64_t>(

        [ = ] (const uint64_t &)

            { updateHsaSignalDone(prev_value); });

    dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);

}


void


GPUCommandProcessor::updateHsaSignalDone(uint64_t *signal_value)

{

    delete signal_value;

}


uint64_t


GPUCommandProcessor::functionalReadHsaSignal(Addr signal_handle)

{

    Addr value_addr = getHsaSignalValueAddr(signal_handle);

    auto tc = system()->threads[0];

    ConstVPtr<Addr> prev_value(value_addr, tc);

    return *prev_value;

}


void


GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,

                                     HsaSignalCallbackFunction function)

{

    // The signal value is aligned 8 bytes from

    // the actual handle in the runtime

    Addr value_addr = getHsaSignalValueAddr(signal_handle);

    Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);

    Addr event_addr = getHsaSignalEventAddr(signal_handle);

    DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);


    auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);


    dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);


    auto tc = system()->threads[0];

    ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);


    // Notifying an event with its mailbox pointer is

    // not supported in the current implementation. Just use

    // mailbox pointer to distinguish between interruptible

    // and default signal. Interruptible signal will have

    // a valid mailbox pointer.

    if (*mailbox_ptr != 0) {

        // This is an interruptible signal. Now, read the

        // event ID and directly communicate with the driver

        // about that event notification.

        ConstVPtr<uint32_t> event_val(event_addr, tc);


        DPRINTF(GPUCommandProc, "Calling signal wakeup event on "

                "signal event value %d\n", *event_val);


        // The mailbox/wakeup signal uses the SE mode proxy port to write

        // the event value. This is not available in full system mode so

        // instead we need to issue a DMA write to the address. The value of

        // *event_val clears the event.

        if (FullSystem) {

            auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);

            dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);

        } else {

            signalWakeupEvent(*event_val);

        }

    }

}


void


GPUCommandProcessor::attachDriver(GPUComputeDriver *gpu_driver)

{

    fatal_if(_driver, "Should not overwrite driver.");

    // TODO: GPU Driver inheritance hierarchy doesn't really make sense.

    // Should get rid of the base class.

    _driver = gpu_driver;

    assert(_driver);

}


GPUComputeDriver*


GPUCommandProcessor::driver()

{

    return _driver;

}


void


GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,

    Addr host_pkt_addr)

{

    auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt;


    if (vendor_pkt->completion_signal) {

        sendCompletionSignal(vendor_pkt->completion_signal);

    }


    warn("Ignoring vendor packet\n");


    hsaPP->finishPkt(raw_pkt, queue_id);

}


void


GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,

    Addr host_pkt_addr)

{

    //Parse the Packet, see what it wants us to do

    _hsa_agent_dispatch_packet_t * agent_pkt =

        (_hsa_agent_dispatch_packet_t *)raw_pkt;


    if (agent_pkt->type == AgentCmd::Nop) {

        DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");

    } else if (agent_pkt->type == AgentCmd::Steal) {

        //This is where we steal the HSA Task's completion signal

        int kid = agent_pkt->arg[0];

        DPRINTF(GPUCommandProc,

            "Agent Dispatch Packet Stealing signal handle for kernel %d\n",

            kid);


        HSAQueueEntry *task = dispatcher.hsaTask(kid);

        uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);


        uint64_t return_address = agent_pkt->return_address;

        DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);

        //*return_address = signal_addr;

        Addr *new_signal_addr = new Addr;

        *new_signal_addr  = (Addr)signal_addr;

        dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);


        DPRINTF(GPUCommandProc,

            "Agent Dispatch Packet Stealing signal handle from kid %d :" \

            "(%x:%x) writing into %x\n",

            kid,signal_addr,new_signal_addr,return_address);


    } else

    {

        panic("The agent dispatch packet provided an unknown argument in" \

        "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");

    }


    hsaPP->finishPkt(raw_pkt, queue_id);

}


void


GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)

{

    dispatcher.dispatch(task);

}


void


GPUCommandProcessor::signalWakeupEvent(uint32_t event_id)

{

    _driver->signalWakeupEvent(event_id);

}


void


GPUCommandProcessor::readPreload(AMDKernelCode *akc, HSAQueueEntry *task)

{

    _hsa_dispatch_packet_t *disp_pkt =

        (_hsa_dispatch_packet_t*)task->dispPktPtr();


    // Data preloaded is copied from the kernarg segment. Preloading starts at

    // the dword offset specified by kernarg_preload_spec_offset.

    Addr preload_addr = (Addr)disp_pkt->kernarg_address

        + akc->kernarg_preload_spec_offset * 4;


    DPRINTF(GPUCommandProc, "Kernarg preload starts at addr: %#x\n",

            preload_addr);


    bool is_system_page = true;

    Addr phys_addr = preload_addr;


    int vmid = 1;

    unsigned tmp_bytes;

    walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),

                            phys_addr, tmp_bytes, BaseMMU::Mode::Read,

                            is_system_page);


    DPRINTF(GPUCommandProc, "Kernarg preload data is in %s memory\n",

            is_system_page ? "host" : "device");


    if (is_system_page) {

        // Unclear if this is even possible as the point of kernarg preload

        // is to avoid loads from host memory by explicitly placing them in

        // device memory. It is not difficult to implement so issue a warning

        // for now to indicate a possible place to debug if something goes

        // wrong and this warning is seen.

        warn("Preload kernarg from host untested!\n");


        auto cb = new DmaVirtCallback<uint32_t>(

            [ = ] (const uint32_t&) {

                initPreload(akc, task);

            });


        dmaReadVirt(preload_addr,

                sizeof(uint32_t) * akc->kernarg_preload_spec_length,

                cb, task->preloadArgs());

    } else {

        // Read from GPU memory manager one cache line at a time to prevent

        // rare cases where the preload data spans two memory pages.

        constexpr unsigned alignment_granularity = 64;

        ChunkGenerator gen(preload_addr,

                sizeof(uint32_t) * akc->kernarg_preload_spec_length,

                alignment_granularity);


        for (; !gen.done(); gen.next()) {

            Addr chunk_addr = gen.addr();

            int vmid = 1;

            unsigned dummy;

            walker->startFunctional(

                gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,

                dummy, BaseMMU::Mode::Read, is_system_page);


            Request::Flags flags = Request::PHYSICAL;

            RequestPtr request = std::make_shared<Request>(chunk_addr,

                alignment_granularity, flags,

                walker->getDevRequestor());


            PacketPtr readPkt = new Packet(request, MemCmd::ReadReq);

            readPkt->dataStatic((uint8_t *)task->preloadArgs()

                                 + gen.complete());


            struct KernelDispatchData dispatchData;

            dispatchData.akc = akc;

            dispatchData.task = task;

            dispatchData.readPkt = readPkt;

            kernelDispatchList.push_back(dispatchData);

            performTimingRead(readPkt,

                ComputeUnit::SQCPort::SenderState::DISPATCH_PRELOAD_ARG);

        }

    }

}


void


GPUCommandProcessor::initPreload(AMDKernelCode *akc, HSAQueueEntry *task)

{

    // Fill in SGPRs

    int num_sgprs = akc->kernarg_preload_spec_length;


    task->preloadLength(num_sgprs);

    for (int i = 0; i < num_sgprs; ++i) {

        DPRINTF(GPUCommandProc, "Task preload user SGPR[%d] = %x\n",

                i, task->preloadArgs()[i]);

    }


    delete akc;


    initABI(task);

}


void


GPUCommandProcessor::initABI(HSAQueueEntry *task)

{

    auto cb = new DmaVirtCallback<uint32_t>(

        [ = ] (const uint32_t &readDispIdOffset)

            { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);


    Addr hostReadIdxPtr

        = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;


    dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),

        sizeof(uint32_t), cb, &cb->dmaBuffer);

}


void


GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)

{

    DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",

            akc->group_segment_fixed_size);

    DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",

            akc->private_segment_fixed_size);

    DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);

    DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",

            akc->kernel_code_entry_byte_offset);

    DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);

    DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);

    DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",

            akc->granulated_workitem_vgpr_count);

    DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",

            akc->granulated_wavefront_sgpr_count);

    DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);

    DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);

    DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",

            akc->float_mode_round_16_64);

    DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",

            akc->float_mode_denorm_32);

    DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",

            akc->float_mode_denorm_16_64);

    DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);

    DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);

    DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);

    DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);

    DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);

    DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);

    DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);

    DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);

    DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);

    DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);

    DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",

            akc->enable_private_segment);

    DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);

    DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);

    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",

            akc->enable_sgpr_workgroup_id_x);

    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",

            akc->enable_sgpr_workgroup_id_y);

    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",

            akc->enable_sgpr_workgroup_id_z);

    DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",

            akc->enable_sgpr_workgroup_info);

    DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",

            akc->enable_vgpr_workitem_id);

    DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",

            akc->enable_exception_address_watch);

    DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",

            akc->enable_exception_memory);

    DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);

    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",

            akc->enable_exception_ieee_754_fp_invalid_operation);

    DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",

            akc->enable_exception_fp_denormal_source);

    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",

            akc->enable_exception_ieee_754_fp_division_by_zero);

    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",

            akc->enable_exception_ieee_754_fp_overflow);

    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",

            akc->enable_exception_ieee_754_fp_underflow);

    DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",

            akc->enable_exception_ieee_754_fp_inexact);

    DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",

            akc->enable_exception_int_divide_by_zero);

    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",

            akc->enable_sgpr_private_segment_buffer);

    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",

            akc->enable_sgpr_dispatch_ptr);

    DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",

            akc->enable_sgpr_queue_ptr);

    DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",

            akc->enable_sgpr_kernarg_segment_ptr);

    DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",

            akc->enable_sgpr_dispatch_id);

    DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",

            akc->enable_sgpr_flat_scratch_init);

    DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",

            akc->enable_sgpr_private_segment_size);

    DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",

            akc->enable_wavefront_size32);

    DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);

    DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",

            akc->kernarg_preload_spec_length);

    DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",

            akc->kernarg_preload_spec_offset);


    // Check for features not implemented in gem5

    fatal_if(akc->wgp_mode, "WGP mode not supported\n");

    fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");

    fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");


    // Warn on features that gem5 will ignore

    warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");

    warn_if(akc->bulky, "Bulky code object bit ignored\n");

    // TODO: All the IEEE bits


    warn_if(akc->tg_split, "TG split not implemented\n");

}


System*


GPUCommandProcessor::system()

{

    return sys;

}


AddrRangeList


GPUCommandProcessor::getAddrRanges() const

{

    AddrRangeList ranges;

    return ranges;

}


void


GPUCommandProcessor::setGPUDevice(AMDGPUDevice *gpu_device)

{

    gpuDevice = gpu_device;

    walker->setDevRequestor(gpuDevice->vramRequestorId());

}


void


GPUCommandProcessor::setShader(Shader *shader)

{

    _shader = shader;

}


Shader*


GPUCommandProcessor::shader()

{

    return _shader;

}


GfxVersion


GPUCommandProcessor::getGfxVersion() const

{

    return FullSystem ? gpuDevice->getGfxVersion() : _driver->getGfxVersion();

}


} // namespace gem5

abstract_mem.hh
AbstractMemory declaration.

pagetable_walker.hh

amdgpu_device.hh

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

chunk_generator.hh
Declaration and inline definition of ChunkGenerator object.

gem5::AMDGPUDevice
Device model for an AMD GPU.
Definition amdgpu_device.hh:66

gem5::AMDGPUVM::UserTranslationGen
Definition amdgpu_vm.hh:430

gem5::BaseMMU::Read
@ Read
Definition mmu.hh:56

gem5::ChunkGenerator
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Definition chunk_generator.hh:60

gem5::ComputeUnit::SQCPort
Definition compute_unit.hh:685

gem5::ComputeUnit::SQCPort::retries
std::deque< std::pair< PacketPtr, Wavefront * > > retries
Definition compute_unit.hh:736

gem5::ComputeUnit
Definition compute_unit.hh:204

gem5::ComputeUnit::sqcPort
SQCPort sqcPort
Definition compute_unit.hh:965

gem5::ComputeUnit::wfList
std::vector< std::vector< Wavefront * > > wfList
Definition compute_unit.hh:293

gem5::DmaVirtDevice::DmaVirtCallback
Wraps a std::function object in a DmaCallback.
Definition dma_virt_device.hh:52

gem5::DmaVirtDevice::DmaVirtCallback::dmaBuffer
T dmaBuffer
Definition dma_virt_device.hh:62

gem5::DmaVirtDevice::dmaReadVirt
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
Definition dma_virt_device.cc:38

gem5::DmaVirtDevice::DmaVirtDevice
DmaVirtDevice(const Params &p)
Definition dma_virt_device.hh:71

gem5::DmaVirtDevice::dmaWriteVirt
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
Definition dma_virt_device.cc:45

gem5::GPUCommandProcessor::sendCompletionSignal
void sendCompletionSignal(Addr signal_handle)
Definition gpu_command_processor.cc:416

gem5::GPUCommandProcessor::submitDispatchPkt
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
Definition gpu_command_processor.cc:166

gem5::GPUCommandProcessor::ReadDispIdOffsetDmaEvent
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
Definition gpu_command_processor.hh:197

gem5::GPUCommandProcessor::getGfxVersion
GfxVersion getGfxVersion() const
Definition gpu_command_processor.cc:992

gem5::GPUCommandProcessor::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
Definition gpu_command_processor.cc:80

gem5::GPUCommandProcessor::dynamic_task_id
int dynamic_task_id
Definition gpu_command_processor.hh:173

gem5::GPUCommandProcessor::driver
GPUComputeDriver * driver()
Definition gpu_command_processor.cc:620

gem5::GPUCommandProcessor::getHsaSignalMailboxAddr
Addr getHsaSignalMailboxAddr(Addr signal_handle)
Definition gpu_command_processor.hh:148

gem5::GPUCommandProcessor::GPUCommandProcessor
GPUCommandProcessor()=delete

gem5::GPUCommandProcessor::setGPUDevice
void setGPUDevice(AMDGPUDevice *gpu_device)
Definition gpu_command_processor.cc:973

gem5::GPUCommandProcessor::translate
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
Definition gpu_command_processor.cc:86

gem5::GPUCommandProcessor::signalWakeupEvent
void signalWakeupEvent(uint32_t event_id)
Definition gpu_command_processor.cc:719

gem5::GPUCommandProcessor::updateHsaSignal
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
Definition gpu_command_processor.cc:565

gem5::GPUCommandProcessor::updateHsaSignalDone
void updateHsaSignalDone(uint64_t *signal_value)
Definition gpu_command_processor.cc:550

gem5::GPUCommandProcessor::_driver
GPUComputeDriver * _driver
Definition gpu_command_processor.hh:161

gem5::GPUCommandProcessor::completeTimingRead
void completeTimingRead(int dispType)
Definition gpu_command_processor.cc:127

gem5::GPUCommandProcessor::setShader
void setShader(Shader *shader)
Definition gpu_command_processor.cc:980

gem5::GPUCommandProcessor::hsaPacketProc
HSAPacketProcessor & hsaPacketProc()
Definition gpu_command_processor.cc:71

gem5::GPUCommandProcessor::target_non_blit_kernel_id
int target_non_blit_kernel_id
Definition gpu_command_processor.hh:179

gem5::GPUCommandProcessor::performTimingRead
void performTimingRead(PacketPtr pkt, int dispType)
Definition gpu_command_processor.cc:105

gem5::GPUCommandProcessor::submitAgentDispatchPkt
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
Definition gpu_command_processor.cc:666

gem5::GPUCommandProcessor::kernelDispatchList
std::list< struct KernelDispatchData > kernelDispatchList
Definition gpu_command_processor.hh:99

gem5::GPUCommandProcessor::getHsaSignalValueAddr
Addr getHsaSignalValueAddr(Addr signal_handle)
Definition gpu_command_processor.hh:143

gem5::GPUCommandProcessor::updateHsaEventTs
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value)
Definition gpu_command_processor.cc:518

gem5::GPUCommandProcessor::hsaPP
HSAPacketProcessor * hsaPP
Definition gpu_command_processor.hh:169

gem5::GPUCommandProcessor::_shader
Shader * _shader
Definition gpu_command_processor.hh:159

gem5::GPUCommandProcessor::dispatchKernelObject
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition gpu_command_processor.cc:304

gem5::GPUCommandProcessor::attachDriver
void attachDriver(GPUComputeDriver *driver)
Definition gpu_command_processor.cc:610

gem5::GPUCommandProcessor::initABI
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
Definition gpu_command_processor.cc:842

gem5::GPUCommandProcessor::updateHsaSignalAsync
void updateHsaSignalAsync(Addr signal_handle, int64_t diff)
Definition gpu_command_processor.cc:443

gem5::GPUCommandProcessor::non_blit_kernel_id
int non_blit_kernel_id
Definition gpu_command_processor.hh:176

gem5::GPUCommandProcessor::dispatchStartTime
std::unordered_map< Addr, Tick > dispatchStartTime
Definition gpu_command_processor.hh:182

gem5::GPUCommandProcessor::getHsaSignalEventAddr
Addr getHsaSignalEventAddr(Addr signal_handle)
Definition gpu_command_processor.hh:153

gem5::GPUCommandProcessor::getAddrRanges
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
Definition gpu_command_processor.cc:966

gem5::GPUCommandProcessor::shader
Shader * shader()
Definition gpu_command_processor.cc:986

gem5::GPUCommandProcessor::submitVendorPkt
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
Definition gpu_command_processor.cc:644

gem5::GPUCommandProcessor::sanityCheckAKC
void sanityCheckAKC(AMDKernelCode *akc)
Definition gpu_command_processor.cc:856

gem5::GPUCommandProcessor::dispatcher
GPUDispatcher & dispatcher
Definition gpu_command_processor.hh:160

gem5::GPUCommandProcessor::initPreload
void initPreload(AMDKernelCode *akc, HSAQueueEntry *task)
Definition gpu_command_processor.cc:819

gem5::GPUCommandProcessor::Params
GPUCommandProcessorParams Params
Definition gpu_command_processor.hh:74

gem5::GPUCommandProcessor::Steal
@ Steal
Definition gpu_command_processor.hh:104

gem5::GPUCommandProcessor::Nop
@ Nop
Definition gpu_command_processor.hh:103

gem5::GPUCommandProcessor::dispatchPkt
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
Definition gpu_command_processor.cc:713

gem5::GPUCommandProcessor::readPreload
void readPreload(AMDKernelCode *akc, HSAQueueEntry *task)
Definition gpu_command_processor.cc:725

gem5::GPUCommandProcessor::gpuDevice
AMDGPUDevice * gpuDevice
Definition gpu_command_processor.hh:162

gem5::GPUCommandProcessor::updateHsaMailboxData
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value)
Definition gpu_command_processor.cc:456

gem5::GPUCommandProcessor::updateHsaEventData
void updateHsaEventData(Addr signal_handle, uint64_t *event_value)
Definition gpu_command_processor.cc:490

gem5::GPUCommandProcessor::HsaSignalCallbackFunction
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
Definition gpu_command_processor.hh:75

gem5::GPUCommandProcessor::system
System * system()
Definition gpu_command_processor.cc:960

gem5::GPUCommandProcessor::functionalReadHsaSignal
uint64_t functionalReadHsaSignal(Addr signal_handle)
Definition gpu_command_processor.cc:556

gem5::GPUCommandProcessor::updateHsaSignalData
void updateHsaSignalData(Addr value_addr, int64_t diff, uint64_t *prev_value)
Definition gpu_command_processor.cc:536

gem5::GPUCommandProcessor::walker
VegaISA::Walker * walker
Definition gpu_command_processor.hh:163

gem5::GPUComputeDriver
Definition gpu_compute_driver.hh:63

gem5::GPUComputeDriver::getGfxVersion
GfxVersion getGfxVersion() const
Definition gpu_compute_driver.hh:143

gem5::HSAPacketProcessor
Definition hsa_packet_processor.hh:240

gem5::HSAPacketProcessor::finishPkt
void finishPkt(void *pkt, uint32_t rl_idx)
Definition hsa_packet_processor.cc:667

gem5::HSAQueueEntry
Definition hsa_queue_entry.hh:61

gem5::HSAQueueEntry::dispPktPtr
void * dispPktPtr()
Definition hsa_queue_entry.hh:187

gem5::HSAQueueEntry::queueId
uint32_t queueId() const
Definition hsa_queue_entry.hh:175

gem5::HSAQueueEntry::preloadLength
void preloadLength(unsigned val)
Definition hsa_queue_entry.hh:413

gem5::HSAQueueEntry::numVectorRegs
int numVectorRegs() const
Definition hsa_queue_entry.hh:163

gem5::HSAQueueEntry::codeAddr
Addr codeAddr() const
Definition hsa_queue_entry.hh:205

gem5::HSAQueueEntry::preloadArgs
uint32_t * preloadArgs()
Definition hsa_queue_entry.hh:433

gem5::HSAQueueEntry::completionSignal
Addr completionSignal() const
Definition hsa_queue_entry.hh:199

gem5::HSAQueueEntry::numScalarRegs
int numScalarRegs() const
Definition hsa_queue_entry.hh:169

gem5::MemCmd::ReadReq
@ ReadReq
Definition packet.hh:87

gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295

gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175

gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition packet.hh:545

gem5::PioDevice::sys
System * sys
Definition io_device.hh:105

gem5::PortProxy::readBlob
void readBlob(Addr addr, void *p, uint64_t size) const
Higher level interfaces based on the above.
Definition port_proxy.hh:182

gem5::RequestPort::sendTimingReq
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition port.hh:603

gem5::Request::PHYSICAL
@ PHYSICAL
The virtual address is also the physical address.
Definition request.hh:117

gem5::Request::Flags
gem5::Flags< FlagsType > Flags
Definition request.hh:102

gem5::SETranslatingPortProxy
Definition se_translating_port_proxy.hh:50

gem5::Shader
Definition shader.hh:84

gem5::Shader::cuList
std::vector< ComputeUnit * > cuList
Definition shader.hh:269

gem5::Shader::addDeferredDispatch
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:576

gem5::System
Definition system.hh:75

gem5::System::threads
Threads threads
Definition system.hh:315

std::pair
STL pair class.
Definition stl.hh:58

compute_unit.hh

dispatcher.hh
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...

full_system.hh

gpu_command_processor.hh
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...

gem5::AddrRangeList
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64

gem5::ChunkGenerator::addr
Addr addr() const
Return starting address of current chunk.
Definition chunk_generator.hh:119

gem5::ChunkGenerator::complete
Addr complete() const
Number of bytes we have already chunked up.
Definition chunk_generator.hh:132

gem5::ChunkGenerator::done
bool done() const
Are we done?
Definition chunk_generator.hh:141

gem5::ChunkGenerator::next
bool next()
Advance generator to next chunk.
Definition chunk_generator.hh:185

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220

fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268

warn
#define warn(...)
Definition logging.hh:288

warn_if
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:315

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::ArmISA::ts
Bitfield< 55, 52 > ts
Definition misc_types.hh:120

gem5::MipsISA::vaddr
vaddr
Definition pra_constants.hh:278

gem5::MipsISA::p
Bitfield< 0 > p
Definition pra_constants.hh:326

gem5::sim_clock::as_int::ns
Tick ns
nanosecond
Definition core.cc:68

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::safe_cast
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74

gem5::AMDKernelCode
struct gem5::GEM5_PACKED AMDKernelCode

gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition request.hh:94

gem5::curTick
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46

gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147

gem5::exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
Definition sim_events.cc:111

gem5::FullSystem
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220

gem5::Tick
uint64_t Tick
Tick count type.
Definition types.hh:58

gem5::RequestorID
uint16_t RequestorID
Definition request.hh:95

gem5::PacketPtr
Packet * PacketPtr
Definition thread_context.hh:70

gem5::KernargPreloadPktSize
constexpr int KernargPreloadPktSize
The number of bytes after the dispatch packet which contain kernel arguments that should be preloaded...
Definition kernel_code.hh:84

gem5::TranslationGenPtr
std::unique_ptr< TranslationGen > TranslationGenPtr
Definition translation_gen.hh:131

gem5::ConstVPtr
ConstProxyPtr< T, SETranslatingPortProxy > ConstVPtr
Definition proxy_ptr.hh:398

gem5::amd_signal_t
struct gem5::amd_signal_s amd_signal_t

packet_access.hh

proxy_ptr.hh

se_translating_port_proxy.hh

shader.hh

process.hh

sim_exit.hh

gem5::ComputeUnit::SQCPort::SenderState
Definition compute_unit.hh:693

gem5::ComputeUnit::SQCPort::SenderState::dispatchType
int dispatchType
Definition compute_unit.hh:706

gem5::ComputeUnit::SQCPort::SenderState::wavefront
Wavefront * wavefront
Definition compute_unit.hh:701

gem5::GEM5_PACKED::enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_flat_scratch_init
Definition kernel_code.hh:157

gem5::GEM5_PACKED::enable_sgpr_queue_ptr
uint32_t enable_sgpr_queue_ptr
Definition kernel_code.hh:154

gem5::GEM5_PACKED::enable_exception_memory
uint32_t enable_exception_memory
Definition kernel_code.hh:138

gem5::GEM5_PACKED::enable_exception_fp_denormal_source
uint32_t enable_exception_fp_denormal_source
Definition kernel_code.hh:141

gem5::GEM5_PACKED::debug_mode
uint32_t debug_mode
Definition kernel_code.hh:116

gem5::GEM5_PACKED::enable_exception_address_watch
uint32_t enable_exception_address_watch
Definition kernel_code.hh:137

gem5::GEM5_PACKED::enable_private_segment
uint32_t enable_private_segment
Definition kernel_code.hh:129

gem5::GEM5_PACKED::enable_sgpr_workgroup_id_y
uint32_t enable_sgpr_workgroup_id_y
Definition kernel_code.hh:133

gem5::GEM5_PACKED::enable_exception_ieee_754_fp_inexact
uint32_t enable_exception_ieee_754_fp_inexact
Definition kernel_code.hh:145

gem5::GEM5_PACKED::enable_ieee_mode
uint32_t enable_ieee_mode
Definition kernel_code.hh:117

gem5::GEM5_PACKED::group_segment_fixed_size
uint32_t group_segment_fixed_size
Definition kernel_code.hh:90

gem5::GEM5_PACKED::float_mode_round_32
uint32_t float_mode_round_32
Definition kernel_code.hh:110

gem5::GEM5_PACKED::enable_exception_int_divide_by_zero
uint32_t enable_exception_int_divide_by_zero
Definition kernel_code.hh:146

gem5::GEM5_PACKED::granulated_wavefront_sgpr_count
uint32_t granulated_wavefront_sgpr_count
Definition kernel_code.hh:108

gem5::GEM5_PACKED::priority
uint32_t priority
Definition kernel_code.hh:109

gem5::GEM5_PACKED::granulated_workitem_vgpr_count
uint32_t granulated_workitem_vgpr_count
Definition kernel_code.hh:107

gem5::GEM5_PACKED::kernarg_preload_spec_length
uint32_t kernarg_preload_spec_length
Definition kernel_code.hh:165

gem5::GEM5_PACKED::enable_sgpr_dispatch_ptr
uint32_t enable_sgpr_dispatch_ptr
Definition kernel_code.hh:153

gem5::GEM5_PACKED::use_dynamic_stack
uint32_t use_dynamic_stack
Definition kernel_code.hh:161

gem5::GEM5_PACKED::granulated_lds_size
uint32_t granulated_lds_size
Definition kernel_code.hh:139

gem5::GEM5_PACKED::float_mode_round_16_64
uint32_t float_mode_round_16_64
Definition kernel_code.hh:111

gem5::GEM5_PACKED::float_mode_denorm_16_64
uint32_t float_mode_denorm_16_64
Definition kernel_code.hh:113

gem5::GEM5_PACKED::float_mode_denorm_32
uint32_t float_mode_denorm_32
Definition kernel_code.hh:112

gem5::GEM5_PACKED::kernarg_preload_spec_offset
uint32_t kernarg_preload_spec_offset
Definition kernel_code.hh:166

gem5::GEM5_PACKED::enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_workgroup_id_z
Definition kernel_code.hh:134

gem5::GEM5_PACKED::enable_sgpr_dispatch_id
uint32_t enable_sgpr_dispatch_id
Definition kernel_code.hh:156

gem5::GEM5_PACKED::accum_offset
uint32_t accum_offset
Definition kernel_code.hh:99

gem5::GEM5_PACKED::user_sgpr_count
uint32_t user_sgpr_count
Definition kernel_code.hh:130

gem5::GEM5_PACKED::enable_dx10_clamp
uint32_t enable_dx10_clamp
Definition kernel_code.hh:115

gem5::GEM5_PACKED::tg_split
uint32_t tg_split
Definition kernel_code.hh:101

gem5::GEM5_PACKED::fwd_progress
uint32_t fwd_progress
Definition kernel_code.hh:124

gem5::GEM5_PACKED::enable_exception_ieee_754_fp_overflow
uint32_t enable_exception_ieee_754_fp_overflow
Definition kernel_code.hh:143

gem5::GEM5_PACKED::kernel_code_entry_byte_offset
int64_t kernel_code_entry_byte_offset
Definition kernel_code.hh:94

gem5::GEM5_PACKED::enable_exception_ieee_754_fp_underflow
uint32_t enable_exception_ieee_754_fp_underflow
Definition kernel_code.hh:144

gem5::GEM5_PACKED::enable_vgpr_workitem_id
uint32_t enable_vgpr_workitem_id
Definition kernel_code.hh:136

gem5::GEM5_PACKED::kernarg_size
uint32_t kernarg_size
Definition kernel_code.hh:92

gem5::GEM5_PACKED::fp16_ovfl
uint32_t fp16_ovfl
Definition kernel_code.hh:120

gem5::GEM5_PACKED::mem_ordered
uint32_t mem_ordered
Definition kernel_code.hh:123

gem5::GEM5_PACKED::enable_sgpr_private_segment_size
uint32_t enable_sgpr_private_segment_size
Definition kernel_code.hh:158

gem5::GEM5_PACKED::enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_kernarg_segment_ptr
Definition kernel_code.hh:155

gem5::GEM5_PACKED::priv
uint32_t priv
Definition pm4_defines.hh:391

gem5::GEM5_PACKED::enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_private_segment_buffer
Definition kernel_code.hh:152

gem5::GEM5_PACKED::enable_sgpr_workgroup_id_x
uint32_t enable_sgpr_workgroup_id_x
Definition kernel_code.hh:132

gem5::GEM5_PACKED::enable_exception_ieee_754_fp_division_by_zero
uint32_t enable_exception_ieee_754_fp_division_by_zero
Definition kernel_code.hh:142

gem5::GEM5_PACKED::private_segment_fixed_size
uint32_t private_segment_fixed_size
Definition kernel_code.hh:91

gem5::GEM5_PACKED::cdbg_user
uint32_t cdbg_user
Definition kernel_code.hh:119

gem5::GEM5_PACKED::bulky
uint32_t bulky
Definition kernel_code.hh:118

gem5::GEM5_PACKED::enable_trap_handler
uint32_t enable_trap_handler
Definition kernel_code.hh:131

gem5::GEM5_PACKED::wgp_mode
uint32_t wgp_mode
Definition kernel_code.hh:122

gem5::GEM5_PACKED::enable_sgpr_workgroup_info
uint32_t enable_sgpr_workgroup_info
Definition kernel_code.hh:135

gem5::GEM5_PACKED::enable_exception_ieee_754_fp_invalid_operation
uint32_t enable_exception_ieee_754_fp_invalid_operation
Definition kernel_code.hh:140

gem5::GEM5_PACKED::enable_wavefront_size32
uint32_t enable_wavefront_size32
Definition kernel_code.hh:160

gem5::GPUCommandProcessor::KernelDispatchData
Definition gpu_command_processor.hh:90

gem5::GPUCommandProcessor::KernelDispatchData::readPkt
PacketPtr readPkt
Definition gpu_command_processor.hh:95

gem5::GPUCommandProcessor::KernelDispatchData::host_pkt_addr
Addr host_pkt_addr
Definition gpu_command_processor.hh:94

gem5::GPUCommandProcessor::KernelDispatchData::akc
AMDKernelCode * akc
Definition gpu_command_processor.hh:91

gem5::GPUCommandProcessor::KernelDispatchData::queue_id
uint32_t queue_id
Definition gpu_command_processor.hh:93

gem5::GPUCommandProcessor::KernelDispatchData::raw_pkt
void * raw_pkt
Definition gpu_command_processor.hh:92

gem5::GPUCommandProcessor::KernelDispatchData::task
HSAQueueEntry * task
Definition gpu_command_processor.hh:96

gem5::_hsa_agent_dispatch_packet_t
Definition hsa_packet.hh:73

gem5::_hsa_agent_dispatch_packet_t::type
uint16_t type
Definition hsa_packet.hh:75

gem5::_hsa_agent_dispatch_packet_t::arg
uint64_t arg[4]
Definition hsa_packet.hh:78

gem5::_hsa_agent_dispatch_packet_t::return_address
uint64_t return_address
Definition hsa_packet.hh:77

gem5::_hsa_dispatch_packet_t
Definition hsa_packet.hh:54

gem5::_hsa_dispatch_packet_t::completion_signal
uint64_t completion_signal
Definition hsa_packet.hh:69

gem5::_hsa_dispatch_packet_t::grid_size_y
uint32_t grid_size_y
Definition hsa_packet.hh:62

gem5::_hsa_dispatch_packet_t::workgroup_size_y
uint16_t workgroup_size_y
Definition hsa_packet.hh:58

gem5::_hsa_dispatch_packet_t::grid_size_z
uint32_t grid_size_z
Definition hsa_packet.hh:63

gem5::_hsa_dispatch_packet_t::kernarg_address
uint64_t kernarg_address
Definition hsa_packet.hh:67

gem5::_hsa_dispatch_packet_t::workgroup_size_z
uint16_t workgroup_size_z
Definition hsa_packet.hh:59

gem5::_hsa_dispatch_packet_t::workgroup_size_x
uint16_t workgroup_size_x
Definition hsa_packet.hh:57

gem5::_hsa_dispatch_packet_t::kernel_object
uint64_t kernel_object
Definition hsa_packet.hh:66

gem5::_hsa_dispatch_packet_t::grid_size_x
uint32_t grid_size_x
Definition hsa_packet.hh:61

gem5::_hsa_generic_vendor_pkt
Definition hsa_packet.hh:104

gem5::amd_event_t
Definition hsa_signal.hh:73

gem5::amd_event_t::end_ts
uint64_t end_ts
Definition hsa_signal.hh:75

gem5::amd_event_t::start_ts
uint64_t start_ts
Definition hsa_signal.hh:74

syscall_emul_buf.hh
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

translating_port_proxy.hh