develop/amdgpu__device_8cc_source.html

/*

 * Copyright (c) 2021 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "dev/amdgpu/amdgpu_device.hh"


#include <fstream>


#include "debug/AMDGPUDevice.hh"

#include "dev/amdgpu/amdgpu_nbio.hh"

#include "dev/amdgpu/amdgpu_vm.hh"

#include "dev/amdgpu/interrupt_handler.hh"

#include "dev/amdgpu/pm4_packet_processor.hh"

#include "dev/amdgpu/sdma_engine.hh"

#include "dev/hsa/hw_scheduler.hh"

#include "gpu-compute/gpu_command_processor.hh"

#include "gpu-compute/shader.hh"

#include "mem/abstract_mem.hh"

#include "mem/packet.hh"

#include "mem/packet_access.hh"

#include "params/AMDGPUDevice.hh"

#include "sim/byteswap.hh"

#include "sim/sim_exit.hh"


namespace gem5

{


AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)

    : PciEndpoint(p),

      gpuMemMgr(p.memory_manager),

      deviceIH(p.device_ih),

      cp(p.cp),

      checkpoint_before_mmios(p.checkpoint_before_mmios),

      init_interrupt_count(0),

      _lastVMID(0),

      deviceMem(name() + ".deviceMem", p.memories, false, "", false),

      system(p.system),

      gpuId(p.gpu_id)

{

    uint64_t vram_size = 0;


    // System pointer needs to be explicitly set for device memory since

    // DRAMCtrl uses it to get (1) cache line size and (2) the mem mode.

    // Note this means the cache line size is system wide.

    for (auto& m : p.memories) {

        m->system(p.system);


        // Add to system's device memory map.

        p.system->addDeviceMemory(gpuMemMgr->getRequestorID(), m);


        vram_size += m->getAddrRange().size();

    }


    vramSize = vram_size;


    if (config().expansionROM) {

        romRange = RangeSize(config().expansionROM, ROM_SIZE);

    } else {

        romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE);

    }


    if (p.device_name == "Vega10") {

        gfx_version = GfxVersion::gfx900;

    } else if (p.device_name == "MI100") {

        gfx_version = GfxVersion::gfx908;

    } else if (p.device_name == "MI200") {

        gfx_version = GfxVersion::gfx90a;

    } else if (p.device_name == "MI300X") {

        gfx_version = GfxVersion::gfx942;

    } else if (p.device_name == "MI355X") {

        gfx_version = GfxVersion::gfx950;

    } else {

        panic("Unknown GPU device %s\n", p.device_name);

    }


    int sdma_id = 0;

    for (auto& s : p.sdmas) {

        s->setGPUDevice(this);

        s->setId(sdma_id);

        sdmaIds.insert({sdma_id, s});

        sdmaMmios.insert({sdma_id,

                          RangeSize(s->getMmioBase(), s->getMmioSize())});

        DPRINTF(AMDGPUDevice, "SDMA%d has MMIO range %s\n", sdma_id,

                sdmaMmios[sdma_id].to_string().c_str());

        sdma_id++;

    }


    // Map SDMA MMIO addresses to functions

    sdmaFunc.insert({0x81, &SDMAEngine::setGfxBaseLo});

    sdmaFunc.insert({0x82, &SDMAEngine::setGfxBaseHi});

    sdmaFunc.insert({0x88, &SDMAEngine::setGfxRptrHi});

    sdmaFunc.insert({0x89, &SDMAEngine::setGfxRptrLo});

    sdmaFunc.insert({0x92, &SDMAEngine::setGfxDoorbellLo});

    sdmaFunc.insert({0xab, &SDMAEngine::setGfxDoorbellOffsetLo});

    sdmaFunc.insert({0x80, &SDMAEngine::setGfxSize});

    sdmaFunc.insert({0xb2, &SDMAEngine::setGfxWptrLo});

    sdmaFunc.insert({0xb3, &SDMAEngine::setGfxWptrHi});

    if (p.device_name == "Vega10") {

        sdmaFunc.insert({0xe1, &SDMAEngine::setPageBaseLo});

        sdmaFunc.insert({0xe9, &SDMAEngine::setPageRptrLo});

        sdmaFunc.insert({0xe8, &SDMAEngine::setPageRptrHi});

        sdmaFunc.insert({0xf2, &SDMAEngine::setPageDoorbellLo});

        sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});

        sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});

        sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});

    } else if (p.device_name == "MI100" || p.device_name == "MI200" ||

               p.device_name == "MI300X" || p.device_name == "MI355X") {

        sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});

        sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});

        sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});

        sdmaFunc.insert({0xea, &SDMAEngine::setPageDoorbellLo});

        sdmaFunc.insert({0xd8, &SDMAEngine::setPageDoorbellOffsetLo});

        sdmaFunc.insert({0x10b, &SDMAEngine::setPageWptrLo});

    } else {

        panic("Unknown GPU device %s\n", p.device_name);

    }


    // Setup PM4 packet processors and sanity check IDs

    std::set<int> pm4_ids;

    for (auto& pm4 : p.pm4_pkt_procs) {

        pm4->setGPUDevice(this);

        fatal_if(pm4_ids.count(pm4->getIpId()),

                "Two PM4s with same IP IDs is not allowed");

        pm4_ids.insert(pm4->getIpId());

        pm4PktProcs.insert({pm4->getIpId(), pm4});


        pm4Ranges.insert({pm4->getMMIORange(), pm4});

    }


    // There should be at least one PM4 packet processor with ID 0

    fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found");


    deviceIH->setGPUDevice(this);

    cp->hsaPacketProc().setGPUDevice(this);

    cp->setGPUDevice(this);

    nbio.setGPUDevice(this);

    gpuvm.setGPUDevice(this);

    smu.setGPUDevice(this);


    // Address aperture for device memory. We tell this to the driver and

    // could possibly be anything, but these are the values used by hardware.

    uint64_t mmhubBase = 0x8000ULL << 24;

    uint64_t mmhubTop = 0x83ffULL << 24;

    uint64_t mmio_mem_size = vram_size / 0x100000;


    // The driver adds + 1 to MMIO value to reduce the number of bits required

    // to represent max memory size. Subtract one here before writing MMIO.

    mmio_mem_size -= 0x1;


    gpuvm.setMMHUBBase(mmhubBase);

    gpuvm.setMMHUBTop(mmhubTop);


    // Map other MMIO apertures based on gfx version. For MI300X+ these come

    // from the ip discovery table (see ip_discovery_header struct in

    // include/discovery.h in amdgpu driver. Common values for MI200 - MI350:

    // NBIO               0x0     - 0x4280

    // IH                 0x4280  - 0x4980

    // GRBM               0x8000  - 0xC000

    // GFX                0x28000 - 0x3F000

    // MMHUB              0x68000 - 0x6a120 (MI200)

    // MMHUB              0x60D00 - 0x62E20 (MI3xx)

    // SMU                0x5a000 - 0x5ace4

    //

    // This must be done before any calls to get/setRegVal.

    gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));

    gpuvm.setMMIOAperture(IH_MMIO_RANGE,   AddrRange(0x4280, 0x4980));

    gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));

    gpuvm.setMMIOAperture(GFX_MMIO_RANGE,  AddrRange(0x28000, 0x3F000));

    if (getGfxVersion() == GfxVersion::gfx942 ||

        getGfxVersion() == GfxVersion::gfx950) {

        gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE,  AddrRange(0x60D00, 0x62E20));

    } else {

        gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE,  AddrRange(0x68000, 0x6A120));

    }

    gpuvm.setMMIOAperture(SMU_MMIO_RANGE, AddrRange(0x5A000, 0x5ACE4));


    // These are hardcoded register values to return what the driver expects

    setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000);


    // There are different registers for different GPUs, so we set the value

    // based on the GPU type specified by the user.

    if (p.device_name == "Vega10") {

        setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);

        setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);

    } else if (p.device_name == "MI100") {

        setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);

        setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);

        setRegVal(MI100_MEM_SIZE_REG, mmio_mem_size);

    } else if (p.device_name == "MI200") {

        // This device can have either 64GB or 128GB of device memory.

        // This limits to 16GB for simulation.

        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);

        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);

        setRegVal(MI200_MEM_SIZE_REG, mmio_mem_size);

    } else if (p.device_name == "MI300X" || p.device_name == "MI355X") {

        // The MMIO addresses are the same in MI300X and MI355X

        // VRAM size in MB (shifted right by 20 bits)

        setRegVal(MI300X_FB_LOCATION_BASE, mmhubBase >> 24);

        setRegVal(MI300X_FB_LOCATION_TOP, mmhubTop >> 24);

        setRegVal(MI300X_MEM_SIZE_REG, mmio_mem_size);

    } else {

        panic("Unknown GPU device %s\n", p.device_name);

    }


    // IP discovery from VRAM for MI300X+. If ipt_binary is None, the assume

    // the driver is being loaded using discovery=2 to read from the disk.

    // In that case gem5 does not have to do anything special.

    bool use_ip_discovery = false;


    if (getGfxVersion() == GfxVersion::gfx942 ||

        getGfxVersion() == GfxVersion::gfx950) {

        use_ip_discovery = true;


        if (p.ipt_binary == "") {

            DPRINTF(AMDGPUDevice, "Assuming discovery=2 for IP discovery\n");

        }

    }


    if (use_ip_discovery && p.ipt_binary != "") {

        // From ROCk driver: amdgpu/amdgpu_discovery.h:

        constexpr uint64_t DISCOVERY_TMR_OFFSET = (64 << 10);

        constexpr int IPT_SIZE_DW = 0xa00;

        uint64_t ip_table_base = (mmio_mem_size << 20) - DISCOVERY_TMR_OFFSET;


        DPRINTF(AMDGPUDevice, "Using IP discovery file %s\n", p.ipt_binary);


        std::ifstream iptBin;

        std::array<uint32_t, IPT_SIZE_DW> ipTable;

        iptBin.open(p.ipt_binary, std::ios::binary);

        iptBin.read((char *)ipTable.data(), IPT_SIZE_DW*4);

        iptBin.close();


        // Read from the IP discovery ROM starting at offset 0x100 (DW 0x40)

        for (int ipt_dword = 0x0; ipt_dword < IPT_SIZE_DW; ipt_dword++) {

            Addr ipt_addr = ip_table_base + ipt_dword*4;


            // The driver is using bit 32 of the address for something not

            // part of the address. Fixup the address to be ipt_addr >> 31

            // OR'd with the lower 31 bits and 0x80000000.

            Addr ipt_addr_hi = ipt_addr >> 31;

            Addr fixup_addr = (ipt_addr_hi << 32) | (ipt_addr & 0x7fffffff)

                            | 0x80000000;


            setRegVal(fixup_addr, ipTable[ipt_dword]);

            DPRINTF(AMDGPUDevice, "IPTable wrote dword %d (%x) to %lx\n",

                    ipt_dword, ipTable[ipt_dword], fixup_addr);

        }

    }

}


void


AMDGPUDevice::readROM(PacketPtr pkt)

{

    Addr rom_offset = pkt->getAddr() & (ROM_SIZE - 1);


    // Read directly from the VGA ROM region. For multiple GPUs, this means

    // every GPU must be the same type. However, this allows for one less

    // input file as the GPU VBIOS is already part of the gem5 resources disk

    // image and loaded at the VGA_ROM_DEFAULT address as part of readfile.

    RequestPtr request = std::make_shared<Request>(

        VGA_ROM_DEFAULT + rom_offset, pkt->getSize(), 0, vramRequestorId());


    auto readPkt = new Packet(request, MemCmd::ReadReq);

    readPkt->allocate();


    system->getPhysMem().access(readPkt);


    DPRINTF(AMDGPUDevice, "Read from VGA ROM offset %#x returned %#x\n",

            rom_offset, readPkt->getUintX(ByteOrder::little));


    pkt->setUintX(readPkt->getUintX(ByteOrder::little), ByteOrder::little);

}


void


AMDGPUDevice::writeROM(PacketPtr pkt)

{

    assert(isROM(pkt->getAddr()));


    // Read directly from the VGA ROM region at VGA_ROM_DEFAULT address.

    Addr rom_offset = pkt->getAddr() - romRange.start();

    uint64_t rom_data = pkt->getUintX(ByteOrder::little);


    RequestPtr request = std::make_shared<Request>(

        VGA_ROM_DEFAULT + rom_offset, pkt->getSize(), 0, vramRequestorId());


    auto writePkt = new Packet(request, MemCmd::WriteReq);

    writePkt->allocate();

    writePkt->setUintX(rom_data, ByteOrder::little);


    system->getPhysMem().access(writePkt);


    DPRINTF(AMDGPUDevice, "Wrote to VGA ROM offset %#x value %#x\n",

            rom_offset, writePkt->getUintX(ByteOrder::little));

}


AddrRangeList


AMDGPUDevice::getAddrRanges() const

{

    AddrRangeList ranges = PciEndpoint::getAddrRanges();

    AddrRangeList ret_ranges;

    ret_ranges.push_back(romRange);


    // If the range starts at zero assume OS hasn't assigned it yet. Do not

    // return ranges starting with zero as they will surely overlap with

    // another range causing the I/O crossbar to fatal.

    for (auto & r : ranges) {

        if (r.start() != 0) {

            ret_ranges.push_back(r);

        }

    }


    return ret_ranges;

}


Tick


AMDGPUDevice::readConfig(PacketPtr pkt)

{

    int offset = pkt->getAddr() & PCI_CONFIG_SIZE;


    if (offset < PCI_DEVICE_SPECIFIC) {

        PciEndpoint::readConfig(pkt);

    } else {

        if (offset >= PXCAP_BASE && offset < (PXCAP_BASE + sizeof(PXCAP))) {

            int pxcap_offset = offset - PXCAP_BASE;


            switch (pkt->getSize()) {

                case sizeof(uint8_t):

                    pkt->setLE<uint8_t>(pxcap.data[pxcap_offset]);

                    DPRINTF(AMDGPUDevice,

                            "Read PXCAP:  dev %#x func %#x reg %#x 1 bytes: "

                            "data = %#x\n",

                            _devAddr.dev, _devAddr.func, pxcap_offset,

                            (uint32_t)pkt->getLE<uint8_t>());

                    break;

                case sizeof(uint16_t):

                    pkt->setLE<uint16_t>(

                        *(uint16_t*)&pxcap.data[pxcap_offset]);

                    DPRINTF(AMDGPUDevice,

                            "Read PXCAP:  dev %#x func %#x reg %#x 2 bytes: "

                            "data = %#x\n",

                            _devAddr.dev, _devAddr.func, pxcap_offset,

                            (uint32_t)pkt->getLE<uint16_t>());

                    break;

                case sizeof(uint32_t):

                    pkt->setLE<uint32_t>(

                        *(uint32_t*)&pxcap.data[pxcap_offset]);

                    DPRINTF(AMDGPUDevice,

                            "Read PXCAP:  dev %#x func %#x reg %#x 4 bytes: "

                            "data = %#x\n",

                            _devAddr.dev, _devAddr.func, pxcap_offset,

                            (uint32_t)pkt->getLE<uint32_t>());

                    break;

                default:

                    panic("Invalid access size (%d) for amdgpu PXCAP %#x\n",

                          pkt->getSize(), pxcap_offset);

            }

            pkt->makeAtomicResponse();

        } else {

            warn("Device specific offset %d not implemented!\n", offset);

        }

    }


    // Before sending MMIOs the driver sends three interrupts in a row.

    // Use this to trigger creating a checkpoint to restore in timing mode.

    // This is only necessary until we can create a "hole" in the KVM VM

    // around the VGA ROM region such that KVM exits and sends requests to

    // this device rather than the KVM VM.

    if (checkpoint_before_mmios) {

        if (offset == PCI_INTERRUPT_PIN) {

            if (++init_interrupt_count == 3) {

                DPRINTF(AMDGPUDevice, "Checkpointing before first MMIO\n");

                exitSimLoop("checkpoint", 0, curTick() + configDelay + 1);

            }

        } else {

            init_interrupt_count = 0;

        }

    }


    return configDelay;

}


Tick


AMDGPUDevice::writeConfig(PacketPtr pkt)

{

    [[maybe_unused]] int offset = pkt->getAddr() & PCI_CONFIG_SIZE;

    DPRINTF(AMDGPUDevice, "Write Config: from offset: %#x size: %#x "

            "data: %#x\n", offset, pkt->getSize(),

            pkt->getUintX(ByteOrder::little));


    if (offset < PCI_DEVICE_SPECIFIC) {

        // For the Expansion ROM BAR, Linux will write ~0x7ff before reading

        // the ROM bar size. If we simply return the written value, the ROM

        // size is only 0x800 which is too small for the GPU VBIOS. Here we

        // override the default PciDevice behavior and set the next read to

        // return 4kiB size. This is enough to load the *used* portions of

        // the VBIOS. See how PCI_ROM_ADDRESS is handled in the function:

        // github.com/torvalds/linux/blob/master/drivers/pci/probe.c#L176

        if (offset == PCI0_ROM_BASE_ADDR &&

            letoh(pkt->getLE<uint32_t>()) == 0xfffff800) {

            DPRINTF(AMDGPUDevice, "Setting expansion ROM size to 0x1000\n");


            config().expansionROM = 0xfffff000;

        } else {

            return PciEndpoint::writeConfig(pkt);

        }

    }


    if (offset >= PXCAP_BASE && offset < (PXCAP_BASE + sizeof(PXCAP))) {

        uint8_t *pxcap_data = &(pxcap.data[0]);

        int pxcap_offset = offset - PXCAP_BASE;


        DPRINTF(AMDGPUDevice, "Writing PXCAP offset %d size %d\n",

                pxcap_offset, pkt->getSize());


        memcpy(pxcap_data + pxcap_offset, pkt->getConstPtr<void>(),

               pkt->getSize());

    }


    pkt->makeAtomicResponse();


    return configDelay;

}


void


AMDGPUDevice::dispatchAccess(PacketPtr pkt, bool read)

{

    DPRINTF(AMDGPUDevice, "%s from addr %#x size: %#x data: %#x\n",

            read ? "Read" : "Write", pkt->getAddr(), pkt->getSize(),

            pkt->getUintX(ByteOrder::little));


    pkt->makeAtomicResponse();

}


void


AMDGPUDevice::readFrame(PacketPtr pkt, Addr offset)

{

    DPRINTF(AMDGPUDevice, "Read framebuffer address %#lx\n", offset);


    /*

     * Return data for frame reads in priority order: (1) Special addresses

     * first, ignoring any writes from driver. (2) Any other address from

     * device backing store / abstract memory class functionally.

     */

    if (nbio.readFrame(pkt, offset)) {

        return;

    }


    /*

     * Read the value from device memory. This must be done functionally

     * because this method is called by the PCIEndpoint::read method which

     * is a non-timing read.

     */

    RequestPtr req = std::make_shared<Request>(

            offset, pkt->getSize(), 0, vramRequestorId());


    PacketPtr readPkt = new Packet(req, MemCmd::ReadReq);

    uint8_t *dataPtr = new uint8_t[pkt->getSize()];

    readPkt->dataDynamic(dataPtr);

    readPkt->req->setGPUFuncAccess(true);

    readPkt->setSuppressFuncError();

    cp->shader()->cuList[0]->memPort[0].sendFunctional(readPkt);

    if (readPkt->cmd == MemCmd::FunctionalReadError) {

        delete readPkt;

        delete[] dataPtr;

        RequestPtr req = std::make_shared<Request>(offset, pkt->getSize(), 0,

                                               vramRequestorId());

        PacketPtr readPkt = Packet::createRead(req);

        uint8_t *dataPtr = new uint8_t[pkt->getSize()];

        readPkt->dataDynamic(dataPtr);


        auto system = cp->shader()->gpuCmdProc.system();

        system->getDeviceMemory(readPkt)->access(readPkt);

    }


    pkt->setUintX(readPkt->getUintX(ByteOrder::little), ByteOrder::little);

    delete readPkt;

}


void


AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset)

{

    DPRINTF(AMDGPUDevice, "Read doorbell %#lx\n", offset);

    mmioReader.readFromTrace(pkt, DOORBELL_BAR, offset);

}


void


AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)

{

    AddrRange aperture = gpuvm.getMMIOAperture(offset);

    Addr aperture_offset = offset - aperture.start();


    // By default read from MMIO trace. Overwrite the packet for a select

    // few more dynamic MMIOs.

    DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);

    mmioReader.readFromTrace(pkt, MMIO_BAR, offset);


    if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "NBIO base\n");

        nbio.readMMIO(pkt, aperture_offset);

    } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "GRBM base\n");

        gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);

    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "GFX base\n");

        gfx.readMMIO(pkt, aperture_offset);

    } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "MMHUB base\n");

        gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);

    } else if (aperture == gpuvm.getMMIORange(SMU_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "SMU base\n");

        smu.readMMIO(pkt, aperture_offset >> SMU_OFFSET_SHIFT);

    } else {

        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset);

    }

}


void


AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)

{

    DPRINTF(AMDGPUDevice, "Wrote framebuffer address %#lx (size %d)\n", offset,

            pkt->getSize());


    for (auto& cu: CP()->shader()->cuList) {

        Addr aligned_addr = offset & ~(gpuMemMgr->getCacheLineSize() - 1);

        cu->sendInvL2(aligned_addr);

    }


    Addr aperture = gpuvm.getFrameAperture(offset);

    Addr aperture_offset = offset - aperture;


    // Record the value

    if (aperture == gpuvm.gartBase()) {

        gpuvm.gartTable[aperture_offset] = pkt->getUintX(ByteOrder::little);

        DPRINTF(AMDGPUDevice, "GART translation %p -> %p\n", aperture_offset,

                gpuvm.gartTable[aperture_offset]);

    }


    nbio.writeFrame(pkt, offset);


    /*

     * Write the value to device memory. This must be done functionally

     * because this method is called by the PCIEndpoint::write method which

     * is a non-timing write.

     */

    RequestPtr req = std::make_shared<Request>(offset, pkt->getSize(), 0,

                                               vramRequestorId());

    PacketPtr writePkt = Packet::createWrite(req);

    uint8_t *dataPtr = new uint8_t[pkt->getSize()];

    std::memcpy(dataPtr, pkt->getPtr<uint8_t>(),

                pkt->getSize() * sizeof(uint8_t));

    writePkt->dataDynamic(dataPtr);


    auto system = cp->shader()->gpuCmdProc.system();


    // If for some reason no device memory is found for this address, ignore

    // the packet. This is an extremely rare situation and seems to only

    // happen with one address that is not important, therefore warn only.

    if (system->getDeviceMemory(writePkt)) {

        system->getDeviceMemory(writePkt)->access(writePkt);

    } else {

        warn("Unable to find device memory for address %#lx\n", offset);

    }


    delete writePkt;

}


void


AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)

{

    DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset);


    if (doorbells.find(offset) != doorbells.end()) {

        QueueType q_type = doorbells[offset].qtype;

        int ip_id = doorbells[offset].ip_id;

        DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n",

                              offset, q_type);

        switch (q_type) {

          case Compute:

            assert(pm4PktProcs.count(ip_id));

            pm4PktProcs[ip_id]->process(

                pm4PktProcs[ip_id]->getQueue(offset),

                pkt->getLE<uint64_t>());

          break;

          case Gfx:

            assert(pm4PktProcs.count(ip_id));

            pm4PktProcs[ip_id]->process(

                pm4PktProcs[ip_id]->getQueue(offset, true),

                pkt->getLE<uint64_t>());

          break;

          case SDMAGfx: {

            SDMAEngine *sdmaEng = getSDMAEngine(offset);

            sdmaEng->processGfx(pkt->getLE<uint64_t>());

          } break;

          case SDMAPage: {

            SDMAEngine *sdmaEng = getSDMAEngine(offset);

            sdmaEng->processPage(pkt->getLE<uint64_t>());

          } break;

          case ComputeAQL: {

            assert(pm4PktProcs.count(ip_id));

            cp->hsaPacketProc().hwScheduler()->write(offset,

                pkt->getLE<uint64_t>() + 1);

            pm4PktProcs[ip_id]->updateReadIndex(offset,

                pkt->getLE<uint64_t>() + 1);

          } break;

          case InterruptHandler:

            deviceIH->updateRptr(pkt->getLE<uint32_t>());

            break;

          case RLC: {

            SDMAEngine *sdmaEng = getSDMAEngine(offset);

            sdmaEng->processRLC(offset, pkt->getLE<uint64_t>());

          } break;

          default:

            panic("Write to unkown queue type!");

        }

    } else {

        warn("Unknown doorbell offset: %lx. Saving to pending doorbells.\n",

             offset);


        // We have to ACK the PCI packet immediately, so create a copy of the

        // packet here to send again. The packet data contains the value of

        // the doorbell to write so we need to copy that as the original

        // packet gets deleted after the PCI write() method returns.

        RequestPtr pending_req(pkt->req);

        PacketPtr pending_pkt = Packet::createWrite(pending_req);

        uint8_t *pending_data = new uint8_t[pkt->getSize()];

        memcpy(pending_data, pkt->getPtr<uint8_t>(), pkt->getSize());

        pending_pkt->dataDynamic(pending_data);


        pendingDoorbellPkts.emplace(offset, pending_pkt);

    }

}


void


AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)

{

    AddrRange aperture = gpuvm.getMMIOAperture(offset);

    Addr aperture_offset = offset - aperture.start();


    DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);


    // Check SDMA functions first, then fallback to MMIO ranges.

    for (int idx = 0; idx < sdmaIds.size(); ++idx) {

        if (sdmaMmios[idx].contains(offset)) {

            Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;

            if (sdmaFunc.count(sdma_offset)) {

                DPRINTF(AMDGPUDevice, "Calling SDMA%d MMIO function %lx\n",

                        idx, sdma_offset);

                sdmaFuncPtr mptr = sdmaFunc[sdma_offset];

                (getSDMAById(idx)->*mptr)(pkt->getLE<uint32_t>());

            } else {

                DPRINTF(AMDGPUDevice, "Unknown SDMA%d MMIO: %#lx\n", idx,

                        sdma_offset);

            }


            return;

        }

    }


    // Check PM4s next, returning to avoid duplicate writes.

    for (auto& [range, pm4_proc] : pm4Ranges) {

        if (range.contains(offset)) {

            // PM4 MMIOs are offset based on the MMIO range start

            Addr ip_offset = offset - range.start();

            pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT);


            return;

        }

    }


    if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "GRBM base\n");

        gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);

    } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "IH base\n");

        deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT);

    } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "NBIO base\n");

        nbio.writeMMIO(pkt, aperture_offset);

    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "GFX base\n");

        gfx.writeMMIO(pkt, aperture_offset);

    } else if (aperture == gpuvm.getMMIORange(SMU_MMIO_RANGE)) {

        DPRINTF(AMDGPUDevice, "SMU base\n");

        smu.writeMMIO(pkt, aperture_offset >> SMU_OFFSET_SHIFT);

    } else {

        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset);

    }

}


Tick


AMDGPUDevice::readDevice(PacketPtr pkt)

{

    if (isROM(pkt->getAddr())) {

        readROM(pkt);

    } else {

        int barnum = -1;

        Addr offset = 0;

        getBAR(pkt->getAddr(), barnum, offset);


        switch (barnum) {

          case FRAMEBUFFER_BAR:

              readFrame(pkt, offset);

              break;

          case DOORBELL_BAR:

              readDoorbell(pkt, offset);

              break;

          case MMIO_BAR:

              readMMIO(pkt, offset);

              break;

          default:

            panic("Request with address out of mapped range!");

        }

    }


    dispatchAccess(pkt, true);

    return pioDelay;

}


Tick


AMDGPUDevice::writeDevice(PacketPtr pkt)

{

    if (isROM(pkt->getAddr())) {

        writeROM(pkt);


        dispatchAccess(pkt, false);


        return pioDelay;

    }


    int barnum = -1;

    Addr offset = 0;

    getBAR(pkt->getAddr(), barnum, offset);


    switch (barnum) {

      case FRAMEBUFFER_BAR:

          writeFrame(pkt, offset);

          break;

      case DOORBELL_BAR:

          writeDoorbell(pkt, offset);

          break;

      case MMIO_BAR:

          writeMMIO(pkt, offset);

          break;

      default:

        panic("Request with address out of mapped range!");

    }


    // Record only if there is non-zero value, or a value to be overwritten.

    // Reads return 0 by default.

    uint64_t data = pkt->getUintX(ByteOrder::little);


    DPRINTF(AMDGPUDevice, "PCI Write to %#lx data %#lx\n",

                            pkt->getAddr(), data);


    dispatchAccess(pkt, false);


    return pioDelay;

}


void


AMDGPUDevice::processPendingDoorbells(uint32_t offset)

{

    if (pendingDoorbellPkts.count(offset)) {

        DPRINTF(AMDGPUDevice, "Sending pending doorbell %x\n", offset);

        writeDoorbell(pendingDoorbellPkts[offset], offset);

        delete pendingDoorbellPkts[offset];

        pendingDoorbellPkts.erase(offset);

    }

}


uint32_t


AMDGPUDevice::getRegVal(uint64_t addr)

{

    // This is somewhat of a guess based on amdgpu_device_mm_access

    // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then

    // assume VRAM and use full address, otherwise assume register

    // address and only user lower 31 bits.

    Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff;


    uint32_t pkt_data = 0;

    RequestPtr request = std::make_shared<Request>(fixup_addr,

            sizeof(uint32_t), 0 /* flags */, vramRequestorId());

    PacketPtr pkt = Packet::createRead(request);

    pkt->dataStatic((uint8_t *)&pkt_data);

    readMMIO(pkt, addr);

    DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",

            fixup_addr, pkt->getLE<uint32_t>());


    pkt_data = pkt->getLE<uint32_t>();

    delete pkt;


    return pkt_data;

}


void


AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)

{

    DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n",

            addr, value);


    uint32_t pkt_data = value;

    RequestPtr request = std::make_shared<Request>(addr,

            sizeof(uint32_t), 0 /* flags */, vramRequestorId());

    PacketPtr pkt = Packet::createWrite(request);

    pkt->dataStatic((uint8_t *)&pkt_data);

    writeMMIO(pkt, addr);

    delete pkt;

}


void


AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id)

{

    DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset);

    doorbells[offset].qtype = qt;

    doorbells[offset].ip_id = ip_id;

}


void


AMDGPUDevice::unsetDoorbell(uint32_t offset)

{

    doorbells.erase(offset);

}


void


AMDGPUDevice::setSDMAEngine(Addr offset, SDMAEngine *eng)

{

    sdmaEngs[offset] = eng;

}


SDMAEngine*


AMDGPUDevice::getSDMAById(int id)

{

    assert(sdmaIds.count(id));


    return sdmaIds[id];

}


SDMAEngine*


AMDGPUDevice::getSDMAEngine(Addr offset)

{

    return sdmaEngs[offset];

}


void


AMDGPUDevice::intrPost()

{

    PciEndpoint::intrPost();

}


void


AMDGPUDevice::serialize(CheckpointOut &cp) const

{

    // Serialize the PciEndpoint base class

    PciEndpoint::serialize(cp);


    uint64_t doorbells_size = doorbells.size();

    uint64_t sdma_engs_size = sdmaEngs.size();

    uint64_t used_vmid_map_size = usedVMIDs.size();


    SERIALIZE_SCALAR(doorbells_size);

    SERIALIZE_SCALAR(sdma_engs_size);

    // Save the number of vmids used

    SERIALIZE_SCALAR(used_vmid_map_size);


    // Make a c-style array of the regs to serialize

    auto doorbells_offset = std::make_unique<uint32_t[]>(doorbells_size);

    auto doorbells_queues = std::make_unique<QueueType[]>(doorbells_size);

    auto doorbells_ip_ids = std::make_unique<int[]>(doorbells_size);

    auto sdma_engs_offset = std::make_unique<uint32_t[]>(sdma_engs_size);

    auto sdma_engs = std::make_unique<int[]>(sdma_engs_size);

    auto used_vmids = std::make_unique<int[]>(used_vmid_map_size);

    auto used_queue_id_sizes = std::make_unique<int[]>(used_vmid_map_size);

    std::vector<int> used_vmid_sets;


    int idx = 0;

    for (auto & it : doorbells) {

        doorbells_offset[idx] = it.first;

        doorbells_queues[idx] = it.second.qtype;

        doorbells_ip_ids[idx] = it.second.ip_id;

        ++idx;

    }


    idx = 0;

    for (auto & it : sdmaEngs) {

        sdma_engs_offset[idx] = it.first;

        sdma_engs[idx] = it.second->getId();

        ++idx;

    }


    idx = 0;

    for (auto & it : usedVMIDs) {

        used_vmids[idx] = it.first;

        used_queue_id_sizes[idx] = it.second.size();

        std::vector<int> set_vector(it.second.begin(), it.second.end());

        used_vmid_sets.insert(used_vmid_sets.end(),

                set_vector.begin(), set_vector.end());

        ++idx;

    }


    int num_queue_id = used_vmid_sets.size();

    auto vmid_array = std::make_unique<int[]>(num_queue_id);

    std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array.get());


    SERIALIZE_UNIQUE_PTR_ARRAY(doorbells_offset, doorbells_size);

    SERIALIZE_UNIQUE_PTR_ARRAY(doorbells_queues, doorbells_size);

    SERIALIZE_UNIQUE_PTR_ARRAY(doorbells_ip_ids, doorbells_size);

    SERIALIZE_UNIQUE_PTR_ARRAY(sdma_engs_offset, sdma_engs_size);

    SERIALIZE_UNIQUE_PTR_ARRAY(sdma_engs, sdma_engs_size);

    // Save the vmids used in an array

    SERIALIZE_UNIQUE_PTR_ARRAY(used_vmids, used_vmid_map_size);

    // Save the size of the set of queue ids mapped to each vmid

    SERIALIZE_UNIQUE_PTR_ARRAY(used_queue_id_sizes, used_vmid_map_size);

    // Save all the queue ids used for all the vmids

    SERIALIZE_UNIQUE_PTR_ARRAY(vmid_array, num_queue_id);

    // Save the total number of queue idsused

    SERIALIZE_SCALAR(num_queue_id);


    // Serialize the device memory

    deviceMem.serializeSection(cp, "deviceMem");

    gpuvm.serializeSection(cp, "GPUVM");

}


void


AMDGPUDevice::unserialize(CheckpointIn &cp)

{

    // Unserialize the PciEndpoint base class

    PciEndpoint::unserialize(cp);


    uint64_t doorbells_size = 0;

    uint64_t sdma_engs_size = 0;

    uint64_t used_vmid_map_size = 0;


    UNSERIALIZE_SCALAR(doorbells_size);

    UNSERIALIZE_SCALAR(sdma_engs_size);

    UNSERIALIZE_SCALAR(used_vmid_map_size);


    if (doorbells_size > 0) {

        auto doorbells_offset = std::make_unique<uint32_t[]>(doorbells_size);

        auto doorbells_queues = std::make_unique<QueueType[]>(doorbells_size);

        auto doorbells_ip_ids = std::make_unique<int[]>(doorbells_size);


        UNSERIALIZE_UNIQUE_PTR_ARRAY(doorbells_offset, doorbells_size);

        UNSERIALIZE_UNIQUE_PTR_ARRAY(doorbells_queues, doorbells_size);

        UNSERIALIZE_UNIQUE_PTR_ARRAY(doorbells_ip_ids, doorbells_size);


        for (int idx = 0; idx < doorbells_size; ++idx) {

            doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];

            doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];

        }

    }


    if (sdma_engs_size > 0) {

        auto sdma_engs_offset = std::make_unique<uint32_t[]>(sdma_engs_size);

        auto sdma_engs = std::make_unique<int[]>(sdma_engs_size);


        UNSERIALIZE_UNIQUE_PTR_ARRAY(sdma_engs_offset, sdma_engs_size);

        UNSERIALIZE_UNIQUE_PTR_ARRAY(sdma_engs, sdma_engs_size);


        for (int idx = 0; idx < sdma_engs_size; ++idx) {

            int sdma_id = sdma_engs[idx];

            assert(sdmaIds.count(sdma_id));

            SDMAEngine *sdma = sdmaIds[sdma_id];

            sdmaEngs.insert(std::make_pair(sdma_engs_offset[idx], sdma));

        }

    }


    if (used_vmid_map_size > 0) {

        auto used_vmids = std::make_unique<int[]>(used_vmid_map_size);

        auto used_queue_id_sizes = std::make_unique<int[]>(used_vmid_map_size);

        int num_queue_id = 0;

        std::vector<int> used_vmid_sets;

        // Extract the total number of queue ids used

        UNSERIALIZE_SCALAR(num_queue_id);

        auto vmid_array = std::make_unique<int[]>(num_queue_id);

        // Extract the number of vmids used

        UNSERIALIZE_UNIQUE_PTR_ARRAY(used_vmids, used_vmid_map_size);

        // Extract the size of the queue id set for each vmid

        UNSERIALIZE_UNIQUE_PTR_ARRAY(used_queue_id_sizes, used_vmid_map_size);

        // Extract all the queue ids used

        UNSERIALIZE_UNIQUE_PTR_ARRAY(vmid_array, num_queue_id);

        // Populate the usedVMIDs map with the queue ids per vm

        int idx = 0;

        for (int it = 0; it < used_vmid_map_size; it++) {

            int vmid = used_vmids[it];

            int vmid_set_size = used_queue_id_sizes[it];

            for (int j = 0; j < vmid_set_size; j++) {

                usedVMIDs[vmid].insert(vmid_array[idx + j]);

            }

            idx += vmid_set_size;

        }

    }


    // Unserialize the device memory

    deviceMem.unserializeSection(cp, "deviceMem");

    gpuvm.unserializeSection(cp, "GPUVM");

}


uint16_t


AMDGPUDevice::allocateVMID(uint16_t pasid)

{

    for (uint16_t vmid = 1; vmid < AMDGPU_VM_COUNT; vmid++) {

        auto result = usedVMIDs.find(vmid);

        if (result == usedVMIDs.end()) {

            idMap.insert(std::make_pair(pasid, vmid));

            usedVMIDs[vmid] = {};

            _lastVMID = vmid;

            return vmid;

        }

    }

    panic("All VMIDs have been assigned");

}


void


AMDGPUDevice::deallocateVmid(uint16_t vmid)

{

    usedVMIDs.erase(vmid);

}


void


AMDGPUDevice::deallocatePasid(uint16_t pasid)

{

    auto result = idMap.find(pasid);

    assert(result != idMap.end());

    if (result == idMap.end()) return;

    uint16_t vmid = result->second;


    idMap.erase(result);

    usedVMIDs.erase(vmid);

}


void


AMDGPUDevice::deallocateAllQueues(bool unmap_static)

{

    idMap.erase(idMap.begin(), idMap.end());

    usedVMIDs.erase(usedVMIDs.begin(), usedVMIDs.end());


    for (auto& it : sdmaEngs) {

        it.second->deallocateRLCQueues(unmap_static);

    }


    // "All" queues implicitly refers to all user queues. User queues begin at

    // doorbell address 0x4000, so unmap any queue at or above that address.

    for (auto [offset, vmid] : doorbellVMIDMap) {

        if (offset >= 0x4000) {

            doorbells.erase(offset);

        }

    }

}


void


AMDGPUDevice::mapDoorbellToVMID(Addr doorbell, uint16_t vmid)

{

    doorbellVMIDMap[doorbell] = vmid;

}


std::unordered_map<uint16_t, std::set<int>>&


AMDGPUDevice::getUsedVMIDs()

{

    return usedVMIDs;

}


void


AMDGPUDevice::insertQId(uint16_t vmid, int id)

{

    usedVMIDs[vmid].insert(id);

}


} // namespace gem5

abstract_mem.hh
AbstractMemory declaration.

amdgpu_device.hh

amdgpu_nbio.hh

AMDGPU_MP0_SMN_C2PMSG_33
#define AMDGPU_MP0_SMN_C2PMSG_33
Definition amdgpu_nbio.hh:68

amdgpu_vm.hh

VEGA10_FB_LOCATION_BASE
#define VEGA10_FB_LOCATION_BASE
Definition amdgpu_vm.hh:93

VEGA10_FB_LOCATION_TOP
#define VEGA10_FB_LOCATION_TOP
Definition amdgpu_vm.hh:94

MI200_MEM_SIZE_REG
#define MI200_MEM_SIZE_REG
Definition amdgpu_vm.hh:100

MI200_FB_LOCATION_TOP
#define MI200_FB_LOCATION_TOP
Definition amdgpu_vm.hh:102

MI300X_FB_LOCATION_TOP
#define MI300X_FB_LOCATION_TOP
Definition amdgpu_vm.hh:106

MI100_FB_LOCATION_BASE
#define MI100_FB_LOCATION_BASE
Definition amdgpu_vm.hh:97

MI200_FB_LOCATION_BASE
#define MI200_FB_LOCATION_BASE
Definition amdgpu_vm.hh:101

MI300X_MEM_SIZE_REG
#define MI300X_MEM_SIZE_REG
Definition amdgpu_vm.hh:104

MI100_FB_LOCATION_TOP
#define MI100_FB_LOCATION_TOP
Definition amdgpu_vm.hh:98

MI300X_FB_LOCATION_BASE
#define MI300X_FB_LOCATION_BASE
Definition amdgpu_vm.hh:105

MI100_MEM_SIZE_REG
#define MI100_MEM_SIZE_REG
Definition amdgpu_vm.hh:96

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

byteswap.hh

data
const char data[]
Definition circlebuf.test.cc:48

gem5::AMDGPUDevice::_lastVMID
uint16_t _lastVMID
Definition amdgpu_device.hh:154

gem5::AMDGPUDevice::insertQId
void insertQId(uint16_t vmid, int id)
Definition amdgpu_device.cc:1095

gem5::AMDGPUDevice::system
System * system
Definition amdgpu_device.hh:164

gem5::AMDGPUDevice::pm4Ranges
std::unordered_map< AddrRange, PM4PacketProcessor *, AddrRangeHasher > pm4Ranges
Definition amdgpu_device.hh:128

gem5::AMDGPUDevice::deallocateAllQueues
void deallocateAllQueues(bool unmap_static)
Definition amdgpu_device.cc:1064

gem5::AMDGPUDevice::doorbellVMIDMap
std::unordered_map< Addr, uint16_t > doorbellVMIDMap
Definition amdgpu_device.hh:150

gem5::AMDGPUDevice::idMap
std::unordered_map< uint16_t, uint16_t > idMap
Definition amdgpu_device.hh:148

gem5::AMDGPUDevice::readMMIO
void readMMIO(PacketPtr pkt, Addr offset)
Definition amdgpu_device.cc:514

gem5::AMDGPUDevice::serialize
void serialize(CheckpointOut &cp) const override
Checkpoint support.
Definition amdgpu_device.cc:882

gem5::AMDGPUDevice::processPendingDoorbells
void processPendingDoorbells(uint32_t offset)
Definition amdgpu_device.cc:788

gem5::AMDGPUDevice::getAddrRanges
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
Definition amdgpu_device.cc:324

gem5::AMDGPUDevice::unserialize
void unserialize(CheckpointIn &cp) override
Unserialize an object.
Definition amdgpu_device.cc:955

gem5::AMDGPUDevice::writeMMIO
void writeMMIO(PacketPtr pkt, Addr offset)
Definition amdgpu_device.cc:661

gem5::AMDGPUDevice::cp
GPUCommandProcessor * cp
Definition amdgpu_device.hh:117

gem5::AMDGPUDevice::setDoorbellType
void setDoorbellType(uint32_t offset, QueueType qt, int ip_id=0)
Set handles to GPU blocks.
Definition amdgpu_device.cc:838

gem5::AMDGPUDevice::writeDevice
Tick writeDevice(PacketPtr pkt) override
Write to the PCI device.
Definition amdgpu_device.cc:747

gem5::AMDGPUDevice::smu
AMDGPUSmu smu
Definition amdgpu_device.hh:116

gem5::AMDGPUDevice::gpuvm
AMDGPUVM gpuvm
Definition amdgpu_device.hh:115

gem5::AMDGPUDevice::gfx
AMDGPUGfx gfx
Definition amdgpu_device.hh:112

gem5::AMDGPUDevice::readROM
void readROM(PacketPtr pkt)
Definition amdgpu_device.cc:279

gem5::AMDGPUDevice::romRange
AddrRange romRange
VGA ROM methods.
Definition amdgpu_device.hh:98

gem5::AMDGPUDevice::doorbells
std::unordered_map< uint32_t, DoorbellInfo > doorbells
Structures to hold registers, doorbells, and some frame memory.
Definition amdgpu_device.hh:92

gem5::AMDGPUDevice::getUsedVMIDs
std::unordered_map< uint16_t, std::set< int > > & getUsedVMIDs()
Definition amdgpu_device.cc:1089

gem5::AMDGPUDevice::isROM
bool isROM(Addr addr) const
Definition amdgpu_device.hh:99

gem5::AMDGPUDevice::unsetDoorbell
void unsetDoorbell(uint32_t offset)
Definition amdgpu_device.cc:846

gem5::AMDGPUDevice::pendingDoorbellPkts
std::unordered_map< uint32_t, PacketPtr > pendingDoorbellPkts
Definition amdgpu_device.hh:93

gem5::AMDGPUDevice::setRegVal
void setRegVal(uint64_t addr, uint32_t value)
Definition amdgpu_device.cc:823

gem5::AMDGPUDevice::sdmaMmios
std::unordered_map< uint32_t, AddrRange > sdmaMmios
Definition amdgpu_device.hh:135

gem5::AMDGPUDevice::readDevice
Tick readDevice(PacketPtr pkt) override
Read from the PCI device.
Definition amdgpu_device.cc:718

gem5::AMDGPUDevice::sdmaFuncPtr
void(SDMAEngine::* sdmaFuncPtr)(uint32_t)
Definition amdgpu_device.hh:137

gem5::AMDGPUDevice::getSDMAEngine
SDMAEngine * getSDMAEngine(Addr offset)
Definition amdgpu_device.cc:870

gem5::AMDGPUDevice::gpuMemMgr
AMDGPUMemoryManager * gpuMemMgr
Definition amdgpu_device.hh:113

gem5::AMDGPUDevice::AMDGPUDevice
AMDGPUDevice(const AMDGPUDeviceParams &p)
Definition amdgpu_device.cc:55

gem5::AMDGPUDevice::readDoorbell
void readDoorbell(PacketPtr pkt, Addr offset)
Definition amdgpu_device.cc:507

gem5::AMDGPUDevice::nbio
AMDGPUNbio nbio
Blocks of the GPU.
Definition amdgpu_device.hh:111

gem5::AMDGPUDevice::readConfig
Tick readConfig(PacketPtr pkt) override
Read from the PCI config space data that is stored locally.
Definition amdgpu_device.cc:343

gem5::AMDGPUDevice::sdmaFunc
std::unordered_map< uint32_t, sdmaFuncPtr > sdmaFunc
Definition amdgpu_device.hh:138

gem5::AMDGPUDevice::usedVMIDs
std::unordered_map< uint16_t, std::set< int > > usedVMIDs
Definition amdgpu_device.hh:152

gem5::AMDGPUDevice::deviceIH
AMDGPUInterruptHandler * deviceIH
Definition amdgpu_device.hh:114

gem5::AMDGPUDevice::writeConfig
Tick writeConfig(PacketPtr pkt) override
Methods inherited from PciEndpoint.
Definition amdgpu_device.cc:410

gem5::AMDGPUDevice::mmioReader
AMDMMIOReader mmioReader
MMIO reader to populate device registers map.
Definition amdgpu_device.hh:106

gem5::AMDGPUDevice::checkpoint_before_mmios
bool checkpoint_before_mmios
Initial checkpoint support variables.
Definition amdgpu_device.hh:143

gem5::AMDGPUDevice::dispatchAccess
void dispatchAccess(PacketPtr pkt, bool read)
Convert a PCI packet into a response.
Definition amdgpu_device.cc:452

gem5::AMDGPUDevice::getRegVal
uint32_t getRegVal(uint64_t addr)
Register value getter/setter.
Definition amdgpu_device.cc:799

gem5::AMDGPUDevice::deallocateVmid
void deallocateVmid(uint16_t vmid)
Definition amdgpu_device.cc:1046

gem5::AMDGPUDevice::mapDoorbellToVMID
void mapDoorbellToVMID(Addr doorbell, uint16_t vmid)
Definition amdgpu_device.cc:1083

gem5::AMDGPUDevice::intrPost
void intrPost()
Methods inherited from PciEndpoint.
Definition amdgpu_device.cc:876

gem5::AMDGPUDevice::readFrame
void readFrame(PacketPtr pkt, Addr offset)
Helper methods to handle specific BAR read/writes.
Definition amdgpu_device.cc:462

gem5::AMDGPUDevice::writeROM
void writeROM(PacketPtr pkt)
Definition amdgpu_device.cc:302

gem5::AMDGPUDevice::vramSize
Addr vramSize
Definition amdgpu_device.hh:169

gem5::AMDGPUDevice::writeDoorbell
void writeDoorbell(PacketPtr pkt, Addr offset)
Definition amdgpu_device.cc:595

gem5::AMDGPUDevice::vramRequestorId
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Definition amdgpu_device.hh:225

gem5::AMDGPUDevice::sdmaIds
std::unordered_map< uint32_t, SDMAEngine * > sdmaIds
Definition amdgpu_device.hh:133

gem5::AMDGPUDevice::allocateVMID
uint16_t allocateVMID(uint16_t pasid)
Definition amdgpu_device.cc:1031

gem5::AMDGPUDevice::pm4PktProcs
std::unordered_map< int, PM4PacketProcessor * > pm4PktProcs
Definition amdgpu_device.hh:126

gem5::AMDGPUDevice::deallocatePasid
void deallocatePasid(uint16_t pasid)
Definition amdgpu_device.cc:1052

gem5::AMDGPUDevice::gpuId
const int gpuId
Definition amdgpu_device.hh:168

gem5::AMDGPUDevice::init_interrupt_count
int init_interrupt_count
Definition amdgpu_device.hh:144

gem5::AMDGPUDevice::getSDMAById
SDMAEngine * getSDMAById(int id)
Definition amdgpu_device.cc:858

gem5::AMDGPUDevice::writeFrame
void writeFrame(PacketPtr pkt, Addr offset)
Definition amdgpu_device.cc:545

gem5::AMDGPUDevice::setSDMAEngine
void setSDMAEngine(Addr offset, SDMAEngine *eng)
Definition amdgpu_device.cc:852

gem5::AMDGPUDevice::deviceMem
memory::PhysicalMemory deviceMem
Definition amdgpu_device.hh:159

gem5::AMDGPUDevice::sdmaEngs
std::unordered_map< uint32_t, SDMAEngine * > sdmaEngs
Definition amdgpu_device.hh:131

gem5::AMDGPUDevice::CP
GPUCommandProcessor * CP()
Definition amdgpu_device.hh:205

gem5::AddrRange
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
Definition addr_range.hh:82

gem5::CheckpointIn
Definition serialize.hh:69

gem5::MemCmd::FunctionalReadError
@ FunctionalReadError
Definition packet.hh:139

gem5::MemCmd::WriteReq
@ WriteReq
Definition packet.hh:90

gem5::MemCmd::ReadReq
@ ReadReq
Definition packet.hh:87

gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295

gem5::Packet::getAddr
Addr getAddr() const
Definition packet.hh:807

gem5::Packet::setUintX
void setUintX(uint64_t w, ByteOrder endian)
Set the value in the word w after truncating it to the length of the packet and then byteswapping it ...
Definition packet.cc:361

gem5::Packet::setLE
void setLE(T v)
Set the value in the data pointer to v as little endian.
Definition packet_access.hh:108

gem5::Packet::createWrite
static PacketPtr createWrite(const RequestPtr &req)
Definition packet.hh:1044

gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175

gem5::Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225

gem5::Packet::createRead
static PacketPtr createRead(const RequestPtr &req)
Constructor-like methods that return Packets based on Request objects.
Definition packet.hh:1038

gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition packet.hh:377

gem5::Packet::getSize
unsigned getSize() const
Definition packet.hh:817

gem5::Packet::getUintX
uint64_t getUintX(ByteOrder endian) const
Get the data in the packet byte swapped from the specified endianness and zero-extended to 64 bits.
Definition packet.cc:352

gem5::Packet::getConstPtr
const T * getConstPtr() const
Definition packet.hh:1234

gem5::Packet::dataDynamic
void dataDynamic(T *p)
Set the data pointer to a value that should have delete [] called on it.
Definition packet.hh:1213

gem5::Packet::makeAtomicResponse
void makeAtomicResponse()
Definition packet.hh:1074

gem5::Packet::cmd
MemCmd cmd
The command field of the packet.
Definition packet.hh:372

gem5::Packet::getLE
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
Definition packet_access.hh:78

gem5::Packet::setSuppressFuncError
void setSuppressFuncError()
Definition packet.hh:757

gem5::PciDevice::pxcap
PXCAP pxcap
Definition device.hh:339

gem5::PciDevice::serialize
void serialize(CheckpointOut &cp) const override
Serialize this object to the given output stream.
Definition device.cc:412

gem5::PciDevice::read
Tick read(PacketPtr pkt) final
Final implementation of read access from PioDevice.
Definition device.cc:264

gem5::PciDevice::getBAR
bool getBAR(Addr addr, int &num, Addr &offs)
Which base address register (if any) maps the given address?
Definition device.hh:358

gem5::PciDevice::pioDelay
Tick pioDelay
Definition device.hh:424

gem5::PciDevice::readConfig
virtual Tick readConfig(PacketPtr pkt)
Read from the PCI config space data that is stored locally.
Definition device.cc:207

gem5::PciDevice::intrPost
void intrPost()
Definition device.hh:435

gem5::PciDevice::PXCAP_BASE
const int PXCAP_BASE
Definition device.hh:338

gem5::PciDevice::configDelay
Tick configDelay
Definition device.hh:425

gem5::PciDevice::_devAddr
const PciDevAddr _devAddr
Definition device.hh:313

gem5::PciEndpoint::config
PCIConfigType0 & config()
Definition device.hh:505

gem5::PciEndpoint::unserialize
void unserialize(CheckpointIn &cp) override
Reconstruct the state of this object from a checkpoint.
Definition device.cc:703

gem5::PciEndpoint::PciEndpoint
PciEndpoint(const PciEndpointParams &params)
Constructor for PCI Dev.
Definition device.cc:603

gem5::PciEndpoint::writeConfig
Tick writeConfig(PacketPtr pkt) override
Write to the PCI config space data that is stored locally.
Definition device.cc:625

gem5::PioDevice::getAddrRanges
virtual AddrRangeList getAddrRanges() const =0
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...

gem5::SDMAEngine
System DMA Engine class for AMD dGPU.
Definition sdma_engine.hh:49

gem5::SDMAEngine::setGfxRptrLo
void setGfxRptrLo(uint32_t data)
Definition sdma_engine.cc:1574

gem5::SDMAEngine::setGfxWptrLo
void setGfxWptrLo(uint32_t data)
Definition sdma_engine.cc:1630

gem5::SDMAEngine::setGfxRptrHi
void setGfxRptrHi(uint32_t data)
Definition sdma_engine.cc:1582

gem5::SDMAEngine::processRLC
void processRLC(Addr doorbellOffset, Addr wptrOffset)
Definition sdma_engine.cc:316

gem5::SDMAEngine::setGfxSize
void setGfxSize(uint32_t data)
Definition sdma_engine.cc:1622

gem5::SDMAEngine::setGfxBaseLo
void setGfxBaseLo(uint32_t data)
Definition sdma_engine.cc:1558

gem5::SDMAEngine::processGfx
void processGfx(Addr wptrOffset)
Given a new write ptr offset, communicated to the GPU through a doorbell write, the SDMA engine proce...
Definition sdma_engine.cc:294

gem5::SDMAEngine::setGfxWptrHi
void setGfxWptrHi(uint32_t data)
Definition sdma_engine.cc:1637

gem5::SDMAEngine::setGfxDoorbellOffsetLo
void setGfxDoorbellOffsetLo(uint32_t data)
Definition sdma_engine.cc:1604

gem5::SDMAEngine::processPage
void processPage(Addr wptrOffset)
Definition sdma_engine.cc:305

gem5::SDMAEngine::setGfxDoorbellLo
void setGfxDoorbellLo(uint32_t data)
Definition sdma_engine.cc:1590

gem5::SDMAEngine::setGfxBaseHi
void setGfxBaseHi(uint32_t data)
Definition sdma_engine.cc:1566

std::vector
STL vector class.
Definition stl.hh:37

gpu_command_processor.hh
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...

gem5::RangeSize
AddrRange RangeSize(Addr start, Addr size)
Definition addr_range.hh:858

gem5::AddrRangeList
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64

gem5::AddrRange::start
Addr start() const
Get the start address of the range.
Definition addr_range.hh:343

gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220

fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268

UNSERIALIZE_UNIQUE_PTR_ARRAY
#define UNSERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Definition serialize.hh:634

SERIALIZE_UNIQUE_PTR_ARRAY
#define SERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Definition serialize.hh:626

hw_scheduler.hh

interrupt_handler.hh

warn
#define warn(...)
Definition logging.hh:288

gem5::ArmISA::s
Bitfield< 4 > s
Definition misc_types.hh:675

gem5::ArmISA::offset
Bitfield< 23, 0 > offset
Definition types.hh:144

gem5::ArmISA::id
Bitfield< 33 > id
Definition misc_types.hh:334

gem5::ArmISA::m
Bitfield< 0 > m
Definition misc_types.hh:482

gem5::MipsISA::r
r
Definition pra_constants.hh:98

gem5::MipsISA::p
Bitfield< 0 > p
Definition pra_constants.hh:326

gem5::VegaISA::p
Bitfield< 54 > p
Definition pagetable.hh:70

gem5::X86ISA::addr
Bitfield< 3 > addr
Definition types.hh:84

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::letoh
T letoh(T value)
Definition byteswap.hh:173

gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition request.hh:94

gem5::curTick
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46

gem5::MMIO_BAR
constexpr int MMIO_BAR
Definition amdgpu_defines.hh:68

gem5::CheckpointOut
std::ostream CheckpointOut
Definition serialize.hh:66

gem5::QueueType
QueueType
Definition amdgpu_defines.hh:42

gem5::SDMAGfx
@ SDMAGfx
Definition amdgpu_defines.hh:45

gem5::Compute
@ Compute
Definition amdgpu_defines.hh:43

gem5::RLC
@ RLC
Definition amdgpu_defines.hh:49

gem5::InterruptHandler
@ InterruptHandler
Definition amdgpu_defines.hh:48

gem5::Gfx
@ Gfx
Definition amdgpu_defines.hh:44

gem5::ComputeAQL
@ ComputeAQL
Definition amdgpu_defines.hh:47

gem5::SDMAPage
@ SDMAPage
Definition amdgpu_defines.hh:46

gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147

gem5::exitSimLoop
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
Definition sim_events.cc:111

gem5::pasid
Bitfield< 10 > pasid
Definition x86_cpu.cc:129

gem5::Tick
uint64_t Tick
Tick count type.
Definition types.hh:58

gem5::ROM_SIZE
constexpr uint32_t ROM_SIZE
Definition amdgpu_defines.hh:72

gem5::IH_OFFSET_SHIFT
static constexpr uint32_t IH_OFFSET_SHIFT
Definition amdgpu_defines.hh:75

gem5::PacketPtr
Packet * PacketPtr
Definition thread_context.hh:70

gem5::MMHUB_OFFSET_SHIFT
static constexpr uint32_t MMHUB_OFFSET_SHIFT
Definition amdgpu_defines.hh:77

gem5::AMDGPU_VM_COUNT
static constexpr int AMDGPU_VM_COUNT
Definition amdgpu_defines.hh:63

gem5::FRAMEBUFFER_BAR
constexpr int FRAMEBUFFER_BAR
Definition amdgpu_defines.hh:66

gem5::GRBM_MMIO_RANGE
@ GRBM_MMIO_RANGE
Definition amdgpu_vm.hh:128

gem5::GFX_MMIO_RANGE
@ GFX_MMIO_RANGE
Definition amdgpu_vm.hh:127

gem5::IH_MMIO_RANGE
@ IH_MMIO_RANGE
Definition amdgpu_vm.hh:129

gem5::MMHUB_MMIO_RANGE
@ MMHUB_MMIO_RANGE
Definition amdgpu_vm.hh:126

gem5::SMU_MMIO_RANGE
@ SMU_MMIO_RANGE
Definition amdgpu_vm.hh:130

gem5::NBIO_MMIO_RANGE
@ NBIO_MMIO_RANGE
Definition amdgpu_vm.hh:125

gem5::SMU_OFFSET_SHIFT
static constexpr uint32_t SMU_OFFSET_SHIFT
Definition amdgpu_defines.hh:78

gem5::DOORBELL_BAR
constexpr int DOORBELL_BAR
Definition amdgpu_defines.hh:67

gem5::VGA_ROM_DEFAULT
constexpr uint32_t VGA_ROM_DEFAULT
Definition amdgpu_defines.hh:71

gem5::GRBM_OFFSET_SHIFT
static constexpr uint32_t GRBM_OFFSET_SHIFT
Definition amdgpu_defines.hh:76

packet.hh
Declaration of the Packet class.

packet_access.hh

PCI0_ROM_BASE_ADDR
#define PCI0_ROM_BASE_ADDR
Definition pcireg.h:207

PCI_INTERRUPT_PIN
#define PCI_INTERRUPT_PIN
Definition pcireg.h:190

PCI_DEVICE_SPECIFIC
#define PCI_DEVICE_SPECIFIC
Definition pcireg.h:54

PCI_CONFIG_SIZE
#define PCI_CONFIG_SIZE
Definition pcireg.h:55

pm4_packet_processor.hh

sdma_engine.hh

UNSERIALIZE_SCALAR
#define UNSERIALIZE_SCALAR(scalar)
Definition serialize.hh:575

SERIALIZE_SCALAR
#define SERIALIZE_SCALAR(scalar)
Definition serialize.hh:568

shader.hh

sim_exit.hh

name
const std::string & name()
Definition trace.cc:48

PXCAP
Defines the PCI Express capability register and its associated bitfields for a PCIe device.
Definition pcireg.h:410