36#include "debug/AMDGPUDevice.hh"
48#include "params/AMDGPUDevice.hh"
56 :
PciDevice(
p), gpuMemMgr(
p.memory_manager), deviceIH(
p.device_ih),
57 cp(
p.cp), checkpoint_before_mmios(
p.checkpoint_before_mmios),
58 init_interrupt_count(0), _lastVMID(0),
59 deviceMem(
name() +
".deviceMem",
p.memories, false,
"", false)
64 for (
auto&
m :
p.memories) {
77 if (
p.device_name ==
"Vega10") {
79 }
else if (
p.device_name ==
"MI100") {
81 }
else if (
p.device_name ==
"MI200") {
83 }
else if (
p.device_name ==
"MI300X") {
86 panic(
"Unknown GPU device %s\n",
p.device_name);
90 for (
auto&
s :
p.sdmas) {
91 s->setGPUDevice(
this);
111 if (
p.device_name ==
"Vega10") {
119 }
else if (
p.device_name ==
"MI100" ||
p.device_name ==
"MI200"
120 ||
p.device_name ==
"MI300X") {
128 panic(
"Unknown GPU device %s\n",
p.device_name);
132 std::set<int> pm4_ids;
133 for (
auto& pm4 :
p.pm4_pkt_procs) {
134 pm4->setGPUDevice(
this);
135 fatal_if(pm4_ids.count(pm4->getIpId()),
136 "Two PM4s with same IP IDs is not allowed");
137 pm4_ids.insert(pm4->getIpId());
140 pm4Ranges.insert({pm4->getMMIORange(), pm4});
153 uint64_t mmhubBase = 0x8000ULL << 24;
154 uint64_t mmhubTop = 0x83ffULL << 24;
155 uint64_t mem_size = 0x3ff0;
178 if (
p.device_name ==
"Vega10") {
181 }
else if (
p.device_name ==
"MI100") {
185 }
else if (
p.device_name ==
"MI200") {
191 }
else if (
p.device_name ==
"MI300X") {
196 panic(
"Unknown GPU device %s\n",
p.device_name);
204 uint64_t rom_data = 0;
206 memcpy(&rom_data,
rom.data() + rom_offset, pkt->
getSize());
207 pkt->
setUintX(rom_data, ByteOrder::little);
210 pkt->
getAddr(), rom_offset, rom_data);
219 uint64_t rom_data = pkt->
getUintX(ByteOrder::little);
221 memcpy(
rom.data() + rom_offset, &rom_data, pkt->
getSize());
224 pkt->
getAddr(), rom_offset, rom_data);
237 for (
auto &
r : ranges) {
238 if (
r.start() != 0) {
239 ret_ranges.push_back(
r);
258 case sizeof(uint8_t):
261 "Read PXCAP: dev %#x func %#x reg %#x 1 bytes: data "
263 (uint32_t)pkt->
getLE<uint8_t>());
265 case sizeof(uint16_t):
266 pkt->
setLE<uint16_t>(
269 "Read PXCAP: dev %#x func %#x reg %#x 2 bytes: data "
271 (uint32_t)pkt->
getLE<uint16_t>());
273 case sizeof(uint32_t):
274 pkt->
setLE<uint32_t>(
277 "Read PXCAP: dev %#x func %#x reg %#x 4 bytes: data "
279 (uint32_t)pkt->
getLE<uint32_t>());
282 panic(
"Invalid access size (%d) for amdgpu PXCAP %#x\n",
287 warn(
"Device specific offset %d not implemented!\n",
offset);
329 memcpy(pxcap_data + pxcap_offset, pkt->
getConstPtr<
void>(),
371 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
373 readPkt->
req->setGPUFuncAccess(
true);
382 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
435 cu->sendInvL2(aligned_addr);
458 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
459 std::memcpy(dataPtr, pkt->
getPtr<uint8_t>(),
460 pkt->
getSize() *
sizeof(uint8_t));
484 pkt->
getLE<uint64_t>());
490 pkt->
getLE<uint64_t>());
503 pkt->
getLE<uint64_t>() + 1);
505 pkt->
getLE<uint64_t>() + 1);
515 panic(
"Write to unkown queue type!");
518 warn(
"Unknown doorbell offset: %lx. Saving to pending doorbells.\n",
527 uint8_t *pending_data =
new uint8_t[pkt->
getSize()];
528 memcpy(pending_data, pkt->
getPtr<uint8_t>(), pkt->
getSize());
544 for (
int idx = 0; idx <
sdmaIds.size(); ++idx) {
562 for (
auto& [range, pm4_proc] :
pm4Ranges) {
563 if (range.contains(
offset)) {
610 panic(
"Request with address out of mapped range!");
644 panic(
"Request with address out of mapped range!");
679 uint32_t pkt_data = 0;
680 RequestPtr request = std::make_shared<Request>(fixup_addr,
686 fixup_addr, pkt->
getLE<uint32_t>());
688 pkt_data = pkt->
getLE<uint32_t>();
700 uint32_t pkt_data = value;
759 uint64_t doorbells_size =
doorbells.size();
760 uint64_t sdma_engs_size =
sdmaEngs.size();
761 uint64_t used_vmid_map_size =
usedVMIDs.size();
769 uint32_t doorbells_offset[doorbells_size];
770 QueueType doorbells_queues[doorbells_size];
771 int doorbells_ip_ids[doorbells_size];
772 uint32_t sdma_engs_offset[sdma_engs_size];
773 int sdma_engs[sdma_engs_size];
774 int used_vmids[used_vmid_map_size];
775 int used_queue_id_sizes[used_vmid_map_size];
780 doorbells_offset[idx] = it.first;
781 doorbells_queues[idx] = it.second.qtype;
782 doorbells_ip_ids[idx] = it.second.ip_id;
788 sdma_engs_offset[idx] = it.first;
789 sdma_engs[idx] = it.second->getId();
795 used_vmids[idx] = it.first;
796 used_queue_id_sizes[idx] = it.second.size();
798 used_vmid_sets.insert(used_vmid_sets.end(),
799 set_vector.begin(), set_vector.end());
803 int num_queue_id = used_vmid_sets.size();
804 int* vmid_array =
new int[num_queue_id];
805 std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);
808 sizeof(doorbells_offset[0]));
810 sizeof(doorbells_queues[0]));
812 sizeof(doorbells_ip_ids[0]));
814 sizeof(sdma_engs_offset[0]));
820 sizeof(used_queue_id_sizes)/
sizeof(used_queue_id_sizes[0]));
839 uint64_t doorbells_size = 0;
840 uint64_t sdma_engs_size = 0;
841 uint64_t used_vmid_map_size = 0;
848 if (doorbells_size > 0) {
849 uint32_t doorbells_offset[doorbells_size];
850 QueueType doorbells_queues[doorbells_size];
851 int doorbells_ip_ids[doorbells_size];
854 sizeof(doorbells_offset[0]));
856 sizeof(doorbells_queues[0]));
858 sizeof(doorbells_ip_ids[0]));
860 for (
int idx = 0; idx < doorbells_size; ++idx) {
861 doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
862 doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
866 if (sdma_engs_size > 0) {
867 uint32_t sdma_engs_offset[sdma_engs_size];
868 int sdma_engs[sdma_engs_size];
871 sizeof(sdma_engs_offset[0]));
874 for (
int idx = 0; idx < sdma_engs_size; ++idx) {
875 int sdma_id = sdma_engs[idx];
876 assert(
sdmaIds.count(sdma_id));
878 sdmaEngs.insert(std::make_pair(sdma_engs_offset[idx], sdma));
882 if (used_vmid_map_size > 0) {
883 int used_vmids[used_vmid_map_size];
884 int used_queue_id_sizes[used_vmid_map_size];
885 int num_queue_id = 0;
889 int* vmid_array =
new int[num_queue_id];
898 for (
int it = 0; it < used_vmid_map_size; it++) {
899 int vmid = used_vmids[it];
900 int vmid_set_size = used_queue_id_sizes[it];
901 for (
int j = 0; j < vmid_set_size; j++) {
902 usedVMIDs[vmid].insert(vmid_array[idx + j]);
904 idx += vmid_set_size;
926 panic(
"All VMIDs have been assigned");
939 assert(result !=
idMap.end());
940 if (result ==
idMap.end())
return;
941 uint16_t vmid = result->second;
954 it.second->deallocateRLCQueues(unmap_static);
972std::unordered_map<uint16_t, std::set<int>>&
AbstractMemory declaration.
#define AMDGPU_MP0_SMN_C2PMSG_33
#define VEGA10_FB_LOCATION_BASE
#define VEGA10_FB_LOCATION_TOP
#define MI200_MEM_SIZE_REG
#define MI200_FB_LOCATION_TOP
#define MI100_FB_LOCATION_BASE
#define MI200_FB_LOCATION_BASE
#define MI100_FB_LOCATION_TOP
#define MI100_MEM_SIZE_REG
Device model for an AMD GPU.
void insertQId(uint16_t vmid, int id)
std::unordered_map< AddrRange, PM4PacketProcessor *, AddrRangeHasher > pm4Ranges
void deallocateAllQueues(bool unmap_static)
std::unordered_map< Addr, uint16_t > doorbellVMIDMap
std::unordered_map< uint16_t, uint16_t > idMap
void readMMIO(PacketPtr pkt, Addr offset)
void serialize(CheckpointOut &cp) const override
Checkpoint support.
void processPendingDoorbells(uint32_t offset)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unserialize(CheckpointIn &cp) override
Unserialize an object.
void writeMMIO(PacketPtr pkt, Addr offset)
void setDoorbellType(uint32_t offset, QueueType qt, int ip_id=0)
Set handles to GPU blocks.
Tick write(PacketPtr pkt) override
Pure virtual function that the device must implement.
void readROM(PacketPtr pkt)
AddrRange romRange
VGA ROM methods.
std::unordered_map< uint32_t, DoorbellInfo > doorbells
Structures to hold registers, doorbells, and some frame memory.
std::unordered_map< uint16_t, std::set< int > > & getUsedVMIDs()
std::array< uint8_t, ROM_SIZE > rom
bool isROM(Addr addr) const
void unsetDoorbell(uint32_t offset)
std::unordered_map< uint32_t, PacketPtr > pendingDoorbellPkts
void setRegVal(uint64_t addr, uint32_t value)
std::unordered_map< uint32_t, AddrRange > sdmaMmios
void(SDMAEngine::* sdmaFuncPtr)(uint32_t)
SDMAEngine * getSDMAEngine(Addr offset)
AMDGPUMemoryManager * gpuMemMgr
AMDGPUDevice(const AMDGPUDeviceParams &p)
void readDoorbell(PacketPtr pkt, Addr offset)
AMDGPUNbio nbio
Blocks of the GPU.
Tick readConfig(PacketPtr pkt) override
Read from the PCI config space data that is stored locally.
std::unordered_map< uint32_t, sdmaFuncPtr > sdmaFunc
std::unordered_map< uint16_t, std::set< int > > usedVMIDs
AMDGPUInterruptHandler * deviceIH
Tick writeConfig(PacketPtr pkt) override
Write to the PCI config space data that is stored locally.
AMDMMIOReader mmioReader
MMIO reader to populate device registers map.
Tick read(PacketPtr pkt) override
Pure virtual function that the device must implement.
bool checkpoint_before_mmios
Initial checkpoint support variables.
void dispatchAccess(PacketPtr pkt, bool read)
Convert a PCI packet into a response.
uint32_t getRegVal(uint64_t addr)
Register value getter/setter.
void deallocateVmid(uint16_t vmid)
void mapDoorbellToVMID(Addr doorbell, uint16_t vmid)
void intrPost()
Methods inherited from PciDevice.
void readFrame(PacketPtr pkt, Addr offset)
Helper methods to handle specific BAR read/writes.
void writeROM(PacketPtr pkt)
void writeDoorbell(PacketPtr pkt, Addr offset)
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
std::unordered_map< uint32_t, SDMAEngine * > sdmaIds
uint16_t allocateVMID(uint16_t pasid)
std::unordered_map< int, PM4PacketProcessor * > pm4PktProcs
void deallocatePasid(uint16_t pasid)
SDMAEngine * getSDMAById(int id)
void writeFrame(PacketPtr pkt, Addr offset)
void setSDMAEngine(Addr offset, SDMAEngine *eng)
memory::PhysicalMemory deviceMem
std::unordered_map< uint32_t, SDMAEngine * > sdmaEngs
GPUCommandProcessor * CP()
void readMMIO(PacketPtr pkt, Addr offset)
void writeMMIO(PacketPtr pkt, Addr offset)
void setGPUDevice(AMDGPUDevice *gpu_device)
void updateRptr(const uint32_t &data)
void writeMMIO(PacketPtr pkt, Addr mmio_offset)
Methods for setting the values of interrupt handler MMIO registers.
RequestorID getRequestorID() const
Get the requestorID for the memory manager.
Addr getCacheLineSize() const
void readMMIO(PacketPtr pkt, Addr offset)
void writeMMIO(PacketPtr pkt, Addr offset)
bool readFrame(PacketPtr pkt, Addr offset)
void writeFrame(PacketPtr pkt, Addr offset)
void setGPUDevice(AMDGPUDevice *gpu_device)
void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
void setMMHUBBase(Addr base)
AddrRange getMMIORange(mmio_range_t mmio_aperture)
std::unordered_map< uint64_t, uint64_t > gartTable
Copy of GART table.
void readMMIO(PacketPtr pkt, Addr offset)
const AddrRange & getMMIOAperture(Addr addr)
void writeMMIO(PacketPtr pkt, Addr offset)
Addr getFrameAperture(Addr addr)
Addr gartBase()
Return base address of GART table in framebuffer.
void setMMHUBTop(Addr top)
void readFromTrace(PacketPtr pkt, int barnum, Addr offset)
Get the next MMIO read from the trace file to an offset in a BAR and write the value to the packet pr...
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessor & hsaPacketProc()
HWScheduler * hwScheduler()
void setGPUDevice(AMDGPUDevice *gpu_device)
void write(Addr db_addr, uint64_t doorbell_reg)
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void setUintX(uint64_t w, ByteOrder endian)
Set the value in the word w after truncating it to the length of the packet and then byteswapping it ...
void setLE(T v)
Set the value in the data pointer to v as little endian.
static PacketPtr createWrite(const RequestPtr &req)
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
T * getPtr()
get a pointer to the data ptr.
static PacketPtr createRead(const RequestPtr &req)
Constructor-like methods that return Packets based on Request objects.
RequestPtr req
A pointer to the original request.
uint64_t getUintX(ByteOrder endian) const
Get the data in the packet byte swapped from the specified endianness and zero-extended to 64 bits.
const T * getConstPtr() const
void dataDynamic(T *p)
Set the data pointer to a value that should have delete [] called on it.
void makeAtomicResponse()
MemCmd cmd
The command field of the packet.
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
void setSuppressFuncError()
PCI device, base implementation is only config space.
PCIConfig config
The current config space.
void unserialize(CheckpointIn &cp) override
Reconstruct the state of this object from a checkpoint.
void serialize(CheckpointOut &cp) const override
Serialize this object to the given output stream.
bool getBAR(Addr addr, int &num, Addr &offs)
Which base address register (if any) maps the given address?
AddrRangeList getAddrRanges() const override
Determine the address ranges that this device responds to.
const PciBusAddr _busAddr
virtual Tick readConfig(PacketPtr pkt)
Read from the PCI config space data that is stored locally.
virtual Tick writeConfig(PacketPtr pkt)
Write to the PCI config space data that is stored locally.
System DMA Engine class for AMD dGPU.
void setPageRptrLo(uint32_t data)
void setGfxRptrLo(uint32_t data)
void setGfxWptrLo(uint32_t data)
void setGfxRptrHi(uint32_t data)
void processRLC(Addr doorbellOffset, Addr wptrOffset)
void setGfxSize(uint32_t data)
void setGfxBaseLo(uint32_t data)
void processGfx(Addr wptrOffset)
Given a new write ptr offset, communicated to the GPU through a doorbell write, the SDMA engine proce...
void setGfxWptrHi(uint32_t data)
void setGfxDoorbellOffsetLo(uint32_t data)
void processPage(Addr wptrOffset)
void setPageDoorbellOffsetLo(uint32_t data)
void setPageWptrLo(uint32_t data)
void setGfxDoorbellLo(uint32_t data)
void setPageDoorbellLo(uint32_t data)
void setPageSize(uint32_t data)
void setPageBaseLo(uint32_t data)
void setGfxBaseHi(uint32_t data)
void setPageRptrHi(uint32_t data)
std::vector< ComputeUnit * > cuList
GPUCommandProcessor & gpuCmdProc
memory::AbstractMemory * getDeviceMemory(const PacketPtr &pkt) const
Return a pointer to the device memory.
void access(PacketPtr pkt)
Perform an untimed memory access and update all the state (e.g.
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
Addr start() const
Get the start address of the range.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
void serializeSection(CheckpointOut &cp, const char *name) const
Serialize an object into a new section.
#define UNSERIALIZE_ARRAY(member, size)
#define SERIALIZE_ARRAY(member, size)
void unserializeSection(CheckpointIn &cp, const char *name)
Unserialize an a child object.
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
std::ostream CheckpointOut
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
uint64_t Tick
Tick count type.
constexpr uint32_t ROM_SIZE
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
static constexpr uint32_t IH_OFFSET_SHIFT
static constexpr uint32_t MMHUB_OFFSET_SHIFT
static constexpr int AMDGPU_VM_COUNT
constexpr int FRAMEBUFFER_BAR
constexpr int DOORBELL_BAR
constexpr uint32_t VGA_ROM_DEFAULT
static constexpr uint32_t GRBM_OFFSET_SHIFT
Declaration of the Packet class.
#define PCI0_INTERRUPT_PIN
#define PCI_DEVICE_SPECIFIC
#define UNSERIALIZE_SCALAR(scalar)
#define SERIALIZE_SCALAR(scalar)
const std::string & name()
Defines the PCI Express capability register and its associated bitfields for a PCIe device.