36#include "debug/AMDGPUDevice.hh"
48#include "params/AMDGPUDevice.hh"
56 :
PciDevice(
p), gpuMemMgr(
p.memory_manager), deviceIH(
p.device_ih),
57 cp(
p.cp), checkpoint_before_mmios(
p.checkpoint_before_mmios),
58 init_interrupt_count(0), _lastVMID(0),
59 deviceMem(
name() +
".deviceMem",
p.memories, false,
"", false)
63 romBin.open(
p.rom_binary, std::ios::binary);
70 for (
auto&
m :
p.memories) {
83 if (
p.device_name ==
"Vega10") {
85 }
else if (
p.device_name ==
"MI100") {
87 }
else if (
p.device_name ==
"MI200") {
89 }
else if (
p.device_name ==
"MI300X") {
92 panic(
"Unknown GPU device %s\n",
p.device_name);
95 if (
p.trace_file !=
"") {
100 for (
auto&
s :
p.sdmas) {
101 s->setGPUDevice(
this);
121 if (
p.device_name ==
"Vega10") {
129 }
else if (
p.device_name ==
"MI100" ||
p.device_name ==
"MI200"
130 ||
p.device_name ==
"MI300X") {
138 panic(
"Unknown GPU device %s\n",
p.device_name);
142 std::set<int> pm4_ids;
143 for (
auto& pm4 :
p.pm4_pkt_procs) {
144 pm4->setGPUDevice(
this);
145 fatal_if(pm4_ids.count(pm4->getIpId()),
146 "Two PM4s with same IP IDs is not allowed");
147 pm4_ids.insert(pm4->getIpId());
150 pm4Ranges.insert({pm4->getMMIORange(), pm4});
163 uint64_t mmhubBase = 0x8000ULL << 24;
164 uint64_t mmhubTop = 0x83ffULL << 24;
165 uint64_t mem_size = 0x3ff0;
188 if (
p.device_name ==
"Vega10") {
191 }
else if (
p.device_name ==
"MI100") {
195 }
else if (
p.device_name ==
"MI200") {
201 }
else if (
p.device_name ==
"MI300X") {
206 panic(
"Unknown GPU device %s\n",
p.device_name);
214 uint64_t rom_data = 0;
216 memcpy(&rom_data,
rom.data() + rom_offset, pkt->
getSize());
217 pkt->
setUintX(rom_data, ByteOrder::little);
220 pkt->
getAddr(), rom_offset, rom_data);
229 uint64_t rom_data = pkt->
getUintX(ByteOrder::little);
231 memcpy(
rom.data() + rom_offset, &rom_data, pkt->
getSize());
234 pkt->
getAddr(), rom_offset, rom_data);
247 for (
auto &
r : ranges) {
248 if (
r.start() != 0) {
249 ret_ranges.push_back(
r);
268 case sizeof(uint8_t):
271 "Read PXCAP: dev %#x func %#x reg %#x 1 bytes: data "
273 (uint32_t)pkt->
getLE<uint8_t>());
275 case sizeof(uint16_t):
276 pkt->
setLE<uint16_t>(
279 "Read PXCAP: dev %#x func %#x reg %#x 2 bytes: data "
281 (uint32_t)pkt->
getLE<uint16_t>());
283 case sizeof(uint32_t):
284 pkt->
setLE<uint32_t>(
287 "Read PXCAP: dev %#x func %#x reg %#x 4 bytes: data "
289 (uint32_t)pkt->
getLE<uint32_t>());
292 panic(
"Invalid access size (%d) for amdgpu PXCAP %#x\n",
297 warn(
"Device specific offset %d not implemented!\n",
offset);
339 memcpy(pxcap_data + pxcap_offset, pkt->
getConstPtr<
void>(),
380 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
433 cu->sendInvL2(aligned_addr);
456 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
457 std::memcpy(dataPtr, pkt->
getPtr<uint8_t>(),
458 pkt->
getSize() *
sizeof(uint8_t));
482 pkt->
getLE<uint64_t>());
488 pkt->
getLE<uint64_t>());
501 pkt->
getLE<uint64_t>() + 1);
503 pkt->
getLE<uint64_t>() + 1);
513 panic(
"Write to unkown queue type!");
516 warn(
"Unknown doorbell offset: %lx. Saving to pending doorbells.\n",
525 uint8_t *pending_data =
new uint8_t[pkt->
getSize()];
526 memcpy(pending_data, pkt->
getPtr<uint8_t>(), pkt->
getSize());
542 for (
int idx = 0; idx <
sdmaIds.size(); ++idx) {
560 for (
auto& [range, pm4_proc] :
pm4Ranges) {
561 if (range.contains(
offset)) {
608 panic(
"Request with address out of mapped range!");
642 panic(
"Request with address out of mapped range!");
677 uint32_t pkt_data = 0;
678 RequestPtr request = std::make_shared<Request>(fixup_addr,
684 fixup_addr, pkt->
getLE<uint32_t>());
686 pkt_data = pkt->
getLE<uint32_t>();
698 uint32_t pkt_data = value;
757 uint64_t doorbells_size =
doorbells.size();
758 uint64_t sdma_engs_size =
sdmaEngs.size();
759 uint64_t used_vmid_map_size =
usedVMIDs.size();
767 uint32_t doorbells_offset[doorbells_size];
768 QueueType doorbells_queues[doorbells_size];
769 int doorbells_ip_ids[doorbells_size];
770 uint32_t sdma_engs_offset[sdma_engs_size];
771 int sdma_engs[sdma_engs_size];
772 int used_vmids[used_vmid_map_size];
773 int used_queue_id_sizes[used_vmid_map_size];
778 doorbells_offset[idx] = it.first;
779 doorbells_queues[idx] = it.second.qtype;
780 doorbells_ip_ids[idx] = it.second.ip_id;
786 sdma_engs_offset[idx] = it.first;
787 sdma_engs[idx] = it.second->getId();
793 used_vmids[idx] = it.first;
794 used_queue_id_sizes[idx] = it.second.size();
796 used_vmid_sets.insert(used_vmid_sets.end(),
797 set_vector.begin(), set_vector.end());
801 int num_queue_id = used_vmid_sets.size();
802 int* vmid_array =
new int[num_queue_id];
803 std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);
806 sizeof(doorbells_offset[0]));
808 sizeof(doorbells_queues[0]));
810 sizeof(doorbells_ip_ids[0]));
812 sizeof(sdma_engs_offset[0]));
818 sizeof(used_queue_id_sizes)/
sizeof(used_queue_id_sizes[0]));
837 uint64_t doorbells_size = 0;
838 uint64_t sdma_engs_size = 0;
839 uint64_t used_vmid_map_size = 0;
846 if (doorbells_size > 0) {
847 uint32_t doorbells_offset[doorbells_size];
848 QueueType doorbells_queues[doorbells_size];
849 int doorbells_ip_ids[doorbells_size];
852 sizeof(doorbells_offset[0]));
854 sizeof(doorbells_queues[0]));
856 sizeof(doorbells_ip_ids[0]));
858 for (
int idx = 0; idx < doorbells_size; ++idx) {
859 doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
860 doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
864 if (sdma_engs_size > 0) {
865 uint32_t sdma_engs_offset[sdma_engs_size];
866 int sdma_engs[sdma_engs_size];
869 sizeof(sdma_engs_offset[0]));
872 for (
int idx = 0; idx < sdma_engs_size; ++idx) {
873 int sdma_id = sdma_engs[idx];
874 assert(
sdmaIds.count(sdma_id));
876 sdmaEngs.insert(std::make_pair(sdma_engs_offset[idx], sdma));
880 if (used_vmid_map_size > 0) {
881 int used_vmids[used_vmid_map_size];
882 int used_queue_id_sizes[used_vmid_map_size];
883 int num_queue_id = 0;
887 int* vmid_array =
new int[num_queue_id];
896 for (
int it = 0; it < used_vmid_map_size; it++) {
897 int vmid = used_vmids[it];
898 int vmid_set_size = used_queue_id_sizes[it];
899 for (
int j = 0; j < vmid_set_size; j++) {
900 usedVMIDs[vmid].insert(vmid_array[idx + j]);
902 idx += vmid_set_size;
924 panic(
"All VMIDs have been assigned");
937 assert(result !=
idMap.end());
938 if (result ==
idMap.end())
return;
939 uint16_t vmid = result->second;
952 it.second->deallocateRLCQueues();
970std::unordered_map<uint16_t, std::set<int>>&
AbstractMemory declaration.
#define AMDGPU_MP0_SMN_C2PMSG_33
#define VEGA10_FB_LOCATION_BASE
#define VEGA10_FB_LOCATION_TOP
#define MI200_MEM_SIZE_REG
#define MI200_FB_LOCATION_TOP
#define MI100_FB_LOCATION_BASE
#define MI200_FB_LOCATION_BASE
#define MI100_FB_LOCATION_TOP
#define MI100_MEM_SIZE_REG
Device model for an AMD GPU.
void insertQId(uint16_t vmid, int id)
std::unordered_map< AddrRange, PM4PacketProcessor *, AddrRangeHasher > pm4Ranges
std::unordered_map< Addr, uint16_t > doorbellVMIDMap
std::unordered_map< uint16_t, uint16_t > idMap
void readMMIO(PacketPtr pkt, Addr offset)
void serialize(CheckpointOut &cp) const override
Checkpoint support.
void processPendingDoorbells(uint32_t offset)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unserialize(CheckpointIn &cp) override
Unserialize an object.
void writeMMIO(PacketPtr pkt, Addr offset)
void setDoorbellType(uint32_t offset, QueueType qt, int ip_id=0)
Set handles to GPU blocks.
Tick write(PacketPtr pkt) override
Pure virtual function that the device must implement.
void deallocateAllQueues()
void readROM(PacketPtr pkt)
AddrRange romRange
VGA ROM methods.
std::unordered_map< uint32_t, DoorbellInfo > doorbells
Structures to hold registers, doorbells, and some frame memory.
std::unordered_map< uint16_t, std::set< int > > & getUsedVMIDs()
std::array< uint8_t, ROM_SIZE > rom
bool isROM(Addr addr) const
void unsetDoorbell(uint32_t offset)
std::unordered_map< uint32_t, PacketPtr > pendingDoorbellPkts
void setRegVal(uint64_t addr, uint32_t value)
std::unordered_map< uint32_t, AddrRange > sdmaMmios
void(SDMAEngine::* sdmaFuncPtr)(uint32_t)
SDMAEngine * getSDMAEngine(Addr offset)
AMDGPUMemoryManager * gpuMemMgr
AMDGPUDevice(const AMDGPUDeviceParams &p)
void readDoorbell(PacketPtr pkt, Addr offset)
AMDGPUNbio nbio
Blocks of the GPU.
Tick readConfig(PacketPtr pkt) override
Read from the PCI config space data that is stored locally.
std::unordered_map< uint32_t, sdmaFuncPtr > sdmaFunc
std::unordered_map< uint16_t, std::set< int > > usedVMIDs
AMDGPUInterruptHandler * deviceIH
Tick writeConfig(PacketPtr pkt) override
Write to the PCI config space data that is stored locally.
AMDMMIOReader mmioReader
MMIO reader to populate device registers map.
Tick read(PacketPtr pkt) override
Pure virtual function that the device must implement.
bool checkpoint_before_mmios
Initial checkpoint support variables.
void dispatchAccess(PacketPtr pkt, bool read)
Convert a PCI packet into a response.
uint32_t getRegVal(uint64_t addr)
Register value getter/setter.
void deallocateVmid(uint16_t vmid)
void mapDoorbellToVMID(Addr doorbell, uint16_t vmid)
void intrPost()
Methods inherited from PciDevice.
void readFrame(PacketPtr pkt, Addr offset)
Helper methods to handle specific BAR read/writes.
void writeROM(PacketPtr pkt)
void writeDoorbell(PacketPtr pkt, Addr offset)
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
std::unordered_map< uint32_t, SDMAEngine * > sdmaIds
uint16_t allocateVMID(uint16_t pasid)
std::unordered_map< int, PM4PacketProcessor * > pm4PktProcs
void deallocatePasid(uint16_t pasid)
SDMAEngine * getSDMAById(int id)
void writeFrame(PacketPtr pkt, Addr offset)
void setSDMAEngine(Addr offset, SDMAEngine *eng)
memory::PhysicalMemory deviceMem
std::unordered_map< uint32_t, SDMAEngine * > sdmaEngs
GPUCommandProcessor * CP()
void readMMIO(PacketPtr pkt, Addr offset)
void writeMMIO(PacketPtr pkt, Addr offset)
void setGPUDevice(AMDGPUDevice *gpu_device)
void updateRptr(const uint32_t &data)
void writeMMIO(PacketPtr pkt, Addr mmio_offset)
Methods for setting the values of interrupt handler MMIO registers.
RequestorID getRequestorID() const
Get the requestorID for the memory manager.
void readMMIO(PacketPtr pkt, Addr offset)
void writeMMIO(PacketPtr pkt, Addr offset)
bool readFrame(PacketPtr pkt, Addr offset)
void writeFrame(PacketPtr pkt, Addr offset)
void setGPUDevice(AMDGPUDevice *gpu_device)
void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
void setMMHUBBase(Addr base)
AddrRange getMMIORange(mmio_range_t mmio_aperture)
std::unordered_map< uint64_t, uint64_t > gartTable
Copy of GART table.
void readMMIO(PacketPtr pkt, Addr offset)
const AddrRange & getMMIOAperture(Addr addr)
void writeMMIO(PacketPtr pkt, Addr offset)
Addr getFrameAperture(Addr addr)
Addr gartBase()
Return base address of GART table in framebuffer.
void setMMHUBTop(Addr top)
void readMMIOTrace(std::string trace_file)
Read an MMIO trace gathered from a real system and place the MMIO values read and written into the MM...
void readFromTrace(PacketPtr pkt, int barnum, Addr offset)
Get the next MMIO read from the trace file to an offset in a BAR and write the value to the packet pr...
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessor & hsaPacketProc()
HWScheduler * hwScheduler()
void setGPUDevice(AMDGPUDevice *gpu_device)
void write(Addr db_addr, uint64_t doorbell_reg)
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void setUintX(uint64_t w, ByteOrder endian)
Set the value in the word w after truncating it to the length of the packet and then byteswapping it ...
void setLE(T v)
Set the value in the data pointer to v as little endian.
static PacketPtr createWrite(const RequestPtr &req)
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
T * getPtr()
get a pointer to the data ptr.
static PacketPtr createRead(const RequestPtr &req)
Constructor-like methods that return Packets based on Request objects.
RequestPtr req
A pointer to the original request.
uint64_t getUintX(ByteOrder endian) const
Get the data in the packet byte swapped from the specified endianness and zero-extended to 64 bits.
const T * getConstPtr() const
void dataDynamic(T *p)
Set the data pointer to a value that should have delete [] called on it.
void makeAtomicResponse()
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
PCI device, base implementation is only config space.
PCIConfig config
The current config space.
void unserialize(CheckpointIn &cp) override
Reconstruct the state of this object from a checkpoint.
void serialize(CheckpointOut &cp) const override
Serialize this object to the given output stream.
bool getBAR(Addr addr, int &num, Addr &offs)
Which base address register (if any) maps the given address?
AddrRangeList getAddrRanges() const override
Determine the address ranges that this device responds to.
const PciBusAddr _busAddr
virtual Tick readConfig(PacketPtr pkt)
Read from the PCI config space data that is stored locally.
virtual Tick writeConfig(PacketPtr pkt)
Write to the PCI config space data that is stored locally.
virtual Tick read(PacketPtr pkt)=0
Pure virtual function that the device must implement.
System DMA Engine class for AMD dGPU.
void setPageRptrLo(uint32_t data)
void setGfxRptrLo(uint32_t data)
void setGfxWptrLo(uint32_t data)
void setGfxRptrHi(uint32_t data)
void processRLC(Addr doorbellOffset, Addr wptrOffset)
void setGfxSize(uint32_t data)
void setGfxBaseLo(uint32_t data)
void processGfx(Addr wptrOffset)
Given a new write ptr offset, communicated to the GPU through a doorbell write, the SDMA engine proce...
void setGfxWptrHi(uint32_t data)
void setGfxDoorbellOffsetLo(uint32_t data)
void processPage(Addr wptrOffset)
void setPageDoorbellOffsetLo(uint32_t data)
void setPageWptrLo(uint32_t data)
void setGfxDoorbellLo(uint32_t data)
void setPageDoorbellLo(uint32_t data)
void setPageSize(uint32_t data)
void setPageBaseLo(uint32_t data)
void setGfxBaseHi(uint32_t data)
void setPageRptrHi(uint32_t data)
std::vector< ComputeUnit * > cuList
GPUCommandProcessor & gpuCmdProc
memory::AbstractMemory * getDeviceMemory(const PacketPtr &pkt) const
Return a pointer to the device memory.
void access(PacketPtr pkt)
Perform an untimed memory access and update all the state (e.g.
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
Addr start() const
Get the start address of the range.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
void serializeSection(CheckpointOut &cp, const char *name) const
Serialize an object into a new section.
#define UNSERIALIZE_ARRAY(member, size)
#define SERIALIZE_ARRAY(member, size)
void unserializeSection(CheckpointIn &cp, const char *name)
Unserialize an a child object.
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
std::ostream CheckpointOut
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
uint64_t Tick
Tick count type.
constexpr uint32_t ROM_SIZE
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
static constexpr uint32_t IH_OFFSET_SHIFT
static constexpr uint32_t MMHUB_OFFSET_SHIFT
static constexpr int AMDGPU_VM_COUNT
constexpr int FRAMEBUFFER_BAR
constexpr int DOORBELL_BAR
constexpr uint32_t VGA_ROM_DEFAULT
static constexpr uint32_t GRBM_OFFSET_SHIFT
Declaration of the Packet class.
#define PCI0_INTERRUPT_PIN
#define PCI_DEVICE_SPECIFIC
#define UNSERIALIZE_SCALAR(scalar)
#define SERIALIZE_SCALAR(scalar)
const std::string & name()
Defines the PCI Express capability register and its associated bitfields for a PCIe device.