36#include "debug/AMDGPUDevice.hh"
48#include "params/AMDGPUDevice.hh"
64 for (
auto&
m :
p.memories) {
68 p.system->addDeviceMemory(gpuMemMgr->getRequestorID(), m);
71 if (
config().expansionROM) {
77 if (
p.device_name ==
"Vega10") {
78 gfx_version = GfxVersion::gfx900;
79 }
else if (
p.device_name ==
"MI100") {
80 gfx_version = GfxVersion::gfx908;
81 }
else if (
p.device_name ==
"MI200") {
82 gfx_version = GfxVersion::gfx90a;
83 }
else if (
p.device_name ==
"MI300X") {
84 gfx_version = GfxVersion::gfx942;
86 panic(
"Unknown GPU device %s\n", p.device_name);
90 for (
auto&
s :
p.sdmas) {
91 s->setGPUDevice(this);
93 sdmaIds.insert({sdma_id, s});
95 RangeSize(s->getMmioBase(), s->getMmioSize())});
111 if (
p.device_name ==
"Vega10") {
112 sdmaFunc.insert({0xe1, &SDMAEngine::setPageBaseLo});
113 sdmaFunc.insert({0xe9, &SDMAEngine::setPageRptrLo});
114 sdmaFunc.insert({0xe8, &SDMAEngine::setPageRptrHi});
115 sdmaFunc.insert({0xf2, &SDMAEngine::setPageDoorbellLo});
116 sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
117 sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
118 sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
119 }
else if (
p.device_name ==
"MI100" ||
p.device_name ==
"MI200"
120 ||
p.device_name ==
"MI300X") {
121 sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
122 sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
123 sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
124 sdmaFunc.insert({0xea, &SDMAEngine::setPageDoorbellLo});
125 sdmaFunc.insert({0xd8, &SDMAEngine::setPageDoorbellOffsetLo});
126 sdmaFunc.insert({0x10b, &SDMAEngine::setPageWptrLo});
128 panic(
"Unknown GPU device %s\n",
p.device_name);
132 std::set<int> pm4_ids;
133 for (
auto& pm4 :
p.pm4_pkt_procs) {
134 pm4->setGPUDevice(this);
135 fatal_if(pm4_ids.count(pm4->getIpId()),
136 "Two PM4s with same IP IDs is not allowed");
137 pm4_ids.insert(pm4->getIpId());
138 pm4PktProcs.insert({pm4->getIpId(), pm4});
140 pm4Ranges.insert({pm4->getMMIORange(), pm4});
144 fatal_if(!pm4PktProcs.count(0),
"No default PM4 processor found");
146 deviceIH->setGPUDevice(
this);
147 cp->hsaPacketProc().setGPUDevice(
this);
148 cp->setGPUDevice(
this);
149 nbio.setGPUDevice(
this);
150 gpuvm.setGPUDevice(
this);
154 uint64_t mmhubBase = 0x8000ULL << 24;
155 uint64_t mmhubTop = 0x83ffULL << 24;
156 uint64_t mem_size = 0x3ff0;
158 gpuvm.setMMHUBBase(mmhubBase);
159 gpuvm.setMMHUBTop(mmhubTop);
168 gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
169 gpuvm.setMMIOAperture(IH_MMIO_RANGE, AddrRange(0x4280, 0x4980));
170 gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
171 gpuvm.setMMIOAperture(GFX_MMIO_RANGE, AddrRange(0x28000, 0x3F000));
172 if (getGfxVersion() == GfxVersion::gfx942) {
173 gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x60D00, 0x62E20));
175 gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x68000, 0x6A120));
183 if (
p.device_name ==
"Vega10") {
186 }
else if (
p.device_name ==
"MI100") {
190 }
else if (
p.device_name ==
"MI200") {
196 }
else if (
p.device_name ==
"MI300X") {
202 panic(
"Unknown GPU device %s\n",
p.device_name);
205 if (getGfxVersion() == GfxVersion::gfx942 &&
p.ipt_binary !=
"") {
207 constexpr uint64_t DISCOVERY_TMR_OFFSET = (64 << 10);
208 constexpr int IPT_SIZE_DW = 0xa00;
209 uint64_t ip_table_base = (mem_size << 20) - DISCOVERY_TMR_OFFSET;
211 std::ifstream iptBin;
212 std::array<uint32_t, IPT_SIZE_DW> ipTable;
213 iptBin.open(
p.ipt_binary, std::ios::binary);
214 iptBin.read((
char *)ipTable.data(), IPT_SIZE_DW*4);
218 for (
int ipt_dword = 0x0; ipt_dword < IPT_SIZE_DW; ipt_dword++) {
219 Addr ipt_addr = ip_table_base + ipt_dword*4;
224 Addr ipt_addr_hi = ipt_addr >> 31;
225 Addr fixup_addr = (ipt_addr_hi << 32) | (ipt_addr & 0x7fffffff)
228 setRegVal(fixup_addr, ipTable[ipt_dword]);
229 DPRINTF(AMDGPUDevice,
"IPTable wrote dword %d (%x) to %lx\n",
230 ipt_dword, ipTable[ipt_dword], fixup_addr);
239 uint64_t rom_data = 0;
241 memcpy(&rom_data,
rom.data() + rom_offset, pkt->
getSize());
242 pkt->
setUintX(rom_data, ByteOrder::little);
245 pkt->
getAddr(), rom_offset, rom_data);
254 uint64_t rom_data = pkt->
getUintX(ByteOrder::little);
256 memcpy(
rom.data() + rom_offset, &rom_data, pkt->
getSize());
259 pkt->
getAddr(), rom_offset, rom_data);
272 for (
auto &
r : ranges) {
273 if (
r.start() != 0) {
274 ret_ranges.push_back(
r);
293 case sizeof(uint8_t):
294 pkt->
setLE<uint8_t>(
pxcap.data[pxcap_offset]);
296 "Read PXCAP: dev %#x func %#x reg %#x 1 bytes: data "
298 (uint32_t)pkt->
getLE<uint8_t>());
300 case sizeof(uint16_t):
301 pkt->
setLE<uint16_t>(
302 *(uint16_t*)&
pxcap.data[pxcap_offset]);
304 "Read PXCAP: dev %#x func %#x reg %#x 2 bytes: data "
306 (uint32_t)pkt->
getLE<uint16_t>());
308 case sizeof(uint32_t):
309 pkt->
setLE<uint32_t>(
310 *(uint32_t*)&
pxcap.data[pxcap_offset]);
312 "Read PXCAP: dev %#x func %#x reg %#x 4 bytes: data "
314 (uint32_t)pkt->
getLE<uint32_t>());
317 panic(
"Invalid access size (%d) for amdgpu PXCAP %#x\n",
322 warn(
"Device specific offset %d not implemented!\n",
offset);
358 uint8_t *pxcap_data = &(
pxcap.data[0]);
364 memcpy(pxcap_data + pxcap_offset, pkt->
getConstPtr<
void>(),
406 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
408 readPkt->
req->setGPUFuncAccess(
true);
410 cp->shader()->cuList[0]->memPort[0].sendFunctional(readPkt);
417 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
420 auto system =
cp->shader()->gpuCmdProc.system();
421 system->getDeviceMemory(readPkt)->access(readPkt);
448 nbio.readMMIO(pkt, aperture_offset);
454 gfx.readMMIO(pkt, aperture_offset);
468 for (
auto& cu:
CP()->shader()->cuList) {
470 cu->sendInvL2(aligned_addr);
477 if (aperture ==
gpuvm.gartBase()) {
478 gpuvm.gartTable[aperture_offset] = pkt->
getUintX(ByteOrder::little);
480 gpuvm.gartTable[aperture_offset]);
493 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
494 std::memcpy(dataPtr, pkt->
getPtr<uint8_t>(),
495 pkt->
getSize() *
sizeof(uint8_t));
498 auto system =
cp->shader()->gpuCmdProc.system();
499 system->getDeviceMemory(writePkt)->access(writePkt);
519 pkt->
getLE<uint64_t>());
525 pkt->
getLE<uint64_t>());
537 cp->hsaPacketProc().hwScheduler()->write(
offset,
538 pkt->
getLE<uint64_t>() + 1);
540 pkt->
getLE<uint64_t>() + 1);
550 panic(
"Write to unkown queue type!");
553 warn(
"Unknown doorbell offset: %lx. Saving to pending doorbells.\n",
562 uint8_t *pending_data =
new uint8_t[pkt->
getSize()];
563 memcpy(pending_data, pkt->
getPtr<uint8_t>(), pkt->
getSize());
579 for (
int idx = 0; idx <
sdmaIds.size(); ++idx) {
597 for (
auto& [range, pm4_proc] :
pm4Ranges) {
598 if (range.contains(
offset)) {
615 nbio.writeMMIO(pkt, aperture_offset);
618 gfx.writeMMIO(pkt, aperture_offset);
645 panic(
"Request with address out of mapped range!");
679 panic(
"Request with address out of mapped range!");
714 uint32_t pkt_data = 0;
715 RequestPtr request = std::make_shared<Request>(fixup_addr,
721 fixup_addr, pkt->
getLE<uint32_t>());
723 pkt_data = pkt->
getLE<uint32_t>();
735 uint32_t pkt_data = value;
794 uint64_t doorbells_size =
doorbells.size();
795 uint64_t sdma_engs_size =
sdmaEngs.size();
796 uint64_t used_vmid_map_size =
usedVMIDs.size();
804 auto doorbells_offset = std::make_unique<uint32_t[]>(doorbells_size);
805 auto doorbells_queues = std::make_unique<QueueType[]>(doorbells_size);
806 auto doorbells_ip_ids = std::make_unique<int[]>(doorbells_size);
807 auto sdma_engs_offset = std::make_unique<uint32_t[]>(sdma_engs_size);
808 auto sdma_engs = std::make_unique<int[]>(sdma_engs_size);
809 auto used_vmids = std::make_unique<int[]>(used_vmid_map_size);
810 auto used_queue_id_sizes = std::make_unique<int[]>(used_vmid_map_size);
815 doorbells_offset[idx] = it.first;
816 doorbells_queues[idx] = it.second.qtype;
817 doorbells_ip_ids[idx] = it.second.ip_id;
823 sdma_engs_offset[idx] = it.first;
824 sdma_engs[idx] = it.second->getId();
830 used_vmids[idx] = it.first;
831 used_queue_id_sizes[idx] = it.second.size();
833 used_vmid_sets.insert(used_vmid_sets.end(),
834 set_vector.begin(), set_vector.end());
838 int num_queue_id = used_vmid_sets.size();
839 auto vmid_array = std::make_unique<int[]>(num_queue_id);
840 std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array.get());
858 gpuvm.serializeSection(
cp,
"GPUVM");
867 uint64_t doorbells_size = 0;
868 uint64_t sdma_engs_size = 0;
869 uint64_t used_vmid_map_size = 0;
876 if (doorbells_size > 0) {
877 auto doorbells_offset = std::make_unique<uint32_t[]>(doorbells_size);
878 auto doorbells_queues = std::make_unique<QueueType[]>(doorbells_size);
879 auto doorbells_ip_ids = std::make_unique<int[]>(doorbells_size);
885 for (
int idx = 0; idx < doorbells_size; ++idx) {
886 doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
887 doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
891 if (sdma_engs_size > 0) {
892 auto sdma_engs_offset = std::make_unique<uint32_t[]>(sdma_engs_size);
893 auto sdma_engs = std::make_unique<int[]>(sdma_engs_size);
898 for (
int idx = 0; idx < sdma_engs_size; ++idx) {
899 int sdma_id = sdma_engs[idx];
900 assert(
sdmaIds.count(sdma_id));
902 sdmaEngs.insert(std::make_pair(sdma_engs_offset[idx], sdma));
906 if (used_vmid_map_size > 0) {
907 auto used_vmids = std::make_unique<int[]>(used_vmid_map_size);
908 auto used_queue_id_sizes = std::make_unique<int[]>(used_vmid_map_size);
909 int num_queue_id = 0;
913 auto vmid_array = std::make_unique<int[]>(num_queue_id);
922 for (
int it = 0; it < used_vmid_map_size; it++) {
923 int vmid = used_vmids[it];
924 int vmid_set_size = used_queue_id_sizes[it];
925 for (
int j = 0; j < vmid_set_size; j++) {
926 usedVMIDs[vmid].insert(vmid_array[idx + j]);
928 idx += vmid_set_size;
934 gpuvm.unserializeSection(
cp,
"GPUVM");
949 panic(
"All VMIDs have been assigned");
962 assert(result !=
idMap.end());
963 if (result ==
idMap.end())
return;
964 uint16_t vmid = result->second;
977 it.second->deallocateRLCQueues(unmap_static);
995std::unordered_map<uint16_t, std::set<int>>&
AbstractMemory declaration.
#define AMDGPU_MP0_SMN_C2PMSG_33
#define VEGA10_FB_LOCATION_BASE
#define VEGA10_FB_LOCATION_TOP
#define MI200_MEM_SIZE_REG
#define MI200_FB_LOCATION_TOP
#define MI300X_FB_LOCATION_TOP
#define MI100_FB_LOCATION_BASE
#define MI200_FB_LOCATION_BASE
#define MI300X_MEM_SIZE_REG
#define MI100_FB_LOCATION_TOP
#define MI300X_FB_LOCATION_BASE
#define MI100_MEM_SIZE_REG
void insertQId(uint16_t vmid, int id)
std::unordered_map< AddrRange, PM4PacketProcessor *, AddrRangeHasher > pm4Ranges
void deallocateAllQueues(bool unmap_static)
std::unordered_map< Addr, uint16_t > doorbellVMIDMap
std::unordered_map< uint16_t, uint16_t > idMap
void readMMIO(PacketPtr pkt, Addr offset)
void serialize(CheckpointOut &cp) const override
Checkpoint support.
void processPendingDoorbells(uint32_t offset)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unserialize(CheckpointIn &cp) override
Unserialize an object.
void writeMMIO(PacketPtr pkt, Addr offset)
void setDoorbellType(uint32_t offset, QueueType qt, int ip_id=0)
Set handles to GPU blocks.
Tick write(PacketPtr pkt) override
Pure virtual function that the device must implement.
void readROM(PacketPtr pkt)
AddrRange romRange
VGA ROM methods.
std::unordered_map< uint32_t, DoorbellInfo > doorbells
Structures to hold registers, doorbells, and some frame memory.
std::unordered_map< uint16_t, std::set< int > > & getUsedVMIDs()
std::array< uint8_t, ROM_SIZE > rom
bool isROM(Addr addr) const
void unsetDoorbell(uint32_t offset)
std::unordered_map< uint32_t, PacketPtr > pendingDoorbellPkts
void setRegVal(uint64_t addr, uint32_t value)
std::unordered_map< uint32_t, AddrRange > sdmaMmios
void(SDMAEngine::* sdmaFuncPtr)(uint32_t)
SDMAEngine * getSDMAEngine(Addr offset)
AMDGPUMemoryManager * gpuMemMgr
AMDGPUDevice(const AMDGPUDeviceParams &p)
void readDoorbell(PacketPtr pkt, Addr offset)
AMDGPUNbio nbio
Blocks of the GPU.
Tick readConfig(PacketPtr pkt) override
Read from the PCI config space data that is stored locally.
std::unordered_map< uint32_t, sdmaFuncPtr > sdmaFunc
std::unordered_map< uint16_t, std::set< int > > usedVMIDs
AMDGPUInterruptHandler * deviceIH
Tick writeConfig(PacketPtr pkt) override
Write to the PCI config space data that is stored locally.
AMDMMIOReader mmioReader
MMIO reader to populate device registers map.
Tick read(PacketPtr pkt) override
Pure virtual function that the device must implement.
bool checkpoint_before_mmios
Initial checkpoint support variables.
void dispatchAccess(PacketPtr pkt, bool read)
Convert a PCI packet into a response.
uint32_t getRegVal(uint64_t addr)
Register value getter/setter.
void deallocateVmid(uint16_t vmid)
void mapDoorbellToVMID(Addr doorbell, uint16_t vmid)
void intrPost()
Methods inherited from PciEndpoint.
void readFrame(PacketPtr pkt, Addr offset)
Helper methods to handle specific BAR read/writes.
void writeROM(PacketPtr pkt)
void writeDoorbell(PacketPtr pkt, Addr offset)
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
std::unordered_map< uint32_t, SDMAEngine * > sdmaIds
uint16_t allocateVMID(uint16_t pasid)
std::unordered_map< int, PM4PacketProcessor * > pm4PktProcs
void deallocatePasid(uint16_t pasid)
SDMAEngine * getSDMAById(int id)
void writeFrame(PacketPtr pkt, Addr offset)
void setSDMAEngine(Addr offset, SDMAEngine *eng)
memory::PhysicalMemory deviceMem
std::unordered_map< uint32_t, SDMAEngine * > sdmaEngs
GPUCommandProcessor * CP()
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void setUintX(uint64_t w, ByteOrder endian)
Set the value in the word w after truncating it to the length of the packet and then byteswapping it ...
void setLE(T v)
Set the value in the data pointer to v as little endian.
static PacketPtr createWrite(const RequestPtr &req)
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
T * getPtr()
get a pointer to the data ptr.
static PacketPtr createRead(const RequestPtr &req)
Constructor-like methods that return Packets based on Request objects.
RequestPtr req
A pointer to the original request.
uint64_t getUintX(ByteOrder endian) const
Get the data in the packet byte swapped from the specified endianness and zero-extended to 64 bits.
const T * getConstPtr() const
void dataDynamic(T *p)
Set the data pointer to a value that should have delete [] called on it.
void makeAtomicResponse()
MemCmd cmd
The command field of the packet.
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
void setSuppressFuncError()
void serialize(CheckpointOut &cp) const override
Serialize this object to the given output stream.
bool getBAR(Addr addr, int &num, Addr &offs)
Which base address register (if any) maps the given address?
const PciBusAddr _busAddr
virtual Tick readConfig(PacketPtr pkt)
Read from the PCI config space data that is stored locally.
PCIConfigType0 & config()
void unserialize(CheckpointIn &cp) override
Reconstruct the state of this object from a checkpoint.
PciEndpoint(const PciEndpointParams ¶ms)
Constructor for PCI Dev.
Tick writeConfig(PacketPtr pkt) override
Write to the PCI config space data that is stored locally.
virtual AddrRangeList getAddrRanges() const =0
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
System DMA Engine class for AMD dGPU.
void setGfxRptrLo(uint32_t data)
void setGfxWptrLo(uint32_t data)
void setGfxRptrHi(uint32_t data)
void processRLC(Addr doorbellOffset, Addr wptrOffset)
void setGfxSize(uint32_t data)
void setGfxBaseLo(uint32_t data)
void processGfx(Addr wptrOffset)
Given a new write ptr offset, communicated to the GPU through a doorbell write, the SDMA engine proce...
void setGfxWptrHi(uint32_t data)
void setGfxDoorbellOffsetLo(uint32_t data)
void processPage(Addr wptrOffset)
void setGfxDoorbellLo(uint32_t data)
void setGfxBaseHi(uint32_t data)
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Addr start() const
Get the start address of the range.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define UNSERIALIZE_UNIQUE_PTR_ARRAY(member, size)
#define SERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
std::ostream CheckpointOut
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
uint64_t Tick
Tick count type.
constexpr uint32_t ROM_SIZE
static constexpr uint32_t IH_OFFSET_SHIFT
static constexpr uint32_t MMHUB_OFFSET_SHIFT
static constexpr int AMDGPU_VM_COUNT
constexpr int FRAMEBUFFER_BAR
constexpr int DOORBELL_BAR
constexpr uint32_t VGA_ROM_DEFAULT
static constexpr uint32_t GRBM_OFFSET_SHIFT
Declaration of the Packet class.
#define PCI_INTERRUPT_PIN
#define PCI_DEVICE_SPECIFIC
#define UNSERIALIZE_SCALAR(scalar)
#define SERIALIZE_SCALAR(scalar)
const std::string & name()
Defines the PCI Express capability register and its associated bitfields for a PCIe device.