36#include "debug/AMDGPUDevice.hh"
48#include "params/AMDGPUDevice.hh"
67 uint64_t vram_size = 0;
72 for (
auto&
m :
p.memories) {
76 p.system->addDeviceMemory(gpuMemMgr->getRequestorID(), m);
78 vram_size += m->getAddrRange().size();
83 if (
config().expansionROM) {
89 if (
p.device_name ==
"Vega10") {
90 gfx_version = GfxVersion::gfx900;
91 }
else if (
p.device_name ==
"MI100") {
92 gfx_version = GfxVersion::gfx908;
93 }
else if (
p.device_name ==
"MI200") {
94 gfx_version = GfxVersion::gfx90a;
95 }
else if (
p.device_name ==
"MI300X") {
96 gfx_version = GfxVersion::gfx942;
97 }
else if (
p.device_name ==
"MI355X") {
98 gfx_version = GfxVersion::gfx950;
100 panic(
"Unknown GPU device %s\n", p.device_name);
104 for (
auto&
s :
p.sdmas) {
105 s->setGPUDevice(this);
107 sdmaIds.insert({sdma_id, s});
109 RangeSize(s->getMmioBase(), s->getMmioSize())});
125 if (
p.device_name ==
"Vega10") {
126 sdmaFunc.insert({0xe1, &SDMAEngine::setPageBaseLo});
127 sdmaFunc.insert({0xe9, &SDMAEngine::setPageRptrLo});
128 sdmaFunc.insert({0xe8, &SDMAEngine::setPageRptrHi});
129 sdmaFunc.insert({0xf2, &SDMAEngine::setPageDoorbellLo});
130 sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo});
131 sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize});
132 sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo});
133 }
else if (
p.device_name ==
"MI100" ||
p.device_name ==
"MI200" ||
134 p.device_name ==
"MI300X" ||
p.device_name ==
"MI355X") {
135 sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo});
136 sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo});
137 sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi});
138 sdmaFunc.insert({0xea, &SDMAEngine::setPageDoorbellLo});
139 sdmaFunc.insert({0xd8, &SDMAEngine::setPageDoorbellOffsetLo});
140 sdmaFunc.insert({0x10b, &SDMAEngine::setPageWptrLo});
142 panic(
"Unknown GPU device %s\n",
p.device_name);
146 std::set<int> pm4_ids;
147 for (
auto& pm4 :
p.pm4_pkt_procs) {
148 pm4->setGPUDevice(this);
149 fatal_if(pm4_ids.count(pm4->getIpId()),
150 "Two PM4s with same IP IDs is not allowed");
151 pm4_ids.insert(pm4->getIpId());
152 pm4PktProcs.insert({pm4->getIpId(), pm4});
154 pm4Ranges.insert({pm4->getMMIORange(), pm4});
158 fatal_if(!pm4PktProcs.count(0),
"No default PM4 processor found");
160 deviceIH->setGPUDevice(
this);
161 cp->hsaPacketProc().setGPUDevice(
this);
162 cp->setGPUDevice(
this);
163 nbio.setGPUDevice(
this);
164 gpuvm.setGPUDevice(
this);
165 smu.setGPUDevice(
this);
169 uint64_t mmhubBase = 0x8000ULL << 24;
170 uint64_t mmhubTop = 0x83ffULL << 24;
171 uint64_t mmio_mem_size = vram_size / 0x100000;
175 mmio_mem_size -= 0x1;
177 gpuvm.setMMHUBBase(mmhubBase);
178 gpuvm.setMMHUBTop(mmhubTop);
192 gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
193 gpuvm.setMMIOAperture(IH_MMIO_RANGE, AddrRange(0x4280, 0x4980));
194 gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
195 gpuvm.setMMIOAperture(GFX_MMIO_RANGE, AddrRange(0x28000, 0x3F000));
196 if (getGfxVersion() == GfxVersion::gfx942 ||
197 getGfxVersion() == GfxVersion::gfx950) {
198 gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x60D00, 0x62E20));
200 gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x68000, 0x6A120));
202 gpuvm.setMMIOAperture(SMU_MMIO_RANGE, AddrRange(0x5A000, 0x5ACE4));
209 if (
p.device_name ==
"Vega10") {
212 }
else if (
p.device_name ==
"MI100") {
216 }
else if (
p.device_name ==
"MI200") {
222 }
else if (
p.device_name ==
"MI300X" ||
p.device_name ==
"MI355X") {
229 panic(
"Unknown GPU device %s\n",
p.device_name);
235 bool use_ip_discovery =
false;
237 if (getGfxVersion() == GfxVersion::gfx942 ||
238 getGfxVersion() == GfxVersion::gfx950) {
239 use_ip_discovery =
true;
241 if (
p.ipt_binary ==
"") {
242 DPRINTF(AMDGPUDevice,
"Assuming discovery=2 for IP discovery\n");
246 if (use_ip_discovery &&
p.ipt_binary !=
"") {
248 constexpr uint64_t DISCOVERY_TMR_OFFSET = (64 << 10);
249 constexpr int IPT_SIZE_DW = 0xa00;
250 uint64_t ip_table_base = (mmio_mem_size << 20) - DISCOVERY_TMR_OFFSET;
252 DPRINTF(AMDGPUDevice,
"Using IP discovery file %s\n",
p.ipt_binary);
254 std::ifstream iptBin;
255 std::array<uint32_t, IPT_SIZE_DW> ipTable;
256 iptBin.open(
p.ipt_binary, std::ios::binary);
257 iptBin.read((
char *)ipTable.data(), IPT_SIZE_DW*4);
261 for (
int ipt_dword = 0x0; ipt_dword < IPT_SIZE_DW; ipt_dword++) {
262 Addr ipt_addr = ip_table_base + ipt_dword*4;
267 Addr ipt_addr_hi = ipt_addr >> 31;
268 Addr fixup_addr = (ipt_addr_hi << 32) | (ipt_addr & 0x7fffffff)
271 setRegVal(fixup_addr, ipTable[ipt_dword]);
272 DPRINTF(AMDGPUDevice,
"IPTable wrote dword %d (%x) to %lx\n",
273 ipt_dword, ipTable[ipt_dword], fixup_addr);
287 RequestPtr request = std::make_shared<Request>(
293 system->getPhysMem().access(readPkt);
296 rom_offset, readPkt->getUintX(ByteOrder::little));
298 pkt->
setUintX(readPkt->getUintX(ByteOrder::little), ByteOrder::little);
308 uint64_t rom_data = pkt->
getUintX(ByteOrder::little);
310 RequestPtr request = std::make_shared<Request>(
314 writePkt->allocate();
315 writePkt->setUintX(rom_data, ByteOrder::little);
317 system->getPhysMem().access(writePkt);
320 rom_offset, writePkt->getUintX(ByteOrder::little));
333 for (
auto &
r : ranges) {
334 if (
r.start() != 0) {
335 ret_ranges.push_back(
r);
354 case sizeof(uint8_t):
355 pkt->
setLE<uint8_t>(
pxcap.data[pxcap_offset]);
357 "Read PXCAP: dev %#x func %#x reg %#x 1 bytes: "
360 (uint32_t)pkt->
getLE<uint8_t>());
362 case sizeof(uint16_t):
363 pkt->
setLE<uint16_t>(
364 *(uint16_t*)&
pxcap.data[pxcap_offset]);
366 "Read PXCAP: dev %#x func %#x reg %#x 2 bytes: "
369 (uint32_t)pkt->
getLE<uint16_t>());
371 case sizeof(uint32_t):
372 pkt->
setLE<uint32_t>(
373 *(uint32_t*)&
pxcap.data[pxcap_offset]);
375 "Read PXCAP: dev %#x func %#x reg %#x 4 bytes: "
378 (uint32_t)pkt->
getLE<uint32_t>());
381 panic(
"Invalid access size (%d) for amdgpu PXCAP %#x\n",
386 warn(
"Device specific offset %d not implemented!\n",
offset);
426 letoh(pkt->
getLE<uint32_t>()) == 0xfffff800) {
429 config().expansionROM = 0xfffff000;
436 uint8_t *pxcap_data = &(
pxcap.data[0]);
442 memcpy(pxcap_data + pxcap_offset, pkt->
getConstPtr<
void>(),
484 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
486 readPkt->
req->setGPUFuncAccess(
true);
488 cp->shader()->cuList[0]->memPort[0].sendFunctional(readPkt);
495 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
498 auto system =
cp->shader()->gpuCmdProc.system();
499 system->getDeviceMemory(readPkt)->access(readPkt);
526 nbio.readMMIO(pkt, aperture_offset);
532 gfx.readMMIO(pkt, aperture_offset);
550 for (
auto& cu:
CP()->shader()->cuList) {
552 cu->sendInvL2(aligned_addr);
559 if (aperture ==
gpuvm.gartBase()) {
560 gpuvm.gartTable[aperture_offset] = pkt->
getUintX(ByteOrder::little);
562 gpuvm.gartTable[aperture_offset]);
575 uint8_t *dataPtr =
new uint8_t[pkt->
getSize()];
576 std::memcpy(dataPtr, pkt->
getPtr<uint8_t>(),
577 pkt->
getSize() *
sizeof(uint8_t));
580 auto system =
cp->shader()->gpuCmdProc.system();
585 if (
system->getDeviceMemory(writePkt)) {
586 system->getDeviceMemory(writePkt)->access(writePkt);
588 warn(
"Unable to find device memory for address %#lx\n",
offset);
609 pkt->
getLE<uint64_t>());
615 pkt->
getLE<uint64_t>());
627 cp->hsaPacketProc().hwScheduler()->write(
offset,
628 pkt->
getLE<uint64_t>() + 1);
630 pkt->
getLE<uint64_t>() + 1);
640 panic(
"Write to unkown queue type!");
643 warn(
"Unknown doorbell offset: %lx. Saving to pending doorbells.\n",
652 uint8_t *pending_data =
new uint8_t[pkt->
getSize()];
653 memcpy(pending_data, pkt->
getPtr<uint8_t>(), pkt->
getSize());
669 for (
int idx = 0; idx <
sdmaIds.size(); ++idx) {
687 for (
auto& [range, pm4_proc] :
pm4Ranges) {
688 if (range.contains(
offset)) {
705 nbio.writeMMIO(pkt, aperture_offset);
708 gfx.writeMMIO(pkt, aperture_offset);
738 panic(
"Request with address out of mapped range!");
772 panic(
"Request with address out of mapped range!");
807 uint32_t pkt_data = 0;
808 RequestPtr request = std::make_shared<Request>(fixup_addr,
814 fixup_addr, pkt->
getLE<uint32_t>());
816 pkt_data = pkt->
getLE<uint32_t>();
828 uint32_t pkt_data = value;
887 uint64_t doorbells_size =
doorbells.size();
888 uint64_t sdma_engs_size =
sdmaEngs.size();
889 uint64_t used_vmid_map_size =
usedVMIDs.size();
897 auto doorbells_offset = std::make_unique<uint32_t[]>(doorbells_size);
898 auto doorbells_queues = std::make_unique<QueueType[]>(doorbells_size);
899 auto doorbells_ip_ids = std::make_unique<int[]>(doorbells_size);
900 auto sdma_engs_offset = std::make_unique<uint32_t[]>(sdma_engs_size);
901 auto sdma_engs = std::make_unique<int[]>(sdma_engs_size);
902 auto used_vmids = std::make_unique<int[]>(used_vmid_map_size);
903 auto used_queue_id_sizes = std::make_unique<int[]>(used_vmid_map_size);
908 doorbells_offset[idx] = it.first;
909 doorbells_queues[idx] = it.second.qtype;
910 doorbells_ip_ids[idx] = it.second.ip_id;
916 sdma_engs_offset[idx] = it.first;
917 sdma_engs[idx] = it.second->getId();
923 used_vmids[idx] = it.first;
924 used_queue_id_sizes[idx] = it.second.size();
926 used_vmid_sets.insert(used_vmid_sets.end(),
927 set_vector.begin(), set_vector.end());
931 int num_queue_id = used_vmid_sets.size();
932 auto vmid_array = std::make_unique<int[]>(num_queue_id);
933 std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array.get());
951 gpuvm.serializeSection(
cp,
"GPUVM");
960 uint64_t doorbells_size = 0;
961 uint64_t sdma_engs_size = 0;
962 uint64_t used_vmid_map_size = 0;
969 if (doorbells_size > 0) {
970 auto doorbells_offset = std::make_unique<uint32_t[]>(doorbells_size);
971 auto doorbells_queues = std::make_unique<QueueType[]>(doorbells_size);
972 auto doorbells_ip_ids = std::make_unique<int[]>(doorbells_size);
978 for (
int idx = 0; idx < doorbells_size; ++idx) {
979 doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
980 doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
984 if (sdma_engs_size > 0) {
985 auto sdma_engs_offset = std::make_unique<uint32_t[]>(sdma_engs_size);
986 auto sdma_engs = std::make_unique<int[]>(sdma_engs_size);
991 for (
int idx = 0; idx < sdma_engs_size; ++idx) {
992 int sdma_id = sdma_engs[idx];
993 assert(
sdmaIds.count(sdma_id));
995 sdmaEngs.insert(std::make_pair(sdma_engs_offset[idx], sdma));
999 if (used_vmid_map_size > 0) {
1000 auto used_vmids = std::make_unique<int[]>(used_vmid_map_size);
1001 auto used_queue_id_sizes = std::make_unique<int[]>(used_vmid_map_size);
1002 int num_queue_id = 0;
1006 auto vmid_array = std::make_unique<int[]>(num_queue_id);
1015 for (
int it = 0; it < used_vmid_map_size; it++) {
1016 int vmid = used_vmids[it];
1017 int vmid_set_size = used_queue_id_sizes[it];
1018 for (
int j = 0; j < vmid_set_size; j++) {
1019 usedVMIDs[vmid].insert(vmid_array[idx + j]);
1021 idx += vmid_set_size;
1027 gpuvm.unserializeSection(
cp,
"GPUVM");
1042 panic(
"All VMIDs have been assigned");
1055 assert(result !=
idMap.end());
1056 if (result ==
idMap.end())
return;
1057 uint16_t vmid = result->second;
1059 idMap.erase(result);
1070 it.second->deallocateRLCQueues(unmap_static);
1088std::unordered_map<uint16_t, std::set<int>>&
AbstractMemory declaration.
#define AMDGPU_MP0_SMN_C2PMSG_33
#define VEGA10_FB_LOCATION_BASE
#define VEGA10_FB_LOCATION_TOP
#define MI200_MEM_SIZE_REG
#define MI200_FB_LOCATION_TOP
#define MI300X_FB_LOCATION_TOP
#define MI100_FB_LOCATION_BASE
#define MI200_FB_LOCATION_BASE
#define MI300X_MEM_SIZE_REG
#define MI100_FB_LOCATION_TOP
#define MI300X_FB_LOCATION_BASE
#define MI100_MEM_SIZE_REG
void insertQId(uint16_t vmid, int id)
std::unordered_map< AddrRange, PM4PacketProcessor *, AddrRangeHasher > pm4Ranges
void deallocateAllQueues(bool unmap_static)
std::unordered_map< Addr, uint16_t > doorbellVMIDMap
std::unordered_map< uint16_t, uint16_t > idMap
void readMMIO(PacketPtr pkt, Addr offset)
void serialize(CheckpointOut &cp) const override
Checkpoint support.
void processPendingDoorbells(uint32_t offset)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unserialize(CheckpointIn &cp) override
Unserialize an object.
void writeMMIO(PacketPtr pkt, Addr offset)
void setDoorbellType(uint32_t offset, QueueType qt, int ip_id=0)
Set handles to GPU blocks.
Tick writeDevice(PacketPtr pkt) override
Write to the PCI device.
void readROM(PacketPtr pkt)
AddrRange romRange
VGA ROM methods.
std::unordered_map< uint32_t, DoorbellInfo > doorbells
Structures to hold registers, doorbells, and some frame memory.
std::unordered_map< uint16_t, std::set< int > > & getUsedVMIDs()
bool isROM(Addr addr) const
void unsetDoorbell(uint32_t offset)
std::unordered_map< uint32_t, PacketPtr > pendingDoorbellPkts
void setRegVal(uint64_t addr, uint32_t value)
std::unordered_map< uint32_t, AddrRange > sdmaMmios
Tick readDevice(PacketPtr pkt) override
Read from the PCI device.
void(SDMAEngine::* sdmaFuncPtr)(uint32_t)
SDMAEngine * getSDMAEngine(Addr offset)
AMDGPUMemoryManager * gpuMemMgr
AMDGPUDevice(const AMDGPUDeviceParams &p)
void readDoorbell(PacketPtr pkt, Addr offset)
AMDGPUNbio nbio
Blocks of the GPU.
Tick readConfig(PacketPtr pkt) override
Read from the PCI config space data that is stored locally.
std::unordered_map< uint32_t, sdmaFuncPtr > sdmaFunc
std::unordered_map< uint16_t, std::set< int > > usedVMIDs
AMDGPUInterruptHandler * deviceIH
Tick writeConfig(PacketPtr pkt) override
Methods inherited from PciEndpoint.
AMDMMIOReader mmioReader
MMIO reader to populate device registers map.
bool checkpoint_before_mmios
Initial checkpoint support variables.
void dispatchAccess(PacketPtr pkt, bool read)
Convert a PCI packet into a response.
uint32_t getRegVal(uint64_t addr)
Register value getter/setter.
void deallocateVmid(uint16_t vmid)
void mapDoorbellToVMID(Addr doorbell, uint16_t vmid)
void intrPost()
Methods inherited from PciEndpoint.
void readFrame(PacketPtr pkt, Addr offset)
Helper methods to handle specific BAR read/writes.
void writeROM(PacketPtr pkt)
void writeDoorbell(PacketPtr pkt, Addr offset)
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
std::unordered_map< uint32_t, SDMAEngine * > sdmaIds
uint16_t allocateVMID(uint16_t pasid)
std::unordered_map< int, PM4PacketProcessor * > pm4PktProcs
void deallocatePasid(uint16_t pasid)
SDMAEngine * getSDMAById(int id)
void writeFrame(PacketPtr pkt, Addr offset)
void setSDMAEngine(Addr offset, SDMAEngine *eng)
memory::PhysicalMemory deviceMem
std::unordered_map< uint32_t, SDMAEngine * > sdmaEngs
GPUCommandProcessor * CP()
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void setUintX(uint64_t w, ByteOrder endian)
Set the value in the word w after truncating it to the length of the packet and then byteswapping it ...
void setLE(T v)
Set the value in the data pointer to v as little endian.
static PacketPtr createWrite(const RequestPtr &req)
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
T * getPtr()
get a pointer to the data ptr.
static PacketPtr createRead(const RequestPtr &req)
Constructor-like methods that return Packets based on Request objects.
RequestPtr req
A pointer to the original request.
uint64_t getUintX(ByteOrder endian) const
Get the data in the packet byte swapped from the specified endianness and zero-extended to 64 bits.
const T * getConstPtr() const
void dataDynamic(T *p)
Set the data pointer to a value that should have delete [] called on it.
void makeAtomicResponse()
MemCmd cmd
The command field of the packet.
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
void setSuppressFuncError()
void serialize(CheckpointOut &cp) const override
Serialize this object to the given output stream.
Tick read(PacketPtr pkt) final
Final implementation of read access from PioDevice.
bool getBAR(Addr addr, int &num, Addr &offs)
Which base address register (if any) maps the given address?
virtual Tick readConfig(PacketPtr pkt)
Read from the PCI config space data that is stored locally.
const PciDevAddr _devAddr
PCIConfigType0 & config()
void unserialize(CheckpointIn &cp) override
Reconstruct the state of this object from a checkpoint.
PciEndpoint(const PciEndpointParams ¶ms)
Constructor for PCI Dev.
Tick writeConfig(PacketPtr pkt) override
Write to the PCI config space data that is stored locally.
virtual AddrRangeList getAddrRanges() const =0
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
System DMA Engine class for AMD dGPU.
void setGfxRptrLo(uint32_t data)
void setGfxWptrLo(uint32_t data)
void setGfxRptrHi(uint32_t data)
void processRLC(Addr doorbellOffset, Addr wptrOffset)
void setGfxSize(uint32_t data)
void setGfxBaseLo(uint32_t data)
void processGfx(Addr wptrOffset)
Given a new write ptr offset, communicated to the GPU through a doorbell write, the SDMA engine proce...
void setGfxWptrHi(uint32_t data)
void setGfxDoorbellOffsetLo(uint32_t data)
void processPage(Addr wptrOffset)
void setGfxDoorbellLo(uint32_t data)
void setGfxBaseHi(uint32_t data)
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Addr start() const
Get the start address of the range.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define UNSERIALIZE_UNIQUE_PTR_ARRAY(member, size)
#define SERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
std::ostream CheckpointOut
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
uint64_t Tick
Tick count type.
constexpr uint32_t ROM_SIZE
static constexpr uint32_t IH_OFFSET_SHIFT
static constexpr uint32_t MMHUB_OFFSET_SHIFT
static constexpr int AMDGPU_VM_COUNT
constexpr int FRAMEBUFFER_BAR
static constexpr uint32_t SMU_OFFSET_SHIFT
constexpr int DOORBELL_BAR
constexpr uint32_t VGA_ROM_DEFAULT
static constexpr uint32_t GRBM_OFFSET_SHIFT
Declaration of the Packet class.
#define PCI0_ROM_BASE_ADDR
#define PCI_INTERRUPT_PIN
#define PCI_DEVICE_SPECIFIC
#define UNSERIALIZE_SCALAR(scalar)
#define SERIALIZE_SCALAR(scalar)
const std::string & name()
Defines the PCI Express capability register and its associated bitfields for a PCIe device.