| gem5 v23.0.0.1
    | 
#include <gpu_compute_driver.hh>
 
  
| Classes | |
| class | DriverWakeupEvent | 
| class | EventList | 
| class | EventTableEntry | 
| Public Types | |
| typedef GPUComputeDriverParams | Params | 
| typedef class EventTableEntry | ETEntry | 
|  Public Types inherited from gem5::SimObject | |
| typedef SimObjectParams | Params | 
| Public Member Functions | |
| GPUComputeDriver (const Params &p) | |
| int | ioctl (ThreadContext *tc, unsigned req, Addr ioc_buf) override | 
| Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a previous open(). | |
| int | open (ThreadContext *tc, int mode, int flags) override | 
| Create an FD entry for the KFD inside of the owning process. | |
| Addr | mmap (ThreadContext *tc, Addr start, uint64_t length, int prot, int tgt_flags, int tgt_fd, off_t offset) override | 
| Currently, mmap() will simply setup a mapping for the associated device's packet processor's doorbells and creates the event page. | |
| virtual void | signalWakeupEvent (uint32_t event_id) | 
| void | sleepCPU (ThreadContext *tc, uint32_t milliSecTimeout) | 
| void | setMtype (RequestPtr req) | 
| Called by the compute units right before a request is issued to ruby. | |
| int | doorbellSize () | 
| GfxVersion | getGfxVersion () const | 
|  Public Member Functions inherited from gem5::EmulatedDriver | |
| EmulatedDriver (const EmulatedDriverParams &p) | |
| bool | match (const std::string &s) const | 
| Check for a match with this driver's filename. | |
| virtual int | open (ThreadContext *tc, int mode, int flags)=0 | 
| Abstract method, invoked when the user program calls open() on the device driver. | |
| virtual int | ioctl (ThreadContext *tc, unsigned req, Addr buf)=0 | 
| Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a previous open(). | |
| virtual Addr | mmap (ThreadContext *tc, Addr start, uint64_t length, int prot, int tgtFlags, int tgtFd, off_t offset) | 
| Virtual method, invoked when the user program calls mmap() on the file descriptor returned by a previous open(). | |
|  Public Member Functions inherited from gem5::SimObject | |
| const Params & | params () const | 
| SimObject (const Params &p) | |
| virtual | ~SimObject () | 
| virtual void | init () | 
| init() is called after all C++ SimObjects have been created and all ports are connected. | |
| virtual void | loadState (CheckpointIn &cp) | 
| loadState() is called on each SimObject when restoring from a checkpoint. | |
| virtual void | initState () | 
| initState() is called on each SimObject when not restoring from a checkpoint. | |
| virtual void | regProbePoints () | 
| Register probe points for this object. | |
| virtual void | regProbeListeners () | 
| Register probe listeners for this object. | |
| ProbeManager * | getProbeManager () | 
| Get the probe manager for this object. | |
| virtual Port & | getPort (const std::string &if_name, PortID idx=InvalidPortID) | 
| Get a port with a given name and index. | |
| virtual void | startup () | 
| startup() is the final initialization call before simulation. | |
| DrainState | drain () override | 
| Provide a default implementation of the drain interface for objects that don't need draining. | |
| virtual void | memWriteback () | 
| Write back dirty buffers to memory using functional writes. | |
| virtual void | memInvalidate () | 
| Invalidate the contents of memory buffers. | |
| void | serialize (CheckpointOut &cp) const override | 
| Serialize an object. | |
| void | unserialize (CheckpointIn &cp) override | 
| Unserialize an object. | |
|  Public Member Functions inherited from gem5::EventManager | |
| EventQueue * | eventQueue () const | 
| void | schedule (Event &event, Tick when) | 
| void | deschedule (Event &event) | 
| void | reschedule (Event &event, Tick when, bool always=false) | 
| void | schedule (Event *event, Tick when) | 
| void | deschedule (Event *event) | 
| void | reschedule (Event *event, Tick when, bool always=false) | 
| void | wakeupEventQueue (Tick when=(Tick) -1) | 
| This function is not needed by the usual gem5 event loop but may be necessary in derived EventQueues which host gem5 on other schedulers. | |
| void | setCurTick (Tick newVal) | 
| EventManager (EventManager &em) | |
| Event manger manages events in the event queue. | |
| EventManager (EventManager *em) | |
| EventManager (EventQueue *eq) | |
|  Public Member Functions inherited from gem5::Serializable | |
| Serializable () | |
| virtual | ~Serializable () | 
| virtual void | serialize (CheckpointOut &cp) const =0 | 
| Serialize an object. | |
| virtual void | unserialize (CheckpointIn &cp)=0 | 
| Unserialize an object. | |
| void | serializeSection (CheckpointOut &cp, const char *name) const | 
| Serialize an object into a new section. | |
| void | serializeSection (CheckpointOut &cp, const std::string &name) const | 
| void | unserializeSection (CheckpointIn &cp, const char *name) | 
| Unserialize an a child object. | |
| void | unserializeSection (CheckpointIn &cp, const std::string &name) | 
|  Public Member Functions inherited from gem5::Drainable | |
| DrainState | drainState () const | 
| Return the current drain state of an object. | |
| virtual void | notifyFork () | 
| Notify a child process of a fork. | |
|  Public Member Functions inherited from gem5::statistics::Group | |
| Group (Group *parent, const char *name=nullptr) | |
| Construct a new statistics group. | |
| virtual | ~Group () | 
| virtual void | regStats () | 
| Callback to set stat parameters. | |
| virtual void | resetStats () | 
| Callback to reset stats. | |
| virtual void | preDumpStats () | 
| Callback before stats are dumped. | |
| void | addStat (statistics::Info *info) | 
| Register a stat with this group. | |
| const std::map< std::string, Group * > & | getStatGroups () const | 
| Get all child groups associated with this object. | |
| const std::vector< Info * > & | getStats () const | 
| Get all stats associated with this object. | |
| void | addStatGroup (const char *name, Group *block) | 
| Add a stat block as a child of this block. | |
| const Info * | resolveStat (std::string name) const | 
| Resolve a stat by its name within this group. | |
| void | mergeStatGroup (Group *block) | 
| Merge the contents (stats & children) of a block to this block. | |
| Group (const Group &)=delete | |
| Group & | operator= (const Group &)=delete | 
|  Public Member Functions inherited from gem5::Named | |
| Named (const std::string &name_) | |
| virtual | ~Named ()=default | 
| virtual std::string | name () const | 
| Private Types | |
| enum | MtypeFlags { SHARED = 0 , READ_WRITE = 1 , CACHED = 2 , NUM_MTYPE_BITS } | 
| Mtype bits {Cached, Read Write, Shared} for caches.  More... | |
| Private Member Functions | |
| void | registerUncacheableMemory (Addr start, Addr length) | 
| Register a region of host memory as uncacheable from the perspective of the dGPU. | |
| Addr | gpuVmApeBase (int gpuNum) const | 
| The aperture (APE) base/limit pairs are set statically at startup by the real KFD. | |
| Addr | gpuVmApeLimit (Addr apeBase) const | 
| Addr | scratchApeBase (int gpuNum) const | 
| Addr | scratchApeBaseV9 () const | 
| Addr | scratchApeLimit (Addr apeBase) const | 
| Addr | ldsApeBase (int gpuNum) const | 
| Addr | ldsApeBaseV9 () const | 
| Addr | ldsApeLimit (Addr apeBase) const | 
| void | allocateGpuVma (Request::CacheCoherenceFlags mtype, Addr start, Addr length) | 
| Allocate/deallocate GPUVM VMAs for tracking virtual address allocations and properties on DGPUs. | |
| Addr | deallocateGpuVma (Addr start) | 
| void | allocateQueue (PortProxy &mem_proxy, Addr ioc_buf_addr) | 
| Forward relevant parameters to packet processor; queueId is used to link doorbell. | |
| Private Attributes | |
| GPUCommandProcessor * | device | 
| GPU that is controlled by this driver. | |
| uint32_t | queueId | 
| bool | isdGPU | 
| GfxVersion | gfxVersion | 
| int | dGPUPoolID | 
| Addr | eventPage | 
| uint32_t | eventSlotIndex | 
| std::unordered_map< uint32_t, ETEntry > | ETable | 
| AddrRangeMap< Request::CacheCoherenceFlags, 1 > | gpuVmas | 
| VMA structures for GPUVM memory. | |
| Request::CacheCoherenceFlags | defaultMtype | 
| std::unordered_map< ThreadContext *, EventList > | TCEvents | 
| Additional Inherited Members | |
|  Static Public Member Functions inherited from gem5::SimObject | |
| static void | serializeAll (const std::string &cpt_dir) | 
| Create a checkpoint by serializing all SimObjects in the system. | |
| static SimObject * | find (const char *name) | 
| Find the SimObject with the given name and return a pointer to it. | |
| static void | setSimObjectResolver (SimObjectResolver *resolver) | 
| There is a single object name resolver, and it is only set when simulation is restoring from checkpoints. | |
| static SimObjectResolver * | getSimObjectResolver () | 
| There is a single object name resolver, and it is only set when simulation is restoring from checkpoints. | |
|  Static Public Member Functions inherited from gem5::Serializable | |
| static const std::string & | currentSection () | 
| Gets the fully-qualified name of the active section. | |
| static void | generateCheckpointOut (const std::string &cpt_dir, std::ofstream &outstream) | 
| Generate a checkpoint file so that the serialization can be routed to it. | |
|  Protected Member Functions inherited from gem5::Drainable | |
| Drainable () | |
| virtual | ~Drainable () | 
| virtual DrainState | drain ()=0 | 
| Draining is the process of clearing out the states of SimObjects.These are the SimObjects that are partially executed or are partially in flight. | |
| virtual void | drainResume () | 
| Resume execution after a successful drain. | |
| void | signalDrainDone () const | 
| Signal that an object is drained. | |
|  Protected Attributes inherited from gem5::EmulatedDriver | |
| const std::string & | filename | 
| filename for opening this driver (under /dev) | |
|  Protected Attributes inherited from gem5::SimObject | |
| const SimObjectParams & | _params | 
| Cached copy of the object parameters. | |
|  Protected Attributes inherited from gem5::EventManager | |
| EventQueue * | eventq | 
| A pointer to this object's event queue. | |
Definition at line 62 of file gpu_compute_driver.hh.
| typedef class EventTableEntry gem5::GPUComputeDriver::ETEntry | 
Definition at line 143 of file gpu_compute_driver.hh.
| typedef GPUComputeDriverParams gem5::GPUComputeDriver::Params | 
Definition at line 65 of file gpu_compute_driver.hh.
| 
 | private | 
Mtype bits {Cached, Read Write, Shared} for caches.
| Enumerator | |
|---|---|
| SHARED | |
| READ_WRITE | |
| CACHED | |
| NUM_MTYPE_BITS | |
Definition at line 169 of file gpu_compute_driver.hh.
| gem5::GPUComputeDriver::GPUComputeDriver | ( | const Params & | p | ) | 
Definition at line 60 of file gpu_compute_driver.cc.
References gem5::GPUCommandProcessor::attachDriver(), CACHED, gem5::Request::CACHED, defaultMtype, device, DPRINTF, gem5::MipsISA::p, READ_WRITE, gem5::Request::READ_WRITE, gem5::Flags< T >::set(), SHARED, and gem5::Request::SHARED.
| 
 | private | 
Allocate/deallocate GPUVM VMAs for tracking virtual address allocations and properties on DGPUs.
For now, we use these to track MTYPE and to be able to select which pages to unmap when the user provides us with a handle during the free ioctl.
Definition at line 997 of file gpu_compute_driver.cc.
References DPRINTF, gem5::AddrRange::end(), fatal_if, gpuVmas, and gem5::AddrRange::start().
Referenced by ioctl().
Forward relevant parameters to packet processor; queueId is used to link doorbell.
The queueIDs are not re-used in current implementation, and we allocate only one page (4096 bytes) for doorbells, so check if this queueID can be mapped into that page.
Definition at line 159 of file gpu_compute_driver.cc.
References gem5::BaseBufferArg::copyIn(), gem5::BaseBufferArg::copyOut(), device, doorbellSize(), fatal, gfxVersion, gem5::GPUCommandProcessor::hsaPacketProc(), KFD_MMAP_GPU_ID, KFD_MMAP_TYPE_DOORBELL, gem5::Named::name(), PAGE_SHIFT, queueId, and gem5::HSAPacketProcessor::setDeviceQueueDesc().
Referenced by ioctl().
Definition at line 1009 of file gpu_compute_driver.cc.
References DPRINTF, and gpuVmas.
Referenced by ioctl().
| 
 | inline | 
Definition at line 86 of file gpu_compute_driver.hh.
References fatal, and gfxVersion.
Referenced by allocateQueue(), ioctl(), and gem5::HSAPacketProcessor::write().
| 
 | inline | 
Definition at line 145 of file gpu_compute_driver.hh.
References gfxVersion.
Referenced by gem5::GPUCommandProcessor::submitDispatchPkt().
| 
 | private | 
The aperture (APE) base/limit pairs are set statically at startup by the real KFD.
AMD x86_64 CPUs only use the areas in the 64b address space where VA[63:47] == 0x1ffff or VA[63:47] = 0. These methods generate the APE base/limit pairs in exactly the same way as the real KFD does, which ensures these APEs do not fall into the CPU's address space
see the macros in the KFD driver in the ROCm Linux kernel source:
drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
Definition at line 945 of file gpu_compute_driver.cc.
Referenced by ioctl().
Definition at line 951 of file gpu_compute_driver.cc.
Referenced by ioctl().
| 
 | overridevirtual | 
Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a previous open().
The parameters are the same as those passed in to ioctlFunc() (q.v.).
This is where the runtime requests MTYPE from an aperture. Basically, the globally memory aperture is divided up into a default aperture and an alternate aperture each of which have their own MTYPE policies. This is done to mark a small piece of the global memory as uncacheable. Host memory mappings will be carved out of this uncacheable aperture, which is how they implement 'coherent' host/device memory on dGPUs.
TODO: Need to reflect per-aperture MTYPE policies based on this call.
Derive all clock counters based on the tick. All device clocks are identical and perfectly in sync.
Set the GPUVM/LDS/Scratch APEs exactly as they are in the real driver, see the KFD driver in the ROCm Linux kernel source: drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
While the GPU node numbers start at 0, we add 1 to force the count to start at 1. This is to ensure that the base/limit addresses are calculated correctly.
The CPU's 64b address space can only use the areas with VA[63:47] == 0x1ffff or VA[63:47] == 0, therefore we must ensure that the apertures do not fall in the CPU's address space.
In real hardware, this IOCTL maps host memory, dGPU memory, or dGPU doorbells into GPUVM space. Essentially, ROCm implements SVM by carving out a region of free VA space that both the host and GPUVM can agree upon. The entire GPU VA space is reserved on the host using a fixed mmap at a low VA range that is also directly accessable by the GPU's limited number of VA bits. When we actually call memory allocation later in the program, this IOCTL is invoked to create BOs/VMAs in the driver and bind them to physical memory/doorbells.
For gem5, we don't need to carve out any GPUVM space here (we don't support GPUVM and use host page tables on the GPU directly). We can can just use the existing host SVM region. We comment on each memory type seperately.
Called to map an already allocated region of memory to this GPU's GPUVM VA space. We don't need to implement this in the simulator since we only have a single VM system. If the region has already been allocated somewhere like the CPU, then it's already visible to the device.
Implements gem5::EmulatedDriver.
Definition at line 229 of file gpu_compute_driver.cc.
References allocateGpuVma(), allocateQueue(), AMDKFD_IOC_ACQUIRE_VM, AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, AMDKFD_IOC_ALLOC_QUEUE_GWS, AMDKFD_IOC_CREATE_EVENT, AMDKFD_IOC_CREATE_QUEUE, AMDKFD_IOC_DBG_ADDRESS_WATCH, AMDKFD_IOC_DBG_REGISTER, AMDKFD_IOC_DBG_UNREGISTER, AMDKFD_IOC_DBG_WAVE_CONTROL, AMDKFD_IOC_DESTROY_EVENT, AMDKFD_IOC_DESTROY_QUEUE, AMDKFD_IOC_FREE_MEMORY_OF_GPU, AMDKFD_IOC_GET_CLOCK_COUNTERS, AMDKFD_IOC_GET_DMABUF_INFO, AMDKFD_IOC_GET_PROCESS_APERTURES, AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, AMDKFD_IOC_GET_QUEUE_WAVE_STATE, AMDKFD_IOC_GET_TILE_CONFIG, AMDKFD_IOC_GET_VERSION, AMDKFD_IOC_IMPORT_DMABUF, AMDKFD_IOC_MAP_MEMORY_TO_GPU, AMDKFD_IOC_RESET_EVENT, AMDKFD_IOC_SET_CU_MASK, AMDKFD_IOC_SET_EVENT, AMDKFD_IOC_SET_MEMORY_POLICY, AMDKFD_IOC_SET_SCRATCH_BACKING_VA, AMDKFD_IOC_SET_TRAP_HANDLER, AMDKFD_IOC_SMI_EVENTS, AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, AMDKFD_IOC_UPDATE_QUEUE, AMDKFD_IOC_WAIT_EVENTS, gem5::Flags< T >::clear(), gem5::BaseBufferArg::copyIn(), gem5::BaseBufferArg::copyOut(), gem5::ThreadContext::cpuId(), gem5::curTick(), deallocateGpuVma(), defaultMtype, device, dGPUPoolID, gem5::divCeil(), doorbellSize(), DPRINTF, ETable, eventSlotIndex, fatal, fatal_if, gem5::BaseMMU::flushAll(), gem5::FullSystem, gem5::ThreadContext::getMMUPtr(), gem5::ThreadContext::getProcessPtr(), gfxVersion, gpuVmApeBase(), gpuVmApeLimit(), gem5::GPUCommandProcessor::hsaPacketProc(), gem5::ArmISA::i, isdGPU, KFD_IOC_ALLOC_MEM_FLAGS_COHERENT, KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL, KFD_IOC_ALLOC_MEM_FLAGS_GTT, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, KFD_IOC_ALLOC_MEM_FLAGS_VRAM, KFD_IOC_EVENT_SIGNAL, KFD_IOCTL_MAJOR_VERSION, KFD_IOCTL_MINOR_VERSION, KFD_MMAP_TYPE_EVENTS, ldsApeBase(), ldsApeBaseV9(), ldsApeLimit(), gem5::Process::memState, gem5::sim_clock::as_int::ns, PAGE_SHIFT, gem5::X86ISA::PageBytes, panic_if, gem5::HSAPacketProcessor::pioAddr, queueId, scratchApeBase(), scratchApeBaseV9(), scratchApeLimit(), signalWakeupEvent(), gem5::BaseBufferArg::size, sleepCPU(), SLOTS_PER_PAGE, TCEvents, gem5::HSAPacketProcessor::unsetDeviceQueueDesc(), and warn.
| 
 | private | 
Definition at line 977 of file gpu_compute_driver.cc.
Referenced by ioctl().
| 
 | private | 
Definition at line 985 of file gpu_compute_driver.cc.
Referenced by ioctl().
Definition at line 991 of file gpu_compute_driver.cc.
Referenced by ioctl().
| 
 | overridevirtual | 
Currently, mmap() will simply setup a mapping for the associated device's packet processor's doorbells and creates the event page.
We don't actually access these pages. We just need to reserve some VA space. See commit id 5ce8abce for details on how events are currently implemented.
Reimplemented from gem5::EmulatedDriver.
Definition at line 108 of file gpu_compute_driver.cc.
References device, DPRINTF, eventPage, gem5::ThreadContext::getProcessPtr(), gem5::GPUCommandProcessor::hsaPacketProc(), KFD_MMAP_TYPE_DOORBELL, KFD_MMAP_TYPE_EVENTS, KFD_MMAP_TYPE_MASK, KFD_SIGNAL_EVENT_LIMIT, gem5::Process::memState, gem5::ArmISA::offset, PAGE_SHIFT, panic_if, gem5::HSAPacketProcessor::pioAddr, and warn_once.
| 
 | overridevirtual | 
Create an FD entry for the KFD inside of the owning process.
Implements gem5::EmulatedDriver.
Definition at line 94 of file gpu_compute_driver.cc.
References DPRINTF, gem5::EmulatedDriver::filename, and gem5::ThreadContext::getProcessPtr().
Register a region of host memory as uncacheable from the perspective of the dGPU.
| 
 | private | 
Definition at line 957 of file gpu_compute_driver.cc.
Referenced by ioctl().
| 
 | private | 
Definition at line 965 of file gpu_compute_driver.cc.
Referenced by ioctl().
Definition at line 971 of file gpu_compute_driver.cc.
Referenced by ioctl().
| void gem5::GPUComputeDriver::setMtype | ( | RequestPtr | req | ) | 
Called by the compute units right before a request is issued to ruby.
This uses our VMAs to correctly set the MTYPE on a per-request basis. In real hardware, this is actually done through PTE bits in GPUVM. Since we are running a single VM (x86 PT) system, the MTYPE bits aren't available. Adding GPUVM specific bits to x86 page tables probably isn't the best way to proceed. For now we just have the driver set these until we implement a proper dual PT system.
Definition at line 1022 of file gpu_compute_driver.cc.
References defaultMtype, DPRINTF, gem5::AddrRange::end(), gem5::FullSystem, gpuVmas, isdGPU, gem5::RangeSize(), and gem5::AddrRange::start().
Referenced by gem5::ComputeUnit::sendRequest().
| 
 | virtual | 
Definition at line 191 of file gpu_compute_driver.cc.
References DPRINTF, ETable, eventSlotIndex, panic_if, and TCEvents.
Referenced by ioctl(), and gem5::GPUCommandProcessor::signalWakeupEvent().
| void gem5::GPUComputeDriver::sleepCPU | ( | ThreadContext * | tc, | 
| uint32_t | milliSecTimeout | ||
| ) | 
Definition at line 933 of file gpu_compute_driver.cc.
References gem5::ThreadContext::cpuId(), DPRINTF, gem5::ThreadContext::suspend(), and TCEvents.
Referenced by ioctl().
| 
 | private | 
Definition at line 177 of file gpu_compute_driver.hh.
Referenced by GPUComputeDriver(), ioctl(), and setMtype().
| 
 | private | 
GPU that is controlled by this driver.
Definition at line 151 of file gpu_compute_driver.hh.
Referenced by allocateQueue(), GPUComputeDriver(), ioctl(), and mmap().
| 
 | private | 
Definition at line 155 of file gpu_compute_driver.hh.
Referenced by ioctl().
| 
 | private | 
Definition at line 159 of file gpu_compute_driver.hh.
Referenced by gem5::GPUComputeDriver::EventList::clearEvents(), ioctl(), and signalWakeupEvent().
| 
 | private | 
Definition at line 156 of file gpu_compute_driver.hh.
Referenced by mmap().
| 
 | private | 
Definition at line 157 of file gpu_compute_driver.hh.
Referenced by gem5::GPUComputeDriver::EventList::clearEvents(), ioctl(), and signalWakeupEvent().
| 
 | private | 
Definition at line 154 of file gpu_compute_driver.hh.
Referenced by allocateQueue(), doorbellSize(), getGfxVersion(), and ioctl().
| 
 | private | 
VMA structures for GPUVM memory.
Definition at line 164 of file gpu_compute_driver.hh.
Referenced by allocateGpuVma(), deallocateGpuVma(), and setMtype().
| 
 | private | 
Definition at line 153 of file gpu_compute_driver.hh.
Referenced by ioctl(), and setMtype().
| 
 | private | 
Definition at line 152 of file gpu_compute_driver.hh.
Referenced by allocateQueue(), and ioctl().
| 
 | private | 
Definition at line 208 of file gpu_compute_driver.hh.
Referenced by ioctl(), signalWakeupEvent(), and sleepCPU().