gem5 v24.1.0.1
|
#include <gpu_command_processor.hh>
Classes | |
struct | KernelDispatchData |
Public Types | |
enum | AgentCmd { Nop = 0 , Steal = 1 } |
typedef GPUCommandProcessorParams | Params |
typedef std::function< void(const uint64_t &)> | HsaSignalCallbackFunction |
Public Types inherited from gem5::DmaVirtDevice | |
typedef void(DmaDevice::* | DmaFnPtr) (Addr, int, Event *, uint8_t *, Tick) |
Public Types inherited from gem5::DmaDevice | |
typedef DmaDeviceParams | Params |
Public Types inherited from gem5::PioDevice | |
using | Params = PioDeviceParams |
Public Types inherited from gem5::ClockedObject | |
using | Params = ClockedObjectParams |
Parameters of ClockedObject. | |
Public Types inherited from gem5::SimObject | |
typedef SimObjectParams | Params |
Public Member Functions | |
GPUCommandProcessor ()=delete | |
GPUCommandProcessor (const Params &p) | |
HSAPacketProcessor & | hsaPacketProc () |
RequestorID | vramRequestorId () |
Forward the VRAM requestor ID needed for device memory from GPU device. | |
void | setGPUDevice (AMDGPUDevice *gpu_device) |
void | setShader (Shader *shader) |
Shader * | shader () |
GPUComputeDriver * | driver () |
void | performTimingRead (PacketPtr pkt) |
void | completeTimingRead () |
void | submitAgentDispatchPkt (void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) |
submitAgentDispatchPkt() is for accepting agent dispatch packets. | |
void | submitDispatchPkt (void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) |
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with AQL kernel dispatch packets. | |
void | submitVendorPkt (void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) |
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP. | |
void | attachDriver (GPUComputeDriver *driver) |
void | dispatchKernelObject (AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr) |
void | dispatchPkt (HSAQueueEntry *task) |
Once the CP has finished extracting all relevant information about a task and has initialized the ABI state, we send a description of the task to the dispatcher. | |
void | signalWakeupEvent (uint32_t event_id) |
Tick | write (PacketPtr pkt) override |
Pure virtual function that the device must implement. | |
Tick | read (PacketPtr pkt) override |
Pure virtual function that the device must implement. | |
AddrRangeList | getAddrRanges () const override |
Every PIO device is obliged to provide an implementation that returns the address ranges the device responds to. | |
System * | system () |
void | sendCompletionSignal (Addr signal_handle) |
void | updateHsaSignal (Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { }) |
void | updateHsaSignalAsync (Addr signal_handle, int64_t diff) |
void | updateHsaSignalData (Addr value_addr, int64_t diff, uint64_t *prev_value) |
void | updateHsaSignalDone (uint64_t *signal_value) |
void | updateHsaMailboxData (Addr signal_handle, uint64_t *mailbox_value) |
void | updateHsaEventData (Addr signal_handle, uint64_t *event_value) |
void | updateHsaEventTs (Addr signal_handle, amd_event_t *event_value) |
uint64_t | functionalReadHsaSignal (Addr signal_handle) |
Addr | getHsaSignalValueAddr (Addr signal_handle) |
Addr | getHsaSignalMailboxAddr (Addr signal_handle) |
Addr | getHsaSignalEventAddr (Addr signal_handle) |
Public Member Functions inherited from gem5::DmaVirtDevice | |
DmaVirtDevice (const Params &p) | |
virtual | ~DmaVirtDevice () |
void | dmaReadVirt (Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0) |
Initiate a DMA read from virtual address host_addr. | |
void | dmaWriteVirt (Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0) |
Initiate a DMA write from virtual address host_addr. | |
void | dmaVirt (DmaFnPtr dmaFn, Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0) |
Initiate a call to DmaDevice using DmaFnPtr do a DMA starting from virtual address host_addr for size number of bytes on the data. | |
Public Member Functions inherited from gem5::DmaDevice | |
DmaDevice (const Params &p) | |
virtual | ~DmaDevice ()=default |
void | dmaWrite (Addr addr, int size, Event *event, uint8_t *data, uint32_t sid, uint32_t ssid, Tick delay=0) |
void | dmaWrite (Addr addr, int size, Event *event, uint8_t *data, Tick delay=0) |
void | dmaRead (Addr addr, int size, Event *event, uint8_t *data, uint32_t sid, uint32_t ssid, Tick delay=0) |
void | dmaRead (Addr addr, int size, Event *event, uint8_t *data, Tick delay=0) |
bool | dmaPending () const |
void | init () override |
init() is called after all C++ SimObjects have been created and all ports are connected. | |
Addr | cacheBlockSize () const |
Port & | getPort (const std::string &if_name, PortID idx=InvalidPortID) override |
Get a port with a given name and index. | |
Public Member Functions inherited from gem5::PioDevice | |
PioDevice (const Params &p) | |
virtual | ~PioDevice () |
void | init () override |
init() is called after all C++ SimObjects have been created and all ports are connected. | |
Port & | getPort (const std::string &if_name, PortID idx=InvalidPortID) override |
Get a port with a given name and index. | |
Public Member Functions inherited from gem5::ClockedObject | |
ClockedObject (const ClockedObjectParams &p) | |
void | serialize (CheckpointOut &cp) const override |
Serialize an object. | |
void | unserialize (CheckpointIn &cp) override |
Unserialize an object. | |
Public Member Functions inherited from gem5::SimObject | |
const Params & | params () const |
SimObject (const Params &p) | |
virtual | ~SimObject () |
virtual void | loadState (CheckpointIn &cp) |
loadState() is called on each SimObject when restoring from a checkpoint. | |
virtual void | initState () |
initState() is called on each SimObject when not restoring from a checkpoint. | |
virtual void | regProbePoints () |
Register probe points for this object. | |
virtual void | regProbeListeners () |
Register probe listeners for this object. | |
ProbeManager * | getProbeManager () |
Get the probe manager for this object. | |
virtual void | startup () |
startup() is the final initialization call before simulation. | |
DrainState | drain () override |
Provide a default implementation of the drain interface for objects that don't need draining. | |
virtual void | memWriteback () |
Write back dirty buffers to memory using functional writes. | |
virtual void | memInvalidate () |
Invalidate the contents of memory buffers. | |
void | serialize (CheckpointOut &cp) const override |
Serialize an object. | |
void | unserialize (CheckpointIn &cp) override |
Unserialize an object. | |
Public Member Functions inherited from gem5::EventManager | |
EventQueue * | eventQueue () const |
void | schedule (Event &event, Tick when) |
void | deschedule (Event &event) |
void | reschedule (Event &event, Tick when, bool always=false) |
void | schedule (Event *event, Tick when) |
void | deschedule (Event *event) |
void | reschedule (Event *event, Tick when, bool always=false) |
void | wakeupEventQueue (Tick when=(Tick) -1) |
This function is not needed by the usual gem5 event loop but may be necessary in derived EventQueues which host gem5 on other schedulers. | |
void | setCurTick (Tick newVal) |
EventManager (EventManager &em) | |
Event manger manages events in the event queue. | |
EventManager (EventManager *em) | |
EventManager (EventQueue *eq) | |
Public Member Functions inherited from gem5::Serializable | |
Serializable () | |
virtual | ~Serializable () |
void | serializeSection (CheckpointOut &cp, const char *name) const |
Serialize an object into a new section. | |
void | serializeSection (CheckpointOut &cp, const std::string &name) const |
void | unserializeSection (CheckpointIn &cp, const char *name) |
Unserialize an a child object. | |
void | unserializeSection (CheckpointIn &cp, const std::string &name) |
Public Member Functions inherited from gem5::Drainable | |
DrainState | drainState () const |
Return the current drain state of an object. | |
virtual void | notifyFork () |
Notify a child process of a fork. | |
Public Member Functions inherited from gem5::statistics::Group | |
Group (Group *parent, const char *name=nullptr) | |
Construct a new statistics group. | |
virtual | ~Group () |
virtual void | regStats () |
Callback to set stat parameters. | |
virtual void | resetStats () |
Callback to reset stats. | |
virtual void | preDumpStats () |
Callback before stats are dumped. | |
void | addStat (statistics::Info *info) |
Register a stat with this group. | |
const std::map< std::string, Group * > & | getStatGroups () const |
Get all child groups associated with this object. | |
const std::vector< Info * > & | getStats () const |
Get all stats associated with this object. | |
void | addStatGroup (const char *name, Group *block) |
Add a stat block as a child of this block. | |
const Info * | resolveStat (std::string name) const |
Resolve a stat by its name within this group. | |
void | mergeStatGroup (Group *block) |
Merge the contents (stats & children) of a block to this block. | |
Group (const Group &)=delete | |
Group & | operator= (const Group &)=delete |
Public Member Functions inherited from gem5::Named | |
Named (const std::string &name_) | |
virtual | ~Named ()=default |
virtual std::string | name () const |
Public Member Functions inherited from gem5::Clocked | |
void | updateClockPeriod () |
Update the tick to the current tick. | |
Tick | clockEdge (Cycles cycles=Cycles(0)) const |
Determine the tick when a cycle begins, by default the current one, but the argument also enables the caller to determine a future cycle. | |
Cycles | curCycle () const |
Determine the current cycle, corresponding to a tick aligned to a clock edge. | |
Tick | nextCycle () const |
Based on the clock of the object, determine the start tick of the first cycle that is at least one cycle in the future. | |
uint64_t | frequency () const |
Tick | clockPeriod () const |
double | voltage () const |
Cycles | ticksToCycles (Tick t) const |
Tick | cyclesToTicks (Cycles c) const |
Public Attributes | |
std::list< struct KernelDispatchData > | kernelDispatchList |
Public Attributes inherited from gem5::ClockedObject | |
PowerState * | powerState |
Private Types | |
typedef void(DmaDevice::* | DmaFnPtr) (Addr, int, Event *, uint8_t *, Tick) |
Private Member Functions | |
void | initABI (HSAQueueEntry *task) |
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing the ABI state. | |
void | sanityCheckAKC (AMDKernelCode *akc) |
TranslationGenPtr | translate (Addr vaddr, Addr size) override |
Function used to translate a range of addresses from virtual to physical addresses. | |
void | ReadDispIdOffsetDmaEvent (HSAQueueEntry *task, const uint32_t &readDispIdOffset) |
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after the read_dispatch_id (the read pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor (MQD)), to find the base address of the MQD. | |
void | MQDDmaEvent (HSAQueueEntry *task) |
Perform a DMA read of the MQD that corresponds to a hardware queue descriptor (HQD). | |
void | WaitScratchDmaEvent (HSAQueueEntry *task, const uint64_t &dmaBuffer) |
Poll on queue_inactive signal until the runtime can get around to taking care of our lack of scratch space. | |
Private Attributes | |
Shader * | _shader |
GPUDispatcher & | dispatcher |
GPUComputeDriver * | _driver |
AMDGPUDevice * | gpuDevice |
VegaISA::Walker * | walker |
HSAPacketProcessor * | hsaPP |
int | dynamic_task_id = 0 |
int | non_blit_kernel_id = 0 |
int | target_non_blit_kernel_id = 0 |
std::unordered_map< Addr, Tick > | dispatchStartTime |
Additional Inherited Members | |
Static Public Member Functions inherited from gem5::SimObject | |
static void | serializeAll (const std::string &cpt_dir) |
Create a checkpoint by serializing all SimObjects in the system. | |
static SimObject * | find (const char *name) |
Find the SimObject with the given name and return a pointer to it. | |
static void | setSimObjectResolver (SimObjectResolver *resolver) |
There is a single object name resolver, and it is only set when simulation is restoring from checkpoints. | |
static SimObjectResolver * | getSimObjectResolver () |
There is a single object name resolver, and it is only set when simulation is restoring from checkpoints. | |
Static Public Member Functions inherited from gem5::Serializable | |
static const std::string & | currentSection () |
Gets the fully-qualified name of the active section. | |
static void | generateCheckpointOut (const std::string &cpt_dir, std::ofstream &outstream) |
Generate a checkpoint file so that the serialization can be routed to it. | |
Protected Member Functions inherited from gem5::Drainable | |
Drainable () | |
virtual | ~Drainable () |
virtual void | drainResume () |
Resume execution after a successful drain. | |
void | signalDrainDone () const |
Signal that an object is drained. | |
Protected Member Functions inherited from gem5::Clocked | |
Clocked (ClockDomain &clk_domain) | |
Create a clocked object and set the clock domain based on the parameters. | |
Clocked (Clocked &)=delete | |
Clocked & | operator= (Clocked &)=delete |
virtual | ~Clocked () |
Virtual destructor due to inheritance. | |
void | resetClock () const |
Reset the object's clock using the current global tick value. | |
virtual void | clockPeriodUpdated () |
A hook subclasses can implement so they can do any extra work that's needed when the clock rate is changed. | |
Protected Attributes inherited from gem5::DmaDevice | |
DmaPort | dmaPort |
Protected Attributes inherited from gem5::PioDevice | |
System * | sys |
PioPort< PioDevice > | pioPort |
The pioPort that handles the requests for us and provides us requests that it sees. | |
Protected Attributes inherited from gem5::SimObject | |
const SimObjectParams & | _params |
Cached copy of the object parameters. | |
Protected Attributes inherited from gem5::EventManager | |
EventQueue * | eventq |
A pointer to this object's event queue. | |
Definition at line 71 of file gpu_command_processor.hh.
|
private |
Definition at line 164 of file gpu_command_processor.hh.
typedef std::function<void(const uint64_t &)> gem5::GPUCommandProcessor::HsaSignalCallbackFunction |
Definition at line 75 of file gpu_command_processor.hh.
typedef GPUCommandProcessorParams gem5::GPUCommandProcessor::Params |
Definition at line 74 of file gpu_command_processor.hh.
Enumerator | |
---|---|
Nop | |
Steal |
Definition at line 99 of file gpu_command_processor.hh.
|
delete |
gem5::GPUCommandProcessor::GPUCommandProcessor | ( | const Params & | p | ) |
Definition at line 60 of file gpu_command_processor.cc.
References dispatcher, hsaPP, gem5::GPUDispatcher::setCommandProcessor(), and gem5::HSAPacketProcessor::setDevice().
void gem5::GPUCommandProcessor::attachDriver | ( | GPUComputeDriver * | driver | ) |
Definition at line 581 of file gpu_command_processor.cc.
References _driver, and fatal_if.
Referenced by gem5::GPUComputeDriver::GPUComputeDriver().
void gem5::GPUCommandProcessor::completeTimingRead | ( | ) |
Definition at line 125 of file gpu_command_processor.cc.
References gem5::GPUCommandProcessor::KernelDispatchData::akc, dispatchKernelObject(), gem5::GPUCommandProcessor::KernelDispatchData::host_pkt_addr, kernelDispatchList, gem5::GPUCommandProcessor::KernelDispatchData::queue_id, gem5::GPUCommandProcessor::KernelDispatchData::raw_pkt, and gem5::GPUCommandProcessor::KernelDispatchData::readPkt.
void gem5::GPUCommandProcessor::dispatchKernelObject | ( | AMDKernelCode * | akc, |
void * | raw_pkt, | ||
uint32_t | queue_id, | ||
Addr | host_pkt_addr | ||
) |
BLIT kernels don't have symbol names. BLIT kernels are built-in compute kernels issued by ROCm to handle DMAs for dGPUs when the SDMA hardware engines are unavailable or explicitly disabled. They can also be used to do copies that ROCm things would be better performed by the shader than the SDMA engines. They are also sometimes used on APUs to implement asynchronous memcopy operations from 2 pointers in host memory. I have no idea what BLIT stands for.
Definition at line 290 of file gpu_command_processor.cc.
References gem5::HSAQueueEntry::codeAddr(), gem5::_hsa_dispatch_packet_t::completion_signal, gem5::HSAQueueEntry::completionSignal(), gem5::curTick(), dispatchStartTime, gem5::HSAQueueEntry::dispPktPtr(), DPRINTF, driver(), dynamic_task_id, gem5::exitSimLoop(), gem5::HSAPacketProcessor::finishPkt(), gem5::FullSystem, gem5::AMDGPUDevice::getGfxVersion(), gem5::GPUComputeDriver::getGfxVersion(), gpuDevice, gem5::_hsa_dispatch_packet_t::grid_size_x, gem5::_hsa_dispatch_packet_t::grid_size_y, gem5::_hsa_dispatch_packet_t::grid_size_z, hsaPacketProc(), initABI(), gem5::_hsa_dispatch_packet_t::kernarg_address, gem5::GEM5_PACKED::kernel_code_entry_byte_offset, gem5::_hsa_dispatch_packet_t::kernel_object, non_blit_kernel_id, gem5::sim_clock::as_int::ns, gem5::HSAQueueEntry::numScalarRegs(), gem5::HSAQueueEntry::numVectorRegs(), gem5::HSAQueueEntry::queueId(), sanityCheckAKC(), sendCompletionSignal(), target_non_blit_kernel_id, gem5::_hsa_dispatch_packet_t::workgroup_size_x, gem5::_hsa_dispatch_packet_t::workgroup_size_y, and gem5::_hsa_dispatch_packet_t::workgroup_size_z.
Referenced by completeTimingRead(), and submitDispatchPkt().
void gem5::GPUCommandProcessor::dispatchPkt | ( | HSAQueueEntry * | task | ) |
Once the CP has finished extracting all relevant information about a task and has initialized the ABI state, we send a description of the task to the dispatcher.
The dispatcher will create and dispatch WGs to the CUs.
Definition at line 684 of file gpu_command_processor.cc.
References gem5::GPUDispatcher::dispatch(), and dispatcher.
Referenced by MQDDmaEvent().
GPUComputeDriver * gem5::GPUCommandProcessor::driver | ( | ) |
Definition at line 591 of file gpu_command_processor.cc.
References _driver.
Referenced by dispatchKernelObject(), gem5::ComputeUnit::sendRequest(), and gem5::HSAPacketProcessor::write().
uint64_t gem5::GPUCommandProcessor::functionalReadHsaSignal | ( | Addr | signal_handle | ) |
Definition at line 527 of file gpu_command_processor.cc.
References getHsaSignalValueAddr(), system(), and gem5::System::threads.
Referenced by sendCompletionSignal().
|
overridevirtual |
Every PIO device is obliged to provide an implementation that returns the address ranges the device responds to.
Implements gem5::PioDevice.
Definition at line 829 of file gpu_command_processor.cc.
Definition at line 151 of file gpu_command_processor.hh.
Referenced by updateHsaMailboxData(), and updateHsaSignal().
Definition at line 146 of file gpu_command_processor.hh.
Referenced by updateHsaEventData(), updateHsaSignal(), and updateHsaSignalAsync().
Definition at line 141 of file gpu_command_processor.hh.
Referenced by functionalReadHsaSignal(), updateHsaEventTs(), updateHsaSignal(), and WaitScratchDmaEvent().
HSAPacketProcessor & gem5::GPUCommandProcessor::hsaPacketProc | ( | ) |
Definition at line 71 of file gpu_command_processor.cc.
References hsaPP.
Referenced by gem5::GPUComputeDriver::allocateQueue(), gem5::AMDGPUDevice::AMDGPUDevice(), dispatchKernelObject(), gem5::GPUComputeDriver::ioctl(), gem5::GPUComputeDriver::mmap(), gem5::GPUDispatcher::notifyWgCompl(), gem5::PM4PacketProcessor::processMQD(), gem5::PM4PacketProcessor::unmapAllQueues(), gem5::PM4PacketProcessor::unserialize(), and gem5::AMDGPUDevice::writeDoorbell().
|
private |
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing the ABI state.
Information provided by the MQD, AQL packet, and code object metadata will be used to initialze register file state.
Definition at line 702 of file gpu_command_processor.cc.
References gem5::DmaVirtDevice::DmaVirtCallback< T >::dmaBuffer, gem5::DmaVirtDevice::dmaReadVirt(), gem5::HSAPacketProcessor::getQueueDesc(), hsaPP, gem5::HSAQueueEntry::queueId(), and ReadDispIdOffsetDmaEvent().
Referenced by dispatchKernelObject().
|
inlineprivate |
Perform a DMA read of the MQD that corresponds to a hardware queue descriptor (HQD).
We store a copy of the MQD in the HSAQueueEntry object so we can send a copy of it along with a dispatch packet, which is needed to initialize register state.
dGPUs on any version of ROCm and APUs starting with ROCm 2.2 can perform lazy allocation of private segment (scratch) memory, where the runtime will intentianally underallocate scratch resources to save framebuffer (or system on APU) memory. If we don't have enough scratch memory to launch this kernel, we need to raise a recoverable error code to the runtime by asserting queue_inactive_signal for the queue. The runtime will then try to allocate more scratch and reset this signal. When the signal is reset we should check that the runtime was successful and then proceed to launch the kernel.
Definition at line 227 of file gpu_command_processor.hh.
References gem5::HSAQueueEntry::amdQueue, gem5::_amd_queue_t::compute_tmpring_size_wavesize, dispatchPkt(), DPRINTF, fatal_if, gem5::_hsa_signal_t::handle, hsaPP, gem5::HSAPacketProcessor::inFlightPkts(), gem5::VegaISA::NumVecElemPerVecReg(), gem5::HSAQueueEntry::privMemPerItem(), gem5::_amd_queue_t::queue_inactive_signal, gem5::HSAQueueEntry::queueId(), and updateHsaSignal().
Referenced by ReadDispIdOffsetDmaEvent(), and WaitScratchDmaEvent().
void gem5::GPUCommandProcessor::performTimingRead | ( | PacketPtr | pkt | ) |
Definition at line 105 of file gpu_command_processor.cc.
References gem5::Shader::cuList, gem5::ComputeUnit::SQCPort::retries, gem5::Packet::senderState, gem5::RequestPort::sendTimingReq(), shader(), gem5::ComputeUnit::sqcPort, gem5::ComputeUnit::SQCPort::SenderState::wavefront, and gem5::ComputeUnit::wfList.
Referenced by submitDispatchPkt().
Pure virtual function that the device must implement.
Called when a read command is recieved by the port.
pkt | Packet describing this request |
Implements gem5::PioDevice.
Definition at line 123 of file gpu_command_processor.hh.
|
inlineprivate |
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after the read_dispatch_id (the read pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor (MQD)), to find the base address of the MQD.
The MQD is the runtime's soft representation of a HW queue descriptor (HQD).
Any fields below the read dispatch ID in the amd_hsa_queue_t should not change according to the HSA standard, therefore we should be able to get them based on their known relative position to the read dispatch ID.
Now that the read pointer's offset from the base of the MQD is known, we can use that to calculate the the address of the MQD itself, the dispatcher will DMA that into the HSAQueueEntry when a kernel is launched.
DMA a copy of the MQD into the task. some fields of the MQD will be used to initialize register state in VI
Definition at line 195 of file gpu_command_processor.hh.
References gem5::HSAQueueEntry::amdQueue, gem5::DmaVirtDevice::dmaReadVirt(), gem5::HSAPacketProcessor::getQueueDesc(), gem5::HSAQueueEntry::hostAMDQueueAddr, gem5::HSAQueueDescriptor::hostReadIndexPtr, hsaPP, MQDDmaEvent(), and gem5::HSAQueueEntry::queueId().
Referenced by initABI().
|
private |
Definition at line 716 of file gpu_command_processor.cc.
References gem5::GEM5_PACKED::accum_offset, gem5::GEM5_PACKED::bulky, gem5::GEM5_PACKED::cdbg_user, gem5::GEM5_PACKED::debug_mode, DPRINTF, gem5::GEM5_PACKED::enable_dx10_clamp, gem5::GEM5_PACKED::enable_exception_address_watch, gem5::GEM5_PACKED::enable_exception_fp_denormal_source, gem5::GEM5_PACKED::enable_exception_ieee_754_fp_division_by_zero, gem5::GEM5_PACKED::enable_exception_ieee_754_fp_inexact, gem5::GEM5_PACKED::enable_exception_ieee_754_fp_invalid_operation, gem5::GEM5_PACKED::enable_exception_ieee_754_fp_overflow, gem5::GEM5_PACKED::enable_exception_ieee_754_fp_underflow, gem5::GEM5_PACKED::enable_exception_int_divide_by_zero, gem5::GEM5_PACKED::enable_exception_memory, gem5::GEM5_PACKED::enable_ieee_mode, gem5::GEM5_PACKED::enable_private_segment, gem5::GEM5_PACKED::enable_sgpr_dispatch_id, gem5::GEM5_PACKED::enable_sgpr_dispatch_ptr, gem5::GEM5_PACKED::enable_sgpr_flat_scratch_init, gem5::GEM5_PACKED::enable_sgpr_kernarg_segment_ptr, gem5::GEM5_PACKED::enable_sgpr_private_segment_buffer, gem5::GEM5_PACKED::enable_sgpr_private_segment_size, gem5::GEM5_PACKED::enable_sgpr_queue_ptr, gem5::GEM5_PACKED::enable_sgpr_workgroup_id_x, gem5::GEM5_PACKED::enable_sgpr_workgroup_id_y, gem5::GEM5_PACKED::enable_sgpr_workgroup_id_z, gem5::GEM5_PACKED::enable_sgpr_workgroup_info, gem5::GEM5_PACKED::enable_trap_handler, gem5::GEM5_PACKED::enable_vgpr_workitem_id, gem5::GEM5_PACKED::enable_wavefront_size32, fatal_if, gem5::GEM5_PACKED::float_mode_denorm_16_64, gem5::GEM5_PACKED::float_mode_denorm_32, gem5::GEM5_PACKED::float_mode_round_16_64, gem5::GEM5_PACKED::float_mode_round_32, gem5::GEM5_PACKED::fp16_ovfl, gem5::GEM5_PACKED::fwd_progress, gem5::GEM5_PACKED::granulated_lds_size, gem5::GEM5_PACKED::granulated_wavefront_sgpr_count, gem5::GEM5_PACKED::granulated_workitem_vgpr_count, gem5::GEM5_PACKED::group_segment_fixed_size, gem5::GEM5_PACKED::kernarg_preload_spec_length, gem5::GEM5_PACKED::kernarg_preload_spec_offset, gem5::GEM5_PACKED::kernarg_size, gem5::GEM5_PACKED::kernel_code_entry_byte_offset, gem5::GEM5_PACKED::mem_ordered, gem5::GEM5_PACKED::priority, gem5::GEM5_PACKED::priv, gem5::GEM5_PACKED::private_segment_fixed_size, gem5::GEM5_PACKED::tg_split, gem5::GEM5_PACKED::use_dynamic_stack, gem5::GEM5_PACKED::user_sgpr_count, warn_if, and gem5::GEM5_PACKED::wgp_mode.
Referenced by dispatchKernelObject().
void gem5::GPUCommandProcessor::sendCompletionSignal | ( | Addr | signal_handle | ) |
HACK: The semantics of the HSA signal is to decrement the current signal value. We cheat here and read out he value from main memory using functional access and then just DMA the decremented value.
Definition at line 387 of file gpu_command_processor.cc.
References gem5::FullSystem, functionalReadHsaSignal(), updateHsaSignal(), and updateHsaSignalAsync().
Referenced by dispatchKernelObject(), gem5::GPUDispatcher::notifyWgCompl(), gem5::HSAPacketProcessor::processPkt(), and submitVendorPkt().
void gem5::GPUCommandProcessor::setGPUDevice | ( | AMDGPUDevice * | gpu_device | ) |
Definition at line 836 of file gpu_command_processor.cc.
References gpuDevice, gem5::VegaISA::Walker::setDevRequestor(), gem5::AMDGPUDevice::vramRequestorId(), and walker.
Referenced by gem5::AMDGPUDevice::AMDGPUDevice().
void gem5::GPUCommandProcessor::setShader | ( | Shader * | shader | ) |
Definition at line 843 of file gpu_command_processor.cc.
Shader * gem5::GPUCommandProcessor::shader | ( | ) |
Definition at line 849 of file gpu_command_processor.cc.
References _shader.
Referenced by gem5::PM4PacketProcessor::mapProcess(), performTimingRead(), gem5::AMDGPUDevice::readFrame(), setShader(), submitDispatchPkt(), and gem5::AMDGPUDevice::writeFrame().
void gem5::GPUCommandProcessor::signalWakeupEvent | ( | uint32_t | event_id | ) |
Definition at line 690 of file gpu_command_processor.cc.
References _driver, and gem5::GPUComputeDriver::signalWakeupEvent().
Referenced by updateHsaSignal().
void gem5::GPUCommandProcessor::submitAgentDispatchPkt | ( | void * | raw_pkt, |
uint32_t | queue_id, | ||
Addr | host_pkt_addr | ||
) |
submitAgentDispatchPkt() is for accepting agent dispatch packets.
These packets will control the dispatch of Wg on the device, and inform the host when a specified number of Wg have been executed on the device.
For now it simply finishes the pkt.
Definition at line 637 of file gpu_command_processor.cc.
References gem5::_hsa_agent_dispatch_packet_t::arg, gem5::HSAQueueEntry::completionSignal(), dispatcher, gem5::DmaVirtDevice::dmaWriteVirt(), DPRINTF, gem5::HSAPacketProcessor::finishPkt(), hsaPP, gem5::GPUDispatcher::hsaTask(), Nop, panic, gem5::_hsa_agent_dispatch_packet_t::return_address, Steal, and gem5::_hsa_agent_dispatch_packet_t::type.
Referenced by gem5::HSAPacketProcessor::processPkt().
void gem5::GPUCommandProcessor::submitDispatchPkt | ( | void * | raw_pkt, |
uint32_t | queue_id, | ||
Addr | host_pkt_addr | ||
) |
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with AQL kernel dispatch packets.
After the HSAPP receives and extracts an AQL packet, it sends it to the CP, which is responsible for gathering all relevant information about a task, initializing CU state, and sending it to the dispatcher for WG creation and dispatch.
First we need capture all information from the the AQL pkt and the code object, then store it in an HSAQueueEntry. Once the packet and code are extracted, we extract information from the queue descriptor that the CP needs to perform state initialization on the CU. Finally we call dispatch() to send the task to the dispatcher. When the task completely finishes, we call finishPkt() on the HSA packet processor in order to remove the packet from the queue, and notify the runtime that the task has completed.
Make sure there is not a race condition with invalidates in the L2 cache. The full system driver may write directly to memory using large BAR while the L2 cache is allowed to keep data in the valid state between kernel launches. This is a rare event but is required for correctness.
Need to use a raw pointer for DmaVirtDevice API. This is deleted in the dispatchKernelObject method.
The kernel_object is a pointer to the machine code, whose entry point is an 'amd_kernel_code_t' type, which is included in the kernel binary, and describes various aspects of the kernel. The desired entry is the 'kernel_code_entry_byte_offset' field, which provides the byte offset (positive or negative) from the address of the amd_kernel_code_t to the start of the machine instructions.
For SE mode we can read from the port proxy. In FS mode, we may need to wait for the guest OS to setup translations, especially when using the KVM CPU, so it is preferred to read the code object using a timing DMA request.
we need to read a pointer in the application's address space to pull out the kernel code descriptor.
In full system mode, the page table entry may point to a system page or a device page. System pages use the proxy as normal, but a device page needs to be read from device memory. Check what type it is here.
Full system currently only supports running on single VMID (one virtual memory space), i.e., one application running on GPU at a time. Because of this, for now we know the VMID is always 1. Later the VMID would have to be passed on to the command processor.
System objects use DMA device. Device objects need to use device memory.
Definition at line 153 of file gpu_command_processor.cc.
References gem5::Shader::addDeferredDispatch(), gem5::ChunkGenerator::addr(), gem5::GPUCommandProcessor::KernelDispatchData::akc, gem5::ChunkGenerator::complete(), gem5::Packet::dataStatic(), dispatchKernelObject(), gem5::DmaVirtDevice::dmaReadVirt(), gem5::ChunkGenerator::done(), DPRINTF, flags, gem5::FullSystem, gem5::VegaISA::Walker::getDevRequestor(), gem5::AMDGPUVM::getPageTableBase(), gem5::AMDGPUDevice::getVM(), gpuDevice, gem5::GPUCommandProcessor::KernelDispatchData::host_pkt_addr, gem5::_hsa_dispatch_packet_t::kernel_object, kernelDispatchList, gem5::ChunkGenerator::next(), performTimingRead(), gem5::Request::PHYSICAL, gem5::GPUCommandProcessor::KernelDispatchData::queue_id, gem5::GPUCommandProcessor::KernelDispatchData::raw_pkt, gem5::BaseMMU::Read, gem5::PortProxy::readBlob(), gem5::GPUCommandProcessor::KernelDispatchData::readPkt, gem5::MemCmd::ReadReq, shader(), gem5::VegaISA::Walker::startFunctional(), gem5::PioDevice::sys, system(), gem5::System::threads, and walker.
Referenced by gem5::Shader::decNumOutstandingInvL2s(), and gem5::HSAPacketProcessor::processPkt().
void gem5::GPUCommandProcessor::submitVendorPkt | ( | void * | raw_pkt, |
uint32_t | queue_id, | ||
Addr | host_pkt_addr | ||
) |
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
Vendor-specific packets may be used by the runtime to send commands to the HSA device that are specific to a particular vendor. The vendor-specific packets should be defined by the vendor in the runtime. TODO: For now we simply tell the HSAPP to finish the packet and write a completion signal, if any. However, in the future proper handing may be required for vendor specific packets.
In the version of ROCm that is currently supported the runtime will send packets that direct the CP to invalidate the GPU caches. We do this automatically on each kernel launch in the CU, so that situation is safe for now.
Definition at line 615 of file gpu_command_processor.cc.
References gem5::HSAPacketProcessor::finishPkt(), hsaPP, sendCompletionSignal(), and warn.
Referenced by gem5::HSAPacketProcessor::processPkt().
System * gem5::GPUCommandProcessor::system | ( | ) |
Definition at line 823 of file gpu_command_processor.cc.
References gem5::PioDevice::sys.
Referenced by gem5::SDMAEngine::copyDone(), functionalReadHsaSignal(), gem5::SDMAEngine::ptePdeDone(), gem5::AMDGPUDevice::readFrame(), submitDispatchPkt(), updateHsaSignal(), gem5::SDMAEngine::writeDone(), and gem5::AMDGPUDevice::writeFrame().
|
overrideprivatevirtual |
Function used to translate a range of addresses from virtual to physical addresses.
All classes inheriting from DmaVirtDevice must define this.
vaddr | Virtual address of the start of the range |
size | Size of the range in bytes |
Implements gem5::DmaVirtDevice.
Definition at line 86 of file gpu_command_processor.cc.
References gem5::FullSystem, gem5::AMDGPUDevice::getVM(), gpuDevice, gem5::PioDevice::sys, gem5::System::threads, gem5::MipsISA::vaddr, and walker.
void gem5::GPUCommandProcessor::updateHsaEventData | ( | Addr | signal_handle, |
uint64_t * | event_value | ||
) |
Definition at line 461 of file gpu_command_processor.cc.
References gem5::curTick(), dispatchStartTime, gem5::DmaVirtDevice::dmaWriteVirt(), DPRINTF, gem5::amd_event_t::end_ts, getHsaSignalMailboxAddr(), gem5::sim_clock::as_int::ns, gem5::amd_event_t::start_ts, updateHsaEventTs(), and updateHsaSignalDone().
Referenced by updateHsaMailboxData().
void gem5::GPUCommandProcessor::updateHsaEventTs | ( | Addr | signal_handle, |
amd_event_t * | event_value | ||
) |
Definition at line 489 of file gpu_command_processor.cc.
References gem5::DmaVirtDevice::dmaReadVirt(), DPRINTF, getHsaSignalValueAddr(), gem5::ArmISA::ts, and updateHsaSignalData().
Referenced by updateHsaEventData(), and updateHsaMailboxData().
void gem5::GPUCommandProcessor::updateHsaMailboxData | ( | Addr | signal_handle, |
uint64_t * | mailbox_value | ||
) |
Definition at line 427 of file gpu_command_processor.cc.
References gem5::curTick(), dispatchStartTime, gem5::DmaVirtDevice::dmaReadVirt(), gem5::DmaVirtDevice::dmaWriteVirt(), DPRINTF, gem5::amd_event_t::end_ts, getHsaSignalEventAddr(), gem5::sim_clock::as_int::ns, gem5::amd_event_t::start_ts, updateHsaEventData(), and updateHsaEventTs().
Referenced by updateHsaSignalAsync().
void gem5::GPUCommandProcessor::updateHsaSignal | ( | Addr | signal_handle, |
uint64_t | signal_value, | ||
HsaSignalCallbackFunction | function = [] (const uint64_t &) { } |
||
) |
Definition at line 536 of file gpu_command_processor.cc.
References gem5::DmaVirtDevice::dmaWriteVirt(), DPRINTF, gem5::FullSystem, getHsaSignalEventAddr(), getHsaSignalMailboxAddr(), getHsaSignalValueAddr(), signalWakeupEvent(), system(), and gem5::System::threads.
Referenced by MQDDmaEvent(), and sendCompletionSignal().
void gem5::GPUCommandProcessor::updateHsaSignalAsync | ( | Addr | signal_handle, |
int64_t | diff | ||
) |
Definition at line 414 of file gpu_command_processor.cc.
References gem5::DmaVirtDevice::dmaReadVirt(), DPRINTF, getHsaSignalMailboxAddr(), and updateHsaMailboxData().
Referenced by sendCompletionSignal().
void gem5::GPUCommandProcessor::updateHsaSignalData | ( | Addr | value_addr, |
int64_t | diff, | ||
uint64_t * | prev_value | ||
) |
Definition at line 507 of file gpu_command_processor.cc.
References gem5::DmaVirtDevice::dmaWriteVirt(), DPRINTF, and updateHsaSignalDone().
Referenced by updateHsaEventTs().
void gem5::GPUCommandProcessor::updateHsaSignalDone | ( | uint64_t * | signal_value | ) |
Definition at line 521 of file gpu_command_processor.cc.
Referenced by updateHsaEventData(), and updateHsaSignalData().
RequestorID gem5::GPUCommandProcessor::vramRequestorId | ( | ) |
Forward the VRAM requestor ID needed for device memory from GPU device.
Definition at line 80 of file gpu_command_processor.cc.
References gpuDevice, and gem5::AMDGPUDevice::vramRequestorId().
Referenced by gem5::Shader::vramRequestorId().
|
inlineprivate |
Poll on queue_inactive signal until the runtime can get around to taking care of our lack of scratch space.
Runtime will have updated the MQD to give us more scratch space. Read it out and continue to pester the runtime until we get all that we need to launch.
TODO: Technically only need to update private segment fields since other MQD entries won't change since we last read them.
Poll until runtime signals us that scratch space has been allocated.
Delay for a large amount of ticks to give the CPU time to setup the scratch space. The delay should be non-zero to since this method calls back itself and can cause an infinite loop in the event queue if the allocation is not completed by the first time this is called.
Definition at line 281 of file gpu_command_processor.hh.
References gem5::HSAQueueEntry::amdQueue, gem5::DmaVirtDevice::dmaReadVirt(), DPRINTF, getHsaSignalValueAddr(), gem5::_hsa_signal_t::handle, gem5::HSAQueueEntry::hostAMDQueueAddr, MQDDmaEvent(), gem5::_amd_queue_t::queue_inactive_signal, and WaitScratchDmaEvent().
Referenced by WaitScratchDmaEvent().
Pure virtual function that the device must implement.
Called when a write command is recieved by the port.
pkt | Packet describing this request |
Implements gem5::PioDevice.
Definition at line 122 of file gpu_command_processor.hh.
|
private |
Definition at line 159 of file gpu_command_processor.hh.
Referenced by attachDriver(), driver(), and signalWakeupEvent().
|
private |
Definition at line 157 of file gpu_command_processor.hh.
Referenced by setShader(), and shader().
|
private |
Definition at line 158 of file gpu_command_processor.hh.
Referenced by dispatchPkt(), GPUCommandProcessor(), and submitAgentDispatchPkt().
Definition at line 180 of file gpu_command_processor.hh.
Referenced by dispatchKernelObject(), updateHsaEventData(), and updateHsaMailboxData().
|
private |
Definition at line 171 of file gpu_command_processor.hh.
Referenced by dispatchKernelObject().
|
private |
Definition at line 160 of file gpu_command_processor.hh.
Referenced by dispatchKernelObject(), setGPUDevice(), submitDispatchPkt(), translate(), and vramRequestorId().
|
private |
Definition at line 167 of file gpu_command_processor.hh.
Referenced by GPUCommandProcessor(), hsaPacketProc(), initABI(), MQDDmaEvent(), ReadDispIdOffsetDmaEvent(), submitAgentDispatchPkt(), and submitVendorPkt().
std::list<struct KernelDispatchData> gem5::GPUCommandProcessor::kernelDispatchList |
Definition at line 97 of file gpu_command_processor.hh.
Referenced by completeTimingRead(), and submitDispatchPkt().
|
private |
Definition at line 174 of file gpu_command_processor.hh.
Referenced by dispatchKernelObject().
|
private |
Definition at line 177 of file gpu_command_processor.hh.
Referenced by dispatchKernelObject().
|
private |
Definition at line 161 of file gpu_command_processor.hh.
Referenced by setGPUDevice(), submitDispatchPkt(), and translate().