38#include "debug/GPUCommandProc.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUInitAbi.hh"
41#include "debug/GPUKernelInfo.hh"
50#include "params/GPUCommandProcessor.hh"
62 walker(
p.walker), hsaPP(
p.hsapp),
63 target_non_blit_kernel_id(
p.target_non_blit_kernel_id)
92 auto process =
sys->
threads[0]->getProcessPtr();
94 return process->pTable->translateRange(
vaddr, size);
115 safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->
senderState);
159 unsigned akc_alignment_granularity = 64;
160 assert(!(disp_pkt->
kernel_object & (akc_alignment_granularity - 1)));
169 if (
shader()->getNumOutstandingInvL2s() > 0) {
171 "Deferring kernel launch due to outstanding L2 invalidates\n");
205 DPRINTF(GPUCommandProc,
"reading kernel_object using proxy\n");
217 bool is_system_page =
true;
232 DPRINTF(GPUCommandProc,
"kernel_object vaddr %#lx paddr %#lx size %d"
240 if (is_system_page) {
242 "sending system DMA read for kernel_object\n");
245 [=](
const uint32_t&) {
250 dma_callback, (
void *)akc);
253 "kernel_object in device, using device mem\n");
258 akc_alignment_granularity);
268 RequestPtr request = std::make_shared<Request>(chunk_addr,
269 akc_alignment_granularity,
flags,
275 assert(
system()->getDeviceMemory(readPkt) !=
nullptr);
277 dispatchData.
akc = akc;
278 dispatchData.
raw_pkt = raw_pkt;
281 dispatchData.
readPkt = readPkt;
291 uint32_t queue_id,
Addr host_pkt_addr)
297 DPRINTF(GPUCommandProc,
"GPU machine code is %lli bytes from start of the "
303 DPRINTF(GPUCommandProc,
"Machine code starts at addr: %#x\n",
306 std::string kernel_name;
319 kernel_name =
"Some kernel";
320 is_blit_kernel =
false;
322 kernel_name =
"Blit kernel";
323 is_blit_kernel =
true;
326 DPRINTF(GPUKernelInfo,
"Kernel name: %s\n", kernel_name.c_str());
340 DPRINTF(GPUCommandProc,
"Skipping non-blit kernel %i (Task ID: %i)\n",
346 DPRINTF(GPUDisp,
"HSA AQL Kernel Complete with completion "
351 DPRINTF(GPUDisp,
"HSA AQL Kernel Complete! No completion "
366 DPRINTF(GPUCommandProc,
"Task ID: %i Got AQL: wg size (%dx%dx%d), "
367 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
374 DPRINTF(GPUCommandProc,
"Extracted code object: %s (num vector regs: %d, "
375 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
417 uint64_t *mailboxValue =
new uint64_t;
419 [ = ] (
const uint64_t &)
421 dmaReadVirt(mailbox_addr,
sizeof(uint64_t), cb2, (
void *)mailboxValue);
422 DPRINTF(GPUCommandProc,
"updateHsaSignalAsync reading mailbox addr %lx\n",
428 uint64_t *mailbox_value)
432 DPRINTF(GPUCommandProc,
"updateHsaMailboxData read %ld\n", *mailbox_value);
433 if (*mailbox_value != 0) {
438 [ = ] (
const uint64_t &)
440 dmaReadVirt(event_addr,
sizeof(uint64_t), cb, (
void *)mailbox_value);
442 delete mailbox_value;
450 [ = ] (
const uint64_t &)
453 DPRINTF(GPUCommandProc,
"updateHsaMailboxData reading timestamp addr "
462 uint64_t *event_value)
466 DPRINTF(GPUCommandProc,
"updateHsaEventData read %ld\n", *event_value);
469 [ = ] (
const uint64_t &)
471 dmaWriteVirt(mailbox_addr,
sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
479 [ = ] (
const uint64_t &)
482 DPRINTF(GPUCommandProc,
"updateHsaEventData reading timestamp addr %lx\n",
497 uint64_t *signalValue =
new uint64_t;
499 [ = ] (
const uint64_t &)
501 dmaReadVirt(value_addr,
sizeof(uint64_t), cb, (
void *)signalValue);
502 DPRINTF(GPUCommandProc,
"updateHsaSignalAsync reading value addr %lx\n",
508 uint64_t *prev_value)
511 DPRINTF(GPUCommandProc,
"updateHsaSignalData read %ld, writing %ld\n",
512 *prev_value, *prev_value + diff);
515 [ = ] (
const uint64_t &)
517 dmaWriteVirt(value_addr,
sizeof(uint64_t), cb, (
void *)prev_value);
544 DPRINTF(GPUCommandProc,
"Triggering completion signal: %x!\n", value_addr);
558 if (*mailbox_ptr != 0) {
564 DPRINTF(GPUCommandProc,
"Calling signal wakeup event on "
565 "signal event value %d\n", *event_val);
620 if (vendor_pkt->completion_signal) {
624 warn(
"Ignoring vendor packet\n");
645 DPRINTF(GPUCommandProc,
"Agent Dispatch Packet NOP\n");
648 int kid = agent_pkt->
arg[0];
650 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
657 DPRINTF(GPUCommandProc,
"Return Addr: %p\n",return_address);
660 *new_signal_addr = (
Addr)signal_addr;
664 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
665 "(%x:%x) writing into %x\n",
666 kid,signal_addr,new_signal_addr,return_address);
670 panic(
"The agent dispatch packet provided an unknown argument in" \
671 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
705 [ = ] (
const uint32_t &readDispIdOffset)
711 dmaReadVirt(hostReadIdxPtr +
sizeof(hostReadIdxPtr),
718 DPRINTF(GPUInitAbi,
"group_segment_fixed_size: %d\n",
720 DPRINTF(GPUInitAbi,
"private_segment_fixed_size: %d\n",
723 DPRINTF(GPUInitAbi,
"kernel_code_entry_byte_offset: %d\n",
727 DPRINTF(GPUInitAbi,
"granulated_workitem_vgpr_count: %d\n",
729 DPRINTF(GPUInitAbi,
"granulated_wavefront_sgpr_count: %d\n",
733 DPRINTF(GPUInitAbi,
"float_mode_round_16_64: %d\n",
735 DPRINTF(GPUInitAbi,
"float_mode_denorm_32: %d\n",
737 DPRINTF(GPUInitAbi,
"float_mode_denorm_16_64: %d\n",
749 DPRINTF(GPUInitAbi,
"enable_private_segment: %d\n",
753 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_x: %d\n",
755 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_y: %d\n",
757 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_z: %d\n",
759 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_info: %d\n",
761 DPRINTF(GPUInitAbi,
"enable_vgpr_workitem_id: %d\n",
763 DPRINTF(GPUInitAbi,
"enable_exception_address_watch: %d\n",
765 DPRINTF(GPUInitAbi,
"enable_exception_memory: %d\n",
768 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_invalid_operation: %d\n",
770 DPRINTF(GPUInitAbi,
"enable_exception_fp_denormal_source: %d\n",
772 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_division_by_zero: %d\n",
774 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_overflow: %d\n",
776 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_underflow: %d\n",
778 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_inexact: %d\n",
780 DPRINTF(GPUInitAbi,
"enable_exception_int_divide_by_zero: %d\n",
782 DPRINTF(GPUInitAbi,
"enable_sgpr_private_segment_buffer: %d\n",
784 DPRINTF(GPUInitAbi,
"enable_sgpr_dispatch_ptr: %d\n",
786 DPRINTF(GPUInitAbi,
"enable_sgpr_queue_ptr: %d\n",
788 DPRINTF(GPUInitAbi,
"enable_sgpr_kernarg_segment_ptr: %d\n",
790 DPRINTF(GPUInitAbi,
"enable_sgpr_dispatch_id: %d\n",
792 DPRINTF(GPUInitAbi,
"enable_sgpr_flat_scratch_init: %d\n",
794 DPRINTF(GPUInitAbi,
"enable_sgpr_private_segment_size: %d\n",
796 DPRINTF(GPUInitAbi,
"enable_wavefront_size32: %d\n",
799 DPRINTF(GPUInitAbi,
"kernarg_preload_spec_length: %d\n",
801 DPRINTF(GPUInitAbi,
"kernarg_preload_spec_offset: %d\n",
818 "Kernarg preload not implemented\n");
AbstractMemory declaration.
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
GfxVersion getGfxVersion() const
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Addr getPageTableBase(uint16_t vmid)
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
std::deque< std::pair< PacketPtr, Wavefront * > > retries
std::vector< std::vector< Wavefront * > > wfList
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void completeTimingRead()
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
GPUComputeDriver * driver()
Addr getHsaSignalMailboxAddr(Addr signal_handle)
GPUCommandProcessor()=delete
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void updateHsaSignalDone(uint64_t *signal_value)
GPUComputeDriver * _driver
void setShader(Shader *shader)
HSAPacketProcessor & hsaPacketProc()
int target_non_blit_kernel_id
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
std::list< struct KernelDispatchData > kernelDispatchList
Addr getHsaSignalValueAddr(Addr signal_handle)
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value)
HSAPacketProcessor * hsaPP
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
void updateHsaSignalAsync(Addr signal_handle, int64_t diff)
std::unordered_map< Addr, Tick > dispatchStartTime
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
void sanityCheckAKC(AMDKernelCode *akc)
GPUDispatcher & dispatcher
GPUCommandProcessorParams Params
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value)
void performTimingRead(PacketPtr pkt)
void updateHsaEventData(Addr signal_handle, uint64_t *event_value)
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
void updateHsaSignalData(Addr value_addr, int64_t diff, uint64_t *prev_value)
GfxVersion getGfxVersion() const
virtual void signalWakeupEvent(uint32_t event_id)
void dispatch(HSAQueueEntry *task)
After all relevant HSA data structures have been traversed/extracted from memory by the CP,...
HSAQueueEntry * hsaTask(int disp_id)
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
void finishPkt(void *pkt, uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
int numVectorRegs() const
Addr completionSignal() const
int numScalarRegs() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
void readBlob(Addr addr, void *p, uint64_t size) const
Higher level interfaces based on the above.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
@ PHYSICAL
The virtual address is also the physical address.
std::vector< ComputeUnit * > cuList
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void setDevRequestor(RequestorID mid)
Fault startFunctional(Addr base, Addr vaddr, PageTableEntry &pte, unsigned &logBytes, BaseMMU::Mode mode)
RequestorID getDevRequestor() const
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Copyright (c) 2024 Arm Limited All rights reserved.
struct gem5::GEM5_PACKED AMDKernelCode
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
std::unique_ptr< TranslationGen > TranslationGenPtr
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_queue_ptr
uint32_t enable_exception_memory
uint32_t enable_exception_fp_denormal_source
uint32_t enable_exception_address_watch
uint32_t enable_private_segment
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_exception_ieee_754_fp_inexact
uint32_t enable_ieee_mode
uint32_t group_segment_fixed_size
uint32_t float_mode_round_32
uint32_t enable_exception_int_divide_by_zero
uint32_t granulated_wavefront_sgpr_count
uint32_t granulated_workitem_vgpr_count
uint32_t kernarg_preload_spec_length
uint32_t enable_sgpr_dispatch_ptr
uint32_t use_dynamic_stack
uint32_t granulated_lds_size
uint32_t float_mode_round_16_64
uint32_t float_mode_denorm_16_64
uint32_t float_mode_denorm_32
uint32_t kernarg_preload_spec_offset
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_dispatch_id
uint32_t enable_dx10_clamp
uint32_t enable_exception_ieee_754_fp_overflow
int64_t kernel_code_entry_byte_offset
uint32_t enable_exception_ieee_754_fp_underflow
uint32_t enable_vgpr_workitem_id
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_exception_ieee_754_fp_division_by_zero
uint32_t private_segment_fixed_size
uint32_t enable_trap_handler
uint32_t enable_sgpr_workgroup_info
uint32_t enable_exception_ieee_754_fp_invalid_operation
uint32_t enable_wavefront_size32
uint64_t completion_signal
uint16_t workgroup_size_y
uint16_t workgroup_size_z
uint16_t workgroup_size_x
This file defines buffer classes used to handle pointer arguments in emulated syscalls.