38#include "debug/GPUCommandProc.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUInitAbi.hh"
41#include "debug/GPUKernelInfo.hh"
50#include "params/GPUCommandProcessor.hh"
66 hsaPP->setDevice(
this);
92 auto process =
sys->threads[0]->getProcessPtr();
94 return process->pTable->translateRange(
vaddr, size);
137 case ComputeUnit::SQCPort::SenderState::DISPATCH_KERNEL_OBJECT:
141 case ComputeUnit::SQCPort::SenderState::DISPATCH_PRELOAD_ARG:
172 unsigned akc_alignment_granularity = 64;
173 assert(!(disp_pkt->
kernel_object & (akc_alignment_granularity - 1)));
182 if (
shader()->getNumOutstandingInvL2s() > 0) {
184 "Deferring kernel launch due to outstanding L2 invalidates\n");
215 auto *tc =
sys->threads[0];
218 DPRINTF(GPUCommandProc,
"reading kernel_object using proxy\n");
230 bool is_system_page =
true;
245 DPRINTF(GPUCommandProc,
"kernel_object vaddr %#lx paddr %#lx size %d"
253 if (is_system_page) {
255 "sending system DMA read for kernel_object\n");
258 [=](
const uint32_t&) {
263 dma_callback, (
void *)akc);
266 "kernel_object in device, using device mem\n");
271 akc_alignment_granularity);
277 gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
281 RequestPtr request = std::make_shared<Request>(chunk_addr,
282 akc_alignment_granularity, flags,
283 walker->getDevRequestor());
288 assert(
system()->getDeviceMemory(readPkt) !=
nullptr);
290 dispatchData.
akc = akc;
291 dispatchData.
raw_pkt = raw_pkt;
294 dispatchData.
readPkt = readPkt;
297 ComputeUnit::SQCPort::SenderState::DISPATCH_KERNEL_OBJECT);
305 uint32_t queue_id,
Addr host_pkt_addr)
321 DPRINTF(GPUCommandProc,
"GPU machine code is %lli bytes from start of the "
327 DPRINTF(GPUCommandProc,
"Machine code starts at addr: %#x\n",
330 std::string kernel_name;
343 kernel_name =
"Some kernel";
344 is_blit_kernel =
false;
346 kernel_name =
"Blit kernel";
347 is_blit_kernel =
true;
350 DPRINTF(GPUKernelInfo,
"Kernel name: %s\n", kernel_name.c_str());
364 DPRINTF(GPUCommandProc,
"Skipping non-blit kernel %i (Task ID: %i)\n",
370 DPRINTF(GPUDisp,
"HSA AQL Kernel Complete with completion "
375 DPRINTF(GPUDisp,
"HSA AQL Kernel Complete! No completion "
390 DPRINTF(GPUCommandProc,
"Task ID: %i Got AQL: wg size (%dx%dx%d), "
391 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
398 DPRINTF(GPUCommandProc,
"Extracted code object: %s (num vector regs: %d, "
399 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
446 uint64_t *mailboxValue =
new uint64_t;
448 [ = ] (
const uint64_t &)
450 dmaReadVirt(mailbox_addr,
sizeof(uint64_t), cb2, (
void *)mailboxValue);
451 DPRINTF(GPUCommandProc,
"updateHsaSignalAsync reading mailbox addr %lx\n",
457 uint64_t *mailbox_value)
461 DPRINTF(GPUCommandProc,
"updateHsaMailboxData read %ld\n", *mailbox_value);
462 if (*mailbox_value != 0) {
467 [ = ] (
const uint64_t &)
469 dmaReadVirt(event_addr,
sizeof(uint64_t), cb, (
void *)mailbox_value);
471 delete mailbox_value;
479 [ = ] (
const uint64_t &)
482 DPRINTF(GPUCommandProc,
"updateHsaMailboxData reading timestamp addr "
491 uint64_t *event_value)
495 DPRINTF(GPUCommandProc,
"updateHsaEventData read %ld\n", *event_value);
498 [ = ] (
const uint64_t &)
500 dmaWriteVirt(mailbox_addr,
sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
508 [ = ] (
const uint64_t &)
511 DPRINTF(GPUCommandProc,
"updateHsaEventData reading timestamp addr %lx\n",
526 uint64_t *signalValue =
new uint64_t;
528 [ = ] (
const uint64_t &)
530 dmaReadVirt(value_addr,
sizeof(uint64_t), cb, (
void *)signalValue);
531 DPRINTF(GPUCommandProc,
"updateHsaSignalAsync reading value addr %lx\n",
537 uint64_t *prev_value)
540 DPRINTF(GPUCommandProc,
"updateHsaSignalData read %ld, writing %ld\n",
541 *prev_value, *prev_value + diff);
544 [ = ] (
const uint64_t &)
546 dmaWriteVirt(value_addr,
sizeof(uint64_t), cb, (
void *)prev_value);
573 DPRINTF(GPUCommandProc,
"Triggering completion signal: %x!\n", value_addr);
587 if (*mailbox_ptr != 0) {
593 DPRINTF(GPUCommandProc,
"Calling signal wakeup event on "
594 "signal event value %d\n", *event_val);
649 if (vendor_pkt->completion_signal) {
653 warn(
"Ignoring vendor packet\n");
655 hsaPP->finishPkt(raw_pkt, queue_id);
674 DPRINTF(GPUCommandProc,
"Agent Dispatch Packet NOP\n");
677 int kid = agent_pkt->
arg[0];
679 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
686 DPRINTF(GPUCommandProc,
"Return Addr: %p\n",return_address);
689 *new_signal_addr = (
Addr)signal_addr;
693 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
694 "(%x:%x) writing into %x\n",
695 kid,signal_addr,new_signal_addr,return_address);
699 panic(
"The agent dispatch packet provided an unknown argument in" \
700 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
703 hsaPP->finishPkt(raw_pkt, queue_id);
721 _driver->signalWakeupEvent(event_id);
735 DPRINTF(GPUCommandProc,
"Kernarg preload starts at addr: %#x\n",
744 bool is_system_page =
true;
745 Addr phys_addr = preload_addr;
759 DPRINTF(GPUCommandProc,
"Kernarg preload data is in %s memory\n",
760 is_system_page ?
"host" :
"device");
766 if (is_system_page) {
772 warn(
"Preload kernarg from host untested!\n");
775 [ = ] (
const uint32_t&) {
785 constexpr unsigned alignment_granularity = 64;
788 alignment_granularity);
795 gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
799 RequestPtr request = std::make_shared<Request>(chunk_addr,
800 alignment_granularity, flags,
801 walker->getDevRequestor());
808 dispatchData.
akc = akc;
809 dispatchData.
task = task;
810 dispatchData.
readPkt = readPkt;
813 ComputeUnit::SQCPort::SenderState::DISPATCH_PRELOAD_ARG);
825 for (
int i = 0;
i < num_sgprs; ++
i) {
826 DPRINTF(GPUCommandProc,
"Task preload user SGPR[%d] = %x\n",
845 [ = ] (
const uint32_t &readDispIdOffset)
849 =
hsaPP->getQueueDesc(task->
queueId())->hostReadIndexPtr;
851 dmaReadVirt(hostReadIdxPtr +
sizeof(hostReadIdxPtr),
858 DPRINTF(GPUInitAbi,
"group_segment_fixed_size: %d\n",
860 DPRINTF(GPUInitAbi,
"private_segment_fixed_size: %d\n",
863 DPRINTF(GPUInitAbi,
"kernel_code_entry_byte_offset: %d\n",
867 DPRINTF(GPUInitAbi,
"granulated_workitem_vgpr_count: %d\n",
869 DPRINTF(GPUInitAbi,
"granulated_wavefront_sgpr_count: %d\n",
873 DPRINTF(GPUInitAbi,
"float_mode_round_16_64: %d\n",
875 DPRINTF(GPUInitAbi,
"float_mode_denorm_32: %d\n",
877 DPRINTF(GPUInitAbi,
"float_mode_denorm_16_64: %d\n",
889 DPRINTF(GPUInitAbi,
"enable_private_segment: %d\n",
893 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_x: %d\n",
895 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_y: %d\n",
897 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_z: %d\n",
899 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_info: %d\n",
901 DPRINTF(GPUInitAbi,
"enable_vgpr_workitem_id: %d\n",
903 DPRINTF(GPUInitAbi,
"enable_exception_address_watch: %d\n",
905 DPRINTF(GPUInitAbi,
"enable_exception_memory: %d\n",
908 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_invalid_operation: %d\n",
910 DPRINTF(GPUInitAbi,
"enable_exception_fp_denormal_source: %d\n",
912 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_division_by_zero: %d\n",
914 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_overflow: %d\n",
916 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_underflow: %d\n",
918 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_inexact: %d\n",
920 DPRINTF(GPUInitAbi,
"enable_exception_int_divide_by_zero: %d\n",
922 DPRINTF(GPUInitAbi,
"enable_sgpr_private_segment_buffer: %d\n",
924 DPRINTF(GPUInitAbi,
"enable_sgpr_dispatch_ptr: %d\n",
926 DPRINTF(GPUInitAbi,
"enable_sgpr_queue_ptr: %d\n",
928 DPRINTF(GPUInitAbi,
"enable_sgpr_kernarg_segment_ptr: %d\n",
930 DPRINTF(GPUInitAbi,
"enable_sgpr_dispatch_id: %d\n",
932 DPRINTF(GPUInitAbi,
"enable_sgpr_flat_scratch_init: %d\n",
934 DPRINTF(GPUInitAbi,
"enable_sgpr_private_segment_size: %d\n",
936 DPRINTF(GPUInitAbi,
"enable_wavefront_size32: %d\n",
939 DPRINTF(GPUInitAbi,
"kernarg_preload_spec_length: %d\n",
941 DPRINTF(GPUInitAbi,
"kernarg_preload_spec_offset: %d\n",
AbstractMemory declaration.
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
std::deque< std::pair< PacketPtr, Wavefront * > > retries
std::vector< std::vector< Wavefront * > > wfList
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
DmaVirtDevice(const Params &p)
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
GfxVersion getGfxVersion() const
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
GPUComputeDriver * driver()
Addr getHsaSignalMailboxAddr(Addr signal_handle)
GPUCommandProcessor()=delete
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void updateHsaSignalDone(uint64_t *signal_value)
GPUComputeDriver * _driver
void completeTimingRead(int dispType)
void setShader(Shader *shader)
HSAPacketProcessor & hsaPacketProc()
int target_non_blit_kernel_id
void performTimingRead(PacketPtr pkt, int dispType)
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
std::list< struct KernelDispatchData > kernelDispatchList
Addr getHsaSignalValueAddr(Addr signal_handle)
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value)
HSAPacketProcessor * hsaPP
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
void updateHsaSignalAsync(Addr signal_handle, int64_t diff)
std::unordered_map< Addr, Tick > dispatchStartTime
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
void sanityCheckAKC(AMDKernelCode *akc)
GPUDispatcher & dispatcher
void initPreload(AMDKernelCode *akc, HSAQueueEntry *task)
GPUCommandProcessorParams Params
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
void readPreload(AMDKernelCode *akc, HSAQueueEntry *task)
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value)
void updateHsaEventData(Addr signal_handle, uint64_t *event_value)
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
void updateHsaSignalData(Addr value_addr, int64_t diff, uint64_t *prev_value)
GfxVersion getGfxVersion() const
void finishPkt(void *pkt, uint32_t rl_idx)
void preloadLength(unsigned val)
int numVectorRegs() const
Addr completionSignal() const
int numScalarRegs() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
void readBlob(Addr addr, void *p, uint64_t size) const
Higher level interfaces based on the above.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
@ PHYSICAL
The virtual address is also the physical address.
gem5::Flags< FlagsType > Flags
std::vector< ComputeUnit * > cuList
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Copyright (c) 2024 Arm Limited All rights reserved.
T safe_cast(U &&ref_or_ptr)
struct gem5::GEM5_PACKED AMDKernelCode
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
constexpr int KernargPreloadPktSize
The number of bytes after the dispatch packet which contain kernel arguments that should be preloaded...
std::unique_ptr< TranslationGen > TranslationGenPtr
ConstProxyPtr< T, SETranslatingPortProxy > ConstVPtr
struct gem5::amd_signal_s amd_signal_t
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_queue_ptr
uint32_t enable_exception_memory
uint32_t enable_exception_fp_denormal_source
uint32_t enable_exception_address_watch
uint32_t enable_private_segment
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_exception_ieee_754_fp_inexact
uint32_t enable_ieee_mode
uint32_t group_segment_fixed_size
uint32_t float_mode_round_32
uint32_t enable_exception_int_divide_by_zero
uint32_t granulated_wavefront_sgpr_count
uint32_t granulated_workitem_vgpr_count
uint32_t kernarg_preload_spec_length
uint32_t enable_sgpr_dispatch_ptr
uint32_t use_dynamic_stack
uint32_t granulated_lds_size
uint32_t float_mode_round_16_64
uint32_t float_mode_denorm_16_64
uint32_t float_mode_denorm_32
uint32_t kernarg_preload_spec_offset
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_dispatch_id
uint32_t enable_dx10_clamp
uint32_t enable_exception_ieee_754_fp_overflow
int64_t kernel_code_entry_byte_offset
uint32_t enable_exception_ieee_754_fp_underflow
uint32_t enable_vgpr_workitem_id
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_exception_ieee_754_fp_division_by_zero
uint32_t private_segment_fixed_size
uint32_t enable_trap_handler
uint32_t enable_sgpr_workgroup_info
uint32_t enable_exception_ieee_754_fp_invalid_operation
uint32_t enable_wavefront_size32
uint64_t completion_signal
uint16_t workgroup_size_y
uint16_t workgroup_size_z
uint16_t workgroup_size_x
This file defines buffer classes used to handle pointer arguments in emulated syscalls.