38#include "debug/GPUCommandProc.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUInitAbi.hh"
41#include "debug/GPUKernelInfo.hh"
49#include "params/GPUCommandProcessor.hh"
61 walker(
p.walker), hsaPP(
p.hsapp),
62 target_non_blit_kernel_id(
p.target_non_blit_kernel_id)
91 auto process =
sys->
threads[0]->getProcessPtr();
93 return process->pTable->translateRange(
vaddr, size);
127 unsigned akc_alignment_granularity = 64;
128 assert(!(disp_pkt->
kernel_object & (akc_alignment_granularity - 1)));
137 if (
shader()->getNumOutstandingInvL2s() > 0) {
139 "Deferring kernel launch due to outstanding L2 invalidates\n");
173 DPRINTF(GPUCommandProc,
"reading kernel_object using proxy\n");
185 bool is_system_page =
true;
200 DPRINTF(GPUCommandProc,
"kernel_object vaddr %#lx paddr %#lx size %d"
208 if (is_system_page) {
210 "sending system DMA read for kernel_object\n");
213 [=](
const uint32_t&) {
218 dma_callback, (
void *)akc);
221 "kernel_object in device, using device mem\n");
226 akc_alignment_granularity);
236 RequestPtr request = std::make_shared<Request>(chunk_addr,
237 akc_alignment_granularity,
flags,
243 assert(
system()->getDeviceMemory(readPkt) !=
nullptr);
255 uint32_t queue_id,
Addr host_pkt_addr)
261 DPRINTF(GPUCommandProc,
"GPU machine code is %lli bytes from start of the "
267 DPRINTF(GPUCommandProc,
"Machine code starts at addr: %#x\n",
270 std::string kernel_name;
283 kernel_name =
"Some kernel";
284 is_blit_kernel =
false;
286 kernel_name =
"Blit kernel";
287 is_blit_kernel =
true;
290 DPRINTF(GPUKernelInfo,
"Kernel name: %s\n", kernel_name.c_str());
304 DPRINTF(GPUCommandProc,
"Skipping non-blit kernel %i (Task ID: %i)\n",
310 DPRINTF(GPUDisp,
"HSA AQL Kernel Complete with completion "
315 DPRINTF(GPUDisp,
"HSA AQL Kernel Complete! No completion "
330 DPRINTF(GPUCommandProc,
"Task ID: %i Got AQL: wg size (%dx%dx%d), "
331 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
338 DPRINTF(GPUCommandProc,
"Extracted code object: %s (num vector regs: %d, "
339 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
381 uint64_t *mailboxValue =
new uint64_t;
383 [ = ] (
const uint64_t &)
385 dmaReadVirt(mailbox_addr,
sizeof(uint64_t), cb2, (
void *)mailboxValue);
386 DPRINTF(GPUCommandProc,
"updateHsaSignalAsync reading mailbox addr %lx\n",
392 uint64_t *mailbox_value)
396 DPRINTF(GPUCommandProc,
"updateHsaMailboxData read %ld\n", *mailbox_value);
397 if (*mailbox_value != 0) {
402 [ = ] (
const uint64_t &)
404 dmaReadVirt(event_addr,
sizeof(uint64_t), cb, (
void *)mailbox_value);
406 delete mailbox_value;
414 [ = ] (
const uint64_t &)
417 DPRINTF(GPUCommandProc,
"updateHsaMailboxData reading timestamp addr "
426 uint64_t *event_value)
430 DPRINTF(GPUCommandProc,
"updateHsaEventData read %ld\n", *event_value);
433 [ = ] (
const uint64_t &)
435 dmaWriteVirt(mailbox_addr,
sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
443 [ = ] (
const uint64_t &)
446 DPRINTF(GPUCommandProc,
"updateHsaEventData reading timestamp addr %lx\n",
461 uint64_t *signalValue =
new uint64_t;
463 [ = ] (
const uint64_t &)
465 dmaReadVirt(value_addr,
sizeof(uint64_t), cb, (
void *)signalValue);
466 DPRINTF(GPUCommandProc,
"updateHsaSignalAsync reading value addr %lx\n",
472 uint64_t *prev_value)
475 DPRINTF(GPUCommandProc,
"updateHsaSignalData read %ld, writing %ld\n",
476 *prev_value, *prev_value + diff);
479 [ = ] (
const uint64_t &)
481 dmaWriteVirt(value_addr,
sizeof(uint64_t), cb, (
void *)prev_value);
508 DPRINTF(GPUCommandProc,
"Triggering completion signal: %x!\n", value_addr);
522 if (*mailbox_ptr != 0) {
528 DPRINTF(GPUCommandProc,
"Calling signal wakeup event on "
529 "signal event value %d\n", *event_val);
584 if (vendor_pkt->completion_signal) {
588 warn(
"Ignoring vendor packet\n");
609 DPRINTF(GPUCommandProc,
"Agent Dispatch Packet NOP\n");
612 int kid = agent_pkt->
arg[0];
614 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
621 DPRINTF(GPUCommandProc,
"Return Addr: %p\n",return_address);
624 *new_signal_addr = (
Addr)signal_addr;
628 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
629 "(%x:%x) writing into %x\n",
630 kid,signal_addr,new_signal_addr,return_address);
634 panic(
"The agent dispatch packet provided an unknown argument in" \
635 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
669 [ = ] (
const uint32_t &readDispIdOffset)
675 dmaReadVirt(hostReadIdxPtr +
sizeof(hostReadIdxPtr),
682 DPRINTF(GPUInitAbi,
"group_segment_fixed_size: %d\n",
684 DPRINTF(GPUInitAbi,
"private_segment_fixed_size: %d\n",
687 DPRINTF(GPUInitAbi,
"kernel_code_entry_byte_offset: %d\n",
691 DPRINTF(GPUInitAbi,
"granulated_workitem_vgpr_count: %d\n",
693 DPRINTF(GPUInitAbi,
"granulated_wavefront_sgpr_count: %d\n",
697 DPRINTF(GPUInitAbi,
"float_mode_round_16_64: %d\n",
699 DPRINTF(GPUInitAbi,
"float_mode_denorm_32: %d\n",
701 DPRINTF(GPUInitAbi,
"float_mode_denorm_16_64: %d\n",
713 DPRINTF(GPUInitAbi,
"enable_private_segment: %d\n",
717 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_x: %d\n",
719 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_y: %d\n",
721 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_id_z: %d\n",
723 DPRINTF(GPUInitAbi,
"enable_sgpr_workgroup_info: %d\n",
725 DPRINTF(GPUInitAbi,
"enable_vgpr_workitem_id: %d\n",
727 DPRINTF(GPUInitAbi,
"enable_exception_address_watch: %d\n",
729 DPRINTF(GPUInitAbi,
"enable_exception_memory: %d\n",
732 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_invalid_operation: %d\n",
734 DPRINTF(GPUInitAbi,
"enable_exception_fp_denormal_source: %d\n",
736 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_division_by_zero: %d\n",
738 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_overflow: %d\n",
740 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_underflow: %d\n",
742 DPRINTF(GPUInitAbi,
"enable_exception_ieee_754_fp_inexact: %d\n",
744 DPRINTF(GPUInitAbi,
"enable_exception_int_divide_by_zero: %d\n",
746 DPRINTF(GPUInitAbi,
"enable_sgpr_private_segment_buffer: %d\n",
748 DPRINTF(GPUInitAbi,
"enable_sgpr_dispatch_ptr: %d\n",
750 DPRINTF(GPUInitAbi,
"enable_sgpr_queue_ptr: %d\n",
752 DPRINTF(GPUInitAbi,
"enable_sgpr_kernarg_segment_ptr: %d\n",
754 DPRINTF(GPUInitAbi,
"enable_sgpr_dispatch_id: %d\n",
756 DPRINTF(GPUInitAbi,
"enable_sgpr_flat_scratch_init: %d\n",
758 DPRINTF(GPUInitAbi,
"enable_sgpr_private_segment_size: %d\n",
760 DPRINTF(GPUInitAbi,
"enable_wavefront_size32: %d\n",
763 DPRINTF(GPUInitAbi,
"kernarg_preload_spec_length: %d\n",
765 DPRINTF(GPUInitAbi,
"kernarg_preload_spec_offset: %d\n",
782 "Kernarg preload not implemented\n");
AbstractMemory declaration.
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
GfxVersion getGfxVersion() const
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Addr getPageTableBase(uint16_t vmid)
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
GPUComputeDriver * driver()
Addr getHsaSignalMailboxAddr(Addr signal_handle)
GPUCommandProcessor()=delete
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void updateHsaSignalDone(uint64_t *signal_value)
GPUComputeDriver * _driver
void setShader(Shader *shader)
HSAPacketProcessor & hsaPacketProc()
int target_non_blit_kernel_id
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
Addr getHsaSignalValueAddr(Addr signal_handle)
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value)
HSAPacketProcessor * hsaPP
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
void updateHsaSignalAsync(Addr signal_handle, int64_t diff)
std::unordered_map< Addr, Tick > dispatchStartTime
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
void sanityCheckAKC(AMDKernelCode *akc)
GPUDispatcher & dispatcher
GPUCommandProcessorParams Params
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value)
void updateHsaEventData(Addr signal_handle, uint64_t *event_value)
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
void updateHsaSignalData(Addr value_addr, int64_t diff, uint64_t *prev_value)
GfxVersion getGfxVersion() const
virtual void signalWakeupEvent(uint32_t event_id)
void dispatch(HSAQueueEntry *task)
After all relevant HSA data structures have been traversed/extracted from memory by the CP,...
HSAQueueEntry * hsaTask(int disp_id)
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
void finishPkt(void *pkt, uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
int numVectorRegs() const
Addr completionSignal() const
int numScalarRegs() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
void readBlob(Addr addr, void *p, uint64_t size) const
Higher level interfaces based on the above.
@ PHYSICAL
The virtual address is also the physical address.
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
memory::AbstractMemory * getDeviceMemory(const PacketPtr &pkt) const
Return a pointer to the device memory.
void setDevRequestor(RequestorID mid)
Fault startFunctional(Addr base, Addr vaddr, PageTableEntry &pte, unsigned &logBytes, BaseMMU::Mode mode)
RequestorID getDevRequestor() const
void access(PacketPtr pkt)
Perform an untimed memory access and update all the state (e.g.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
struct gem5::GEM5_PACKED AMDKernelCode
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
std::unique_ptr< TranslationGen > TranslationGenPtr
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_queue_ptr
uint32_t enable_exception_memory
uint32_t enable_exception_fp_denormal_source
uint32_t enable_exception_address_watch
uint32_t enable_private_segment
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_exception_ieee_754_fp_inexact
uint32_t enable_ieee_mode
uint32_t group_segment_fixed_size
uint32_t float_mode_round_32
uint32_t enable_exception_int_divide_by_zero
uint32_t granulated_wavefront_sgpr_count
uint32_t granulated_workitem_vgpr_count
uint32_t kernarg_preload_spec_length
uint32_t enable_sgpr_dispatch_ptr
uint32_t use_dynamic_stack
uint32_t granulated_lds_size
uint32_t float_mode_round_16_64
uint32_t float_mode_denorm_16_64
uint32_t float_mode_denorm_32
uint32_t kernarg_preload_spec_offset
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_dispatch_id
uint32_t enable_dx10_clamp
uint32_t enable_exception_ieee_754_fp_overflow
int64_t kernel_code_entry_byte_offset
uint32_t enable_exception_ieee_754_fp_underflow
uint32_t enable_vgpr_workitem_id
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_exception_ieee_754_fp_division_by_zero
uint32_t private_segment_fixed_size
uint32_t enable_trap_handler
uint32_t enable_sgpr_workgroup_info
uint32_t enable_exception_ieee_754_fp_invalid_operation
uint32_t enable_wavefront_size32
uint64_t completion_signal
uint16_t workgroup_size_y
uint16_t workgroup_size_z
uint16_t workgroup_size_x
This file defines buffer classes used to handle pointer arguments in emulated syscalls.