41#include "debug/HSAPacketProcessor.hh"
46#include "enums/GfxVersion.hh"
55#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
57 HSAPacketProcessor::XEVENT::description() const \
62#define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
63 HSA_PACKET_HEADER_TYPE) & mask(HSA_PACKET_HEADER_WIDTH_TYPE)))
67#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
68 HSA_PACKET_HEADER_BARRIER) & \
69 mask(HSA_PACKET_HEADER_WIDTH_BARRIER)))
78 numHWQueues(
p.numHWQueues), pioAddr(
p.pioAddr),
79 pioSize(
PAGE_SIZE), pioDelay(10), pktProcessDelay(
p.pktProcessDelay)
83 regdQList.resize(numHWQueues);
84 for (
int i = 0;
i < numHWQueues;
i++) {
113 uint64_t basePointer,
115 uint32_t size,
int doorbellSize,
116 GfxVersion gfxVersion,
120 "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
121 (
void *)basePointer, queue_id, size);
123 basePointer, queue_id, size, doorbellSize,
124 gfxVersion,
offset, rd_idx);
148 "%s: write of size %d to reg-offset %d (0x%x)\n",
149 __FUNCTION__, pkt->
getSize(), daddr, daddr);
153 uint64_t doorbell_reg(0);
155 doorbell_reg = pkt->
getLE<uint64_t>() + 1;
157 doorbell_reg = pkt->
getLE<uint32_t>();
159 fatal(
"invalid db size");
162 "%s: write data 0x%x to offset %d (0x%x)\n",
163 __FUNCTION__, doorbell_reg, daddr, daddr);
184 auto process =
sys->
threads[0]->getProcessPtr();
186 return process->pTable->translateRange(
vaddr, size);
216 "%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->
rdIdx());
222 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
223 " q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
233 bool isRead, uint32_t ix_start,
unsigned num_pkts,
236 uint32_t rl_idx = series_ctx->
rl_idx;
242 " pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
243 ix_start, num_pkts, series_ctx->
pkts_2_go,
251 "%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
252 " dispIdx %d, active list ID = %d\n",
253 __FUNCTION__, aqlRingBuffer->
rdIdx(),
254 aqlRingBuffer->
wrIdx(), aqlRingBuffer->
dispIdx(), rl_idx);
297 regdQList[rl_idx]->compltnPending() > 0) {
302 " list ID = %d\n", __FUNCTION__, rl_idx);
308 " active list ID = %d\n", __FUNCTION__, rl_idx);
314 " active list ID = %d\n", __FUNCTION__, rl_idx);
325 " list ID = %d\n", __FUNCTION__, rl_idx);
330 " active list ID = %d\n", __FUNCTION__, rl_idx);
337 if (bar_and_pkt->dep_signal[
i]) {
340 uint64_t signal_addr =
341 (uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[
i]) + 1);
345 " , sig addr %x, value %d active list ID = %d\n",
346 __FUNCTION__,
i, signal_addr,
347 *signal_val, rl_idx);
351 if (*signal_val != 0) {
356 [ = ] (
const uint32_t &dma_data)
362 " active list %d\n", __FUNCTION__,
369 [ = ] (
const uint32_t &dma_data)
375 " active list %d\n", __FUNCTION__,
383 " active list ID = %d\n", __FUNCTION__, rl_idx);
391 if (bar_and_pkt->completion_signal != 0) {
397 " completion signal! Addr: %x\n",
398 bar_and_pkt->completion_signal);
401 bar_and_pkt->completion_signal);
406 dep_sgnl_rd_st->
allRead =
false;
410 fatal(
"Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
412 fatal(
"Unsupported packet type HSA_PACKET_TYPE_INVALID");
415 " active list ID = %d\n", __FUNCTION__, rl_idx);
418 (
void *)disp_pkt, rl_idx, host_pkt_addr);
422 fatal(
"Unsupported packet type %d\n", pkt_type);
435 "%s: Qwakeup , rdIdx %d, wrIdx %d," \
436 " dispIdx %d, active list ID = %d\n",
437 __FUNCTION__, aqlRingBuffer->
rdIdx(),
443 "Dummy wakeup with barrier bit for rdIdx %d\n",
rqIdx);
451 void *pkt = aqlRingBuffer->
ptr(aqlRingBuffer->
dispIdx());
453 __FUNCTION__, aqlRingBuffer->
dispIdx());
459 __FUNCTION__, aqlRingBuffer->
dispIdx());
477 panic(
"Unknown queue state\n");
485 assert(pendingReads > 0);
487 if (pendingReads == 0) {
502 "%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
503 " doorbell(%d)[0x%x] \n",
516 uint32_t ttl_aql_buf = aqlRingBuffer->
numObjs();
519 uint32_t got_aql_buf = aqlRingBuffer->
allocEntry(num_umq);
521 uint32_t dma_start_ix = (aqlRingBuffer->
wrIdx() - got_aql_buf) %
526 "dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
527 ttl_aql_buf, dma_start_ix, num_umq);
529 if (got_aql_buf == 0) {
537 uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
538 while (got_aql_buf != 0 && num_umq != 0) {
539 uint32_t umq_b4_wrap = qDesc->
numObjs() -
542 = std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
546 dma_start_ix, rl_idx);
549 void *aql_buf = aqlRingBuffer->
ptr(dma_start_ix);
551 [ = ] (
const uint32_t &dma_data)
553 num_2_xfer, series_ctx, aql_buf); }, 0);
561 "%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
562 __FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
564 num_umq -= num_2_xfer;
565 got_aql_buf -= num_2_xfer;
566 dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
567 umq_nxt = (umq_nxt + num_2_xfer) % qDesc->
numObjs();
568 if (got_aql_buf == 0 && num_umq != 0) {
583 "%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
584 "writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
591 const std::string
name)
592 :
_name(
name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
626 " # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
632 uint32_t old_rdIdx =
rdIdx();
638 return (old_rdIdx !=
rdIdx());
656 if (nBufReq >
nFree())
662 __FUNCTION__, nBufReq,
wrIdx());
674 if (
regdQList[rl_idx]->getBarrierBit() &&
675 regdQList[rl_idx]->isLastOutstandingPkt()) {
677 "Unset barrier bit for active list ID %d\n", rl_idx);
682 "Rescheduling active list ID %d after unsetting barrier "
689 if (
regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
692 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
693 " q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
697 qDesc->
isEmpty()?
"true" :
"false", rl_idx);
713 uint64_t signal_addr =
714 (uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
716 " completion signal: %x!\n", signal_addr);
741 uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
Addr hostDispAddr() const
AQLRingBuffer(uint32_t size, const std::string name)
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
bool freeEntry(void *pkt)
void incRdIdx(uint64_t value)
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
GPUComputeDriver * driver()
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
HSAPacketProcessor * hsaPP
QueueProcessEvent aqlProcessEvent
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessorParams Params
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
const Tick pktProcessDelay
virtual Tick read(Packet *) override
uint64_t ptr(uint64_t ix)
uint64_t hostReadIndexPtr
bool stalledOnDmaBufAvailability
void unregisterQueue(uint64_t queue_id, int doorbellSize)
void registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void write(Addr db_addr, uint64_t doorbell_reg)
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void makeAtomicResponse()
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
void setDevRequestor(RequestorID mid)
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
bool scheduled() const
Determine if the current event is scheduled.
void schedule(Event &event, Tick when)
#define panic(...)
This implements a cprintf based panic() function.
#define fatal(...)
This implements a cprintf based fatal() function.
hsa_packet_type_t
Packet type.
@ HSA_PACKET_TYPE_BARRIER_AND
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
@ HSA_PACKET_TYPE_BARRIER_OR
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
@ HSA_PACKET_TYPE_VENDOR_SPECIFIC
Vendor-specific packet.
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
@ HSA_PACKET_TYPE_KERNEL_DISPATCH
Packet used by agents for dispatching jobs to kernel agents.
@ HSA_PACKET_TYPE_AGENT_DISPATCH
Packet used by agents for dispatching jobs to agents.
int32_t hsa_signal_value_t
Signal value.
#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT)
#define NumSignalsPerBarrier
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
std::unique_ptr< TranslationGen > TranslationGenPtr
Declarations of a non-full system Page Table.
Calls getCurrentEntry once the queueEntry has been dmaRead.
AQL kernel dispatch packet.
const std::string & name()