gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
hsa_packet_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cassert>
35#include <cstring>
36
38#include "base/compiler.hh"
39#include "base/logging.hh"
40#include "base/trace.hh"
41#include "debug/HSAPacketProcessor.hh"
43#include "dev/dma_device.hh"
44#include "dev/hsa/hsa_packet.hh"
46#include "enums/GfxVersion.hh"
48#include "mem/packet_access.hh"
49#include "mem/page_table.hh"
50#include "sim/full_system.hh"
51#include "sim/process.hh"
52#include "sim/proxy_ptr.hh"
53#include "sim/system.hh"
54
55#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
56 const char* \
57 HSAPacketProcessor::XEVENT::description() const \
58 { \
59 return #XEVENT; \
60 }
61
62#define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
63 HSA_PACKET_HEADER_TYPE) & mask(HSA_PACKET_HEADER_WIDTH_TYPE)))
64
65// checks if the barrier bit is set in the header -- shift the barrier bit
66// to LSB, then bitwise "and" to mask off all other bits
67#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
68 HSA_PACKET_HEADER_BARRIER) & \
69 mask(HSA_PACKET_HEADER_WIDTH_BARRIER)))
70
71namespace gem5
72{
73
74HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
75
80{
81 DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
82 hwSchdlr = new HWScheduler(this, p.wakeupDelay);
83 regdQList.resize(numHWQueues);
84 for (int i = 0; i < numHWQueues; i++) {
85 regdQList[i] = new RQLEntry(this, i);
86 }
87}
88
90{
91 for (auto &queue : regdQList) {
92 delete queue;
93 }
94}
95
96void
98{
100
101 assert(walker);
102 walker->setDevRequestor(gpuDevice->vramRequestorId());
103}
104
105void
106HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
107{
108 hwSchdlr->unregisterQueue(queue_id, doorbellSize);
109}
110
111void
112HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
113 uint64_t basePointer,
114 uint64_t queue_id,
115 uint32_t size, int doorbellSize,
116 GfxVersion gfxVersion,
117 Addr offset, uint64_t rd_idx)
118{
120 "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
121 (void *)basePointer, queue_id, size);
122 hwSchdlr->registerNewQueue(hostReadIndexPointer,
123 basePointer, queue_id, size, doorbellSize,
124 gfxVersion, offset, rd_idx);
125}
126
129{
130 assert(pioSize != 0);
131
132 AddrRangeList ranges;
133 ranges.push_back(RangeSize(pioAddr, pioSize));
134
135 return ranges;
136}
137
138// Basically only processes writes to the queue doorbell register.
139Tick
141{
142 assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
143
144 // TODO: How to get pid??
145 [[maybe_unused]] Addr daddr = pkt->getAddr() - pioAddr;
146
148 "%s: write of size %d to reg-offset %d (0x%x)\n",
149 __FUNCTION__, pkt->getSize(), daddr, daddr);
150
151 assert(gpu_device->driver()->doorbellSize() == pkt->getSize());
152
153 uint64_t doorbell_reg(0);
154 if (pkt->getSize() == 8)
155 doorbell_reg = pkt->getLE<uint64_t>() + 1;
156 else if (pkt->getSize() == 4)
157 doorbell_reg = pkt->getLE<uint32_t>();
158 else
159 fatal("invalid db size");
160
162 "%s: write data 0x%x to offset %d (0x%x)\n",
163 __FUNCTION__, doorbell_reg, daddr, daddr);
164 hwSchdlr->write(daddr, doorbell_reg);
165 pkt->makeAtomicResponse();
166 return pioDelay;
167}
168
169Tick
171{
172 pkt->makeAtomicResponse();
173 pkt->setBadAddress();
174 return pioDelay;
175}
176
179{
180 if (!FullSystem) {
181 // Grab the process and try to translate the virtual address with it;
182 // with new extensions, it will likely be wrong to just arbitrarily
183 // grab context zero.
184 auto process = sys->threads[0]->getProcessPtr();
185
186 return process->pTable->translateRange(vaddr, size);
187 }
188
189 // In full system use the page tables setup by the kernel driver rather
190 // than the CPU page tables.
191 return TranslationGenPtr(
193 1 /* vmid */, vaddr, size));
194}
195
201void
206
207void
208HSAPacketProcessor::updateReadIndex(int pid, uint32_t rl_idx)
209{
210 AQLRingBuffer* aqlbuf = regdQList[rl_idx]->qCntxt.aqlBuf;
211 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
212 auto cb = new DmaVirtCallback<uint64_t>(
213 [ = ] (const uint32_t &dma_data) { this->updateReadDispIdDma(); }, 0);
214
216 "%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->rdIdx());
217
218 dmaWriteVirt((Addr)qDesc->hostReadIndexPtr, sizeof(aqlbuf->rdIdx()),
219 cb, aqlbuf->rdIdxPtr());
220
222 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
223 " q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
224 qDesc->readIndex, qDesc->writeIndex, qDesc->spaceUsed(),
225 qDesc->numElts, qDesc->isEmpty()? "true" : "false", rl_idx);
226 if (qDesc->writeIndex != aqlbuf->wrIdx()) {
227 getCommandsFromHost(pid, rl_idx);
228 }
229}
230
231void
233 bool isRead, uint32_t ix_start, unsigned num_pkts,
234 dma_series_ctx *series_ctx, void *dest_4debug)
235{
236 uint32_t rl_idx = series_ctx->rl_idx;
237 [[maybe_unused]] AQLRingBuffer *aqlRingBuffer =
238 hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
239 HSAQueueDescriptor* qDesc =
240 hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
241 DPRINTF(HSAPacketProcessor, ">%s, ix = %d, npkts = %d," \
242 " pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
243 ix_start, num_pkts, series_ctx->pkts_2_go,
244 rl_idx);
245 if (isRead) {
246 series_ctx->pkts_2_go -= num_pkts;
247 if (series_ctx->pkts_2_go == 0) {
248 // Mark DMA as completed
249 qDesc->dmaInProgress = false;
251 "%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
252 " dispIdx %d, active list ID = %d\n",
253 __FUNCTION__, aqlRingBuffer->rdIdx(),
254 aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rl_idx);
255 // schedule queue wakeup
256 hsaPP->schedAQLProcessing(rl_idx);
257 delete series_ctx;
258 }
259 }
260}
261
262void
264{
265 RQLEntry *queue = regdQList[rl_idx];
266 if (!queue->aqlProcessEvent.scheduled()) {
267 Tick processingTick = curTick() + delay;
268 schedule(queue->aqlProcessEvent, processingTick);
269 DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
270 processingTick);
271 } else {
272 DPRINTF(HSAPacketProcessor, "AQL processing already scheduled\n");
273 }
274}
275
276void
281
283HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
284{
285 Q_STATE is_submitted = BLOCKED_BPKT;
286 SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
287 // Dependency signals are not read yet. And this can only be a retry.
288 // The retry logic will schedule the packet processor wakeup
289 if (dep_sgnl_rd_st->pendingReads != 0) {
290 return BLOCKED_BPKT;
291 }
292 // `pkt` can be typecasted to any type of AQL packet since they all
293 // have header information at offset zero
294 auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
295 hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
296 if (IS_BARRIER(disp_pkt) &&
297 regdQList[rl_idx]->compltnPending() > 0) {
298 // If this packet is using the "barrier bit" to enforce ordering with
299 // previous packets, and if there are outstanding packets, set the
300 // barrier bit for this queue and block the queue.
301 DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
302 " list ID = %d\n", __FUNCTION__, rl_idx);
303 regdQList[rl_idx]->setBarrierBit(true);
304 return BLOCKED_BBIT;
305 }
306 if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
307 DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
308 " active list ID = %d\n", __FUNCTION__, rl_idx);
309 // Submit packet to HSA device (dispatcher)
310 gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
311 is_submitted = UNBLOCKED;
312 } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
313 DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
314 " active list ID = %d\n", __FUNCTION__, rl_idx);
315 // Submit packet to HSA device (dispatcher)
316 gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
317 is_submitted = UNBLOCKED;
318 /*
319 If this packet is using the "barrier bit" to enforce ordering with
320 subsequent kernels, set the bit for this queue now, after
321 dispatching.
322 */
323 if (IS_BARRIER(disp_pkt)) {
324 DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
325 " list ID = %d\n", __FUNCTION__, rl_idx);
326 regdQList[rl_idx]->setBarrierBit(true);
327 }
328 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
329 DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
330 " active list ID = %d\n", __FUNCTION__, rl_idx);
331 auto bar_and_pkt = (_hsa_barrier_and_packet_t *)pkt;
332 bool isReady = true;
333 // Loop thorugh all the completion signals to see if this barrier
334 // packet is ready.
335 for (int i = 0; i < NumSignalsPerBarrier; i++) {
336 // dep_signal = zero imply no signal connected
337 if (bar_and_pkt->dep_signal[i]) {
338 // The signal value is aligned 8 bytes from
339 // the actual handle in the runtime
340 uint64_t signal_addr =
341 (uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[i]) + 1);
342 hsa_signal_value_t *signal_val =
343 &(dep_sgnl_rd_st->values[i]);
344 DPRINTF(HSAPacketProcessor, "%s: Barrier pkt dep sgnl[%d]" \
345 " , sig addr %x, value %d active list ID = %d\n",
346 __FUNCTION__, i, signal_addr,
347 *signal_val, rl_idx);
348 // The if condition will be executed everytime except the
349 // very first time this barrier packet is encounteresd.
350 if (dep_sgnl_rd_st->allRead) {
351 if (*signal_val != 0) {
352 // This signal is not yet ready, read it again
353 isReady = false;
354
355 auto cb = new DmaVirtCallback<int64_t>(
356 [ = ] (const uint32_t &dma_data)
357 { dep_sgnl_rd_st->handleReadDMA(); }, 0);
358 dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
359 cb, signal_val);
360 dep_sgnl_rd_st->pendingReads++;
361 DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
362 " active list %d\n", __FUNCTION__,
363 dep_sgnl_rd_st->pendingReads, rl_idx);
364 }
365 } else {
366 // This signal is not yet ready, read it again
367 isReady = false;
368 auto cb = new DmaVirtCallback<int64_t>(
369 [ = ] (const uint32_t &dma_data)
370 { dep_sgnl_rd_st->handleReadDMA(); }, 0);
371 dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
372 cb, signal_val);
373 dep_sgnl_rd_st->pendingReads++;
374 DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
375 " active list %d\n", __FUNCTION__,
376 dep_sgnl_rd_st->pendingReads, rl_idx);
377 }
378 }
379 }
380 if (isReady) {
381 assert(dep_sgnl_rd_st->pendingReads == 0);
382 DPRINTF(HSAPacketProcessor, "%s: Barrier packet completed" \
383 " active list ID = %d\n", __FUNCTION__, rl_idx);
384 // TODO: Completion signal of barrier packet to be
385 // atomically decremented here
386 finishPkt((void*)bar_and_pkt, rl_idx);
387 is_submitted = UNBLOCKED;
388 // Reset signal values
389 dep_sgnl_rd_st->resetSigVals();
390 // The completion signal is connected
391 if (bar_and_pkt->completion_signal != 0) {
392 // The semantics of the HSA signal is to decrement the current
393 // signal value by one. Do this asynchronously via DMAs and
394 // callbacks as we can safely continue with this function
395 // while waiting for the next packet from the host.
396 DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
397 " completion signal! Addr: %x\n",
398 bar_and_pkt->completion_signal);
399
400 gpu_device->sendCompletionSignal(
401 bar_and_pkt->completion_signal);
402 }
403 }
404 if (dep_sgnl_rd_st->pendingReads > 0) {
405 // Atleast one DepSignalsReadDmaEvent is scheduled this cycle
406 dep_sgnl_rd_st->allRead = false;
407 dep_sgnl_rd_st->discardRead = false;
408 }
409 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_OR) {
410 fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
411 } else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
412 fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
413 } else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
414 DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
415 " active list ID = %d\n", __FUNCTION__, rl_idx);
416 // Submit packet to HSA device (dispatcher)
417 gpu_device->submitAgentDispatchPkt(
418 (void *)disp_pkt, rl_idx, host_pkt_addr);
419 is_submitted = UNBLOCKED;
420 sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
421 } else {
422 fatal("Unsupported packet type %d\n", pkt_type);
423 }
424 return is_submitted;
425}
426
427// Wakes up every fixed time interval (pktProcessDelay) and processes a single
428// packet from the queue that scheduled this wakeup. If there are more
429// packets in that queue, the next wakeup is scheduled.
430void
432{
433 AQLRingBuffer *aqlRingBuffer = hsaPP->regdQList[rqIdx]->qCntxt.aqlBuf;
435 "%s: Qwakeup , rdIdx %d, wrIdx %d," \
436 " dispIdx %d, active list ID = %d\n",
437 __FUNCTION__, aqlRingBuffer->rdIdx(),
438 aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
439 // If barrier bit is set, then this wakeup is a dummy wakeup
440 // just to model the processing time. Do nothing.
441 if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
443 "Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
444 return;
445 }
446 // In the future, we may support batch processing of packets.
447 // Then, we can just remove the break statements and the code
448 // will support batch processing. That is why we are using a
449 // "while loop" here instead on an "if" condition.
450 while (hsaPP->regdQList[rqIdx]->dispPending()) {
451 void *pkt = aqlRingBuffer->ptr(aqlRingBuffer->dispIdx());
452 DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
453 __FUNCTION__, aqlRingBuffer->dispIdx());
454 Addr host_addr = aqlRingBuffer->hostDispAddr();
455 Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
456 if (q_state == UNBLOCKED) {
457 aqlRingBuffer->incDispIdx(1);
458 DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
459 __FUNCTION__, aqlRingBuffer->dispIdx());
460 if (hsaPP->regdQList[rqIdx]->dispPending()) {
461 hsaPP->schedAQLProcessing(rqIdx);
462 }
463 break;
464 } else if (q_state == BLOCKED_BPKT) {
465 // This queue is blocked by barrier packet,
466 // schedule a processing event
467 hsaPP->schedAQLProcessing(rqIdx);
468 break;
469 } else if (q_state == BLOCKED_BBIT) {
470 // This queue is blocked by barrier bit, and processing event
471 // should be scheduled from finishPkt(). However, to elapse
472 // "pktProcessDelay" processing time, let us schedule a dummy
473 // wakeup once which will just wakeup and will do nothing.
474 hsaPP->schedAQLProcessing(rqIdx);
475 break;
476 } else {
477 panic("Unknown queue state\n");
478 }
479 }
480}
481
482void
484{
485 assert(pendingReads > 0);
486 pendingReads--;
487 if (pendingReads == 0) {
488 allRead = true;
489 if (discardRead) {
490 resetSigVals();
491 }
492 }
493}
494
495void
497{
498 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
499 AQLRingBuffer *aqlRingBuffer = regdQList[rl_idx]->qCntxt.aqlBuf;
500
502 "%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
503 " doorbell(%d)[0x%x] \n",
504 __FUNCTION__, qDesc->readIndex,
505 qDesc->writeIndex, pid, qDesc->doorbellPointer);
506
507 if (qDesc->dmaInProgress) {
508 // we'll try again when this dma transfer completes in updateReadIndex
509 return;
510 }
511 uint32_t num_umq = qDesc->spaceUsed();
512 if (num_umq == 0)
513 return; // nothing to be gotten
514 uint32_t umq_nxt = qDesc->readIndex;
515 // Total AQL buffer size
516 uint32_t ttl_aql_buf = aqlRingBuffer->numObjs();
517 // Available AQL buffer size. If the available buffer is less than
518 // demanded, number of available buffer is returned
519 uint32_t got_aql_buf = aqlRingBuffer->allocEntry(num_umq);
520 qDesc->readIndex += got_aql_buf;
521 uint32_t dma_start_ix = (aqlRingBuffer->wrIdx() - got_aql_buf) %
522 ttl_aql_buf;
523 dma_series_ctx *series_ctx = NULL;
524
525 DPRINTF(HSAPacketProcessor, "%s: umq_nxt = %d, ttl_aql_buf = %d, "
526 "dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
527 ttl_aql_buf, dma_start_ix, num_umq);
528
529 if (got_aql_buf == 0) {
530 // we'll try again when some dma bufs are freed in freeEntry
531 qDesc->stalledOnDmaBufAvailability = true;
532 return;
533 } else {
534 qDesc->stalledOnDmaBufAvailability = false;
535 }
536
537 uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
538 while (got_aql_buf != 0 && num_umq != 0) {
539 uint32_t umq_b4_wrap = qDesc->numObjs() -
540 (umq_nxt % qDesc->objSize());
541 uint32_t num_2_xfer
542 = std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
543 if (!series_ctx) {
544 qDesc->dmaInProgress = true;
545 series_ctx = new dma_series_ctx(got_aql_buf, got_aql_buf,
546 dma_start_ix, rl_idx);
547 }
548
549 void *aql_buf = aqlRingBuffer->ptr(dma_start_ix);
550 auto cb = new DmaVirtCallback<uint64_t>(
551 [ = ] (const uint32_t &dma_data)
552 { this->cmdQueueCmdDma(this, pid, true, dma_start_ix,
553 num_2_xfer, series_ctx, aql_buf); }, 0);
554 dmaReadVirt(qDesc->ptr(umq_nxt), num_2_xfer * qDesc->objSize(),
555 cb, aql_buf);
556
557 aqlRingBuffer->saveHostDispAddr(qDesc->ptr(umq_nxt), num_2_xfer,
558 dma_start_ix);
559
561 "%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
562 __FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
563
564 num_umq -= num_2_xfer;
565 got_aql_buf -= num_2_xfer;
566 dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
567 umq_nxt = (umq_nxt + num_2_xfer) % qDesc->numObjs();
568 if (got_aql_buf == 0 && num_umq != 0) {
569 // There are more packets in the queue but
570 // not enough DMA buffers. Set the stalledOnDmaBufAvailability,
571 // we will try again in freeEntry
572 qDesc->stalledOnDmaBufAvailability = true;
573 }
574 }
575}
576
577void
579{
580 [[maybe_unused]] HSAQueueDescriptor* qDesc =
581 regdQList[rl_idx]->qCntxt.qDesc;
583 "%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
584 "writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
585 __FUNCTION__, pid, qDesc->basePointer,
586 qDesc->doorbellPointer, qDesc->writeIndex,
587 qDesc->readIndex, qDesc->numElts);
588}
589
591 const std::string name)
592 : _name(name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
593{
594 _aqlBuf.resize(size);
595 _aqlComplete.resize(size);
596 _hostDispAddresses.resize(size);
597 // Mark all packets as invalid and incomplete
598 for (auto& it : _aqlBuf)
599 it.header = HSA_PACKET_TYPE_INVALID;
600 std::fill(_aqlComplete.begin(), _aqlComplete.end(), false);
601}
602
603void
605{
606 _rdIdx = value;
607}
608
609void
611{
612 _wrIdx = value;
613}
614
615void
617{
618 _dispIdx = value;
619}
620
621bool
623{
624 _aqlComplete[(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data()] = true;
625 DPRINTF(HSAPacketProcessor, "%s: pkt_ix = %d; "\
626 " # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
627 (hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data(),
628 nFree(), wrIdx(), rdIdx());
629 // Packets can complete out-of-order. This code "retires" packets in-order
630 // by updating the read pointer in the MQD when a contiguous chunk of
631 // packets have finished.
632 uint32_t old_rdIdx = rdIdx();
633 while (_aqlComplete[rdIdx() % numObjs()]) {
634 _aqlComplete[rdIdx() % numObjs()] = false;
636 incRdIdx(1);
637 }
638 return (old_rdIdx != rdIdx());
639}
640
641void
646
647int
649{
650 DPRINTF(HSAPacketProcessor, "%s: nReq = %d\n", __FUNCTION__, nBufReq);
651 if (nFree() == 0) {
652 DPRINTF(HSAPacketProcessor, "%s: return = %d\n", __FUNCTION__, 0);
653 return 0;
654 }
655
656 if (nBufReq > nFree())
657 nBufReq = nFree();
658
659 DPRINTF(HSAPacketProcessor, "%s: ix1stFree = %d\n", __FUNCTION__, wrIdx());
660 incWrIdx(nBufReq);
661 DPRINTF(HSAPacketProcessor, "%s: return = %d, wrIdx = %d\n",
662 __FUNCTION__, nBufReq, wrIdx());
663 return nBufReq;
664}
665
666void
667HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
668{
669 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
670
671 // if barrier bit was set and this is the last
672 // outstanding packet from that queue,
673 // unset it here
674 if (regdQList[rl_idx]->getBarrierBit() &&
675 regdQList[rl_idx]->isLastOutstandingPkt()) {
677 "Unset barrier bit for active list ID %d\n", rl_idx);
678 regdQList[rl_idx]->setBarrierBit(false);
679 // if pending kernels in the queue after this kernel, reschedule
680 if (regdQList[rl_idx]->dispPending()) {
682 "Rescheduling active list ID %d after unsetting barrier "
683 "bit\n", rl_idx);
684 schedAQLProcessing(rl_idx);
685 }
686 }
687
688 // If set, then blocked schedule, so need to reschedule
689 if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
690 updateReadIndex(0, rl_idx);
692 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
693 " q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
694 __FUNCTION__, qDesc->readIndex, qDesc->writeIndex,
695 qDesc->spaceUsed(), qDesc->numElts,
696 qDesc->stalledOnDmaBufAvailability? "true" : "false",
697 qDesc->isEmpty()? "true" : "false", rl_idx);
698 // DMA buffer is freed, check the queue to see if there are DMA
699 // accesses blocked becasue of non-availability of DMA buffer
700 if (qDesc->stalledOnDmaBufAvailability) {
701 assert(!qDesc->isEmpty());
702 getCommandsFromHost(0, rl_idx); // TODO:assign correct pid
703 // when implementing
704 // multi-process support
705 }
706}
707
708void
710 void *pkt, hsa_signal_value_t signal)
711{
712 auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
713 uint64_t signal_addr =
714 (uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
715 DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
716 " completion signal: %x!\n", signal_addr);
726 VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
727
728 DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
729 (uint64_t)sys->threads[0]->cpuId());
730
731
732 hsa_signal_value_t *new_signal = new hsa_signal_value_t;
733 *new_signal = (hsa_signal_value_t) *prev_signal - 1;
734
735 dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
736}
737
738void
740{
741 uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
742 DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
743 signal_addr);
753 VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
754
755 hsa_signal_value_t *new_signal = new hsa_signal_value_t;
756 *new_signal = (hsa_signal_value_t) *prev_signal - 1;
757
758 dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
759}
760
761} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
void * ptr(uint32_t ix)
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
AQLRingBuffer(uint32_t size, const std::string name)
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
void incRdIdx(uint64_t value)
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
DmaVirtDevice(const Params &p)
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessorParams Params
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
virtual Tick read(Packet *) override
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void setBadAddress()
Definition packet.hh:786
Addr getAddr() const
Definition packet.hh:807
unsigned getSize() const
Definition packet.hh:817
void makeAtomicResponse()
Definition packet.hh:1074
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
#define PAGE_SIZE
Definition base.cc:60
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
hsa_packet_type_t
Packet type.
Definition hsa.h:2746
struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t
AQL kernel dispatch packet.
@ HSA_PACKET_TYPE_BARRIER_AND
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition hsa.h:2767
@ HSA_PACKET_TYPE_BARRIER_OR
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition hsa.h:2778
@ HSA_PACKET_TYPE_VENDOR_SPECIFIC
Vendor-specific packet.
Definition hsa.h:2750
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
Definition hsa.h:2756
@ HSA_PACKET_TYPE_KERNEL_DISPATCH
Packet used by agents for dispatching jobs to kernel agents.
Definition hsa.h:2761
@ HSA_PACKET_TYPE_AGENT_DISPATCH
Packet used by agents for dispatching jobs to agents.
Definition hsa.h:2772
int32_t hsa_signal_value_t
Signal value.
Definition hsa.h:1322
#define PKT_TYPE(PKT)
#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT)
#define IS_BARRIER(PKT)
#define NumSignalsPerBarrier
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
ProxyPtr< T, SETranslatingPortProxy > VPtr
Definition proxy_ptr.hh:400
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
std::unique_ptr< TranslationGen > TranslationGenPtr
Declarations of a non-full system Page Table.
Calls getCurrentEntry once the queueEntry has been dmaRead.

Generated on Mon May 26 2025 09:19:09 for gem5 by doxygen 1.13.2