gem5 v24.0.0.0
Loading...
Searching...
No Matches
hsa_packet_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cassert>
35#include <cstring>
36
38#include "base/compiler.hh"
39#include "base/logging.hh"
40#include "base/trace.hh"
41#include "debug/HSAPacketProcessor.hh"
43#include "dev/dma_device.hh"
44#include "dev/hsa/hsa_packet.hh"
46#include "enums/GfxVersion.hh"
48#include "mem/packet_access.hh"
49#include "mem/page_table.hh"
50#include "sim/full_system.hh"
51#include "sim/process.hh"
52#include "sim/proxy_ptr.hh"
53#include "sim/system.hh"
54
55#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
56 const char* \
57 HSAPacketProcessor::XEVENT::description() const \
58 { \
59 return #XEVENT; \
60 }
61
62#define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
63 HSA_PACKET_HEADER_TYPE) & mask(HSA_PACKET_HEADER_WIDTH_TYPE)))
64
65// checks if the barrier bit is set in the header -- shift the barrier bit
66// to LSB, then bitwise "and" to mask off all other bits
67#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
68 HSA_PACKET_HEADER_BARRIER) & \
69 mask(HSA_PACKET_HEADER_WIDTH_BARRIER)))
70
71namespace gem5
72{
73
74HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
75
77 : DmaVirtDevice(p), walker(p.walker),
78 numHWQueues(p.numHWQueues), pioAddr(p.pioAddr),
79 pioSize(PAGE_SIZE), pioDelay(10), pktProcessDelay(p.pktProcessDelay)
80{
81 DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
82 hwSchdlr = new HWScheduler(this, p.wakeupDelay);
83 regdQList.resize(numHWQueues);
84 for (int i = 0; i < numHWQueues; i++) {
85 regdQList[i] = new RQLEntry(this, i);
86 }
87}
88
90{
91 for (auto &queue : regdQList) {
92 delete queue;
93 }
94}
95
96void
104
105void
106HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
107{
108 hwSchdlr->unregisterQueue(queue_id, doorbellSize);
109}
110
111void
112HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
113 uint64_t basePointer,
114 uint64_t queue_id,
115 uint32_t size, int doorbellSize,
116 GfxVersion gfxVersion,
117 Addr offset, uint64_t rd_idx)
118{
120 "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
121 (void *)basePointer, queue_id, size);
122 hwSchdlr->registerNewQueue(hostReadIndexPointer,
123 basePointer, queue_id, size, doorbellSize,
124 gfxVersion, offset, rd_idx);
125}
126
129{
130 assert(pioSize != 0);
131
132 AddrRangeList ranges;
133 ranges.push_back(RangeSize(pioAddr, pioSize));
134
135 return ranges;
136}
137
138// Basically only processes writes to the queue doorbell register.
139Tick
141{
142 assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
143
144 // TODO: How to get pid??
145 [[maybe_unused]] Addr daddr = pkt->getAddr() - pioAddr;
146
148 "%s: write of size %d to reg-offset %d (0x%x)\n",
149 __FUNCTION__, pkt->getSize(), daddr, daddr);
150
151 assert(gpu_device->driver()->doorbellSize() == pkt->getSize());
152
153 uint64_t doorbell_reg(0);
154 if (pkt->getSize() == 8)
155 doorbell_reg = pkt->getLE<uint64_t>() + 1;
156 else if (pkt->getSize() == 4)
157 doorbell_reg = pkt->getLE<uint32_t>();
158 else
159 fatal("invalid db size");
160
162 "%s: write data 0x%x to offset %d (0x%x)\n",
163 __FUNCTION__, doorbell_reg, daddr, daddr);
164 hwSchdlr->write(daddr, doorbell_reg);
165 pkt->makeAtomicResponse();
166 return pioDelay;
167}
168
169Tick
171{
172 pkt->makeAtomicResponse();
173 pkt->setBadAddress();
174 return pioDelay;
175}
176
179{
180 if (!FullSystem) {
181 // Grab the process and try to translate the virtual address with it;
182 // with new extensions, it will likely be wrong to just arbitrarily
183 // grab context zero.
184 auto process = sys->threads[0]->getProcessPtr();
185
186 return process->pTable->translateRange(vaddr, size);
187 }
188
189 // In full system use the page tables setup by the kernel driver rather
190 // than the CPU page tables.
191 return TranslationGenPtr(
193 1 /* vmid */, vaddr, size));
194}
195
201void
206
207void
208HSAPacketProcessor::updateReadIndex(int pid, uint32_t rl_idx)
209{
210 AQLRingBuffer* aqlbuf = regdQList[rl_idx]->qCntxt.aqlBuf;
211 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
212 auto cb = new DmaVirtCallback<uint64_t>(
213 [ = ] (const uint32_t &dma_data) { this->updateReadDispIdDma(); }, 0);
214
216 "%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->rdIdx());
217
218 dmaWriteVirt((Addr)qDesc->hostReadIndexPtr, sizeof(aqlbuf->rdIdx()),
219 cb, aqlbuf->rdIdxPtr());
220
222 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
223 " q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
224 qDesc->readIndex, qDesc->writeIndex, qDesc->spaceUsed(),
225 qDesc->numElts, qDesc->isEmpty()? "true" : "false", rl_idx);
226 if (qDesc->writeIndex != aqlbuf->wrIdx()) {
227 getCommandsFromHost(pid, rl_idx);
228 }
229}
230
231void
233 bool isRead, uint32_t ix_start, unsigned num_pkts,
234 dma_series_ctx *series_ctx, void *dest_4debug)
235{
236 uint32_t rl_idx = series_ctx->rl_idx;
237 [[maybe_unused]] AQLRingBuffer *aqlRingBuffer =
238 hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
239 HSAQueueDescriptor* qDesc =
240 hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
241 DPRINTF(HSAPacketProcessor, ">%s, ix = %d, npkts = %d," \
242 " pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
243 ix_start, num_pkts, series_ctx->pkts_2_go,
244 rl_idx);
245 if (isRead) {
246 series_ctx->pkts_2_go -= num_pkts;
247 if (series_ctx->pkts_2_go == 0) {
248 // Mark DMA as completed
249 qDesc->dmaInProgress = false;
251 "%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
252 " dispIdx %d, active list ID = %d\n",
253 __FUNCTION__, aqlRingBuffer->rdIdx(),
254 aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rl_idx);
255 // schedule queue wakeup
256 hsaPP->schedAQLProcessing(rl_idx);
257 delete series_ctx;
258 }
259 }
260}
261
262void
264{
265 RQLEntry *queue = regdQList[rl_idx];
266 if (!queue->aqlProcessEvent.scheduled()) {
267 Tick processingTick = curTick() + delay;
268 schedule(queue->aqlProcessEvent, processingTick);
269 DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
270 processingTick);
271 } else {
272 DPRINTF(HSAPacketProcessor, "AQL processing already scheduled\n");
273 }
274}
275
276void
281
283HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
284{
285 Q_STATE is_submitted = BLOCKED_BPKT;
286 SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
287 // Dependency signals are not read yet. And this can only be a retry.
288 // The retry logic will schedule the packet processor wakeup
289 if (dep_sgnl_rd_st->pendingReads != 0) {
290 return BLOCKED_BPKT;
291 }
292 // `pkt` can be typecasted to any type of AQL packet since they all
293 // have header information at offset zero
294 auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
295 hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
296 if (IS_BARRIER(disp_pkt) &&
297 regdQList[rl_idx]->compltnPending() > 0) {
298 // If this packet is using the "barrier bit" to enforce ordering with
299 // previous packets, and if there are outstanding packets, set the
300 // barrier bit for this queue and block the queue.
301 DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
302 " list ID = %d\n", __FUNCTION__, rl_idx);
303 regdQList[rl_idx]->setBarrierBit(true);
304 return BLOCKED_BBIT;
305 }
306 if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
307 DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
308 " active list ID = %d\n", __FUNCTION__, rl_idx);
309 // Submit packet to HSA device (dispatcher)
310 gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
311 is_submitted = UNBLOCKED;
312 } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
313 DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
314 " active list ID = %d\n", __FUNCTION__, rl_idx);
315 // Submit packet to HSA device (dispatcher)
316 gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
317 is_submitted = UNBLOCKED;
318 /*
319 If this packet is using the "barrier bit" to enforce ordering with
320 subsequent kernels, set the bit for this queue now, after
321 dispatching.
322 */
323 if (IS_BARRIER(disp_pkt)) {
324 DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
325 " list ID = %d\n", __FUNCTION__, rl_idx);
326 regdQList[rl_idx]->setBarrierBit(true);
327 }
328 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
329 DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
330 " active list ID = %d\n", __FUNCTION__, rl_idx);
331 auto bar_and_pkt = (_hsa_barrier_and_packet_t *)pkt;
332 bool isReady = true;
333 // Loop thorugh all the completion signals to see if this barrier
334 // packet is ready.
335 for (int i = 0; i < NumSignalsPerBarrier; i++) {
336 // dep_signal = zero imply no signal connected
337 if (bar_and_pkt->dep_signal[i]) {
338 // The signal value is aligned 8 bytes from
339 // the actual handle in the runtime
340 uint64_t signal_addr =
341 (uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[i]) + 1);
342 hsa_signal_value_t *signal_val =
343 &(dep_sgnl_rd_st->values[i]);
344 DPRINTF(HSAPacketProcessor, "%s: Barrier pkt dep sgnl[%d]" \
345 " , sig addr %x, value %d active list ID = %d\n",
346 __FUNCTION__, i, signal_addr,
347 *signal_val, rl_idx);
348 // The if condition will be executed everytime except the
349 // very first time this barrier packet is encounteresd.
350 if (dep_sgnl_rd_st->allRead) {
351 if (*signal_val != 0) {
352 // This signal is not yet ready, read it again
353 isReady = false;
354
355 auto cb = new DmaVirtCallback<int64_t>(
356 [ = ] (const uint32_t &dma_data)
357 { dep_sgnl_rd_st->handleReadDMA(); }, 0);
358 dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
359 cb, signal_val);
360 dep_sgnl_rd_st->pendingReads++;
361 DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
362 " active list %d\n", __FUNCTION__,
363 dep_sgnl_rd_st->pendingReads, rl_idx);
364 }
365 } else {
366 // This signal is not yet ready, read it again
367 isReady = false;
368 auto cb = new DmaVirtCallback<int64_t>(
369 [ = ] (const uint32_t &dma_data)
370 { dep_sgnl_rd_st->handleReadDMA(); }, 0);
371 dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
372 cb, signal_val);
373 dep_sgnl_rd_st->pendingReads++;
374 DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
375 " active list %d\n", __FUNCTION__,
376 dep_sgnl_rd_st->pendingReads, rl_idx);
377 }
378 }
379 }
380 if (isReady) {
381 assert(dep_sgnl_rd_st->pendingReads == 0);
382 DPRINTF(HSAPacketProcessor, "%s: Barrier packet completed" \
383 " active list ID = %d\n", __FUNCTION__, rl_idx);
384 // TODO: Completion signal of barrier packet to be
385 // atomically decremented here
386 finishPkt((void*)bar_and_pkt, rl_idx);
387 is_submitted = UNBLOCKED;
388 // Reset signal values
389 dep_sgnl_rd_st->resetSigVals();
390 // The completion signal is connected
391 if (bar_and_pkt->completion_signal != 0) {
392 // The semantics of the HSA signal is to decrement the current
393 // signal value by one. Do this asynchronously via DMAs and
394 // callbacks as we can safely continue with this function
395 // while waiting for the next packet from the host.
396 DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
397 " completion signal! Addr: %x\n",
398 bar_and_pkt->completion_signal);
399
401 bar_and_pkt->completion_signal);
402 }
403 }
404 if (dep_sgnl_rd_st->pendingReads > 0) {
405 // Atleast one DepSignalsReadDmaEvent is scheduled this cycle
406 dep_sgnl_rd_st->allRead = false;
407 dep_sgnl_rd_st->discardRead = false;
408 }
409 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_OR) {
410 fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
411 } else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
412 fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
413 } else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
414 DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
415 " active list ID = %d\n", __FUNCTION__, rl_idx);
416 // Submit packet to HSA device (dispatcher)
418 (void *)disp_pkt, rl_idx, host_pkt_addr);
419 is_submitted = UNBLOCKED;
420 sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
421 } else {
422 fatal("Unsupported packet type %d\n", pkt_type);
423 }
424 return is_submitted;
425}
426
427// Wakes up every fixed time interval (pktProcessDelay) and processes a single
428// packet from the queue that scheduled this wakeup. If there are more
429// packets in that queue, the next wakeup is scheduled.
430void
432{
433 AQLRingBuffer *aqlRingBuffer = hsaPP->regdQList[rqIdx]->qCntxt.aqlBuf;
435 "%s: Qwakeup , rdIdx %d, wrIdx %d," \
436 " dispIdx %d, active list ID = %d\n",
437 __FUNCTION__, aqlRingBuffer->rdIdx(),
438 aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
439 // If barrier bit is set, then this wakeup is a dummy wakeup
440 // just to model the processing time. Do nothing.
441 if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
443 "Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
444 return;
445 }
446 // In the future, we may support batch processing of packets.
447 // Then, we can just remove the break statements and the code
448 // will support batch processing. That is why we are using a
449 // "while loop" here instead on an "if" condition.
450 while (hsaPP->regdQList[rqIdx]->dispPending()) {
451 void *pkt = aqlRingBuffer->ptr(aqlRingBuffer->dispIdx());
452 DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
453 __FUNCTION__, aqlRingBuffer->dispIdx());
454 Addr host_addr = aqlRingBuffer->hostDispAddr();
455 Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
456 if (q_state == UNBLOCKED) {
457 aqlRingBuffer->incDispIdx(1);
458 DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
459 __FUNCTION__, aqlRingBuffer->dispIdx());
460 if (hsaPP->regdQList[rqIdx]->dispPending()) {
462 }
463 break;
464 } else if (q_state == BLOCKED_BPKT) {
465 // This queue is blocked by barrier packet,
466 // schedule a processing event
468 break;
469 } else if (q_state == BLOCKED_BBIT) {
470 // This queue is blocked by barrier bit, and processing event
471 // should be scheduled from finishPkt(). However, to elapse
472 // "pktProcessDelay" processing time, let us schedule a dummy
473 // wakeup once which will just wakeup and will do nothing.
475 break;
476 } else {
477 panic("Unknown queue state\n");
478 }
479 }
480}
481
482void
484{
485 assert(pendingReads > 0);
486 pendingReads--;
487 if (pendingReads == 0) {
488 allRead = true;
489 if (discardRead) {
490 resetSigVals();
491 }
492 }
493}
494
495void
497{
498 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
499 AQLRingBuffer *aqlRingBuffer = regdQList[rl_idx]->qCntxt.aqlBuf;
500
502 "%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
503 " doorbell(%d)[0x%x] \n",
504 __FUNCTION__, qDesc->readIndex,
505 qDesc->writeIndex, pid, qDesc->doorbellPointer);
506
507 if (qDesc->dmaInProgress) {
508 // we'll try again when this dma transfer completes in updateReadIndex
509 return;
510 }
511 uint32_t num_umq = qDesc->spaceUsed();
512 if (num_umq == 0)
513 return; // nothing to be gotten
514 uint32_t umq_nxt = qDesc->readIndex;
515 // Total AQL buffer size
516 uint32_t ttl_aql_buf = aqlRingBuffer->numObjs();
517 // Available AQL buffer size. If the available buffer is less than
518 // demanded, number of available buffer is returned
519 uint32_t got_aql_buf = aqlRingBuffer->allocEntry(num_umq);
520 qDesc->readIndex += got_aql_buf;
521 uint32_t dma_start_ix = (aqlRingBuffer->wrIdx() - got_aql_buf) %
522 ttl_aql_buf;
523 dma_series_ctx *series_ctx = NULL;
524
525 DPRINTF(HSAPacketProcessor, "%s: umq_nxt = %d, ttl_aql_buf = %d, "
526 "dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
527 ttl_aql_buf, dma_start_ix, num_umq);
528
529 if (got_aql_buf == 0) {
530 // we'll try again when some dma bufs are freed in freeEntry
531 qDesc->stalledOnDmaBufAvailability = true;
532 return;
533 } else {
534 qDesc->stalledOnDmaBufAvailability = false;
535 }
536
537 uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
538 while (got_aql_buf != 0 && num_umq != 0) {
539 uint32_t umq_b4_wrap = qDesc->numObjs() -
540 (umq_nxt % qDesc->objSize());
541 uint32_t num_2_xfer
542 = std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
543 if (!series_ctx) {
544 qDesc->dmaInProgress = true;
545 series_ctx = new dma_series_ctx(got_aql_buf, got_aql_buf,
546 dma_start_ix, rl_idx);
547 }
548
549 void *aql_buf = aqlRingBuffer->ptr(dma_start_ix);
550 auto cb = new DmaVirtCallback<uint64_t>(
551 [ = ] (const uint32_t &dma_data)
552 { this->cmdQueueCmdDma(this, pid, true, dma_start_ix,
553 num_2_xfer, series_ctx, aql_buf); }, 0);
554 dmaReadVirt(qDesc->ptr(umq_nxt), num_2_xfer * qDesc->objSize(),
555 cb, aql_buf);
556
557 aqlRingBuffer->saveHostDispAddr(qDesc->ptr(umq_nxt), num_2_xfer,
558 dma_start_ix);
559
561 "%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
562 __FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
563
564 num_umq -= num_2_xfer;
565 got_aql_buf -= num_2_xfer;
566 dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
567 umq_nxt = (umq_nxt + num_2_xfer) % qDesc->numObjs();
568 if (got_aql_buf == 0 && num_umq != 0) {
569 // There are more packets in the queue but
570 // not enough DMA buffers. Set the stalledOnDmaBufAvailability,
571 // we will try again in freeEntry
572 qDesc->stalledOnDmaBufAvailability = true;
573 }
574 }
575}
576
577void
579{
580 [[maybe_unused]] HSAQueueDescriptor* qDesc =
581 regdQList[rl_idx]->qCntxt.qDesc;
583 "%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
584 "writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
585 __FUNCTION__, pid, qDesc->basePointer,
586 qDesc->doorbellPointer, qDesc->writeIndex,
587 qDesc->readIndex, qDesc->numElts);
588}
589
591 const std::string name)
592 : _name(name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
593{
594 _aqlBuf.resize(size);
595 _aqlComplete.resize(size);
596 _hostDispAddresses.resize(size);
597 // Mark all packets as invalid and incomplete
598 for (auto& it : _aqlBuf)
599 it.header = HSA_PACKET_TYPE_INVALID;
600 std::fill(_aqlComplete.begin(), _aqlComplete.end(), false);
601}
602
603void
605{
606 _rdIdx = value;
607}
608
609void
611{
612 _wrIdx = value;
613}
614
615void
617{
618 _dispIdx = value;
619}
620
621bool
623{
624 _aqlComplete[(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data()] = true;
625 DPRINTF(HSAPacketProcessor, "%s: pkt_ix = %d; "\
626 " # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
627 (hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data(),
628 nFree(), wrIdx(), rdIdx());
629 // Packets can complete out-of-order. This code "retires" packets in-order
630 // by updating the read pointer in the MQD when a contiguous chunk of
631 // packets have finished.
632 uint32_t old_rdIdx = rdIdx();
633 while (_aqlComplete[rdIdx() % numObjs()]) {
634 _aqlComplete[rdIdx() % numObjs()] = false;
636 incRdIdx(1);
637 }
638 return (old_rdIdx != rdIdx());
639}
640
641void
646
647int
649{
650 DPRINTF(HSAPacketProcessor, "%s: nReq = %d\n", __FUNCTION__, nBufReq);
651 if (nFree() == 0) {
652 DPRINTF(HSAPacketProcessor, "%s: return = %d\n", __FUNCTION__, 0);
653 return 0;
654 }
655
656 if (nBufReq > nFree())
657 nBufReq = nFree();
658
659 DPRINTF(HSAPacketProcessor, "%s: ix1stFree = %d\n", __FUNCTION__, wrIdx());
660 incWrIdx(nBufReq);
661 DPRINTF(HSAPacketProcessor, "%s: return = %d, wrIdx = %d\n",
662 __FUNCTION__, nBufReq, wrIdx());
663 return nBufReq;
664}
665
666void
667HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
668{
669 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
670
671 // if barrier bit was set and this is the last
672 // outstanding packet from that queue,
673 // unset it here
674 if (regdQList[rl_idx]->getBarrierBit() &&
675 regdQList[rl_idx]->isLastOutstandingPkt()) {
677 "Unset barrier bit for active list ID %d\n", rl_idx);
678 regdQList[rl_idx]->setBarrierBit(false);
679 // if pending kernels in the queue after this kernel, reschedule
680 if (regdQList[rl_idx]->dispPending()) {
682 "Rescheduling active list ID %d after unsetting barrier "
683 "bit\n", rl_idx);
684 schedAQLProcessing(rl_idx);
685 }
686 }
687
688 // If set, then blocked schedule, so need to reschedule
689 if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
690 updateReadIndex(0, rl_idx);
692 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
693 " q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
694 __FUNCTION__, qDesc->readIndex, qDesc->writeIndex,
695 qDesc->spaceUsed(), qDesc->numElts,
696 qDesc->stalledOnDmaBufAvailability? "true" : "false",
697 qDesc->isEmpty()? "true" : "false", rl_idx);
698 // DMA buffer is freed, check the queue to see if there are DMA
699 // accesses blocked becasue of non-availability of DMA buffer
700 if (qDesc->stalledOnDmaBufAvailability) {
701 assert(!qDesc->isEmpty());
702 getCommandsFromHost(0, rl_idx); // TODO:assign correct pid
703 // when implementing
704 // multi-process support
705 }
706}
707
708void
710 void *pkt, hsa_signal_value_t signal)
711{
712 auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
713 uint64_t signal_addr =
714 (uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
715 DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
716 " completion signal: %x!\n", signal_addr);
726 VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
727
728 DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
729 (uint64_t)sys->threads[0]->cpuId());
730
731
732 hsa_signal_value_t *new_signal = new hsa_signal_value_t;
733 *new_signal = (hsa_signal_value_t) *prev_signal - 1;
734
735 dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
736}
737
738void
740{
741 uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
742 DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
743 signal_addr);
753 VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
754
755 hsa_signal_value_t *new_signal = new hsa_signal_value_t;
756 *new_signal = (hsa_signal_value_t) *prev_signal - 1;
757
758 dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
759}
760
761} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
void * ptr(uint32_t ix)
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
AQLRingBuffer(uint32_t size, const std::string name)
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
void incRdIdx(uint64_t value)
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessorParams Params
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
virtual Tick read(Packet *) override
void unregisterQueue(uint64_t queue_id, int doorbellSize)
void registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void write(Addr db_addr, uint64_t doorbell_reg)
const std::string _name
Definition named.hh:41
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void setBadAddress()
Definition packet.hh:786
Addr getAddr() const
Definition packet.hh:807
unsigned getSize() const
Definition packet.hh:817
void makeAtomicResponse()
Definition packet.hh:1074
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
Threads threads
Definition system.hh:310
void setDevRequestor(RequestorID mid)
#define PAGE_SIZE
Definition base.cc:60
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
hsa_packet_type_t
Packet type.
Definition hsa.h:2746
@ HSA_PACKET_TYPE_BARRIER_AND
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition hsa.h:2767
@ HSA_PACKET_TYPE_BARRIER_OR
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition hsa.h:2778
@ HSA_PACKET_TYPE_VENDOR_SPECIFIC
Vendor-specific packet.
Definition hsa.h:2750
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
Definition hsa.h:2756
@ HSA_PACKET_TYPE_KERNEL_DISPATCH
Packet used by agents for dispatching jobs to kernel agents.
Definition hsa.h:2761
@ HSA_PACKET_TYPE_AGENT_DISPATCH
Packet used by agents for dispatching jobs to agents.
Definition hsa.h:2772
int32_t hsa_signal_value_t
Signal value.
Definition hsa.h:1322
#define PKT_TYPE(PKT)
#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT)
#define IS_BARRIER(PKT)
#define NumSignalsPerBarrier
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
std::unique_ptr< TranslationGen > TranslationGenPtr
Declarations of a non-full system Page Table.
Calls getCurrentEntry once the queueEntry has been dmaRead.
AQL kernel dispatch packet.
Definition hsa.h:2901
const std::string & name()
Definition trace.cc:48

Generated on Tue Jun 18 2024 16:24:03 for gem5 by doxygen 1.11.0