gem5 v23.0.0.1
Loading...
Searching...
No Matches
hsa_packet_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cassert>
35#include <cstring>
36
38#include "base/compiler.hh"
39#include "base/logging.hh"
40#include "base/trace.hh"
41#include "debug/HSAPacketProcessor.hh"
43#include "dev/dma_device.hh"
44#include "dev/hsa/hsa_packet.hh"
46#include "enums/GfxVersion.hh"
48#include "mem/packet_access.hh"
49#include "mem/page_table.hh"
50#include "sim/full_system.hh"
51#include "sim/process.hh"
52#include "sim/proxy_ptr.hh"
53#include "sim/system.hh"
54
55#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
56 const char* \
57 HSAPacketProcessor::XEVENT::description() const \
58 { \
59 return #XEVENT; \
60 }
61
62#define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
63 HSA_PACKET_HEADER_TYPE) & mask(HSA_PACKET_HEADER_WIDTH_TYPE)))
64
65// checks if the barrier bit is set in the header -- shift the barrier bit
66// to LSB, then bitwise "and" to mask off all other bits
67#define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
68 HSA_PACKET_HEADER_BARRIER) & \
69 mask(HSA_PACKET_HEADER_WIDTH_BARRIER)))
70
71namespace gem5
72{
73
74HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
75
77 : DmaVirtDevice(p), walker(p.walker),
78 numHWQueues(p.numHWQueues), pioAddr(p.pioAddr),
79 pioSize(PAGE_SIZE), pioDelay(10), pktProcessDelay(p.pktProcessDelay)
80{
81 DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
82 hwSchdlr = new HWScheduler(this, p.wakeupDelay);
83 regdQList.resize(numHWQueues);
84 for (int i = 0; i < numHWQueues; i++) {
85 regdQList[i] = new RQLEntry(this, i);
86 }
87}
88
90{
91 for (auto &queue : regdQList) {
92 delete queue;
93 }
94}
95
96void
98{
100
101 assert(walker);
103}
104
105void
106HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
107{
108 hwSchdlr->unregisterQueue(queue_id, doorbellSize);
109}
110
111void
112HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
113 uint64_t basePointer,
114 uint64_t queue_id,
115 uint32_t size, int doorbellSize,
116 GfxVersion gfxVersion,
117 Addr offset, uint64_t rd_idx)
118{
120 "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
121 (void *)basePointer, queue_id, size);
122 hwSchdlr->registerNewQueue(hostReadIndexPointer,
123 basePointer, queue_id, size, doorbellSize,
124 gfxVersion, offset, rd_idx);
125}
126
129{
130 assert(pioSize != 0);
131
132 AddrRangeList ranges;
133 ranges.push_back(RangeSize(pioAddr, pioSize));
134
135 return ranges;
136}
137
138// Basically only processes writes to the queue doorbell register.
139Tick
141{
142 assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
143
144 // TODO: How to get pid??
145 [[maybe_unused]] Addr daddr = pkt->getAddr() - pioAddr;
146
148 "%s: write of size %d to reg-offset %d (0x%x)\n",
149 __FUNCTION__, pkt->getSize(), daddr, daddr);
150
151 assert(gpu_device->driver()->doorbellSize() == pkt->getSize());
152
153 uint64_t doorbell_reg(0);
154 if (pkt->getSize() == 8)
155 doorbell_reg = pkt->getLE<uint64_t>() + 1;
156 else if (pkt->getSize() == 4)
157 doorbell_reg = pkt->getLE<uint32_t>();
158 else
159 fatal("invalid db size");
160
162 "%s: write data 0x%x to offset %d (0x%x)\n",
163 __FUNCTION__, doorbell_reg, daddr, daddr);
164 hwSchdlr->write(daddr, doorbell_reg);
165 pkt->makeAtomicResponse();
166 return pioDelay;
167}
168
169Tick
171{
172 pkt->makeAtomicResponse();
173 pkt->setBadAddress();
174 return pioDelay;
175}
176
179{
180 if (!FullSystem) {
181 // Grab the process and try to translate the virtual address with it;
182 // with new extensions, it will likely be wrong to just arbitrarily
183 // grab context zero.
184 auto process = sys->threads[0]->getProcessPtr();
185
186 return process->pTable->translateRange(vaddr, size);
187 }
188
189 // In full system use the page tables setup by the kernel driver rather
190 // than the CPU page tables.
191 return TranslationGenPtr(
193 1 /* vmid */, vaddr, size));
194}
195
201void
203{
204 DPRINTF(HSAPacketProcessor, "updateReaddispId\n");
205}
206
207void
208HSAPacketProcessor::updateReadIndex(int pid, uint32_t rl_idx)
209{
210 AQLRingBuffer* aqlbuf = regdQList[rl_idx]->qCntxt.aqlBuf;
211 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
212 auto cb = new DmaVirtCallback<uint64_t>(
213 [ = ] (const uint32_t &dma_data) { this->updateReadDispIdDma(); }, 0);
214
216 "%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->rdIdx());
217
218 dmaWriteVirt((Addr)qDesc->hostReadIndexPtr, sizeof(aqlbuf->rdIdx()),
219 cb, aqlbuf->rdIdxPtr());
220
222 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
223 " q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
224 qDesc->readIndex, qDesc->writeIndex, qDesc->spaceUsed(),
225 qDesc->numElts, qDesc->isEmpty()? "true" : "false", rl_idx);
226 if (qDesc->writeIndex != aqlbuf->wrIdx()) {
227 getCommandsFromHost(pid, rl_idx);
228 }
229}
230
231void
233 bool isRead, uint32_t ix_start, unsigned num_pkts,
234 dma_series_ctx *series_ctx, void *dest_4debug)
235{
236 uint32_t rl_idx = series_ctx->rl_idx;
237 [[maybe_unused]] AQLRingBuffer *aqlRingBuffer =
238 hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
239 HSAQueueDescriptor* qDesc =
240 hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
241 DPRINTF(HSAPacketProcessor, ">%s, ix = %d, npkts = %d," \
242 " pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
243 ix_start, num_pkts, series_ctx->pkts_2_go,
244 rl_idx);
245 if (isRead) {
246 series_ctx->pkts_2_go -= num_pkts;
247 if (series_ctx->pkts_2_go == 0) {
248 // Mark DMA as completed
249 qDesc->dmaInProgress = false;
251 "%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
252 " dispIdx %d, active list ID = %d\n",
253 __FUNCTION__, aqlRingBuffer->rdIdx(),
254 aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rl_idx);
255 // schedule queue wakeup
256 hsaPP->schedAQLProcessing(rl_idx);
257 delete series_ctx;
258 }
259 }
260}
261
262void
264{
265 RQLEntry *queue = regdQList[rl_idx];
266 if (!queue->aqlProcessEvent.scheduled()) {
267 Tick processingTick = curTick() + delay;
268 schedule(queue->aqlProcessEvent, processingTick);
269 DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
270 processingTick);
271 } else {
272 DPRINTF(HSAPacketProcessor, "AQL processing already scheduled\n");
273 }
274}
275
276void
278{
280}
281
283HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
284{
285 Q_STATE is_submitted = BLOCKED_BPKT;
286 SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
287 // Dependency signals are not read yet. And this can only be a retry.
288 // The retry logic will schedule the packet processor wakeup
289 if (dep_sgnl_rd_st->pendingReads != 0) {
290 return BLOCKED_BPKT;
291 }
292 // `pkt` can be typecasted to any type of AQL packet since they all
293 // have header information at offset zero
294 auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
295 hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
296 if (IS_BARRIER(disp_pkt) &&
297 regdQList[rl_idx]->compltnPending() > 0) {
298 // If this packet is using the "barrier bit" to enforce ordering with
299 // previous packets, and if there are outstanding packets, set the
300 // barrier bit for this queue and block the queue.
301 DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
302 " list ID = %d\n", __FUNCTION__, rl_idx);
303 regdQList[rl_idx]->setBarrierBit(true);
304 return BLOCKED_BBIT;
305 }
306 if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
307 DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
308 " active list ID = %d\n", __FUNCTION__, rl_idx);
309 // Submit packet to HSA device (dispatcher)
310 gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
311 is_submitted = UNBLOCKED;
312 } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
313 DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
314 " active list ID = %d\n", __FUNCTION__, rl_idx);
315 // Submit packet to HSA device (dispatcher)
316 gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
317 is_submitted = UNBLOCKED;
318 /*
319 If this packet is using the "barrier bit" to enforce ordering with
320 subsequent kernels, set the bit for this queue now, after
321 dispatching.
322 */
323 if (IS_BARRIER(disp_pkt)) {
324 DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
325 " list ID = %d\n", __FUNCTION__, rl_idx);
326 regdQList[rl_idx]->setBarrierBit(true);
327 }
328 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
329 DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
330 " active list ID = %d\n", __FUNCTION__, rl_idx);
331 auto bar_and_pkt = (_hsa_barrier_and_packet_t *)pkt;
332 bool isReady = true;
333 // Loop thorugh all the completion signals to see if this barrier
334 // packet is ready.
335 for (int i = 0; i < NumSignalsPerBarrier; i++) {
336 // dep_signal = zero imply no signal connected
337 if (bar_and_pkt->dep_signal[i]) {
338 // The signal value is aligned 8 bytes from
339 // the actual handle in the runtime
340 uint64_t signal_addr =
341 (uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[i]) + 1);
342 hsa_signal_value_t *signal_val =
343 &(dep_sgnl_rd_st->values[i]);
344 DPRINTF(HSAPacketProcessor, "%s: Barrier pkt dep sgnl[%d]" \
345 " , sig addr %x, value %d active list ID = %d\n",
346 __FUNCTION__, i, signal_addr,
347 *signal_val, rl_idx);
348 // The if condition will be executed everytime except the
349 // very first time this barrier packet is encounteresd.
350 if (dep_sgnl_rd_st->allRead) {
351 if (*signal_val != 0) {
352 // This signal is not yet ready, read it again
353 isReady = false;
354
355 auto cb = new DmaVirtCallback<int64_t>(
356 [ = ] (const uint32_t &dma_data)
357 { dep_sgnl_rd_st->handleReadDMA(); }, 0);
358 dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
359 cb, signal_val);
360 dep_sgnl_rd_st->pendingReads++;
361 DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
362 " active list %d\n", __FUNCTION__,
363 dep_sgnl_rd_st->pendingReads, rl_idx);
364 }
365 } else {
366 // This signal is not yet ready, read it again
367 isReady = false;
368 auto cb = new DmaVirtCallback<int64_t>(
369 [ = ] (const uint32_t &dma_data)
370 { dep_sgnl_rd_st->handleReadDMA(); }, 0);
371 dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
372 cb, signal_val);
373 dep_sgnl_rd_st->pendingReads++;
374 DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
375 " active list %d\n", __FUNCTION__,
376 dep_sgnl_rd_st->pendingReads, rl_idx);
377 }
378 }
379 }
380 if (isReady) {
381 assert(dep_sgnl_rd_st->pendingReads == 0);
382 DPRINTF(HSAPacketProcessor, "%s: Barrier packet completed" \
383 " active list ID = %d\n", __FUNCTION__, rl_idx);
384 // TODO: Completion signal of barrier packet to be
385 // atomically decremented here
386 finishPkt((void*)bar_and_pkt, rl_idx);
387 is_submitted = UNBLOCKED;
388 // Reset signal values
389 dep_sgnl_rd_st->resetSigVals();
390 // The completion signal is connected
391 if (bar_and_pkt->completion_signal != 0) {
392 // HACK: The semantics of the HSA signal is to
393 // decrement the current signal value
394 // I'm going to cheat here and read out
395 // the value from main memory using functional
396 // access, and then just DMA the decremented value.
397 uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
398 bar_and_pkt->completion_signal);
399
400 DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
401 " completion signal! Addr: %x\n",
402 bar_and_pkt->completion_signal);
403
404 gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
405 signal_value - 1);
406 }
407 }
408 if (dep_sgnl_rd_st->pendingReads > 0) {
409 // Atleast one DepSignalsReadDmaEvent is scheduled this cycle
410 dep_sgnl_rd_st->allRead = false;
411 dep_sgnl_rd_st->discardRead = false;
412 }
413 } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_OR) {
414 fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
415 } else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
416 fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
417 } else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
418 DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
419 " active list ID = %d\n", __FUNCTION__, rl_idx);
420 // Submit packet to HSA device (dispatcher)
422 (void *)disp_pkt, rl_idx, host_pkt_addr);
423 is_submitted = UNBLOCKED;
424 sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
425 } else {
426 fatal("Unsupported packet type %d\n", pkt_type);
427 }
428 return is_submitted;
429}
430
431// Wakes up every fixed time interval (pktProcessDelay) and processes a single
432// packet from the queue that scheduled this wakeup. If there are more
433// packets in that queue, the next wakeup is scheduled.
434void
436{
437 AQLRingBuffer *aqlRingBuffer = hsaPP->regdQList[rqIdx]->qCntxt.aqlBuf;
439 "%s: Qwakeup , rdIdx %d, wrIdx %d," \
440 " dispIdx %d, active list ID = %d\n",
441 __FUNCTION__, aqlRingBuffer->rdIdx(),
442 aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
443 // If barrier bit is set, then this wakeup is a dummy wakeup
444 // just to model the processing time. Do nothing.
445 if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
447 "Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
448 return;
449 }
450 // In the future, we may support batch processing of packets.
451 // Then, we can just remove the break statements and the code
452 // will support batch processing. That is why we are using a
453 // "while loop" here instead on an "if" condition.
454 while (hsaPP->regdQList[rqIdx]->dispPending()) {
455 void *pkt = aqlRingBuffer->ptr(aqlRingBuffer->dispIdx());
456 DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
457 __FUNCTION__, aqlRingBuffer->dispIdx());
458 Addr host_addr = aqlRingBuffer->hostDispAddr();
459 Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
460 if (q_state == UNBLOCKED) {
461 aqlRingBuffer->incDispIdx(1);
462 DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
463 __FUNCTION__, aqlRingBuffer->dispIdx());
464 if (hsaPP->regdQList[rqIdx]->dispPending()) {
466 }
467 break;
468 } else if (q_state == BLOCKED_BPKT) {
469 // This queue is blocked by barrier packet,
470 // schedule a processing event
472 break;
473 } else if (q_state == BLOCKED_BBIT) {
474 // This queue is blocked by barrier bit, and processing event
475 // should be scheduled from finishPkt(). However, to elapse
476 // "pktProcessDelay" processing time, let us schedule a dummy
477 // wakeup once which will just wakeup and will do nothing.
479 break;
480 } else {
481 panic("Unknown queue state\n");
482 }
483 }
484}
485
486void
488{
489 assert(pendingReads > 0);
490 pendingReads--;
491 if (pendingReads == 0) {
492 allRead = true;
493 if (discardRead) {
494 resetSigVals();
495 }
496 }
497}
498
499void
501{
502 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
503 AQLRingBuffer *aqlRingBuffer = regdQList[rl_idx]->qCntxt.aqlBuf;
504
506 "%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
507 " doorbell(%d)[0x%x] \n",
508 __FUNCTION__, qDesc->readIndex,
509 qDesc->writeIndex, pid, qDesc->doorbellPointer);
510
511 if (qDesc->dmaInProgress) {
512 // we'll try again when this dma transfer completes in updateReadIndex
513 return;
514 }
515 uint32_t num_umq = qDesc->spaceUsed();
516 if (num_umq == 0)
517 return; // nothing to be gotten
518 uint32_t umq_nxt = qDesc->readIndex;
519 // Total AQL buffer size
520 uint32_t ttl_aql_buf = aqlRingBuffer->numObjs();
521 // Available AQL buffer size. If the available buffer is less than
522 // demanded, number of available buffer is returned
523 uint32_t got_aql_buf = aqlRingBuffer->allocEntry(num_umq);
524 qDesc->readIndex += got_aql_buf;
525 uint32_t dma_start_ix = (aqlRingBuffer->wrIdx() - got_aql_buf) %
526 ttl_aql_buf;
527 dma_series_ctx *series_ctx = NULL;
528
529 DPRINTF(HSAPacketProcessor, "%s: umq_nxt = %d, ttl_aql_buf = %d, "
530 "dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
531 ttl_aql_buf, dma_start_ix, num_umq);
532
533 if (got_aql_buf == 0) {
534 // we'll try again when some dma bufs are freed in freeEntry
535 qDesc->stalledOnDmaBufAvailability = true;
536 return;
537 } else {
538 qDesc->stalledOnDmaBufAvailability = false;
539 }
540
541 uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
542 while (got_aql_buf != 0 && num_umq != 0) {
543 uint32_t umq_b4_wrap = qDesc->numObjs() -
544 (umq_nxt % qDesc->objSize());
545 uint32_t num_2_xfer
546 = std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
547 if (!series_ctx) {
548 qDesc->dmaInProgress = true;
549 series_ctx = new dma_series_ctx(got_aql_buf, got_aql_buf,
550 dma_start_ix, rl_idx);
551 }
552
553 void *aql_buf = aqlRingBuffer->ptr(dma_start_ix);
554 auto cb = new DmaVirtCallback<uint64_t>(
555 [ = ] (const uint32_t &dma_data)
556 { this->cmdQueueCmdDma(this, pid, true, dma_start_ix,
557 num_2_xfer, series_ctx, aql_buf); }, 0);
558 dmaReadVirt(qDesc->ptr(umq_nxt), num_2_xfer * qDesc->objSize(),
559 cb, aql_buf);
560
561 aqlRingBuffer->saveHostDispAddr(qDesc->ptr(umq_nxt), num_2_xfer,
562 dma_start_ix);
563
565 "%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
566 __FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
567
568 num_umq -= num_2_xfer;
569 got_aql_buf -= num_2_xfer;
570 dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
571 umq_nxt = (umq_nxt + num_2_xfer) % qDesc->numObjs();
572 if (got_aql_buf == 0 && num_umq != 0) {
573 // There are more packets in the queue but
574 // not enough DMA buffers. Set the stalledOnDmaBufAvailability,
575 // we will try again in freeEntry
576 qDesc->stalledOnDmaBufAvailability = true;
577 }
578 }
579}
580
581void
583{
584 [[maybe_unused]] HSAQueueDescriptor* qDesc =
585 regdQList[rl_idx]->qCntxt.qDesc;
587 "%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
588 "writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
589 __FUNCTION__, pid, qDesc->basePointer,
590 qDesc->doorbellPointer, qDesc->writeIndex,
591 qDesc->readIndex, qDesc->numElts);
592}
593
595 const std::string name)
596 : _name(name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
597{
598 _aqlBuf.resize(size);
599 _aqlComplete.resize(size);
600 _hostDispAddresses.resize(size);
601 // Mark all packets as invalid and incomplete
602 for (auto& it : _aqlBuf)
603 it.header = HSA_PACKET_TYPE_INVALID;
604 std::fill(_aqlComplete.begin(), _aqlComplete.end(), false);
605}
606
607void
609{
610 _rdIdx = value;
611}
612
613void
615{
616 _wrIdx = value;
617}
618
619void
621{
622 _dispIdx = value;
623}
624
625bool
627{
628 _aqlComplete[(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data()] = true;
629 DPRINTF(HSAPacketProcessor, "%s: pkt_ix = %d; "\
630 " # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
631 (hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data(),
632 nFree(), wrIdx(), rdIdx());
633 // Packets can complete out-of-order. This code "retires" packets in-order
634 // by updating the read pointer in the MQD when a contiguous chunk of
635 // packets have finished.
636 uint32_t old_rdIdx = rdIdx();
637 while (_aqlComplete[rdIdx() % numObjs()]) {
638 _aqlComplete[rdIdx() % numObjs()] = false;
640 incRdIdx(1);
641 }
642 return (old_rdIdx != rdIdx());
643}
644
645void
647{
648 this->gpu_device = dev;
649}
650
651int
653{
654 DPRINTF(HSAPacketProcessor, "%s: nReq = %d\n", __FUNCTION__, nBufReq);
655 if (nFree() == 0) {
656 DPRINTF(HSAPacketProcessor, "%s: return = %d\n", __FUNCTION__, 0);
657 return 0;
658 }
659
660 if (nBufReq > nFree())
661 nBufReq = nFree();
662
663 DPRINTF(HSAPacketProcessor, "%s: ix1stFree = %d\n", __FUNCTION__, wrIdx());
664 incWrIdx(nBufReq);
665 DPRINTF(HSAPacketProcessor, "%s: return = %d, wrIdx = %d\n",
666 __FUNCTION__, nBufReq, wrIdx());
667 return nBufReq;
668}
669
670void
671HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
672{
673 HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
674
675 // if barrier bit was set and this is the last
676 // outstanding packet from that queue,
677 // unset it here
678 if (regdQList[rl_idx]->getBarrierBit() &&
679 regdQList[rl_idx]->isLastOutstandingPkt()) {
681 "Unset barrier bit for active list ID %d\n", rl_idx);
682 regdQList[rl_idx]->setBarrierBit(false);
683 // if pending kernels in the queue after this kernel, reschedule
684 if (regdQList[rl_idx]->dispPending()) {
686 "Rescheduling active list ID %d after unsetting barrier "
687 "bit\n", rl_idx);
688 schedAQLProcessing(rl_idx);
689 }
690 }
691
692 // If set, then blocked schedule, so need to reschedule
693 if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
694 updateReadIndex(0, rl_idx);
696 "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
697 " q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
698 __FUNCTION__, qDesc->readIndex, qDesc->writeIndex,
699 qDesc->spaceUsed(), qDesc->numElts,
700 qDesc->stalledOnDmaBufAvailability? "true" : "false",
701 qDesc->isEmpty()? "true" : "false", rl_idx);
702 // DMA buffer is freed, check the queue to see if there are DMA
703 // accesses blocked becasue of non-availability of DMA buffer
704 if (qDesc->stalledOnDmaBufAvailability) {
705 assert(!qDesc->isEmpty());
706 getCommandsFromHost(0, rl_idx); // TODO:assign correct pid
707 // when implementing
708 // multi-process support
709 }
710}
711
712void
714 void *pkt, hsa_signal_value_t signal)
715{
716 auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
717 uint64_t signal_addr =
718 (uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
719 DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
720 " completion signal: %x!\n", signal_addr);
730 VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
731
732 DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
733 (uint64_t)sys->threads[0]->cpuId());
734
735
736 hsa_signal_value_t *new_signal = new hsa_signal_value_t;
737 *new_signal = (hsa_signal_value_t) *prev_signal - 1;
738
739 dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
740}
741
742void
744{
745 uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
746 DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
747 signal_addr);
757 VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
758
759 hsa_signal_value_t *new_signal = new hsa_signal_value_t;
760 *new_signal = (hsa_signal_value_t) *prev_signal - 1;
761
762 dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
763}
764
765} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
void * ptr(uint32_t ix)
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
AQLRingBuffer(uint32_t size, const std::string name)
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
void incRdIdx(uint64_t value)
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
uint64_t functionalReadHsaSignal(Addr signal_handle)
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessorParams Params
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
virtual Tick read(Packet *) override
void unregisterQueue(uint64_t queue_id, int doorbellSize)
void registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void write(Addr db_addr, uint64_t doorbell_reg)
const std::string _name
Definition named.hh:41
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void setBadAddress()
Definition packet.hh:786
Addr getAddr() const
Definition packet.hh:807
unsigned getSize() const
Definition packet.hh:817
void makeAtomicResponse()
Definition packet.hh:1074
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
Threads threads
Definition system.hh:310
void setDevRequestor(RequestorID mid)
#define PAGE_SIZE
Definition base.cc:60
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
hsa_packet_type_t
Packet type.
Definition hsa.h:2746
@ HSA_PACKET_TYPE_BARRIER_AND
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition hsa.h:2767
@ HSA_PACKET_TYPE_BARRIER_OR
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition hsa.h:2778
@ HSA_PACKET_TYPE_VENDOR_SPECIFIC
Vendor-specific packet.
Definition hsa.h:2750
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
Definition hsa.h:2756
@ HSA_PACKET_TYPE_KERNEL_DISPATCH
Packet used by agents for dispatching jobs to kernel agents.
Definition hsa.h:2761
@ HSA_PACKET_TYPE_AGENT_DISPATCH
Packet used by agents for dispatching jobs to agents.
Definition hsa.h:2772
int32_t hsa_signal_value_t
Signal value.
Definition hsa.h:1322
#define PKT_TYPE(PKT)
#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT)
#define IS_BARRIER(PKT)
#define NumSignalsPerBarrier
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
std::unique_ptr< TranslationGen > TranslationGenPtr
Declarations of a non-full system Page Table.
Calls getCurrentEntry once the queueEntry has been dmaRead.
AQL kernel dispatch packet.
Definition hsa.h:2901
const std::string & name()
Definition trace.cc:48

Generated on Mon Jul 10 2023 15:32:02 for gem5 by doxygen 1.9.7