gem5  v22.1.0.0
hsa_packet_processor.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <cassert>
35 #include <cstring>
36 
37 #include "base/chunk_generator.hh"
38 #include "base/compiler.hh"
39 #include "base/logging.hh"
40 #include "base/trace.hh"
41 #include "debug/HSAPacketProcessor.hh"
43 #include "dev/dma_device.hh"
44 #include "dev/hsa/hsa_packet.hh"
45 #include "dev/hsa/hw_scheduler.hh"
46 #include "enums/GfxVersion.hh"
48 #include "mem/packet_access.hh"
49 #include "mem/page_table.hh"
50 #include "sim/full_system.hh"
51 #include "sim/process.hh"
52 #include "sim/proxy_ptr.hh"
53 #include "sim/system.hh"
54 
55 #define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT) \
56  const char* \
57  HSAPacketProcessor::XEVENT::description() const \
58  { \
59  return #XEVENT; \
60  }
61 
62 #define PKT_TYPE(PKT) ((hsa_packet_type_t)(((PKT->header) >> \
63  HSA_PACKET_HEADER_TYPE) & mask(HSA_PACKET_HEADER_WIDTH_TYPE)))
64 
65 // checks if the barrier bit is set in the header -- shift the barrier bit
66 // to LSB, then bitwise "and" to mask off all other bits
67 #define IS_BARRIER(PKT) ((hsa_packet_header_t)(((PKT->header) >> \
68  HSA_PACKET_HEADER_BARRIER) & \
69  mask(HSA_PACKET_HEADER_WIDTH_BARRIER)))
70 
71 namespace gem5
72 {
73 
74 HSAPP_EVENT_DESCRIPTION_GENERATOR(QueueProcessEvent)
75 
77  : DmaVirtDevice(p), walker(p.walker),
78  numHWQueues(p.numHWQueues), pioAddr(p.pioAddr),
79  pioSize(PAGE_SIZE), pioDelay(10), pktProcessDelay(p.pktProcessDelay)
80 {
81  DPRINTF(HSAPacketProcessor, "%s:\n", __FUNCTION__);
82  hwSchdlr = new HWScheduler(this, p.wakeupDelay);
83  regdQList.resize(numHWQueues);
84  for (int i = 0; i < numHWQueues; i++) {
85  regdQList[i] = new RQLEntry(this, i);
86  }
87 }
88 
90 {
91  for (auto &queue : regdQList) {
92  delete queue;
93  }
94 }
95 
96 void
98 {
100 
101  assert(walker);
103 }
104 
105 void
106 HSAPacketProcessor::unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
107 {
108  hwSchdlr->unregisterQueue(queue_id, doorbellSize);
109 }
110 
111 void
112 HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
113  uint64_t basePointer,
114  uint64_t queue_id,
115  uint32_t size, int doorbellSize,
116  GfxVersion gfxVersion,
117  Addr offset, uint64_t rd_idx)
118 {
120  "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
121  (void *)basePointer, queue_id, size);
122  hwSchdlr->registerNewQueue(hostReadIndexPointer,
123  basePointer, queue_id, size, doorbellSize,
124  gfxVersion, offset, rd_idx);
125 }
126 
129 {
130  assert(pioSize != 0);
131 
132  AddrRangeList ranges;
133  ranges.push_back(RangeSize(pioAddr, pioSize));
134 
135  return ranges;
136 }
137 
138 // Basically only processes writes to the queue doorbell register.
139 Tick
141 {
142  assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
143 
144  // TODO: How to get pid??
145  [[maybe_unused]] Addr daddr = pkt->getAddr() - pioAddr;
146 
148  "%s: write of size %d to reg-offset %d (0x%x)\n",
149  __FUNCTION__, pkt->getSize(), daddr, daddr);
150 
151  assert(gpu_device->driver()->doorbellSize() == pkt->getSize());
152 
153  uint64_t doorbell_reg(0);
154  if (pkt->getSize() == 8)
155  doorbell_reg = pkt->getLE<uint64_t>() + 1;
156  else if (pkt->getSize() == 4)
157  doorbell_reg = pkt->getLE<uint32_t>();
158  else
159  fatal("invalid db size");
160 
162  "%s: write data 0x%x to offset %d (0x%x)\n",
163  __FUNCTION__, doorbell_reg, daddr, daddr);
164  hwSchdlr->write(daddr, doorbell_reg);
165  pkt->makeAtomicResponse();
166  return pioDelay;
167 }
168 
169 Tick
171 {
172  pkt->makeAtomicResponse();
173  pkt->setBadAddress();
174  return pioDelay;
175 }
176 
179 {
180  if (!FullSystem) {
181  // Grab the process and try to translate the virtual address with it;
182  // with new extensions, it will likely be wrong to just arbitrarily
183  // grab context zero.
184  auto process = sys->threads[0]->getProcessPtr();
185 
186  return process->pTable->translateRange(vaddr, size);
187  }
188 
189  // In full system use the page tables setup by the kernel driver rather
190  // than the CPU page tables.
191  return TranslationGenPtr(
193  1 /* vmid */, vaddr, size));
194 }
195 
201 void
203 {
204  DPRINTF(HSAPacketProcessor, "updateReaddispId\n");
205 }
206 
207 void
208 HSAPacketProcessor::updateReadIndex(int pid, uint32_t rl_idx)
209 {
210  AQLRingBuffer* aqlbuf = regdQList[rl_idx]->qCntxt.aqlBuf;
211  HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
212  auto cb = new DmaVirtCallback<uint64_t>(
213  [ = ] (const uint32_t &dma_data) { this->updateReadDispIdDma(); }, 0);
214 
216  "%s: read-pointer offset [0x%x]\n", __FUNCTION__, aqlbuf->rdIdx());
217 
218  dmaWriteVirt((Addr)qDesc->hostReadIndexPtr, sizeof(aqlbuf->rdIdx()),
219  cb, aqlbuf->rdIdxPtr());
220 
222  "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
223  " q size = %d, is_empty = %s, active list ID = %d\n", __FUNCTION__,
224  qDesc->readIndex, qDesc->writeIndex, qDesc->spaceUsed(),
225  qDesc->numElts, qDesc->isEmpty()? "true" : "false", rl_idx);
226  if (qDesc->writeIndex != aqlbuf->wrIdx()) {
227  getCommandsFromHost(pid, rl_idx);
228  }
229 }
230 
231 void
233  bool isRead, uint32_t ix_start, unsigned num_pkts,
234  dma_series_ctx *series_ctx, void *dest_4debug)
235 {
236  uint32_t rl_idx = series_ctx->rl_idx;
237  [[maybe_unused]] AQLRingBuffer *aqlRingBuffer =
238  hsaPP->regdQList[rl_idx]->qCntxt.aqlBuf;
239  HSAQueueDescriptor* qDesc =
240  hsaPP->regdQList[rl_idx]->qCntxt.qDesc;
241  DPRINTF(HSAPacketProcessor, ">%s, ix = %d, npkts = %d," \
242  " pktsRemaining = %d, active list ID = %d\n", __FUNCTION__,
243  ix_start, num_pkts, series_ctx->pkts_2_go,
244  rl_idx);
245  if (isRead) {
246  series_ctx->pkts_2_go -= num_pkts;
247  if (series_ctx->pkts_2_go == 0) {
248  // Mark DMA as completed
249  qDesc->dmaInProgress = false;
251  "%s: schedule Qwakeup next cycle, rdIdx %d, wrIdx %d," \
252  " dispIdx %d, active list ID = %d\n",
253  __FUNCTION__, aqlRingBuffer->rdIdx(),
254  aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rl_idx);
255  // schedule queue wakeup
256  hsaPP->schedAQLProcessing(rl_idx);
257  delete series_ctx;
258  }
259  }
260 }
261 
262 void
264 {
265  RQLEntry *queue = regdQList[rl_idx];
266  if (!queue->aqlProcessEvent.scheduled()) {
267  Tick processingTick = curTick() + delay;
268  schedule(queue->aqlProcessEvent, processingTick);
269  DPRINTF(HSAPacketProcessor, "AQL processing scheduled at tick: %d\n",
270  processingTick);
271  } else {
272  DPRINTF(HSAPacketProcessor, "AQL processing already scheduled\n");
273  }
274 }
275 
276 void
278 {
280 }
281 
282 Q_STATE
283 HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr)
284 {
285  Q_STATE is_submitted = BLOCKED_BPKT;
286  SignalState *dep_sgnl_rd_st = &(regdQList[rl_idx]->depSignalRdState);
287  // Dependency signals are not read yet. And this can only be a retry.
288  // The retry logic will schedule the packet processor wakeup
289  if (dep_sgnl_rd_st->pendingReads != 0) {
290  return BLOCKED_BPKT;
291  }
292  // `pkt` can be typecasted to any type of AQL packet since they all
293  // have header information at offset zero
294  auto disp_pkt = (_hsa_dispatch_packet_t *)pkt;
295  hsa_packet_type_t pkt_type = PKT_TYPE(disp_pkt);
296  if (IS_BARRIER(disp_pkt) &&
297  regdQList[rl_idx]->compltnPending() > 0) {
298  // If this packet is using the "barrier bit" to enforce ordering with
299  // previous packets, and if there are outstanding packets, set the
300  // barrier bit for this queue and block the queue.
301  DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
302  " list ID = %d\n", __FUNCTION__, rl_idx);
303  regdQList[rl_idx]->setBarrierBit(true);
304  return BLOCKED_BBIT;
305  }
306  if (pkt_type == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
307  DPRINTF(HSAPacketProcessor, "%s: submitting vendor specific pkt" \
308  " active list ID = %d\n", __FUNCTION__, rl_idx);
309  // Submit packet to HSA device (dispatcher)
310  gpu_device->submitVendorPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
311  is_submitted = UNBLOCKED;
312  } else if (pkt_type == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
313  DPRINTF(HSAPacketProcessor, "%s: submitting kernel dispatch pkt" \
314  " active list ID = %d\n", __FUNCTION__, rl_idx);
315  // Submit packet to HSA device (dispatcher)
316  gpu_device->submitDispatchPkt((void *)disp_pkt, rl_idx, host_pkt_addr);
317  is_submitted = UNBLOCKED;
318  /*
319  If this packet is using the "barrier bit" to enforce ordering with
320  subsequent kernels, set the bit for this queue now, after
321  dispatching.
322  */
323  if (IS_BARRIER(disp_pkt)) {
324  DPRINTF(HSAPacketProcessor, "%s: setting barrier bit for active" \
325  " list ID = %d\n", __FUNCTION__, rl_idx);
326  regdQList[rl_idx]->setBarrierBit(true);
327  }
328  } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_AND) {
329  DPRINTF(HSAPacketProcessor, "%s: Processing barrier packet" \
330  " active list ID = %d\n", __FUNCTION__, rl_idx);
331  auto bar_and_pkt = (_hsa_barrier_and_packet_t *)pkt;
332  bool isReady = true;
333  // Loop thorugh all the completion signals to see if this barrier
334  // packet is ready.
335  for (int i = 0; i < NumSignalsPerBarrier; i++) {
336  // dep_signal = zero imply no signal connected
337  if (bar_and_pkt->dep_signal[i]) {
338  // The signal value is aligned 8 bytes from
339  // the actual handle in the runtime
340  uint64_t signal_addr =
341  (uint64_t) (((uint64_t *) bar_and_pkt->dep_signal[i]) + 1);
342  hsa_signal_value_t *signal_val =
343  &(dep_sgnl_rd_st->values[i]);
344  DPRINTF(HSAPacketProcessor, "%s: Barrier pkt dep sgnl[%d]" \
345  " , sig addr %x, value %d active list ID = %d\n",
346  __FUNCTION__, i, signal_addr,
347  *signal_val, rl_idx);
348  // The if condition will be executed everytime except the
349  // very first time this barrier packet is encounteresd.
350  if (dep_sgnl_rd_st->allRead) {
351  if (*signal_val != 0) {
352  // This signal is not yet ready, read it again
353  isReady = false;
354 
355  auto cb = new DmaVirtCallback<int64_t>(
356  [ = ] (const uint32_t &dma_data)
357  { dep_sgnl_rd_st->handleReadDMA(); }, 0);
358  dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
359  cb, signal_val);
360  dep_sgnl_rd_st->pendingReads++;
361  DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
362  " active list %d\n", __FUNCTION__,
363  dep_sgnl_rd_st->pendingReads, rl_idx);
364  }
365  } else {
366  // This signal is not yet ready, read it again
367  isReady = false;
368  auto cb = new DmaVirtCallback<int64_t>(
369  [ = ] (const uint32_t &dma_data)
370  { dep_sgnl_rd_st->handleReadDMA(); }, 0);
371  dmaReadVirt(signal_addr, sizeof(hsa_signal_value_t),
372  cb, signal_val);
373  dep_sgnl_rd_st->pendingReads++;
374  DPRINTF(HSAPacketProcessor, "%s: Pending reads %d," \
375  " active list %d\n", __FUNCTION__,
376  dep_sgnl_rd_st->pendingReads, rl_idx);
377  }
378  }
379  }
380  if (isReady) {
381  assert(dep_sgnl_rd_st->pendingReads == 0);
382  DPRINTF(HSAPacketProcessor, "%s: Barrier packet completed" \
383  " active list ID = %d\n", __FUNCTION__, rl_idx);
384  // TODO: Completion signal of barrier packet to be
385  // atomically decremented here
386  finishPkt((void*)bar_and_pkt, rl_idx);
387  is_submitted = UNBLOCKED;
388  // Reset signal values
389  dep_sgnl_rd_st->resetSigVals();
390  // The completion signal is connected
391  if (bar_and_pkt->completion_signal != 0) {
392  // HACK: The semantics of the HSA signal is to
393  // decrement the current signal value
394  // I'm going to cheat here and read out
395  // the value from main memory using functional
396  // access, and then just DMA the decremented value.
397  uint64_t signal_value = gpu_device->functionalReadHsaSignal(\
398  bar_and_pkt->completion_signal);
399 
400  DPRINTF(HSAPacketProcessor, "Triggering barrier packet" \
401  " completion signal! Addr: %x\n",
402  bar_and_pkt->completion_signal);
403 
404  gpu_device->updateHsaSignal(bar_and_pkt->completion_signal,
405  signal_value - 1);
406  }
407  }
408  if (dep_sgnl_rd_st->pendingReads > 0) {
409  // Atleast one DepSignalsReadDmaEvent is scheduled this cycle
410  dep_sgnl_rd_st->allRead = false;
411  dep_sgnl_rd_st->discardRead = false;
412  }
413  } else if (pkt_type == HSA_PACKET_TYPE_BARRIER_OR) {
414  fatal("Unsupported packet type HSA_PACKET_TYPE_BARRIER_OR");
415  } else if (pkt_type == HSA_PACKET_TYPE_INVALID) {
416  fatal("Unsupported packet type HSA_PACKET_TYPE_INVALID");
417  } else if (pkt_type == HSA_PACKET_TYPE_AGENT_DISPATCH) {
418  DPRINTF(HSAPacketProcessor, "%s: submitting agent dispatch pkt" \
419  " active list ID = %d\n", __FUNCTION__, rl_idx);
420  // Submit packet to HSA device (dispatcher)
422  (void *)disp_pkt, rl_idx, host_pkt_addr);
423  is_submitted = UNBLOCKED;
424  sendAgentDispatchCompletionSignal((void *)disp_pkt,0);
425  } else {
426  fatal("Unsupported packet type %d\n", pkt_type);
427  }
428  return is_submitted;
429 }
430 
431 // Wakes up every fixed time interval (pktProcessDelay) and processes a single
432 // packet from the queue that scheduled this wakeup. If there are more
433 // packets in that queue, the next wakeup is scheduled.
434 void
436 {
437  AQLRingBuffer *aqlRingBuffer = hsaPP->regdQList[rqIdx]->qCntxt.aqlBuf;
439  "%s: Qwakeup , rdIdx %d, wrIdx %d," \
440  " dispIdx %d, active list ID = %d\n",
441  __FUNCTION__, aqlRingBuffer->rdIdx(),
442  aqlRingBuffer->wrIdx(), aqlRingBuffer->dispIdx(), rqIdx);
443  // If barrier bit is set, then this wakeup is a dummy wakeup
444  // just to model the processing time. Do nothing.
445  if (hsaPP->regdQList[rqIdx]->getBarrierBit()) {
447  "Dummy wakeup with barrier bit for rdIdx %d\n", rqIdx);
448  return;
449  }
450  // In the future, we may support batch processing of packets.
451  // Then, we can just remove the break statements and the code
452  // will support batch processing. That is why we are using a
453  // "while loop" here instead on an "if" condition.
454  while (hsaPP->regdQList[rqIdx]->dispPending()) {
455  void *pkt = aqlRingBuffer->ptr(aqlRingBuffer->dispIdx());
456  DPRINTF(HSAPacketProcessor, "%s: Attempting dispatch @ dispIdx[%d]\n",
457  __FUNCTION__, aqlRingBuffer->dispIdx());
458  Addr host_addr = aqlRingBuffer->hostDispAddr();
459  Q_STATE q_state = hsaPP->processPkt(pkt, rqIdx, host_addr);
460  if (q_state == UNBLOCKED) {
461  aqlRingBuffer->incDispIdx(1);
462  DPRINTF(HSAPacketProcessor, "%s: Increment dispIdx[%d]\n",
463  __FUNCTION__, aqlRingBuffer->dispIdx());
464  if (hsaPP->regdQList[rqIdx]->dispPending()) {
466  }
467  break;
468  } else if (q_state == BLOCKED_BPKT) {
469  // This queue is blocked by barrier packet,
470  // schedule a processing event
472  break;
473  } else if (q_state == BLOCKED_BBIT) {
474  // This queue is blocked by barrier bit, and processing event
475  // should be scheduled from finishPkt(). However, to elapse
476  // "pktProcessDelay" processing time, let us schedule a dummy
477  // wakeup once which will just wakeup and will do nothing.
479  break;
480  } else {
481  panic("Unknown queue state\n");
482  }
483  }
484 }
485 
486 void
488 {
489  assert(pendingReads > 0);
490  pendingReads--;
491  if (pendingReads == 0) {
492  allRead = true;
493  if (discardRead) {
494  resetSigVals();
495  }
496  }
497 }
498 
499 void
501 {
502  HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
503  AQLRingBuffer *aqlRingBuffer = regdQList[rl_idx]->qCntxt.aqlBuf;
504 
506  "%s: read-pointer offset[0x%x], write-pointer offset[0x%x]"
507  " doorbell(%d)[0x%x] \n",
508  __FUNCTION__, qDesc->readIndex,
509  qDesc->writeIndex, pid, qDesc->doorbellPointer);
510 
511  if (qDesc->dmaInProgress) {
512  // we'll try again when this dma transfer completes in updateReadIndex
513  return;
514  }
515  uint32_t num_umq = qDesc->spaceUsed();
516  if (num_umq == 0)
517  return; // nothing to be gotten
518  uint32_t umq_nxt = qDesc->readIndex;
519  // Total AQL buffer size
520  uint32_t ttl_aql_buf = aqlRingBuffer->numObjs();
521  // Available AQL buffer size. If the available buffer is less than
522  // demanded, number of available buffer is returned
523  uint32_t got_aql_buf = aqlRingBuffer->allocEntry(num_umq);
524  qDesc->readIndex += got_aql_buf;
525  uint32_t dma_start_ix = (aqlRingBuffer->wrIdx() - got_aql_buf) %
526  ttl_aql_buf;
527  dma_series_ctx *series_ctx = NULL;
528 
529  DPRINTF(HSAPacketProcessor, "%s: umq_nxt = %d, ttl_aql_buf = %d, "
530  "dma_start_ix = %d, num_umq = %d\n", __FUNCTION__, umq_nxt,
531  ttl_aql_buf, dma_start_ix, num_umq);
532 
533  if (got_aql_buf == 0) {
534  // we'll try again when some dma bufs are freed in freeEntry
535  qDesc->stalledOnDmaBufAvailability = true;
536  return;
537  } else {
538  qDesc->stalledOnDmaBufAvailability = false;
539  }
540 
541  uint32_t dma_b4_wrap = ttl_aql_buf - dma_start_ix;
542  while (got_aql_buf != 0 && num_umq != 0) {
543  uint32_t umq_b4_wrap = qDesc->numObjs() -
544  (umq_nxt % qDesc->objSize());
545  uint32_t num_2_xfer
546  = std::min({umq_b4_wrap, dma_b4_wrap, num_umq, got_aql_buf});
547  if (!series_ctx) {
548  qDesc->dmaInProgress = true;
549  series_ctx = new dma_series_ctx(got_aql_buf, got_aql_buf,
550  dma_start_ix, rl_idx);
551  }
552 
553  void *aql_buf = aqlRingBuffer->ptr(dma_start_ix);
554  auto cb = new DmaVirtCallback<uint64_t>(
555  [ = ] (const uint32_t &dma_data)
556  { this->cmdQueueCmdDma(this, pid, true, dma_start_ix,
557  num_2_xfer, series_ctx, aql_buf); }, 0);
558  dmaReadVirt(qDesc->ptr(umq_nxt), num_2_xfer * qDesc->objSize(),
559  cb, aql_buf);
560 
561  aqlRingBuffer->saveHostDispAddr(qDesc->ptr(umq_nxt), num_2_xfer,
562  dma_start_ix);
563 
565  "%s: aql_buf = %p, umq_nxt = %d, dma_ix = %d, num2xfer = %d\n",
566  __FUNCTION__, aql_buf, umq_nxt, dma_start_ix, num_2_xfer);
567 
568  num_umq -= num_2_xfer;
569  got_aql_buf -= num_2_xfer;
570  dma_start_ix = (dma_start_ix + num_2_xfer) % ttl_aql_buf;
571  umq_nxt = (umq_nxt + num_2_xfer) % qDesc->numObjs();
572  if (got_aql_buf == 0 && num_umq != 0) {
573  // There are more packets in the queue but
574  // not enough DMA buffers. Set the stalledOnDmaBufAvailability,
575  // we will try again in freeEntry
576  qDesc->stalledOnDmaBufAvailability = true;
577  }
578  }
579 }
580 
581 void
583 {
584  [[maybe_unused]] HSAQueueDescriptor* qDesc =
585  regdQList[rl_idx]->qCntxt.qDesc;
587  "%s: pid[%d], basePointer[0x%lx], dBPointer[0x%lx], "
588  "writeIndex[0x%x], readIndex[0x%x], size(bytes)[0x%x]\n",
589  __FUNCTION__, pid, qDesc->basePointer,
590  qDesc->doorbellPointer, qDesc->writeIndex,
591  qDesc->readIndex, qDesc->numElts);
592 }
593 
595  const std::string name)
596  : _name(name), _wrIdx(0), _rdIdx(0), _dispIdx(0)
597 {
598  _aqlBuf.resize(size);
599  _aqlComplete.resize(size);
600  _hostDispAddresses.resize(size);
601  // Mark all packets as invalid and incomplete
602  for (auto& it : _aqlBuf)
603  it.header = HSA_PACKET_TYPE_INVALID;
604  std::fill(_aqlComplete.begin(), _aqlComplete.end(), false);
605 }
606 
607 void
608 AQLRingBuffer::setRdIdx(uint64_t value)
609 {
610  _rdIdx = value;
611 }
612 
613 void
614 AQLRingBuffer::setWrIdx(uint64_t value)
615 {
616  _wrIdx = value;
617 }
618 
619 void
621 {
622  _dispIdx = value;
623 }
624 
625 bool
627 {
628  _aqlComplete[(hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data()] = true;
629  DPRINTF(HSAPacketProcessor, "%s: pkt_ix = %d; "\
630  " # free entries = %d, wrIdx = %d, rdIdx = %d\n", __FUNCTION__,
631  (hsa_kernel_dispatch_packet_t *) pkt - _aqlBuf.data(),
632  nFree(), wrIdx(), rdIdx());
633  // Packets can complete out-of-order. This code "retires" packets in-order
634  // by updating the read pointer in the MQD when a contiguous chunk of
635  // packets have finished.
636  uint32_t old_rdIdx = rdIdx();
637  while (_aqlComplete[rdIdx() % numObjs()]) {
638  _aqlComplete[rdIdx() % numObjs()] = false;
640  incRdIdx(1);
641  }
642  return (old_rdIdx != rdIdx());
643 }
644 
645 void
647 {
648  this->gpu_device = dev;
649 }
650 
651 int
652 AQLRingBuffer::allocEntry(uint32_t nBufReq)
653 {
654  DPRINTF(HSAPacketProcessor, "%s: nReq = %d\n", __FUNCTION__, nBufReq);
655  if (nFree() == 0) {
656  DPRINTF(HSAPacketProcessor, "%s: return = %d\n", __FUNCTION__, 0);
657  return 0;
658  }
659 
660  if (nBufReq > nFree())
661  nBufReq = nFree();
662 
663  DPRINTF(HSAPacketProcessor, "%s: ix1stFree = %d\n", __FUNCTION__, wrIdx());
664  incWrIdx(nBufReq);
665  DPRINTF(HSAPacketProcessor, "%s: return = %d, wrIdx = %d\n",
666  __FUNCTION__, nBufReq, wrIdx());
667  return nBufReq;
668 }
669 
670 void
671 HSAPacketProcessor::finishPkt(void *pvPkt, uint32_t rl_idx)
672 {
673  HSAQueueDescriptor* qDesc = regdQList[rl_idx]->qCntxt.qDesc;
674 
675  // if barrier bit was set and this is the last
676  // outstanding packet from that queue,
677  // unset it here
678  if (regdQList[rl_idx]->getBarrierBit() &&
679  regdQList[rl_idx]->isLastOutstandingPkt()) {
681  "Unset barrier bit for active list ID %d\n", rl_idx);
682  regdQList[rl_idx]->setBarrierBit(false);
683  // if pending kernels in the queue after this kernel, reschedule
684  if (regdQList[rl_idx]->dispPending()) {
686  "Rescheduling active list ID %d after unsetting barrier "
687  "bit\n", rl_idx);
688  schedAQLProcessing(rl_idx);
689  }
690  }
691 
692  // If set, then blocked schedule, so need to reschedule
693  if (regdQList[rl_idx]->qCntxt.aqlBuf->freeEntry(pvPkt))
694  updateReadIndex(0, rl_idx);
696  "%s: rd-ptr offset [0x%x], wr-ptr offset [0x%x], space used = %d," \
697  " q size = %d, stalled = %s, empty = %s, active list ID = %d\n",
698  __FUNCTION__, qDesc->readIndex, qDesc->writeIndex,
699  qDesc->spaceUsed(), qDesc->numElts,
700  qDesc->stalledOnDmaBufAvailability? "true" : "false",
701  qDesc->isEmpty()? "true" : "false", rl_idx);
702  // DMA buffer is freed, check the queue to see if there are DMA
703  // accesses blocked becasue of non-availability of DMA buffer
704  if (qDesc->stalledOnDmaBufAvailability) {
705  assert(!qDesc->isEmpty());
706  getCommandsFromHost(0, rl_idx); // TODO:assign correct pid
707  // when implementing
708  // multi-process support
709  }
710 }
711 
712 void
714  void *pkt, hsa_signal_value_t signal)
715 {
716  auto agent_pkt = (_hsa_agent_dispatch_packet_t *)pkt;
717  uint64_t signal_addr =
718  (uint64_t) (((uint64_t *)agent_pkt->completion_signal) + 1);
719  DPRINTF(HSAPacketProcessor, "Triggering Agent Dispatch packet" \
720  " completion signal: %x!\n", signal_addr);
730  VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
731 
732  DPRINTF(HSAPacketProcessor,"HSADriver: Sending signal to %lu\n",
733  (uint64_t)sys->threads[0]->cpuId());
734 
735 
736  hsa_signal_value_t *new_signal = new hsa_signal_value_t;
737  *new_signal = (hsa_signal_value_t) *prev_signal - 1;
738 
739  dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
740 }
741 
742 void
744 {
745  uint64_t signal_addr = (uint64_t) (((uint64_t *)signal) + 1);
746  DPRINTF(HSAPacketProcessor, "Triggering completion signal: %x!\n",
747  signal_addr);
757  VPtr<uint64_t> prev_signal(signal_addr, sys->threads[0]);
758 
759  hsa_signal_value_t *new_signal = new hsa_signal_value_t;
760  *new_signal = (hsa_signal_value_t) *prev_signal - 1;
761 
762  dmaWriteVirt(signal_addr, sizeof(hsa_signal_value_t), nullptr, new_signal, 0);
763 }
764 
765 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
AMDGPUVM & getVM()
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
void * ptr(uint32_t ix)
AQLRingBuffer(uint32_t size, const std::string name)
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
void incRdIdx(uint64_t value)
DmaDeviceParams Params
Definition: dma_device.hh:209
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
uint64_t functionalReadHsaSignal(Addr signal_handle)
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
virtual Tick read(Packet *) override
void unregisterQueue(uint64_t queue_id, int doorbellSize)
void registerNewQueue(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
Definition: hw_scheduler.cc:85
void write(Addr db_addr, uint64_t doorbell_reg)
const std::string _name
Definition: named.hh:41
virtual std::string name() const
Definition: named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
void setBadAddress()
Definition: packet.hh:784
Addr getAddr() const
Definition: packet.hh:805
unsigned getSize() const
Definition: packet.hh:815
void makeAtomicResponse()
Definition: packet.hh:1071
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
System * sys
Definition: io_device.hh:105
Threads threads
Definition: system.hh:313
void setDevRequestor(RequestorID mid)
#define PAGE_SIZE
Definition: base.cc:60
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
AddrRange RangeSize(Addr start, Addr size)
Definition: addr_range.hh:815
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
hsa_packet_type_t
Packet type.
Definition: hsa.h:2746
@ HSA_PACKET_TYPE_BARRIER_AND
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition: hsa.h:2767
@ HSA_PACKET_TYPE_BARRIER_OR
Packet used by agents to delay processing of subsequent packets, and to express complex dependencies ...
Definition: hsa.h:2778
@ HSA_PACKET_TYPE_VENDOR_SPECIFIC
Vendor-specific packet.
Definition: hsa.h:2750
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
Definition: hsa.h:2756
@ HSA_PACKET_TYPE_KERNEL_DISPATCH
Packet used by agents for dispatching jobs to kernel agents.
Definition: hsa.h:2761
@ HSA_PACKET_TYPE_AGENT_DISPATCH
Packet used by agents for dispatching jobs to agents.
Definition: hsa.h:2772
int32_t hsa_signal_value_t
Signal value.
Definition: hsa.h:1322
#define PKT_TYPE(PKT)
#define HSAPP_EVENT_DESCRIPTION_GENERATOR(XEVENT)
#define IS_BARRIER(PKT)
#define NumSignalsPerBarrier
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 23, 0 > offset
Definition: types.hh:144
Bitfield< 54 > p
Definition: pagetable.hh:70
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition: root.cc:220
uint64_t Tick
Tick count type.
Definition: types.hh:58
std::unique_ptr< TranslationGen > TranslationGenPtr
Declarations of a non-full system Page Table.
Calls getCurrentEntry once the queueEntry has been dmaRead.
AQL kernel dispatch packet.
Definition: hsa.h:2901

Generated on Wed Dec 21 2022 10:22:33 for gem5 by doxygen 1.9.1