gem5  v22.1.0.0
hsa_packet_processor.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
33 #define __DEV_HSA_HSA_PACKET_PROCESSOR__
34 
35 #include <algorithm>
36 #include <cstdint>
37 #include <vector>
38 
39 #include "base/types.hh"
40 #include "debug/HSAPacketProcessor.hh"
41 #include "dev/dma_virt_device.hh"
42 #include "dev/hsa/hsa.h"
43 #include "dev/hsa/hsa_queue.hh"
44 #include "enums/GfxVersion.hh"
45 #include "params/HSAPacketProcessor.hh"
46 #include "sim/eventq.hh"
47 
48 #define AQL_PACKET_SIZE 64
49 #define PAGE_SIZE 4096
50 #define NUM_DMA_BUFS 16
51 #define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
52 // HSA runtime supports only 5 signals per barrier packet
53 #define NumSignalsPerBarrier 5
54 
55 namespace gem5
56 {
57 
58 class AMDGPUDevice;
59 
60 // Ideally, each queue should store this status and
61 // the processPkt() should make decisions based on that
62 // status variable.
63 enum Q_STATE
64 {
65  UNBLOCKED = 0, // Unblocked queue, can submit packets.
66  BLOCKED_BBIT, // Queue blocked by barrier bit.
67  // Can submit packet packets after
68  // previous packet completes.
69  BLOCKED_BPKT, // Queue blocked by barrier packet.
70  // Can submit packet packets after
71  // barrier packet completes.
72 };
73 
74 class GPUCommandProcessor;
75 class HWScheduler;
76 
77 // Our internal representation of an HSA queue
79 {
80  public:
81  uint64_t basePointer;
82  uint64_t doorbellPointer;
83  uint64_t writeIndex;
84  uint64_t readIndex;
85  uint32_t numElts;
86  uint64_t hostReadIndexPtr;
89  GfxVersion gfxVersion;
90 
91  HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
92  uint64_t hri_ptr, uint32_t size,
93  GfxVersion gfxVersion)
94  : basePointer(base_ptr), doorbellPointer(db_ptr),
95  writeIndex(0), readIndex(0),
96  numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
99  { }
100  uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
101  uint64_t spaceUsed() { return writeIndex - readIndex; }
102  uint32_t objSize() { return AQL_PACKET_SIZE; }
103  uint32_t numObjs() { return numElts; }
104  bool isFull() { return spaceRemaining() == 0; }
105  bool isEmpty() { return spaceRemaining() == numElts; }
106 
107  uint64_t ptr(uint64_t ix)
108  {
109  /*
110  * Based on ROCm Documentation:
111  * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
112  10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
113  rocr/src/core/runtime/amd_aql_queue.cpp#L99
114  * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
115  10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
116  rocr/src/core/runtime/amd_aql_queue.cpp#L624
117  *
118  * GFX7 and GFX8 will allocate twice as much space for their HSA
119  * queues as they actually access (using mod operations to map the
120  * virtual addresses from the upper half of the queue to the same
121  * virtual addresses as the lower half). Thus, we need to check if
122  * the ISA is GFX8 and mod the address by half of the queue size if
123  * so.
124  */
125  uint64_t retAddr = 0ll;
126  if ((gfxVersion == GfxVersion::gfx801) ||
127  (gfxVersion == GfxVersion::gfx803)) {
128  retAddr = basePointer + ((ix % (numElts/2)) * objSize());
129  DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
130  "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
131  "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
132  numElts, numElts/2, objSize(), retAddr);
133  } else {
134  retAddr = basePointer + ((ix % numElts) * objSize());
135  DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
136  "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
137  "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
138  retAddr);
139  }
140  return retAddr;
141  }
142 };
143 
157 {
158  private:
160  std::string _name;
163  uint64_t _wrIdx; // Points to next write location
164  uint64_t _rdIdx; // Read pointer of AQL buffer
165  uint64_t _dispIdx; // Dispatch pointer of AQL buffer
166 
167  public:
168  std::string name() {return _name;}
169  AQLRingBuffer(uint32_t size, const std::string name);
170  int allocEntry(uint32_t nBufReq);
171  bool freeEntry(void *pkt);
172 
182  void
183  saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
184  {
185  for (int i = 0; i < num_pkts; ++i) {
186  _hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
187  ++ix;
188  }
189  }
190 
191  Addr
192  hostDispAddr() const
193  {
194  return _hostDispAddresses[dispIdx() % numObjs()];
195  }
196 
197  bool
198  dispPending() const
199  {
200  int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
202  ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
203  return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
204  }
205 
214  bool
216  {
217  for (int i = _rdIdx + 1; i < _dispIdx; i++) {
218  if (!_aqlComplete[i % _aqlBuf.size()]) {
219  return false;
220  }
221  }
222  return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
223  }
224 
225  uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
226  void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
227  uint32_t numObjs() const { return _aqlBuf.size(); };
228  uint32_t objSize() const { return AQL_PACKET_SIZE; }
229  uint64_t dispIdx() const { return _dispIdx; }
230  uint64_t wrIdx() const { return _wrIdx; }
231  uint64_t rdIdx() const { return _rdIdx; }
232  uint64_t* rdIdxPtr() { return &_rdIdx; }
233  void incRdIdx(uint64_t value) { _rdIdx += value; }
234  void incWrIdx(uint64_t value) { _wrIdx += value; }
235  void incDispIdx(uint64_t value) { _dispIdx += value; }
236  uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
237  void setRdIdx(uint64_t value);
238  void setWrIdx(uint64_t value);
239  void setDispIdx(uint64_t value);
240 };
241 
242 struct QCntxt
243 {
246  // used for HSA packets that enforce synchronization with barrier bit
249  qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
250  {}
251  QCntxt() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
252 };
253 
255 {
256  friend class HWScheduler;
257  protected:
258  typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
263 
264  // Structure to store the read values of dependency signals
265  // from shared memory. Also used for tracking the status of
266  // those reads while they are in progress
268  {
269  public:
271  : pendingReads(0), allRead(false), discardRead(false)
272  {
274  }
275  void handleReadDMA();
277  bool allRead;
278  // If this queue is unmapped when there are pending reads, then
279  // the pending reads has to be discarded.
281  // values stores the value of already read dependency signal
283  void
285  {
286  std::fill(values.begin(), values.end(), 1);
287  }
288  };
289 
290  class QueueProcessEvent : public Event
291  {
292  private:
294  uint32_t rqIdx;
295  public:
296  QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
297  : Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
298  {}
299  virtual void process();
300  virtual const char *description() const;
301  };
302 
303  // Registered queue list entry; each entry has one queueDescriptor and
304  // associated AQL buffer
305  class RQLEntry
306  {
307  public:
308  RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
309  : aqlProcessEvent(hsaPP, rqIdx) {}
311  bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
312  uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
315  void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
316  bool getBarrierBit() const { return qCntxt.barrierBit; }
317  bool isLastOutstandingPkt() const
318  {
320  }
321  };
322  // Keeps track of queueDescriptors of registered queues
324 
325  Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
326  void displayQueueDescriptor(int pid, uint32_t rl_idx);
327 
328  public:
330  getQueueDesc(uint32_t queId)
331  {
332  return regdQList.at(queId)->qCntxt.qDesc;
333  }
334  class RQLEntry*
335  getRegdListEntry(uint32_t queId)
336  {
337  return regdQList.at(queId);
338  }
339 
340  uint64_t
341  inFlightPkts(uint32_t queId)
342  {
343  auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
344  return aqlBuf->dispIdx() - aqlBuf->rdIdx();
345  }
346 
352 
353  typedef HSAPacketProcessorParams Params;
354  HSAPacketProcessor(const Params &p);
356  TranslationGenPtr translate(Addr vaddr, Addr size) override;
357  void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
358  uint64_t basePointer,
359  uint64_t queue_id,
360  uint32_t size, int doorbellSize,
361  GfxVersion gfxVersion,
362  Addr offset = 0, uint64_t rd_idx = 0);
363  void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
364  void setDevice(GPUCommandProcessor * dev);
366  void updateReadIndex(int, uint32_t);
367  void getCommandsFromHost(int pid, uint32_t rl_idx);
369 
370  // PIO interface
371  virtual Tick read(Packet*) override;
372  virtual Tick write(Packet*) override;
373  virtual AddrRangeList getAddrRanges() const override;
374  void finishPkt(void *pkt, uint32_t rl_idx);
375  void finishPkt(void *pkt) { finishPkt(pkt, 0); }
376  void schedAQLProcessing(uint32_t rl_idx);
377  void schedAQLProcessing(uint32_t rl_idx, Tick delay);
378 
379  void sendAgentDispatchCompletionSignal(void *pkt,
380  hsa_signal_value_t signal);
382 
387  {
388  // deal with the fact dma ops can complete out of issue order
389  uint32_t pkts_ttl;
390  uint32_t pkts_2_go;
391  uint32_t start_ix;
392  uint32_t rl_idx;
393 
394  dma_series_ctx(uint32_t _pkts_ttl,
395  uint32_t _pkts_2_go,
396  uint32_t _start_ix,
397  uint32_t _rl_idx)
398  : pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
399  start_ix(_start_ix), rl_idx(_rl_idx)
400  {};
402  };
403 
404  void updateReadDispIdDma();
405  void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead,
406  uint32_t ix_start, unsigned num_pkts,
407  dma_series_ctx *series_ctx, void *dest_4debug);
409 };
410 
411 } // namespace gem5
412 
413 #endif // __DEV_HSA_HSA_PACKET_PROCESSOR__
#define DPRINTF(x,...)
Definition: trace.hh:186
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
Device model for an AMD GPU.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
bool isLastOutstandingPkt() const
Packets aren't guaranteed to be completed in-order, and we need to know when the last packet is finis...
void * ptr(uint32_t ix)
AQLRingBuffer(uint32_t size, const std::string name)
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
void incRdIdx(uint64_t value)
DmaDeviceParams Params
Definition: dma_device.hh:209
QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
virtual const char * description() const
Return a C string describing the event.
RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void(DmaDevice::* DmaFnPtr)(Addr, int, Event *, uint8_t *, Tick)
uint64_t inFlightPkts(uint32_t queId)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessorParams Params
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
class RQLEntry * getRegdListEntry(uint32_t queId)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
virtual Tick read(Packet *) override
HSAPacketProcessor(const Params &p)
HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr, uint64_t hri_ptr, uint32_t size, GfxVersion gfxVersion)
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
STL vector class.
Definition: stl.hh:37
static const Priority Default_Pri
Default is zero for historical reasons.
Definition: eventq.hh:179
@ HSA_PACKET_HEADER_TYPE
Packet type.
Definition: hsa.h:2816
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
Definition: hsa.h:2756
@ HSA_PACKET_HEADER_WIDTH_TYPE
Definition: hsa.h:2858
int32_t hsa_signal_value_t
Signal value.
Definition: hsa.h:1322
#define NumSignalsPerBarrier
#define AQL_PACKET_SIZE
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 23, 0 > offset
Definition: types.hh:144
Bitfield< 54 > p
Definition: pagetable.hh:70
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
uint64_t Tick
Tick count type.
Definition: types.hh:58
std::unique_ptr< TranslationGen > TranslationGenPtr
Calls getCurrentEntry once the queueEntry has been dmaRead.
dma_series_ctx(uint32_t _pkts_ttl, uint32_t _pkts_2_go, uint32_t _start_ix, uint32_t _rl_idx)
QCntxt(HSAQueueDescriptor *q_desc, AQLRingBuffer *aql_buf)
AQLRingBuffer * aqlBuf
HSAQueueDescriptor * qDesc

Generated on Wed Dec 21 2022 10:22:33 for gem5 by doxygen 1.9.1