gem5 v24.0.0.0
Loading...
Searching...
No Matches
hsa_packet_processor.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
33#define __DEV_HSA_HSA_PACKET_PROCESSOR__
34
35#include <algorithm>
36#include <cstdint>
37#include <vector>
38
39#include "base/types.hh"
40#include "debug/HSAPacketProcessor.hh"
42#include "dev/hsa/hsa.h"
43#include "dev/hsa/hsa_queue.hh"
44#include "enums/GfxVersion.hh"
45#include "params/HSAPacketProcessor.hh"
46#include "sim/eventq.hh"
47
48#define AQL_PACKET_SIZE 64
49#define PAGE_SIZE 4096
50#define NUM_DMA_BUFS 16
51#define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
52// HSA runtime supports only 5 signals per barrier packet
53#define NumSignalsPerBarrier 5
54
55namespace gem5
56{
57
58class AMDGPUDevice;
59
60// Ideally, each queue should store this status and
61// the processPkt() should make decisions based on that
62// status variable.
64{
65 UNBLOCKED = 0, // Unblocked queue, can submit packets.
66 BLOCKED_BBIT, // Queue blocked by barrier bit.
67 // Can submit packet packets after
68 // previous packet completes.
69 BLOCKED_BPKT, // Queue blocked by barrier packet.
70 // Can submit packet packets after
71 // barrier packet completes.
72};
73
74class GPUCommandProcessor;
75class HWScheduler;
76
77// Our internal representation of an HSA queue
79{
80 public:
81 uint64_t basePointer;
83 uint64_t writeIndex;
84 uint64_t readIndex;
85 uint32_t numElts;
89 GfxVersion gfxVersion;
90
91 HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
92 uint64_t hri_ptr, uint32_t size,
93 GfxVersion gfxVersion)
94 : basePointer(base_ptr), doorbellPointer(db_ptr),
95 writeIndex(0), readIndex(0),
99 { }
100 uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
101 uint64_t spaceUsed() { return writeIndex - readIndex; }
102 uint32_t objSize() { return AQL_PACKET_SIZE; }
103 uint32_t numObjs() { return numElts; }
104 bool isFull() { return spaceRemaining() == 0; }
105 bool isEmpty() { return spaceRemaining() == numElts; }
106
107 uint64_t ptr(uint64_t ix)
108 {
109 /*
110 * Based on ROCm Documentation:
111 * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
112 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
113 rocr/src/core/runtime/amd_aql_queue.cpp#L99
114 * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
115 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
116 rocr/src/core/runtime/amd_aql_queue.cpp#L624
117 *
118 */
119 uint64_t retAddr = 0ll;
120 retAddr = basePointer + ((ix % numElts) * objSize());
121 DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
122 "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
123 "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
124 retAddr);
125 return retAddr;
126 }
127};
128
142{
143 private:
145 std::string _name;
148 uint64_t _wrIdx; // Points to next write location
149 uint64_t _rdIdx; // Read pointer of AQL buffer
150 uint64_t _dispIdx; // Dispatch pointer of AQL buffer
151
152 public:
153 std::string name() {return _name;}
154 AQLRingBuffer(uint32_t size, const std::string name);
155 int allocEntry(uint32_t nBufReq);
156 bool freeEntry(void *pkt);
157
167 void
168 saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
169 {
170 for (int i = 0; i < num_pkts; ++i) {
171 _hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
172 ++ix;
173 }
174 }
175
176 Addr
178 {
179 return _hostDispAddresses[dispIdx() % numObjs()];
180 }
181
182 bool
184 {
185 int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
187 ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
188 return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
189 }
190
199 bool
201 {
202 for (int i = _rdIdx + 1; i < _dispIdx; i++) {
203 if (!_aqlComplete[i % _aqlBuf.size()]) {
204 return false;
205 }
206 }
207 return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
208 }
209
210 uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
211 void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
212 uint32_t numObjs() const { return _aqlBuf.size(); };
213 uint32_t objSize() const { return AQL_PACKET_SIZE; }
214 uint64_t dispIdx() const { return _dispIdx; }
215 uint64_t wrIdx() const { return _wrIdx; }
216 uint64_t rdIdx() const { return _rdIdx; }
217 uint64_t* rdIdxPtr() { return &_rdIdx; }
218 void incRdIdx(uint64_t value) { _rdIdx += value; }
219 void incWrIdx(uint64_t value) { _wrIdx += value; }
220 void incDispIdx(uint64_t value) { _dispIdx += value; }
221 uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
222 void setRdIdx(uint64_t value);
223 void setWrIdx(uint64_t value);
224 void setDispIdx(uint64_t value);
225};
226
227struct QCntxt
228{
231 // used for HSA packets that enforce synchronization with barrier bit
234 qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
235 {}
236 QCntxt() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
237};
238
240{
241 friend class HWScheduler;
242 protected:
243 typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
248
249 // Structure to store the read values of dependency signals
250 // from shared memory. Also used for tracking the status of
251 // those reads while they are in progress
253 {
254 public:
256 : pendingReads(0), allRead(false), discardRead(false)
257 {
259 }
260 void handleReadDMA();
263 // If this queue is unmapped when there are pending reads, then
264 // the pending reads has to be discarded.
266 // values stores the value of already read dependency signal
268 void
270 {
271 std::fill(values.begin(), values.end(), 1);
272 }
273 };
274
276 {
277 private:
279 uint32_t rqIdx;
280 public:
281 QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
282 : Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
283 {}
284 virtual void process();
285 virtual const char *description() const;
286 };
287
288 // Registered queue list entry; each entry has one queueDescriptor and
289 // associated AQL buffer
291 {
292 public:
293 RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
294 : aqlProcessEvent(hsaPP, rqIdx) {}
296 bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
297 uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
300 void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
301 bool getBarrierBit() const { return qCntxt.barrierBit; }
303 {
305 }
306 };
307 // Keeps track of queueDescriptors of registered queues
309
310 Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
311 void displayQueueDescriptor(int pid, uint32_t rl_idx);
312
313 public:
315 getQueueDesc(uint32_t queId)
316 {
317 return regdQList.at(queId)->qCntxt.qDesc;
318 }
319 class RQLEntry*
320 getRegdListEntry(uint32_t queId)
321 {
322 return regdQList.at(queId);
323 }
324
325 uint64_t
326 inFlightPkts(uint32_t queId)
327 {
328 auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
329 return aqlBuf->dispIdx() - aqlBuf->rdIdx();
330 }
331
337
338 typedef HSAPacketProcessorParams Params;
341 TranslationGenPtr translate(Addr vaddr, Addr size) override;
342 void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
343 uint64_t basePointer,
344 uint64_t queue_id,
345 uint32_t size, int doorbellSize,
346 GfxVersion gfxVersion,
347 Addr offset = 0, uint64_t rd_idx = 0);
348 void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
349 void setDevice(GPUCommandProcessor * dev);
351 void updateReadIndex(int, uint32_t);
352 void getCommandsFromHost(int pid, uint32_t rl_idx);
354
355 // PIO interface
356 virtual Tick read(Packet*) override;
357 virtual Tick write(Packet*) override;
358 virtual AddrRangeList getAddrRanges() const override;
359 void finishPkt(void *pkt, uint32_t rl_idx);
360 void finishPkt(void *pkt) { finishPkt(pkt, 0); }
361 void schedAQLProcessing(uint32_t rl_idx);
362 void schedAQLProcessing(uint32_t rl_idx, Tick delay);
363
365 hsa_signal_value_t signal);
367
372 {
373 // deal with the fact dma ops can complete out of issue order
374 uint32_t pkts_ttl;
375 uint32_t pkts_2_go;
376 uint32_t start_ix;
377 uint32_t rl_idx;
378
379 dma_series_ctx(uint32_t _pkts_ttl,
380 uint32_t _pkts_2_go,
381 uint32_t _start_ix,
382 uint32_t _rl_idx)
383 : pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
384 start_ix(_start_ix), rl_idx(_rl_idx)
385 {};
387 };
388
389 void updateReadDispIdDma();
390 void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead,
391 uint32_t ix_start, unsigned num_pkts,
392 dma_series_ctx *series_ctx, void *dest_4debug);
394};
395
396} // namespace gem5
397
398#endif // __DEV_HSA_HSA_PACKET_PROCESSOR__
#define DPRINTF(x,...)
Definition trace.hh:210
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
Device model for an AMD GPU.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
void * ptr(uint32_t ix)
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
bool isLastOutstandingPkt() const
Packets aren't guaranteed to be completed in-order, and we need to know when the last packet is finis...
AQLRingBuffer(uint32_t size, const std::string name)
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
void incRdIdx(uint64_t value)
QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
virtual const char * description() const
Return a C string describing the event.
RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void(DmaDevice::* DmaFnPtr)(Addr, int, Event *, uint8_t *, Tick)
uint64_t inFlightPkts(uint32_t queId)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessorParams Params
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
class RQLEntry * getRegdListEntry(uint32_t queId)
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
virtual Tick read(Packet *) override
HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr, uint64_t hri_ptr, uint32_t size, GfxVersion gfxVersion)
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
STL vector class.
Definition stl.hh:37
static const Priority Default_Pri
Default is zero for historical reasons.
Definition eventq.hh:182
@ HSA_PACKET_HEADER_TYPE
Packet type.
Definition hsa.h:2816
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
Definition hsa.h:2756
@ HSA_PACKET_HEADER_WIDTH_TYPE
Definition hsa.h:2858
int32_t hsa_signal_value_t
Signal value.
Definition hsa.h:1322
#define NumSignalsPerBarrier
#define AQL_PACKET_SIZE
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
uint64_t Tick
Tick count type.
Definition types.hh:58
std::unique_ptr< TranslationGen > TranslationGenPtr
Calls getCurrentEntry once the queueEntry has been dmaRead.
dma_series_ctx(uint32_t _pkts_ttl, uint32_t _pkts_2_go, uint32_t _start_ix, uint32_t _rl_idx)
QCntxt(HSAQueueDescriptor *q_desc, AQLRingBuffer *aql_buf)
AQLRingBuffer * aqlBuf
HSAQueueDescriptor * qDesc

Generated on Tue Jun 18 2024 16:24:03 for gem5 by doxygen 1.11.0