gem5 v23.0.0.1
Loading...
Searching...
No Matches
hsa_packet_processor.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __DEV_HSA_HSA_PACKET_PROCESSOR__
33#define __DEV_HSA_HSA_PACKET_PROCESSOR__
34
35#include <algorithm>
36#include <cstdint>
37#include <vector>
38
39#include "base/types.hh"
40#include "debug/HSAPacketProcessor.hh"
42#include "dev/hsa/hsa.h"
43#include "dev/hsa/hsa_queue.hh"
44#include "enums/GfxVersion.hh"
45#include "params/HSAPacketProcessor.hh"
46#include "sim/eventq.hh"
47
48#define AQL_PACKET_SIZE 64
49#define PAGE_SIZE 4096
50#define NUM_DMA_BUFS 16
51#define DMA_BUF_SIZE (AQL_PACKET_SIZE * NUM_DMA_BUFS)
52// HSA runtime supports only 5 signals per barrier packet
53#define NumSignalsPerBarrier 5
54
55namespace gem5
56{
57
58class AMDGPUDevice;
59
60// Ideally, each queue should store this status and
61// the processPkt() should make decisions based on that
62// status variable.
64{
65 UNBLOCKED = 0, // Unblocked queue, can submit packets.
66 BLOCKED_BBIT, // Queue blocked by barrier bit.
67 // Can submit packet packets after
68 // previous packet completes.
69 BLOCKED_BPKT, // Queue blocked by barrier packet.
70 // Can submit packet packets after
71 // barrier packet completes.
72};
73
74class GPUCommandProcessor;
75class HWScheduler;
76
77// Our internal representation of an HSA queue
79{
80 public:
81 uint64_t basePointer;
83 uint64_t writeIndex;
84 uint64_t readIndex;
85 uint32_t numElts;
89 GfxVersion gfxVersion;
90
91 HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
92 uint64_t hri_ptr, uint32_t size,
93 GfxVersion gfxVersion)
94 : basePointer(base_ptr), doorbellPointer(db_ptr),
95 writeIndex(0), readIndex(0),
99 { }
100 uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
101 uint64_t spaceUsed() { return writeIndex - readIndex; }
102 uint32_t objSize() { return AQL_PACKET_SIZE; }
103 uint32_t numObjs() { return numElts; }
104 bool isFull() { return spaceRemaining() == 0; }
105 bool isEmpty() { return spaceRemaining() == numElts; }
106
107 uint64_t ptr(uint64_t ix)
108 {
109 /*
110 * Based on ROCm Documentation:
111 * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
112 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
113 rocr/src/core/runtime/amd_aql_queue.cpp#L99
114 * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
115 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
116 rocr/src/core/runtime/amd_aql_queue.cpp#L624
117 *
118 * GFX7 and GFX8 will allocate twice as much space for their HSA
119 * queues as they actually access (using mod operations to map the
120 * virtual addresses from the upper half of the queue to the same
121 * virtual addresses as the lower half). Thus, we need to check if
122 * the ISA is GFX8 and mod the address by half of the queue size if
123 * so.
124 */
125 uint64_t retAddr = 0ll;
126 if ((gfxVersion == GfxVersion::gfx801) ||
127 (gfxVersion == GfxVersion::gfx803)) {
128 retAddr = basePointer + ((ix % (numElts/2)) * objSize());
129 DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
130 "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
131 "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
132 numElts, numElts/2, objSize(), retAddr);
133 } else {
134 retAddr = basePointer + ((ix % numElts) * objSize());
135 DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
136 "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
137 "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
138 retAddr);
139 }
140 return retAddr;
141 }
142};
143
157{
158 private:
160 std::string _name;
163 uint64_t _wrIdx; // Points to next write location
164 uint64_t _rdIdx; // Read pointer of AQL buffer
165 uint64_t _dispIdx; // Dispatch pointer of AQL buffer
166
167 public:
168 std::string name() {return _name;}
169 AQLRingBuffer(uint32_t size, const std::string name);
170 int allocEntry(uint32_t nBufReq);
171 bool freeEntry(void *pkt);
172
182 void
183 saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
184 {
185 for (int i = 0; i < num_pkts; ++i) {
186 _hostDispAddresses[ix % numObjs()] = host_pkt_addr + i * objSize();
187 ++ix;
188 }
189 }
190
191 Addr
193 {
194 return _hostDispAddresses[dispIdx() % numObjs()];
195 }
196
197 bool
199 {
200 int packet_type = (_aqlBuf[_dispIdx % _aqlBuf.size()].header
202 ((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
203 return (_dispIdx < _wrIdx) && packet_type != HSA_PACKET_TYPE_INVALID;
204 }
205
214 bool
216 {
217 for (int i = _rdIdx + 1; i < _dispIdx; i++) {
218 if (!_aqlComplete[i % _aqlBuf.size()]) {
219 return false;
220 }
221 }
222 return !_aqlComplete[_rdIdx % _aqlBuf.size()] && _rdIdx != _dispIdx;
223 }
224
225 uint32_t nFree() const { return _aqlBuf.size() - (_wrIdx - _rdIdx); }
226 void *ptr(uint32_t ix) { return _aqlBuf.data() + (ix % _aqlBuf.size()); }
227 uint32_t numObjs() const { return _aqlBuf.size(); };
228 uint32_t objSize() const { return AQL_PACKET_SIZE; }
229 uint64_t dispIdx() const { return _dispIdx; }
230 uint64_t wrIdx() const { return _wrIdx; }
231 uint64_t rdIdx() const { return _rdIdx; }
232 uint64_t* rdIdxPtr() { return &_rdIdx; }
233 void incRdIdx(uint64_t value) { _rdIdx += value; }
234 void incWrIdx(uint64_t value) { _wrIdx += value; }
235 void incDispIdx(uint64_t value) { _dispIdx += value; }
236 uint64_t compltnPending() { return (_dispIdx - _rdIdx); }
237 void setRdIdx(uint64_t value);
238 void setWrIdx(uint64_t value);
239 void setDispIdx(uint64_t value);
240};
241
242struct QCntxt
243{
246 // used for HSA packets that enforce synchronization with barrier bit
249 qDesc(q_desc), aqlBuf(aql_buf), barrierBit(false)
250 {}
251 QCntxt() : qDesc(NULL), aqlBuf(NULL), barrierBit(false) {}
252};
253
255{
256 friend class HWScheduler;
257 protected:
258 typedef void (DmaDevice::*DmaFnPtr)(Addr, int, Event*, uint8_t*, Tick);
263
264 // Structure to store the read values of dependency signals
265 // from shared memory. Also used for tracking the status of
266 // those reads while they are in progress
268 {
269 public:
271 : pendingReads(0), allRead(false), discardRead(false)
272 {
274 }
275 void handleReadDMA();
278 // If this queue is unmapped when there are pending reads, then
279 // the pending reads has to be discarded.
281 // values stores the value of already read dependency signal
283 void
285 {
286 std::fill(values.begin(), values.end(), 1);
287 }
288 };
289
291 {
292 private:
294 uint32_t rqIdx;
295 public:
296 QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
297 : Event(Default_Pri), hsaPP(_hsaPP), rqIdx(_rqIdx)
298 {}
299 virtual void process();
300 virtual const char *description() const;
301 };
302
303 // Registered queue list entry; each entry has one queueDescriptor and
304 // associated AQL buffer
306 {
307 public:
308 RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
309 : aqlProcessEvent(hsaPP, rqIdx) {}
311 bool dispPending() { return qCntxt.aqlBuf->dispPending() > 0; }
312 uint64_t compltnPending() { return qCntxt.aqlBuf->compltnPending(); }
315 void setBarrierBit(bool set_val) { qCntxt.barrierBit = set_val; }
316 bool getBarrierBit() const { return qCntxt.barrierBit; }
318 {
320 }
321 };
322 // Keeps track of queueDescriptors of registered queues
324
325 Q_STATE processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr);
326 void displayQueueDescriptor(int pid, uint32_t rl_idx);
327
328 public:
330 getQueueDesc(uint32_t queId)
331 {
332 return regdQList.at(queId)->qCntxt.qDesc;
333 }
334 class RQLEntry*
335 getRegdListEntry(uint32_t queId)
336 {
337 return regdQList.at(queId);
338 }
339
340 uint64_t
341 inFlightPkts(uint32_t queId)
342 {
343 auto aqlBuf = regdQList.at(queId)->qCntxt.aqlBuf;
344 return aqlBuf->dispIdx() - aqlBuf->rdIdx();
345 }
346
352
353 typedef HSAPacketProcessorParams Params;
356 TranslationGenPtr translate(Addr vaddr, Addr size) override;
357 void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
358 uint64_t basePointer,
359 uint64_t queue_id,
360 uint32_t size, int doorbellSize,
361 GfxVersion gfxVersion,
362 Addr offset = 0, uint64_t rd_idx = 0);
363 void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
364 void setDevice(GPUCommandProcessor * dev);
366 void updateReadIndex(int, uint32_t);
367 void getCommandsFromHost(int pid, uint32_t rl_idx);
369
370 // PIO interface
371 virtual Tick read(Packet*) override;
372 virtual Tick write(Packet*) override;
373 virtual AddrRangeList getAddrRanges() const override;
374 void finishPkt(void *pkt, uint32_t rl_idx);
375 void finishPkt(void *pkt) { finishPkt(pkt, 0); }
376 void schedAQLProcessing(uint32_t rl_idx);
377 void schedAQLProcessing(uint32_t rl_idx, Tick delay);
378
380 hsa_signal_value_t signal);
382
387 {
388 // deal with the fact dma ops can complete out of issue order
389 uint32_t pkts_ttl;
390 uint32_t pkts_2_go;
391 uint32_t start_ix;
392 uint32_t rl_idx;
393
394 dma_series_ctx(uint32_t _pkts_ttl,
395 uint32_t _pkts_2_go,
396 uint32_t _start_ix,
397 uint32_t _rl_idx)
398 : pkts_ttl(_pkts_2_go), pkts_2_go(_pkts_2_go),
399 start_ix(_start_ix), rl_idx(_rl_idx)
400 {};
402 };
403
404 void updateReadDispIdDma();
405 void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead,
406 uint32_t ix_start, unsigned num_pkts,
407 dma_series_ctx *series_ctx, void *dest_4debug);
409};
410
411} // namespace gem5
412
413#endif // __DEV_HSA_HSA_PACKET_PROCESSOR__
#define DPRINTF(x,...)
Definition trace.hh:210
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
Device model for an AMD GPU.
Internal ring buffer which is used to prefetch/store copies of the in-memory HSA ring buffer.
void setRdIdx(uint64_t value)
std::vector< bool > _aqlComplete
void * ptr(uint32_t ix)
int allocEntry(uint32_t nBufReq)
void incDispIdx(uint64_t value)
void setDispIdx(uint64_t value)
void saveHostDispAddr(Addr host_pkt_addr, int num_pkts, int ix)
the kernel may try to read from the dispatch packet, so we need to keep the host address that corresp...
void setWrIdx(uint64_t value)
bool isLastOutstandingPkt() const
Packets aren't guaranteed to be completed in-order, and we need to know when the last packet is finis...
std::vector< hsa_kernel_dispatch_packet_t > _aqlBuf
void incWrIdx(uint64_t value)
std::vector< Addr > _hostDispAddresses
void incRdIdx(uint64_t value)
QueueProcessEvent(HSAPacketProcessor *_hsaPP, uint32_t _rqIdx)
virtual const char * description() const
Return a C string describing the event.
RQLEntry(HSAPacketProcessor *hsaPP, uint32_t rqIdx)
std::vector< hsa_signal_value_t > values
void sendAgentDispatchCompletionSignal(void *pkt, hsa_signal_value_t signal)
std::vector< class RQLEntry * > regdQList
void updateReadIndex(int, uint32_t)
virtual Tick write(Packet *) override
void cmdQueueCmdDma(HSAPacketProcessor *hsaPP, int pid, bool isRead, uint32_t ix_start, unsigned num_pkts, dma_series_ctx *series_ctx, void *dest_4debug)
void(DmaDevice::* DmaFnPtr)(Addr, int, Event *, uint8_t *, Tick)
uint64_t inFlightPkts(uint32_t queId)
void sendCompletionSignal(hsa_signal_value_t signal)
GPUCommandProcessor * gpu_device
void updateReadDispIdDma()
this event is used to update the read_disp_id field (the read pointer) of the MQD,...
void setGPUDevice(AMDGPUDevice *gpu_device)
HSAPacketProcessorParams Params
void getCommandsFromHost(int pid, uint32_t rl_idx)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
class RQLEntry * getRegdListEntry(uint32_t queId)
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void displayQueueDescriptor(int pid, uint32_t rl_idx)
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
Q_STATE processPkt(void *pkt, uint32_t rl_idx, Addr host_pkt_addr)
void finishPkt(void *pkt, uint32_t rl_idx)
virtual AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
void schedAQLProcessing(uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
virtual Tick read(Packet *) override
HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr, uint64_t hri_ptr, uint32_t size, GfxVersion gfxVersion)
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
STL vector class.
Definition stl.hh:37
static const Priority Default_Pri
Default is zero for historical reasons.
Definition eventq.hh:182
@ HSA_PACKET_HEADER_TYPE
Packet type.
Definition hsa.h:2816
@ HSA_PACKET_TYPE_INVALID
The packet has been processed in the past, but has not been reassigned to the packet processor.
Definition hsa.h:2756
@ HSA_PACKET_HEADER_WIDTH_TYPE
Definition hsa.h:2858
int32_t hsa_signal_value_t
Signal value.
Definition hsa.h:1322
#define NumSignalsPerBarrier
#define AQL_PACKET_SIZE
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
uint64_t Tick
Tick count type.
Definition types.hh:58
std::unique_ptr< TranslationGen > TranslationGenPtr
Calls getCurrentEntry once the queueEntry has been dmaRead.
dma_series_ctx(uint32_t _pkts_ttl, uint32_t _pkts_2_go, uint32_t _start_ix, uint32_t _rl_idx)
QCntxt(HSAQueueDescriptor *q_desc, AQLRingBuffer *aql_buf)
AQLRingBuffer * aqlBuf
HSAQueueDescriptor * qDesc

Generated on Mon Jul 10 2023 15:32:02 for gem5 by doxygen 1.9.7