gem5 [DEVELOP-FOR-25.1]
Loading...
Searching...
No Matches
hsa_queue_entry.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
40
41#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
42#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
43
44#include <bitset>
45#include <cstdint>
46#include <cstring>
47#include <iostream>
48#include <vector>
49
50#include "base/intmath.hh"
51#include "base/types.hh"
52#include "dev/hsa/hsa_packet.hh"
53#include "dev/hsa/hsa_queue.hh"
54#include "enums/GfxVersion.hh"
56
57namespace gem5
58{
59
61{
62 public:
63 HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
64 int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
65 Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
66 : _gfxVersion(gfx_version), kernName(kernel_name),
67 _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
68 (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
69 (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
70 _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
71 (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
72 (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
73 _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
74 _hostDispPktAddr(host_pkt_addr),
75 _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
76 ->completion_signal),
77 codeAddress(code_addr),
78 kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
79 _outstandingInvs(-1), _outstandingWbs(0),
80 _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
81 group_segment_size),
82 _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
83 private_segment_size),
84 _contextId(0), _wgId{{ 0, 0, 0 }},
85 _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
86 _globalWgId(0), dispatchComplete(false)
87
88 {
89 // Use the resource descriptors to determine number of GPRs. This will
90 // round up in some cases, however the exact number field in the AMD
91 // kernel code struct is not backwards compatible and that field is
92 // not populated in newer compiles. The resource descriptor dword must
93 // be backwards compatible, so use that always.
94 // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
95 // #code-object-v3-kernel-descriptor
96 //
97 // Currently, gem5 supported gfx version use a multiplier of 8. The
98 // only exception is gfx900 (Vega10).
99 if (gfx_version == GfxVersion::gfx90a ||
100 gfx_version == GfxVersion::gfx942 ||
101 gfx_version == GfxVersion::gfx950) {
102 numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
103 } else {
104 numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
105 }
106
107 // SGPR allocation granulary is 16 in GFX9
108 // Source: https://llvm.org/docs/AMDGPUUsage.html
109 if (gfx_version == GfxVersion::gfx900 ||
110 gfx_version == GfxVersion::gfx902 ||
111 gfx_version == GfxVersion::gfx908 ||
112 gfx_version == GfxVersion::gfx90a ||
113 gfx_version == GfxVersion::gfx942 ||
114 gfx_version == GfxVersion::gfx950) {
115 numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
116 } else {
117 panic("Saw unknown gfx version setting up GPR counts\n");
118 }
119
120 initialVgprState.reset();
121 initialSgprState.reset();
122
123 for (int i = 0; i < MAX_DIM; ++i) {
124 _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
125 _numWgTotal *= _numWg[i];
126 }
127
128 parseKernelCode(akc);
129
130 // Offset of a first AccVGPR in the unified register file.
131 // Granularity 4. Value 0-63. 0 - accum-offset = 4,
132 // 1 - accum-offset = 8, ..., 63 - accum-offset = 256.
133 _accumOffset = (akc->accum_offset + 1) * 4;
134 }
135
136 const GfxVersion&
138 {
139 return _gfxVersion;
140 }
141
142 const std::string&
144 {
145 return kernName;
146 }
147
148 int
149 wgSize(int dim) const
150 {
151 assert(dim < MAX_DIM);
152 return _wgSize[dim];
153 }
154
155 int
156 gridSize(int dim) const
157 {
158 assert(dim < MAX_DIM);
159 return _gridSize[dim];
160 }
161
162 int
164 {
165 return numVgprs;
166 }
167
168 int
170 {
171 return numSgprs;
172 }
173
174 uint32_t
175 queueId() const
176 {
177 return _queueId;
178 }
179
180 int
182 {
183 return _dispatchId;
184 }
185
186 void*
188 {
189 return dispPkt;
190 }
191
192 Addr
194 {
195 return _hostDispPktAddr;
196 }
197
198 Addr
200 {
201 return _completionSignal;
202 }
203
204 Addr
205 codeAddr() const
206 {
207 return codeAddress;
208 }
209
210 Addr
212 {
213 return kernargAddress;
214 }
215
216 int
217 ldsSize() const
218 {
219 return _ldsSize;
220 }
221
222 int privMemPerItem() const { return _privMemPerItem; }
223
224 int
225 contextId() const
226 {
227 return _contextId;
228 }
229
230 bool
232 {
233 return dispatchComplete;
234 }
235
236 int
237 wgId(int dim) const
238 {
239 assert(dim < MAX_DIM);
240 return _wgId[dim];
241 }
242
243 void
244 wgId(int dim, int val)
245 {
246 assert(dim < MAX_DIM);
247 _wgId[dim] = val;
248 }
249
250 int
252 {
253 return _globalWgId;
254 }
255
256 void
258 {
260 }
261
262 int
263 numWg(int dim) const
264 {
265 assert(dim < MAX_DIM);
266 return _numWg[dim];
267 }
268
269 void
271 {
273 }
274
275 int
277 {
278 return _numWgCompleted;
279 }
280
281 int
283 {
284 return _numWgTotal;
285 }
286
287 void
289 {
290 ++_wgId[0];
291 ++_globalWgId;
292
293 if (wgId(0) * wgSize(0) >= gridSize(0)) {
294 _wgId[0] = 0;
295 ++_wgId[1];
296
297 if (wgId(1) * wgSize(1) >= gridSize(1)) {
298 _wgId[1] = 0;
299 ++_wgId[2];
300
301 if (wgId(2) * wgSize(2) >= gridSize(2)) {
302 dispatchComplete = true;
303 }
304 }
305 }
306 }
307
308 int
310 {
312 }
313
314 bool vgprBitEnabled(int bit) const
315 {
316 return initialVgprState.test(bit);
317 }
318
319 bool sgprBitEnabled(int bit) const
320 {
321 return initialSgprState.test(bit);
322 }
323
329
336
337 // the maximum number of dimensions for a grid or workgroup
338 const static int MAX_DIM = 3;
339
340 /* getter */
341 int
343 return _outstandingInvs;
344 }
345
351 bool
353 {
354 return (_outstandingInvs != -1);
355 }
356
362 void
364 {
366 assert(_outstandingInvs >= 0);
367 }
368
372 void
374 {
376 }
377
381 bool
382 isInvDone() const
383 {
384 assert(_outstandingInvs >= 0);
385 return (_outstandingInvs == 0);
386 }
387
388 int
390 {
391 return _outstandingWbs;
392 }
393
399 void
401 {
403 assert(_outstandingWbs >= 0);
404 }
405
406 unsigned
408 {
409 return _accumOffset;
410 }
411
412 void
414 {
416
421 if (_preloadLength) {
423 }
424 }
425
426 unsigned
428 {
429 return _preloadLength;
430 }
431
432 uint32_t *
434 {
435 return &(_preloadArgs[0]);
436 }
437
438 private:
439 void
476
477 // store gfx version for version specific task handling
478 GfxVersion _gfxVersion;
479 // name of the kernel associated with the AQL entry
480 std::string kernName;
481 // workgroup Size (3 dimensions)
482 std::array<int, MAX_DIM> _wgSize;
483 // grid Size (3 dimensions)
484 std::array<int, MAX_DIM> _gridSize;
485 // total number of VGPRs per work-item
487 // total number of SGPRs per wavefront
489 // id of AQL queue in which this entry is placed
490 uint32_t _queueId;
492 // raw AQL packet pointer
493 void *dispPkt;
494 // host-side addr of the dispatch packet
496 // pointer to bool
498 // base address of the raw machine code
500 // base address of the kernel args
522 std::array<int, MAX_DIM> _wgId;
523 std::array<int, MAX_DIM> _numWg;
526 // The number of completed work groups
530
531 std::bitset<NumVectorInitFields> initialVgprState;
532 std::bitset<NumScalarInitFields> initialSgprState;
533
534 unsigned _accumOffset;
535
536 // For preloading args there are extra bytes of space after the dispatch
537 // packet containing values that should be preloaded into SGPRs. This
538 // field serves as a buffer to DMA into and therefore is sized at the
539 // max amount. It is of dword type to easily access during wave start.
540 unsigned _preloadLength = 0;
541 uint32_t _preloadArgs[KernargPreloadPktSize / sizeof(uint32_t)];
542};
543
544} // namespace gem5
545
546#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
uint32_t queueId() const
bool sgprBitEnabled(int bit) const
std::bitset< NumScalarInitFields > initialSgprState
int numWg(int dim) const
int wgId(int dim) const
const GfxVersion & gfxVersion() const
void preloadLength(unsigned val)
Addr hostDispPktAddr() const
static const int MAX_DIM
int wgSize(int dim) const
void wgId(int dim, int val)
void parseKernelCode(AMDKernelCode *akc)
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
uint32_t _preloadArgs[KernargPreloadPktSize/sizeof(uint32_t)]
bool vgprBitEnabled(int bit) const
void markInvDone()
Forcefully change the state to be inv done.
const std::string & kernelName() const
std::array< int, MAX_DIM > _wgId
std::array< int, MAX_DIM > _gridSize
int _outstandingWbs
Number of outstanding wbs for the kernel values: 0: 1)initial value, flush has not started for the ke...
std::array< int, MAX_DIM > _wgSize
std::array< int, MAX_DIM > _numWg
Addr completionSignal() const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
unsigned preloadLength() const
int _outstandingInvs
Number of outstanding invs for the kernel.
std::bitset< NumVectorInitFields > initialVgprState
HSAQueueEntry(std::string kernel_name, uint32_t queue_id, int dispatch_id, void *disp_pkt, AMDKernelCode *akc, Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
unsigned accumOffset() const
void updateOutstandingWbs(int val)
Update the number of pending writeback requests.
void updateOutstandingInvs(int val)
update the number of pending invalidate requests
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 63 > val
Definition misc.hh:804
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
struct gem5::GEM5_PACKED AMDKernelCode
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
constexpr int KernargPreloadPktSize
The number of bytes after the dispatch packet which contain kernel arguments that should be preloaded...
@ WorkgroupIdX
@ DispatchId
@ DispatchPtr
@ QueuePtr
@ PrivSegWaveByteOffset
@ PrivateSegBuf
@ WorkgroupIdY
@ PrivateSegSize
@ WorkgroupInfo
@ WorkgroupIdZ
@ FlatScratchInit
@ KernargPreload
@ KernargSegPtr
@ WorkitemIdX
@ WorkitemIdZ
@ WorkitemIdY
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_queue_ptr
uint32_t enable_private_segment
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_sgpr_dispatch_ptr
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_dispatch_id
uint32_t enable_vgpr_workitem_id
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_sgpr_workgroup_info

Generated on Mon Oct 27 2025 04:13:02 for gem5 by doxygen 1.14.0