gem5 v23.0.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
hsa_queue_entry.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
41#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
42#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
43
44#include <bitset>
45#include <cstdint>
46#include <cstring>
47#include <iostream>
48#include <vector>
49
50#include "base/intmath.hh"
51#include "base/types.hh"
52#include "dev/hsa/hsa_packet.hh"
53#include "dev/hsa/hsa_queue.hh"
54#include "enums/GfxVersion.hh"
56
57namespace gem5
58{
59
61{
62 public:
63 HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
64 int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
65 Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
66 : kernName(kernel_name),
67 _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
68 (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
69 (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
70 _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
71 (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
72 (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
73 numVgprs(akc->workitem_vgpr_count),
74 numSgprs(akc->wavefront_sgpr_count),
75 _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
76 _hostDispPktAddr(host_pkt_addr),
77 _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
78 ->completion_signal),
79 codeAddress(code_addr),
80 kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
81 _outstandingInvs(-1), _outstandingWbs(0),
82 _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
83 group_segment_size),
84 _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
85 private_segment_size),
86 _contextId(0), _wgId{{ 0, 0, 0 }},
87 _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
88 _globalWgId(0), dispatchComplete(false)
89
90 {
91 // Precompiled BLIT kernels actually violate the spec a bit
92 // and don't set many of the required akc fields. For these kernels,
93 // we need to rip register usage from the resource registers.
94 //
95 // We can't get an exact number of registers from the resource
96 // registers because they round, but we can get an upper bound on it.
97 // We determine the number of registers by solving for "vgprs_used"
98 // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
99 // #code-object-v3-kernel-descriptor
100 // Currently, the only supported gfx version in gem5 that computes
101 // this differently is gfx90a.
102 if (!numVgprs) {
103 if (gfx_version == GfxVersion::gfx90a) {
104 numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
105 } else {
106 numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
107 }
108 }
109
110 if (!numSgprs || numSgprs ==
111 std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
112 // Supported major generation numbers: 0 (BLIT kernels), 8, and 9
113 uint16_t version = akc->amd_machine_version_major;
114 assert((version == 0) || (version == 8) || (version == 9));
115 // SGPR allocation granularies:
116 // - GFX8: 8
117 // - GFX9: 16
118 // Source: https://llvm.org/docs/AMDGPUUsage.html
119 if ((version == 0) || (version == 8)) {
120 // We assume that BLIT kernels use the same granularity as GFX8
121 numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
122 } else if (version == 9) {
123 numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
124 }
125 }
126
127 initialVgprState.reset();
128 initialSgprState.reset();
129
130 for (int i = 0; i < MAX_DIM; ++i) {
131 _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
132 _numWgTotal *= _numWg[i];
133 }
134
135 parseKernelCode(akc);
136 }
137
138 const std::string&
140 {
141 return kernName;
142 }
143
144 int
145 wgSize(int dim) const
146 {
147 assert(dim < MAX_DIM);
148 return _wgSize[dim];
149 }
150
151 int
152 gridSize(int dim) const
153 {
154 assert(dim < MAX_DIM);
155 return _gridSize[dim];
156 }
157
158 int
160 {
161 return numVgprs;
162 }
163
164 int
166 {
167 return numSgprs;
168 }
169
170 uint32_t
171 queueId() const
172 {
173 return _queueId;
174 }
175
176 int
178 {
179 return _dispatchId;
180 }
181
182 void*
184 {
185 return dispPkt;
186 }
187
188 Addr
190 {
191 return _hostDispPktAddr;
192 }
193
194 Addr
196 {
197 return _completionSignal;
198 }
199
200 Addr
201 codeAddr() const
202 {
203 return codeAddress;
204 }
205
206 Addr
208 {
209 return kernargAddress;
210 }
211
212 int
213 ldsSize() const
214 {
215 return _ldsSize;
216 }
217
218 int privMemPerItem() const { return _privMemPerItem; }
219
220 int
221 contextId() const
222 {
223 return _contextId;
224 }
225
226 bool
228 {
229 return dispatchComplete;
230 }
231
232 int
233 wgId(int dim) const
234 {
235 assert(dim < MAX_DIM);
236 return _wgId[dim];
237 }
238
239 void
240 wgId(int dim, int val)
241 {
242 assert(dim < MAX_DIM);
243 _wgId[dim] = val;
244 }
245
246 int
248 {
249 return _globalWgId;
250 }
251
252 void
254 {
255 _globalWgId = val;
256 }
257
258 int
259 numWg(int dim) const
260 {
261 assert(dim < MAX_DIM);
262 return _numWg[dim];
263 }
264
265 void
267 {
268 ++_numWgCompleted;
269 }
270
271 int
273 {
274 return _numWgCompleted;
275 }
276
277 int
279 {
280 return _numWgTotal;
281 }
282
283 void
285 {
286 ++_wgId[0];
287 ++_globalWgId;
288
289 if (wgId(0) * wgSize(0) >= gridSize(0)) {
290 _wgId[0] = 0;
291 ++_wgId[1];
292
293 if (wgId(1) * wgSize(1) >= gridSize(1)) {
294 _wgId[1] = 0;
295 ++_wgId[2];
296
297 if (wgId(2) * wgSize(2) >= gridSize(2)) {
298 dispatchComplete = true;
299 }
300 }
301 }
302 }
303
304 int
306 {
307 return numWgArrivedAtBarrier;
308 }
309
310 bool vgprBitEnabled(int bit) const
311 {
312 return initialVgprState.test(bit);
313 }
314
315 bool sgprBitEnabled(int bit) const
316 {
317 return initialSgprState.test(bit);
318 }
319
325
332
333 // the maximum number of dimensions for a grid or workgroup
334 const static int MAX_DIM = 3;
335
336 /* getter */
337 int
339 return _outstandingInvs;
340 }
341
347 bool
349 {
350 return (_outstandingInvs != -1);
351 }
352
358 void
360 {
361 _outstandingInvs += val;
362 assert(_outstandingInvs >= 0);
363 }
364
368 void
370 {
371 _outstandingInvs = 0;
372 }
373
377 bool
378 isInvDone() const
379 {
380 assert(_outstandingInvs >= 0);
381 return (_outstandingInvs == 0);
382 }
383
384 int
386 {
387 return _outstandingWbs;
388 }
389
395 void
397 {
398 _outstandingWbs += val;
399 assert(_outstandingWbs >= 0);
400 }
401
402 private:
403 void
405 {
407 initialSgprState.set(PrivateSegBuf,
409 initialSgprState.set(DispatchPtr,
411 initialSgprState.set(QueuePtr,
413 initialSgprState.set(KernargSegPtr,
415 initialSgprState.set(DispatchId,
417 initialSgprState.set(FlatScratchInit,
419 initialSgprState.set(PrivateSegSize,
421 initialSgprState.set(GridWorkgroupCountX,
423 initialSgprState.set(GridWorkgroupCountY,
425 initialSgprState.set(GridWorkgroupCountZ,
427 initialSgprState.set(WorkgroupIdX,
429 initialSgprState.set(WorkgroupIdY,
431 initialSgprState.set(WorkgroupIdZ,
433 initialSgprState.set(WorkgroupInfo,
435 initialSgprState.set(PrivSegWaveByteOffset,
437
442 initialVgprState.set(WorkitemIdX, true);
443 initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0);
444 initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);
445 }
446
447 // name of the kernel associated with the AQL entry
448 std::string kernName;
449 // workgroup Size (3 dimensions)
450 std::array<int, MAX_DIM> _wgSize;
451 // grid Size (3 dimensions)
452 std::array<int, MAX_DIM> _gridSize;
453 // total number of VGPRs per work-item
455 // total number of SGPRs per wavefront
457 // id of AQL queue in which this entry is placed
458 uint32_t _queueId;
460 // raw AQL packet pointer
461 void *dispPkt;
462 // host-side addr of the dispatch packet
464 // pointer to bool
466 // base address of the raw machine code
468 // base address of the kernel args
490 std::array<int, MAX_DIM> _wgId;
491 std::array<int, MAX_DIM> _numWg;
494 // The number of completed work groups
498
499 std::bitset<NumVectorInitFields> initialVgprState;
500 std::bitset<NumScalarInitFields> initialSgprState;
501};
502
503} // namespace gem5
504
505#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
uint32_t queueId() const
bool sgprBitEnabled(int bit) const
std::bitset< NumScalarInitFields > initialSgprState
int numWg(int dim) const
int wgId(int dim) const
Addr hostDispPktAddr() const
int wgSize(int dim) const
void wgId(int dim, int val)
void parseKernelCode(AMDKernelCode *akc)
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
void markInvDone()
Forcefully change the state to be inv done.
const std::string & kernelName() const
std::array< int, MAX_DIM > _wgId
std::array< int, MAX_DIM > _gridSize
int _outstandingWbs
Number of outstanding wbs for the kernel values: 0: 1)initial value, flush has not started for the ke...
std::array< int, MAX_DIM > _wgSize
std::array< int, MAX_DIM > _numWg
Addr completionSignal() const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
int _outstandingInvs
Number of outstanding invs for the kernel.
std::bitset< NumVectorInitFields > initialVgprState
HSAQueueEntry(std::string kernel_name, uint32_t queue_id, int dispatch_id, void *disp_pkt, AMDKernelCode *akc, Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)
void updateOutstandingWbs(int val)
Update the number of pending writeback requests.
void updateOutstandingInvs(int val)
update the number of pending invalidate requests
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 63 > val
Definition misc.hh:776
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ GridWorkgroupCountZ
@ WorkgroupIdX
@ DispatchId
@ DispatchPtr
@ QueuePtr
@ PrivSegWaveByteOffset
@ PrivateSegBuf
@ WorkgroupIdY
@ PrivateSegSize
@ WorkgroupInfo
@ GridWorkgroupCountY
@ WorkgroupIdZ
@ GridWorkgroupCountX
@ FlatScratchInit
@ KernargSegPtr
@ WorkitemIdX
@ WorkitemIdZ
@ WorkitemIdY
uint32_t enable_sgpr_workgroup_info
uint32_t enable_sgpr_queue_ptr
uint32_t enable_sgpr_grid_workgroup_count_x
uint32_t enable_sgpr_dispatch_ptr
uint32_t enable_sgpr_dispatch_id
uint32_t enable_vgpr_workitem_id
uint32_t enable_sgpr_private_segment_wave_byte_offset
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_sgpr_grid_workgroup_count_y
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_grid_workgroup_count_z
uint32_t enable_sgpr_kernarg_segment_ptr

Generated on Mon Jul 10 2023 15:32:03 for gem5 by doxygen 1.9.7