gem5  v22.1.0.0
hsa_queue_entry.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
41 #ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
42 #define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
43 
44 #include <bitset>
45 #include <cstdint>
46 #include <cstring>
47 #include <iostream>
48 #include <vector>
49 
50 #include "base/intmath.hh"
51 #include "base/types.hh"
52 #include "dev/hsa/hsa_packet.hh"
53 #include "dev/hsa/hsa_queue.hh"
55 
56 namespace gem5
57 {
58 
60 {
61  public:
62  HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
63  int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
64  Addr host_pkt_addr, Addr code_addr)
65  : kernName(kernel_name),
66  _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
67  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
68  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
69  _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
70  (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
71  (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
72  numVgprs(akc->workitem_vgpr_count),
73  numSgprs(akc->wavefront_sgpr_count),
74  _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
75  _hostDispPktAddr(host_pkt_addr),
76  _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
77  ->completion_signal),
78  codeAddress(code_addr),
79  kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
80  _outstandingInvs(-1), _outstandingWbs(0),
81  _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
82  group_segment_size),
83  _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
84  private_segment_size),
85  _contextId(0), _wgId{{ 0, 0, 0 }},
86  _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
87  _globalWgId(0), dispatchComplete(false)
88 
89  {
90  // Precompiled BLIT kernels actually violate the spec a bit
91  // and don't set many of the required akc fields. For these kernels,
92  // we need to rip register usage from the resource registers.
93  //
94  // We can't get an exact number of registers from the resource
95  // registers because they round, but we can get an upper bound on it
96  if (!numVgprs)
97  numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
98 
99  if (!numSgprs || numSgprs ==
100  std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
101  // Supported major generation numbers: 0 (BLIT kernels), 8, and 9
102  uint16_t version = akc->amd_machine_version_major;
103  assert((version == 0) || (version == 8) || (version == 9));
104  // SGPR allocation granularies:
105  // - GFX8: 8
106  // - GFX9: 16
107  // Source: https://llvm.org/docs/AMDGPUUsage.html
108  if ((version == 0) || (version == 8)) {
109  // We assume that BLIT kernels use the same granularity as GFX8
110  numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
111  } else if (version == 9) {
112  numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
113  }
114  }
115 
116  initialVgprState.reset();
117  initialSgprState.reset();
118 
119  for (int i = 0; i < MAX_DIM; ++i) {
120  _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
121  _numWgTotal *= _numWg[i];
122  }
123 
124  parseKernelCode(akc);
125  }
126 
127  const std::string&
128  kernelName() const
129  {
130  return kernName;
131  }
132 
133  int
134  wgSize(int dim) const
135  {
136  assert(dim < MAX_DIM);
137  return _wgSize[dim];
138  }
139 
140  int
141  gridSize(int dim) const
142  {
143  assert(dim < MAX_DIM);
144  return _gridSize[dim];
145  }
146 
147  int
149  {
150  return numVgprs;
151  }
152 
153  int
155  {
156  return numSgprs;
157  }
158 
159  uint32_t
160  queueId() const
161  {
162  return _queueId;
163  }
164 
165  int
166  dispatchId() const
167  {
168  return _dispatchId;
169  }
170 
171  void*
173  {
174  return dispPkt;
175  }
176 
177  Addr
179  {
180  return _hostDispPktAddr;
181  }
182 
183  Addr
185  {
186  return _completionSignal;
187  }
188 
189  Addr
190  codeAddr() const
191  {
192  return codeAddress;
193  }
194 
195  Addr
196  kernargAddr() const
197  {
198  return kernargAddress;
199  }
200 
201  int
202  ldsSize() const
203  {
204  return _ldsSize;
205  }
206 
207  int privMemPerItem() const { return _privMemPerItem; }
208 
209  int
210  contextId() const
211  {
212  return _contextId;
213  }
214 
215  bool
216  dispComplete() const
217  {
218  return dispatchComplete;
219  }
220 
221  int
222  wgId(int dim) const
223  {
224  assert(dim < MAX_DIM);
225  return _wgId[dim];
226  }
227 
228  void
229  wgId(int dim, int val)
230  {
231  assert(dim < MAX_DIM);
232  _wgId[dim] = val;
233  }
234 
235  int
236  globalWgId() const
237  {
238  return _globalWgId;
239  }
240 
241  void
243  {
244  _globalWgId = val;
245  }
246 
247  int
248  numWg(int dim) const
249  {
250  assert(dim < MAX_DIM);
251  return _numWg[dim];
252  }
253 
254  void
256  {
257  ++_numWgCompleted;
258  }
259 
260  int
262  {
263  return _numWgCompleted;
264  }
265 
266  int
267  numWgTotal() const
268  {
269  return _numWgTotal;
270  }
271 
272  void
274  {
275  ++_wgId[0];
276  ++_globalWgId;
277 
278  if (wgId(0) * wgSize(0) >= gridSize(0)) {
279  _wgId[0] = 0;
280  ++_wgId[1];
281 
282  if (wgId(1) * wgSize(1) >= gridSize(1)) {
283  _wgId[1] = 0;
284  ++_wgId[2];
285 
286  if (wgId(2) * wgSize(2) >= gridSize(2)) {
287  dispatchComplete = true;
288  }
289  }
290  }
291  }
292 
293  int
295  {
296  return numWgArrivedAtBarrier;
297  }
298 
299  bool vgprBitEnabled(int bit) const
300  {
301  return initialVgprState.test(bit);
302  }
303 
304  bool sgprBitEnabled(int bit) const
305  {
306  return initialSgprState.test(bit);
307  }
308 
314 
321 
322  // the maximum number of dimensions for a grid or workgroup
323  const static int MAX_DIM = 3;
324 
325  /* getter */
326  int
328  return _outstandingInvs;
329  }
330 
336  bool
338  {
339  return (_outstandingInvs != -1);
340  }
341 
347  void
349  {
350  _outstandingInvs += val;
351  assert(_outstandingInvs >= 0);
352  }
353 
357  void
359  {
360  _outstandingInvs = 0;
361  }
362 
366  bool
367  isInvDone() const
368  {
369  assert(_outstandingInvs >= 0);
370  return (_outstandingInvs == 0);
371  }
372 
373  int
375  {
376  return _outstandingWbs;
377  }
378 
384  void
386  {
387  _outstandingWbs += val;
388  assert(_outstandingWbs >= 0);
389  }
390 
391  private:
392  void
394  {
396  initialSgprState.set(PrivateSegBuf,
398  initialSgprState.set(DispatchPtr,
400  initialSgprState.set(QueuePtr,
401  akc->enable_sgpr_queue_ptr);
402  initialSgprState.set(KernargSegPtr,
404  initialSgprState.set(DispatchId,
406  initialSgprState.set(FlatScratchInit,
408  initialSgprState.set(PrivateSegSize,
410  initialSgprState.set(GridWorkgroupCountX,
412  initialSgprState.set(GridWorkgroupCountY,
414  initialSgprState.set(GridWorkgroupCountZ,
416  initialSgprState.set(WorkgroupIdX,
418  initialSgprState.set(WorkgroupIdY,
420  initialSgprState.set(WorkgroupIdZ,
422  initialSgprState.set(WorkgroupInfo,
424  initialSgprState.set(PrivSegWaveByteOffset,
426 
431  initialVgprState.set(WorkitemIdX, true);
432  initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0);
433  initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);
434  }
435 
436  // name of the kernel associated with the AQL entry
437  std::string kernName;
438  // workgroup Size (3 dimensions)
439  std::array<int, MAX_DIM> _wgSize;
440  // grid Size (3 dimensions)
441  std::array<int, MAX_DIM> _gridSize;
442  // total number of VGPRs per work-item
443  int numVgprs;
444  // total number of SGPRs per wavefront
445  int numSgprs;
446  // id of AQL queue in which this entry is placed
447  uint32_t _queueId;
449  // raw AQL packet pointer
450  void *dispPkt;
451  // host-side addr of the dispatch packet
453  // pointer to bool
455  // base address of the raw machine code
457  // base address of the kernel args
476  int _ldsSize;
479  std::array<int, MAX_DIM> _wgId;
480  std::array<int, MAX_DIM> _numWg;
483  // The number of completed work groups
487 
488  std::bitset<NumVectorInitFields> initialVgprState;
489  std::bitset<NumScalarInitFields> initialSgprState;
490 };
491 
492 } // namespace gem5
493 
494 #endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
_amd_queue_t amdQueue
Keep a copy of the AMD HSA queue because we need info from some of its fields to initialize register ...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
Addr kernargAddr() const
uint32_t queueId() const
bool sgprBitEnabled(int bit) const
std::bitset< NumScalarInitFields > initialSgprState
int numWg(int dim) const
int wgId(int dim) const
void globalWgId(int val)
const std::string & kernelName() const
Addr hostDispPktAddr() const
int wgSize(int dim) const
void wgId(int dim, int val)
int numWgAtBarrier() const
void parseKernelCode(AMDKernelCode *akc)
Addr hostAMDQueueAddr
Host-side addr of the amd_queue_t on which this task was queued.
bool vgprBitEnabled(int bit) const
void markInvDone()
Forcefully change the state to be inv done.
int outstandingWbs() const
HSAQueueEntry(std::string kernel_name, uint32_t queue_id, int dispatch_id, void *disp_pkt, AMDKernelCode *akc, Addr host_pkt_addr, Addr code_addr)
std::array< int, MAX_DIM > _wgId
int privMemPerItem() const
int numWgCompleted() const
std::array< int, MAX_DIM > _gridSize
int _outstandingWbs
Number of outstanding wbs for the kernel values: 0: 1)initial value, flush has not started for the ke...
std::array< int, MAX_DIM > _wgSize
bool dispComplete() const
std::array< int, MAX_DIM > _numWg
Addr completionSignal() const
bool isInvDone() const
Is invalidate done?
int gridSize(int dim) const
int _outstandingInvs
Number of outstanding invs for the kernel.
std::bitset< NumVectorInitFields > initialVgprState
void updateOutstandingWbs(int val)
Update the number of pending writeback requests.
void updateOutstandingInvs(int val)
update the number of pending invalidate requests
static constexpr T divCeil(const T &a, const U &b)
Definition: intmath.hh:110
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 63 > val
Definition: misc.hh:776
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
@ GridWorkgroupCountZ
Definition: kernel_code.hh:65
@ WorkgroupIdX
Definition: kernel_code.hh:66
@ DispatchId
Definition: kernel_code.hh:60
@ DispatchPtr
Definition: kernel_code.hh:57
@ QueuePtr
Definition: kernel_code.hh:58
@ PrivSegWaveByteOffset
Definition: kernel_code.hh:70
@ PrivateSegBuf
Definition: kernel_code.hh:56
@ WorkgroupIdY
Definition: kernel_code.hh:67
@ PrivateSegSize
Definition: kernel_code.hh:62
@ WorkgroupInfo
Definition: kernel_code.hh:69
@ GridWorkgroupCountY
Definition: kernel_code.hh:64
@ WorkgroupIdZ
Definition: kernel_code.hh:68
@ GridWorkgroupCountX
Definition: kernel_code.hh:63
@ FlatScratchInit
Definition: kernel_code.hh:61
@ KernargSegPtr
Definition: kernel_code.hh:59
@ WorkitemIdX
Definition: kernel_code.hh:76
@ WorkitemIdZ
Definition: kernel_code.hh:78
@ WorkitemIdY
Definition: kernel_code.hh:77
uint32_t enable_sgpr_workgroup_info
Definition: kernel_code.hh:131
uint32_t enable_sgpr_queue_ptr
Definition: kernel_code.hh:150
uint32_t enable_sgpr_grid_workgroup_count_x
Definition: kernel_code.hh:155
uint32_t enable_sgpr_dispatch_ptr
Definition: kernel_code.hh:149
uint32_t enable_sgpr_dispatch_id
Definition: kernel_code.hh:152
uint32_t enable_vgpr_workitem_id
Definition: kernel_code.hh:132
uint32_t enable_sgpr_private_segment_wave_byte_offset
Definition: kernel_code.hh:125
uint32_t enable_sgpr_workgroup_id_y
Definition: kernel_code.hh:129
uint32_t enable_sgpr_grid_workgroup_count_y
Definition: kernel_code.hh:156
uint32_t enable_sgpr_workgroup_id_x
Definition: kernel_code.hh:128
uint32_t enable_sgpr_workgroup_id_z
Definition: kernel_code.hh:130
uint32_t enable_sgpr_private_segment_size
Definition: kernel_code.hh:154
uint32_t enable_sgpr_private_segment_buffer
Definition: kernel_code.hh:148
uint32_t enable_sgpr_flat_scratch_init
Definition: kernel_code.hh:153
uint32_t enable_sgpr_grid_workgroup_count_z
Definition: kernel_code.hh:157
uint32_t enable_sgpr_kernarg_segment_ptr
Definition: kernel_code.hh:151

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1