gem5  v20.0.0.3
wavefront.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef __WAVEFRONT_HH__
35 #define __WAVEFRONT_HH__
36 
37 #include <cassert>
38 #include <deque>
39 #include <memory>
40 #include <stack>
41 #include <vector>
42 
43 #include "arch/gpu_isa.hh"
44 #include "base/logging.hh"
45 #include "base/types.hh"
46 #include "config/the_gpu_isa.hh"
48 #include "gpu-compute/lds_state.hh"
49 #include "gpu-compute/misc.hh"
50 #include "gpu-compute/ndrange.hh"
51 #include "params/Wavefront.hh"
52 #include "sim/sim_object.hh"
53 
54 static const int MAX_NUM_INSTS_PER_WF = 12;
55 
64  uint32_t pc;
70  uint32_t rpc;
75 };
76 
77 /*
78  * Arguments for the hsail opcode call, are user defined and variable length.
79  * The hardware/finalizer can support arguments in hardware or use memory to
80  * pass arguments. For now, let's assume that an unlimited number of arguments
81  * are supported in hardware (the compiler inlines functions whenver it can
82  * anyways, so unless someone is interested in the implications of linking/
83  * library functions, I think this is a reasonable assumption given the typical
84  * size of an OpenCL kernel).
85  *
86  * Note that call args are different than kernel arguments:
87  * * All work-items in a kernel refer the same set of kernel arguments
88  * * Each work-item has it's on set of call args. So a call argument at
89  * address 0x4 is different for work-item 0 and work-item 1.
90  *
91  * Ok, the table below shows an example of how we organize the call arguments in
92  * the CallArgMem class.
93  *
94  * int foo(int arg1, double arg2)
95  * ___________________________________________________
96  * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
97  * |---------------------------------------------------|
98  * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
99  * |---------------------------------------------------|
100  * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
101  * ___________________________________________________
102  */
104 {
105  public:
106  // pointer to buffer for storing function arguments
107  uint8_t *mem;
108  int wfSize;
109  // size of function args
111 
112  template<typename CType>
113  int
114  getLaneOffset(int lane, int addr)
115  {
116  return addr * wfSize + sizeof(CType) * lane;
117  }
118 
119  CallArgMem(int func_args_size_per_item, int wf_size)
120  : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
121  {
122  mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
123  }
124 
126  {
127  free(mem);
128  }
129 
130  template<typename CType>
131  uint8_t*
132  getLaneAddr(int lane, int addr)
133  {
134  return mem + getLaneOffset<CType>(lane, addr);
135  }
136 
137  template<typename CType>
138  void
139  setLaneAddr(int lane, int addr, CType val)
140  {
141  *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
142  }
143 };
144 
145 class Wavefront : public SimObject
146 {
147  public:
148  enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
149  enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
150 
151  // Base pointer for array of instruction pointers
152  uint64_t basePtr;
153 
154  uint32_t oldBarrierCnt;
155  uint32_t barrierCnt;
156  uint32_t barrierId;
157  uint32_t barrierSlots;
159  // HW slot id where the WF is mapped to inside a SIMD unit
160  int wfSlotId;
161  int kernId;
162  // SIMD unit where the WV has been scheduled
163  int simdId;
164  // pointer to parent CU
166 
168 
170  bool dropFetch;
171 
172  // Condition Register State (for HSAIL simulations only)
174  // number of single precision VGPRs required by WF
175  uint32_t maxSpVgprs;
176  // number of double precision VGPRs required by WF
177  uint32_t maxDpVgprs;
178  // map virtual to physical vector register
179  uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
180  void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
181  bool isGmInstruction(GPUDynInstPtr ii);
182  bool isLmInstruction(GPUDynInstPtr ii);
183  bool isOldestInstGMem();
184  bool isOldestInstLMem();
185  bool isOldestInstPrivMem();
186  bool isOldestInstFlatMem();
187  bool isOldestInstALU();
188  bool isOldestInstBarrier();
189  // used for passing spill address to DDInstGPU
191  std::vector<uint32_t> workItemId[3];
193  /* kernel launch parameters */
194  uint32_t workGroupId[3];
195  uint32_t workGroupSz[3];
196  uint32_t gridSz[3];
197  uint32_t wgId;
198  uint32_t wgSz;
199  /* the actual WG size can differ than the maximum size */
200  uint32_t actualWgSz[3];
201  uint32_t actualWgSzTotal;
202  void computeActualWgSz(NDRange *ndr);
203  // wavefront id within a workgroup
204  uint32_t wfId;
205  uint32_t maxDynWaveId;
206  uint32_t dispatchId;
207  // outstanding global+local memory requests
208  uint32_t outstandingReqs;
209  // memory requests between scoreboard
210  // and execute stage not yet executed
211  uint32_t memReqsInPipe;
212  // outstanding global memory write requests
214  // outstanding local memory write requests
216  // outstanding global memory read requests
218  // outstanding local memory read requests
220  uint32_t rdLmReqsInPipe;
221  uint32_t rdGmReqsInPipe;
222  uint32_t wrLmReqsInPipe;
223  uint32_t wrGmReqsInPipe;
224 
226  uint64_t lastTrace;
227  // number of vector registers reserved by WF
229  // Index into the Vector Register File's namespace where the WF's registers
230  // will live while the WF is executed
231  uint32_t startVgprIndex;
232 
233  // Old value of destination gpr (for trace)
235  // Id of destination gpr (for trace)
236  uint32_t oldVgprId;
237  // Tick count of last old_vgpr copy
238  uint64_t oldVgprTcnt;
239 
240  // Old value of destination gpr (for trace)
242  // Id of destination gpr (for trace)
243  uint32_t oldDgprId;
244  // Tick count of last old_vgpr copy
245  uint64_t oldDgprTcnt;
246 
247  // Execution mask at wavefront start
249 
250  // number of barriers this WF has joined
253  // Flag to stall a wave on barrier
255 
256  // a pointer to the fraction of the LDS allocated
257  // to this workgroup (thus this wavefront)
259 
260  // A pointer to the spill area
262  // The size of the spill area
264  // The vector width of the spill area
265  uint32_t spillWidth;
266 
267  // A pointer to the private memory area
269  // The size of the private memory area
270  uint32_t privSizePerItem;
271 
272  // A pointer ot the read-only memory area
274  // size of the read-only memory area
275  uint32_t roSize;
276 
277  // pointer to buffer for storing kernel arguments
278  uint8_t *kernelArgs;
279  // unique WF id over all WFs executed across all CUs
280  uint64_t wfDynId;
281 
282  // number of times instruction issue for this wavefront is blocked
283  // due to VRF port availability
285  // number of times an instruction of a WF is blocked from being issued
286  // due to WAR and WAW dependencies
288  // number of times an instruction of a WF is blocked from being issued
289  // due to WAR and WAW dependencies
291  // distribution of executed instructions based on their register
292  // operands; this is used to highlight the load on the VRF
295 
296  // Functions to operate on call argument memory
297  // argument memory for hsail call instruction
299  void
300  initCallArgMem(int func_args_size_per_item, int wf_size)
301  {
302  callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
303  }
304 
305  template<typename CType>
306  CType
307  readCallArgMem(int lane, int addr)
308  {
309  return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
310  }
311 
312  template<typename CType>
313  void
314  writeCallArgMem(int lane, int addr, CType val)
315  {
316  callArgMem->setLaneAddr<CType>(lane, addr, val);
317  }
318 
319  typedef WavefrontParams Params;
320  Wavefront(const Params *p);
321  ~Wavefront();
322  virtual void init();
323 
324  void
326  {
327  computeUnit = cu;
328  }
329 
330  void start(uint64_t _wfDynId, uint64_t _base_ptr);
331  void exec();
332  void updateResources();
333  int ready(itype_e type);
334  bool instructionBufferHasBranch();
335  void regStats();
336  VectorMask getPred() { return execMask() & initMask; }
337 
338  bool waitingAtBarrier(int lane);
339 
340  void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
341  const VectorMask& exec_mask);
342 
343  void popFromReconvergenceStack();
344 
345  uint32_t pc() const;
346 
347  uint32_t rpc() const;
348 
349  VectorMask execMask() const;
350 
351  bool execMask(int lane) const;
352 
353  void pc(uint32_t new_pc);
354 
355  void discardFetch();
356 
361  uint32_t getStaticContextSize() const;
362 
367  void getContext(const void *out);
368 
373  void setContext(const void *in);
374 
375  TheGpuISA::GPUISA&
377  {
378  return _gpuISA;
379  }
380 
381  private:
382  TheGpuISA::GPUISA _gpuISA;
391 };
392 
393 #endif // __WAVEFRONT_HH__
Addr roBase
Definition: wavefront.hh:273
std::vector< uint32_t > oldVgpr
Definition: wavefront.hh:234
Addr spillBase
Definition: wavefront.hh:261
VectorMask getPred()
Definition: wavefront.hh:336
uint32_t oldDgprId
Definition: wavefront.hh:243
Stats::Scalar numTimesBlockedDueRAWDependencies
Definition: wavefront.hh:290
CallArgMem(int func_args_size_per_item, int wf_size)
Definition: wavefront.hh:119
uint32_t barrierCnt
Definition: wavefront.hh:155
Stats::Scalar numTimesBlockedDueVrfPortAvail
Definition: wavefront.hh:284
std::deque< std::unique_ptr< ReconvergenceStackEntry > > reconvergenceStack
Stack containing Control Flow Graph nodes (i.e., kernel instructions) to be visited by the wavefront...
Definition: wavefront.hh:390
uint8_t * mem
Definition: wavefront.hh:107
int maxBarCnt
Definition: wavefront.hh:252
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:43
ip6_addr_t addr
Definition: inet.hh:330
VectorMask initMask
Definition: wavefront.hh:248
uint32_t wgSz
Definition: wavefront.hh:198
uint32_t spillWidth
Definition: wavefront.hh:265
int simdId
Definition: wavefront.hh:163
bool dropFetch
Definition: wavefront.hh:170
uint32_t dispatchId
Definition: wavefront.hh:206
int kernId
Definition: wavefront.hh:161
class ConditionRegisterState * condRegState
Definition: wavefront.hh:173
this represents a slice of the overall LDS, intended to be associated with an individual workgroup ...
Definition: lds_state.hh:56
Bitfield< 4, 0 > mode
int wfSlotId
Definition: wavefront.hh:160
bool stalledAtBarrier
Definition: wavefront.hh:254
uint32_t maxSpVgprs
Definition: wavefront.hh:175
LdsChunk * ldsChunk
Definition: wavefront.hh:258
Stats::Scalar numTimesBlockedDueWAXDependencies
Definition: wavefront.hh:287
uint32_t oldVgprId
Definition: wavefront.hh:236
This is a simple scalar statistic, like a counter.
Definition: statistics.hh:2505
uint64_t lastTrace
Definition: wavefront.hh:226
Bitfield< 63 > val
Definition: misc.hh:769
uint64_t wfDynId
Definition: wavefront.hh:280
uint32_t barrierSlots
Definition: wavefront.hh:157
CallArgMem * callArgMem
Definition: wavefront.hh:298
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:46
Stats::Distribution srcRegOpDist
Definition: wavefront.hh:293
uint8_t type
Definition: inet.hh:328
uint32_t pc
PC of current instruction.
Definition: wavefront.hh:64
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:167
void initCallArgMem(int func_args_size_per_item, int wf_size)
Definition: wavefront.hh:300
uint32_t wfId
Definition: wavefront.hh:204
uint32_t rdLmReqsInPipe
Definition: wavefront.hh:220
Addr privBase
Definition: wavefront.hh:268
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:192
void writeCallArgMem(int lane, int addr, CType val)
Definition: wavefront.hh:314
void exec(GPUDynInstPtr ii, Wavefront *w)
A simple distribution stat.
Definition: statistics.hh:2589
CType readCallArgMem(int lane, int addr)
Definition: wavefront.hh:307
std::vector< int > barCnt
Definition: wavefront.hh:251
void setParent(ComputeUnit *cu)
Definition: wavefront.hh:325
ComputeUnit * computeUnit
Definition: wavefront.hh:165
uint32_t wgId
Definition: wavefront.hh:197
Stats::Distribution dstRegOpDist
Definition: wavefront.hh:294
uint32_t rdGmReqsInPipe
Definition: wavefront.hh:221
uint32_t outstandingReqsWrLm
Definition: wavefront.hh:215
uint32_t actualWgSzTotal
Definition: wavefront.hh:201
uint32_t outstandingReqsRdGm
Definition: wavefront.hh:217
int memTraceBusy
Definition: wavefront.hh:225
WavefrontParams Params
Definition: wavefront.hh:319
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,16,32,64}_t.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140
uint32_t outstandingReqsRdLm
Definition: wavefront.hh:219
int getLaneOffset(int lane, int addr)
Definition: wavefront.hh:114
uint64_t basePtr
Definition: wavefront.hh:152
uint32_t outstandingReqs
Definition: wavefront.hh:208
uint32_t privSizePerItem
Definition: wavefront.hh:270
TheGpuISA::GPUISA & gpuISA()
Definition: wavefront.hh:376
uint32_t outstandingReqsWrGm
Definition: wavefront.hh:213
std::vector< Addr > lastAddr
Definition: wavefront.hh:190
TheGpuISA::GPUISA _gpuISA
Definition: wavefront.hh:382
uint32_t oldBarrierCnt
Definition: wavefront.hh:154
uint64_t oldDgprTcnt
Definition: wavefront.hh:245
bool pendingFetch
Definition: wavefront.hh:169
uint64_t oldVgprTcnt
Definition: wavefront.hh:238
uint32_t memReqsInPipe
Definition: wavefront.hh:211
int reservedVectorRegs
Definition: wavefront.hh:228
uint32_t startVgprIndex
Definition: wavefront.hh:231
A reconvergence stack entry conveys the necessary state to implement control flow divergence...
Definition: wavefront.hh:60
void setLaneAddr(int lane, int addr, CType val)
Definition: wavefront.hh:139
uint32_t roSize
Definition: wavefront.hh:275
uint32_t wrGmReqsInPipe
Definition: wavefront.hh:223
uint8_t * getLaneAddr(int lane, int addr)
Definition: wavefront.hh:132
uint32_t spillSizePerItem
Definition: wavefront.hh:263
static const int MAX_NUM_INSTS_PER_WF
Definition: wavefront.hh:54
std::vector< uint64_t > oldDgpr
Definition: wavefront.hh:241
uint32_t maxDynWaveId
Definition: wavefront.hh:205
uint32_t wrLmReqsInPipe
Definition: wavefront.hh:222
uint32_t maxDpVgprs
Definition: wavefront.hh:177
uint32_t barrierId
Definition: wavefront.hh:156
Bitfield< 0 > p
Abstract superclass for simulation objects.
Definition: sim_object.hh:93
int funcArgsSizePerItem
Definition: wavefront.hh:110
status_e status
Definition: wavefront.hh:158
const FlagsType init
This Stat is Initialized.
Definition: info.hh:45
uint32_t rpc
PC of the immediate post-dominator instruction, i.e., the value of pc for the first instruction that ...
Definition: wavefront.hh:70
VectorMask execMask
Execution mask.
Definition: wavefront.hh:74
uint8_t * kernelArgs
Definition: wavefront.hh:278

Generated on Fri Jul 3 2020 15:53:03 for gem5 by doxygen 1.8.13