gem5  v20.0.0.3
gpu_dyn_inst.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef __GPU_DYN_INST_HH__
35 #define __GPU_DYN_INST_HH__
36 
37 #include <cstdint>
38 #include <string>
39 
40 #include "base/amo.hh"
41 #include "base/logging.hh"
42 #include "enums/MemType.hh"
43 #include "enums/StorageClassType.hh"
46 
47 class GPUStaticInst;
48 
49 template<typename T>
51 {
52  public:
53  T c;
54  T s;
55 
57 
58  AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
59  : c(_c), s(_s), computeUnit(compute_unit) { }
60 
61  void
62  execute(T *b)
63  {
64  computeUnit->numCASOps++;
65 
66  if (*b == c) {
67  *b = s;
68  } else {
69  computeUnit->numFailedCASOps++;
70  }
71 
72  if (computeUnit->xact_cas_mode) {
73  computeUnit->xactCasLoadMap.clear();
74  }
75  }
76  AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
77 };
78 
79 typedef enum
80 {
83 } vgpr_type;
84 
85 class GPUDynInst : public GPUExecContext
86 {
87  public:
88  GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
89  uint64_t instSeqNum);
90  ~GPUDynInst();
91  void execute(GPUDynInstPtr gpuDynInst);
92  int numSrcRegOperands();
93  int numDstRegOperands();
94  int getNumOperands();
95  bool isVectorRegister(int operandIdx);
96  bool isScalarRegister(int operandIdx);
97  bool isCondRegister(int operandIdx);
98  int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
99  int getOperandSize(int operandIdx);
100  bool isDstOperand(int operandIdx);
101  bool isSrcOperand(int operandIdx);
102 
103  const std::string &disassemble() const;
104 
105  uint64_t seqNum() const;
106 
107  Enums::StorageClassType executedAs();
108 
109  // The address of the memory operation
112 
113  // The data to get written
114  uint8_t *d_data;
115  // Additional data (for atomics)
116  uint8_t *a_data;
117  // Additional data (for atomics)
118  uint8_t *x_data;
119  // The execution mask
121 
122  // The memory type (M_U32, M_S32, ...)
123  Enums::MemType m_type;
124 
125  // The equivalency class
126  int equiv;
127  // The return VGPR type (VT_32 or VT_64)
129  // Number of VGPR's accessed (1, 2, or 4)
130  int n_reg;
131  // The return VGPR index
132  int dst_reg;
133  // There can be max 4 dest regs>
134  int dst_reg_vec[4];
135  // SIMD where the WF of the memory instruction has been mapped to
136  int simdId;
137  // unique id of the WF where the memory instruction belongs to
138  int wfDynId;
139  // The kernel id of the requesting wf
140  int kern_id;
141  // The CU id of the requesting wf
142  int cu_id;
143  // HW slot id where the WF is mapped to inside a SIMD unit
144  int wfSlotId;
145  // execution pipeline id where the memory instruction has been scheduled
146  int pipeId;
147  // The execution time of this operation
149  // The latency of this operation
151  // A list of bank conflicts for the 4 cycles.
152  uint32_t bc[4];
153 
154  // A pointer to ROM
155  uint8_t *rom;
156  // The size of the READONLY segment
157  int sz_rom;
158 
159  // Initiate the specified memory operation, by creating a
160  // memory request and sending it off to the memory system.
161  void initiateAcc(GPUDynInstPtr gpuDynInst);
162  // Complete the specified memory operation, by writing
163  // value back to the RF in the case of a load or atomic
164  // return or, in the case of a store, we do nothing
165  void completeAcc(GPUDynInstPtr gpuDynInst);
166 
167  void updateStats();
168 
169  GPUStaticInst* staticInstruction() { return _staticInst; }
170 
171  bool isALU() const;
172  bool isBranch() const;
173  bool isNop() const;
174  bool isReturn() const;
175  bool isUnconditionalJump() const;
176  bool isSpecialOp() const;
177  bool isWaitcnt() const;
178 
179  bool isBarrier() const;
180  bool isMemFence() const;
181  bool isMemRef() const;
182  bool isFlat() const;
183  bool isLoad() const;
184  bool isStore() const;
185 
186  bool isAtomic() const;
187  bool isAtomicNoRet() const;
188  bool isAtomicRet() const;
189 
190  bool isScalar() const;
191  bool readsSCC() const;
192  bool writesSCC() const;
193  bool readsVCC() const;
194  bool writesVCC() const;
195 
196  bool isAtomicAnd() const;
197  bool isAtomicOr() const;
198  bool isAtomicXor() const;
199  bool isAtomicCAS() const;
200  bool isAtomicExch() const;
201  bool isAtomicAdd() const;
202  bool isAtomicSub() const;
203  bool isAtomicInc() const;
204  bool isAtomicDec() const;
205  bool isAtomicMax() const;
206  bool isAtomicMin() const;
207 
208  bool isArgLoad() const;
209  bool isGlobalMem() const;
210  bool isLocalMem() const;
211 
212  bool isArgSeg() const;
213  bool isGlobalSeg() const;
214  bool isGroupSeg() const;
215  bool isKernArgSeg() const;
216  bool isPrivateSeg() const;
217  bool isReadOnlySeg() const;
218  bool isSpillSeg() const;
219 
220  bool isWorkitemScope() const;
221  bool isWavefrontScope() const;
222  bool isWorkgroupScope() const;
223  bool isDeviceScope() const;
224  bool isSystemScope() const;
225  bool isNoScope() const;
226 
227  bool isRelaxedOrder() const;
228  bool isAcquire() const;
229  bool isRelease() const;
230  bool isAcquireRelease() const;
231  bool isNoOrder() const;
232 
233  bool isGloballyCoherent() const;
234  bool isSystemCoherent() const;
235 
236  /*
237  * Loads/stores/atomics may have acquire/release semantics associated
238  * withthem. Some protocols want to see the acquire/release as separate
239  * requests from the load/store/atomic. We implement that separation
240  * using continuations (i.e., a function pointer with an object associated
241  * with it). When, for example, the front-end generates a store with
242  * release semantics, we will first issue a normal store and set the
243  * continuation in the GPUDynInst to a function that generate a
244  * release request. That continuation will be called when the normal
245  * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
246  * continuation will be called in the context of the same GPUDynInst
247  * that generated the initial store.
248  */
249  std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
250 
251  // when true, call execContinuation when response arrives
253 
254  template<typename c0> AtomicOpFunctorPtr
255  makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
256  {
257  if (isAtomicAnd()) {
258  return m5::make_unique<AtomicOpAnd<c0>>(*reg0);
259  } else if (isAtomicOr()) {
260  return m5::make_unique<AtomicOpOr<c0>>(*reg0);
261  } else if (isAtomicXor()) {
262  return m5::make_unique<AtomicOpXor<c0>>(*reg0);
263  } else if (isAtomicCAS()) {
264  return m5::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
265  } else if (isAtomicExch()) {
266  return m5::make_unique<AtomicOpExch<c0>>(*reg0);
267  } else if (isAtomicAdd()) {
268  return m5::make_unique<AtomicOpAdd<c0>>(*reg0);
269  } else if (isAtomicSub()) {
270  return m5::make_unique<AtomicOpSub<c0>>(*reg0);
271  } else if (isAtomicInc()) {
272  return m5::make_unique<AtomicOpInc<c0>>();
273  } else if (isAtomicDec()) {
274  return m5::make_unique<AtomicOpDec<c0>>();
275  } else if (isAtomicMax()) {
276  return m5::make_unique<AtomicOpMax<c0>>(*reg0);
277  } else if (isAtomicMin()) {
278  return m5::make_unique<AtomicOpMin<c0>>(*reg0);
279  } else {
280  fatal("Unrecognized atomic operation");
281  }
282  }
283 
284  void
285  setRequestFlags(RequestPtr req, bool setMemOrder=true)
286  {
287  // currently these are the easy scopes to deduce
288  if (isPrivateSeg()) {
289  req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
290  } else if (isSpillSeg()) {
291  req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
292  } else if (isGlobalSeg()) {
293  req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
294  } else if (isReadOnlySeg()) {
295  req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
296  } else if (isGroupSeg()) {
297  req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
298  } else if (isFlat()) {
299  panic("TODO: translate to correct scope");
300  } else {
301  fatal("%s has bad segment type\n", disassemble());
302  }
303 
304  if (isWavefrontScope()) {
305  req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
307  } else if (isWorkgroupScope()) {
308  req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
310  } else if (isDeviceScope()) {
311  req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
313  } else if (isSystemScope()) {
314  req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
316  } else if (!isNoScope() && !isWorkitemScope()) {
317  fatal("%s has bad scope type\n", disassemble());
318  }
319 
320  if (setMemOrder) {
321  // set acquire and release flags
322  if (isAcquire()) {
323  req->setFlags(Request::ACQUIRE);
324  } else if (isRelease()) {
325  req->setFlags(Request::RELEASE);
326  } else if (isAcquireRelease()) {
327  req->setFlags(Request::ACQUIRE | Request::RELEASE);
328  } else if (!isNoOrder()) {
329  fatal("%s has bad memory order\n", disassemble());
330  }
331  }
332 
333  // set atomic type
334  // currently, the instruction genenerator only produces atomic return
335  // but a magic instruction can produce atomic no return
336  if (isAtomicRet()) {
337  req->setFlags(Request::ATOMIC_RETURN_OP);
338  } else if (isAtomicNoRet()) {
339  req->setFlags(Request::ATOMIC_NO_RETURN_OP);
340  }
341  }
342 
343  // Map returned packets and the addresses they satisfy with which lane they
344  // were requested from
345  typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
346  StatusVector memStatusVector;
347 
348  // Track the status of memory requests per lane, a bit per lane
350  // for ld_v# or st_v#
353 
354  private:
356  uint64_t _seqNum;
357 };
358 
359 #endif // __GPU_DYN_INST_HH__
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
Access has Workgroup scope visibility.
Definition: request.hh:228
StatusVector memStatusVector
std::vector< Addr > addr
Access has Wavefront scope visibility.
Definition: request.hh:226
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:171
std::vector< int > tlbHitLevel
std::map< unsigned, waveQueue > xactCasLoadMap
WaitClass latency
std::function< void(GPUStaticInst *, GPUDynInstPtr)> execContinuation
std::shared_ptr< Request > RequestPtr
Definition: request.hh:81
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:43
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
Definition: amo.hh:228
The request should be marked with RELEASE.
Definition: request.hh:158
uint8_t * rom
void execute(T *b)
Definition: gpu_dyn_inst.hh:62
Access has System (e.g., CPU + GPU) scope visibility.
Definition: request.hh:232
ComputeUnit * computeUnit
Definition: gpu_dyn_inst.hh:56
uint8_t * a_data
std::vector< int > statusVector
Bitfield< 7 > b
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
Definition: gpu_dyn_inst.hh:58
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:46
void setRequestFlags(RequestPtr req, bool setMemOrder=true)
vgpr_type v_type
uint64_t _seqNum
uint8_t * d_data
uint64_t Tick
Tick count type.
Definition: types.hh:61
Spill Segment.
Definition: request.hh:245
Stats::Scalar numCASOps
Private Segment.
Definition: request.hh:239
uint8_t * x_data
VectorMask exec_mask
The request is an atomic that returns data.
Definition: request.hh:161
The request is an atomic that does not return data.
Definition: request.hh:163
GPUStaticInst * staticInstruction()
Has a synchronization scope been set?
Definition: request.hh:224
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140
bool useContinuation
bool xact_cas_mode
AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
Global Segment.
Definition: request.hh:235
Group Segment.
Definition: request.hh:237
AtomicOpFunctor * clone()
Definition: gpu_dyn_inst.hh:76
GPUStaticInst * _staticInst
Stats::Scalar numFailedCASOps
Readonly Segment.
Definition: request.hh:243
VectorMask statusBitVector
std::unordered_map< Addr, std::vector< int > > StatusVector
vgpr_type
Definition: gpu_dyn_inst.hh:79
Enums::MemType m_type
Access has Device (e.g., GPU) scope visibility.
Definition: request.hh:230
The request should be marked with ACQUIRE.
Definition: request.hh:156

Generated on Fri Jul 3 2020 15:53:02 for gem5 by doxygen 1.8.13