gem5  v22.1.0.0
gpu_dyn_inst.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef __GPU_DYN_INST_HH__
33 #define __GPU_DYN_INST_HH__
34 
35 #include <cstdint>
36 #include <memory>
37 #include <string>
38 
39 #include "base/amo.hh"
40 #include "base/logging.hh"
41 #include "base/trace.hh"
42 #include "debug/GPUMem.hh"
43 #include "enums/StorageClassType.hh"
47 
48 namespace gem5
49 {
50 
51 class GPUStaticInst;
52 
53 template<typename T>
55 {
56  public:
57  T c;
58  T s;
59 
61 
62  AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
63  : c(_c), s(_s), computeUnit(compute_unit) { }
64 
65  void
66  execute(T *b)
67  {
69 
70  if (*b == c) {
71  *b = s;
72  } else {
74  }
75  }
77 };
78 
80 {
81  public:
82  RegisterOperandInfo() = delete;
83  RegisterOperandInfo(int op_idx, int num_dwords,
84  const std::vector<int> &virt_indices,
85  const std::vector<int> &phys_indices)
86  : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
87  physIndices(phys_indices)
88  {
89  }
90 
95  int operandIdx() const { return opIdx; }
100  int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
101 
102  private:
107  const int opIdx;
111  const int numDWORDs;
114 };
115 
117 {
118  public:
119  GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
120  uint64_t instSeqNum);
121  ~GPUDynInst();
122  void execute(GPUDynInstPtr gpuDynInst);
123 
128 
129  int numSrcRegOperands();
130  int numDstRegOperands();
131 
132  int numSrcVecRegOperands() const;
133  int numDstVecRegOperands() const;
135  int numSrcVecDWords();
136  int numDstVecDWords();
137 
138  int numSrcScalarRegOperands() const;
139  int numDstScalarRegOperands() const;
141  int numSrcScalarDWords();
142  int numDstScalarDWords();
143 
144  int maxOperandSize();
145 
146  int getNumOperands() const;
147 
148  bool hasSourceSgpr() const;
149  bool hasDestinationSgpr() const;
150  bool hasSourceVgpr() const;
151  bool hasDestinationVgpr() const;
152 
153  // returns true if the string "opcodeStr" is found in the
154  // opcode of the instruction
155  bool isOpcode(const std::string& opcodeStr) const;
156  bool isOpcode(const std::string& opcodeStr,
157  const std::string& extStr) const;
158 
159  const std::string &disassemble() const;
160 
161  InstSeqNum seqNum() const;
162 
163  Addr pc();
164  void pc(Addr _pc);
165 
166  enums::StorageClassType executedAs();
167 
168  // virtual address for scalar memory operations
170  // virtual addressies for vector memory operations
173 
174  // vector data to get written
175  uint8_t *d_data;
176  // scalar data to be transferred
177  uint8_t *scalar_data;
178  // Additional data (for atomics)
179  uint8_t *a_data;
180  // Additional data (for atomics)
181  uint8_t *x_data;
182  // The execution mask
184 
185  // SIMD where the WF of the memory instruction has been mapped to
186  int simdId;
187  // unique id of the WF where the memory instruction belongs to
188  int wfDynId;
189  // The kernel id of the requesting wf
190  int kern_id;
191  // The CU id of the requesting wf
192  int cu_id;
193  // The workgroup id of the requesting wf
194  int wg_id;
195  // HW slot id where the WF is mapped to inside a SIMD unit
196  int wfSlotId;
197  // execution pipeline id where the memory instruction has been scheduled
199  // The execution time of this operation
201  // The latency of this operation
203 
204  // Initiate the specified memory operation, by creating a
205  // memory request and sending it off to the memory system.
206  void initiateAcc(GPUDynInstPtr gpuDynInst);
207  // Complete the specified memory operation, by writing
208  // value back to the RF in the case of a load or atomic
209  // return or, in the case of a store, we do nothing
210  void completeAcc(GPUDynInstPtr gpuDynInst);
211 
212  void updateStats();
213 
215 
217 
218  bool isALU() const;
219  bool isBranch() const;
220  bool isCondBranch() const;
221  bool isNop() const;
222  bool isReturn() const;
223  bool isEndOfKernel() const;
224  bool isKernelLaunch() const;
225  bool isSDWAInst() const;
226  bool isDPPInst() const;
227  bool isUnconditionalJump() const;
228  bool isSpecialOp() const;
229  bool isWaitcnt() const;
230  bool isSleep() const;
231 
232  bool isBarrier() const;
233  bool isMemSync() const;
234  bool isMemRef() const;
235  bool isFlat() const;
236  bool isFlatGlobal() const;
237  bool isLoad() const;
238  bool isStore() const;
239 
240  bool isAtomic() const;
241  bool isAtomicNoRet() const;
242  bool isAtomicRet() const;
243 
244  bool isScalar() const;
245  bool isVector() const;
246  bool readsSCC() const;
247  bool writesSCC() const;
248  bool readsVCC() const;
249  bool writesVCC() const;
250  bool readsExec() const;
251  bool writesExec() const;
252  bool readsMode() const;
253  bool writesMode() const;
254  bool ignoreExec() const;
255  bool readsFlatScratch() const;
256  bool writesFlatScratch() const;
257  bool readsExecMask() const;
258  bool writesExecMask() const;
259 
260  bool isAtomicAnd() const;
261  bool isAtomicOr() const;
262  bool isAtomicXor() const;
263  bool isAtomicCAS() const;
264  bool isAtomicExch() const;
265  bool isAtomicAdd() const;
266  bool isAtomicSub() const;
267  bool isAtomicInc() const;
268  bool isAtomicDec() const;
269  bool isAtomicMax() const;
270  bool isAtomicMin() const;
271 
272  bool isArgLoad() const;
273  bool isGlobalMem() const;
274  bool isLocalMem() const;
275 
276  bool isArgSeg() const;
277  bool isGlobalSeg() const;
278  bool isGroupSeg() const;
279  bool isKernArgSeg() const;
280  bool isPrivateSeg() const;
281  bool isReadOnlySeg() const;
282  bool isSpillSeg() const;
283 
284  bool isGloballyCoherent() const;
285  bool isSystemCoherent() const;
286 
287  bool isF16() const;
288  bool isF32() const;
289  bool isF64() const;
290 
291  bool isFMA() const;
292  bool isMAC() const;
293  bool isMAD() const;
294 
295  // for FLAT memory ops. check the segment address
296  // against the APE registers to see if it falls
297  // within one of the APE ranges for LDS/SCRATCH/GPUVM.
298  // if it does not fall into one of the three APEs, it
299  // will be a regular global access.
300  void doApertureCheck(const VectorMask &mask);
301  // Function to resolve a flat accesses during execution stage.
302  void resolveFlatSegment(const VectorMask &mask);
303 
304  template<typename c0> AtomicOpFunctorPtr
305  makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
306  {
307  if (isAtomicAnd()) {
308  return std::make_unique<AtomicOpAnd<c0>>(*reg0);
309  } else if (isAtomicOr()) {
310  return std::make_unique<AtomicOpOr<c0>>(*reg0);
311  } else if (isAtomicXor()) {
312  return std::make_unique<AtomicOpXor<c0>>(*reg0);
313  } else if (isAtomicCAS()) {
314  return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
315  } else if (isAtomicExch()) {
316  return std::make_unique<AtomicOpExch<c0>>(*reg0);
317  } else if (isAtomicAdd()) {
318  return std::make_unique<AtomicOpAdd<c0>>(*reg0);
319  } else if (isAtomicSub()) {
320  return std::make_unique<AtomicOpSub<c0>>(*reg0);
321  } else if (isAtomicInc()) {
322  return std::make_unique<AtomicOpInc<c0>>();
323  } else if (isAtomicDec()) {
324  return std::make_unique<AtomicOpDec<c0>>();
325  } else if (isAtomicMax()) {
326  return std::make_unique<AtomicOpMax<c0>>(*reg0);
327  } else if (isAtomicMin()) {
328  return std::make_unique<AtomicOpMin<c0>>(*reg0);
329  } else {
330  fatal("Unrecognized atomic operation");
331  }
332  }
333 
334  void
336  {
337  if (isGloballyCoherent()) {
338  req->setCacheCoherenceFlags(Request::GLC_BIT);
339  }
340 
341  if (isSystemCoherent()) {
342  req->setCacheCoherenceFlags(Request::SLC_BIT);
343  }
344 
345  if (isAtomicRet()) {
346  req->setFlags(Request::ATOMIC_RETURN_OP);
347  } else if (isAtomicNoRet()) {
348  req->setFlags(Request::ATOMIC_NO_RETURN_OP);
349  }
350 
351  if (isMemSync()) {
352  // the path for kernel launch and kernel end is different
353  // from non-kernel mem sync.
354  assert(!isKernelLaunch());
355  assert(!isEndOfKernel());
356 
357  // must be wbinv inst if not kernel launch/end
358  req->setCacheCoherenceFlags(Request::INV_L1);
359  }
360  }
361 
362  // reset the number of pending memory requests for all lanes
363  void
365  {
367  for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
368  resetStatusVector(lane);
369  }
370  }
371 
372  // reset the number of pending memory requests for the inputted lane
373  void
375  {
376  setStatusVector(lane, 0);
377  }
378 
379  // set the number of pending memory requests for the inputted lane
380  void
381  setStatusVector(int lane, int newVal)
382  {
383  // currently we can have up to 2 memory requests per lane (if the
384  // lane's request goes across multiple cache lines)
385  assert((newVal >= 0) && (newVal <= 2));
386  statusVector[lane] = newVal;
387  }
388 
389  // subtracts the number of pending memory requests for the inputted lane
390  // by 1
391  void
393  {
394  // this lane may have multiple requests, so only subtract one for
395  // this request
396  assert(statusVector[lane] >= 1);
397  statusVector[lane]--;
398  }
399 
400  // return the current number of pending memory requests for the inputted
401  // lane
402  int
403  getLaneStatus(int lane) const
404  {
405  return statusVector[lane];
406  }
407 
408  // returns true if all memory requests from all lanes have been received,
409  // else returns false
410  bool
411  allLanesZero() const
412  {
413  // local variables
414  bool allZero = true;
415 
416  // iterate over all lanes, checking the number of pending memory
417  // requests they have
418  for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
419  // if any lane still has pending requests, return false
420  if (statusVector[lane] > 0) {
421  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
422  "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
423  statusVector[lane], addr[lane]);
424  allZero = false;
425  }
426  }
427 
428  if (allZero) {
429  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
430  " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
431  }
432  return allZero;
433  }
434 
435  // returns a string representing the current state of the statusVector
436  std::string
438  {
439  std::string statusVec_str = "[";
440 
441  // iterate over all lanes, adding the current number of pending
442  // requests for this lane to the string
443  for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
444  statusVec_str += std::to_string(statusVector[lane]);
445  }
446  statusVec_str += "]";
447 
448  return statusVec_str;
449  }
450 
451  // Map returned packets and the addresses they satisfy with which lane they
452  // were requested from
453  typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
455 
456  // Track the status of memory requests per lane, an int per lane to allow
457  // unaligned accesses
459  // for ld_v# or st_v#
461 
462  // for misaligned scalar ops we track the number
463  // of outstanding reqs here
465 
466  Tick getAccessTime() const { return accessTime; }
467 
468  void setAccessTime(Tick currentTime) { accessTime = currentTime; }
469 
470  void profileRoundTripTime(Tick currentTime, int hopId);
472 
473  void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
474  const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
475  { return lineAddressTime; }
476 
477  // inst used to save/restore a wavefront context
479 
480  bool isSystemReq() { return systemReq; }
481  void setSystemReq() { systemReq = true; }
482 
483  private:
488  bool systemReq = false;
489 
490  // the time the request was started
492 
493  // hold the tick when the instruction arrives at certain hop points
494  // on it's way to main memory
496 
497  // hold each cache block address for the instruction and a vector
498  // to hold the tick when the block arrives at certain hop points
499  std::map<Addr, std::vector<Tick>> lineAddressTime;
500 };
501 
502 } // namespace gem5
503 
504 #endif // __GPU_DYN_INST_HH__
#define DPRINTF(x,...)
Definition: trace.hh:186
void execute(T *b)
Definition: gpu_dyn_inst.hh:66
AtomicOpFunctor * clone()
Definition: gpu_dyn_inst.hh:76
ComputeUnit * computeUnit
Definition: gpu_dyn_inst.hh:60
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
Definition: gpu_dyn_inst.hh:62
gem5::ComputeUnit::ComputeUnitStats stats
bool isKernelLaunch() const
std::unordered_map< Addr, std::vector< int > > StatusVector
bool isAtomicCAS() const
bool isSpecialOp() const
std::vector< Tick > roundTripTime
bool isLocalMem() const
bool hasDestinationSgpr() const
bool writesVCC() const
bool isAtomicDec() const
bool readsVCC() const
bool isNop() const
bool isF16() const
int numDstScalarRegOperands() const
std::map< Addr, std::vector< Tick > > lineAddressTime
void doApertureCheck(const VectorMask &mask)
bool isAtomicRet() const
void resolveFlatSegment(const VectorMask &mask)
std::vector< int > tlbHitLevel
bool isGlobalMem() const
bool isAtomicMin() const
bool isAtomicExch() const
bool isFlatGlobal() const
bool isBranch() const
bool isF32() const
bool isAtomicSub() const
GPUStaticInst * _staticInst
bool hasDestinationVgpr() const
std::vector< int > statusVector
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
void decrementStatusVector(int lane)
bool isUnconditionalJump() const
int numSrcScalarRegOperands() const
bool isOpcode(const std::string &opcodeStr) const
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum)
Definition: gpu_dyn_inst.cc:44
bool isAtomicXor() const
bool isALU() const
accessor methods for the attributes of the underlying GPU static instruction
bool isReadOnlySeg() const
bool isSystemCoherent() const
bool isMemRef() const
bool isAtomicAnd() const
bool isStore() const
bool isDPPInst() const
bool isSleep() const
const std::map< Addr, std::vector< Tick > > & getLineAddressTime() const
VectorMask exec_mask
bool isMemSync() const
bool writesSCC() const
bool hasSourceVgpr() const
int numDstVecRegOperands() const
bool ignoreExec() const
StatusVector memStatusVector
bool hasSourceSgpr() const
uint8_t * scalar_data
GPUStaticInst * staticInstruction()
bool isReturn() const
bool readsSCC() const
bool isMAD() const
int getLaneStatus(int lane) const
bool readsFlatScratch() const
void initiateAcc(GPUDynInstPtr gpuDynInst)
int getNumOperands() const
bool writesExec() const
bool isSDWAInst() const
bool isWaitcnt() const
bool writesMode() const
enums::StorageClassType executedAs()
bool isFlat() const
const std::vector< OperandInfo > & dstVecRegOperands() const
void profileRoundTripTime(Tick currentTime, int hopId)
void resetStatusVector(int lane)
bool isCondBranch() const
bool writesExecMask() const
bool isPrivateSeg() const
bool isEndOfKernel() const
void resetEntireStatusVector()
const std::vector< OperandInfo > & srcVecRegOperands() const
bool isAtomicInc() const
bool isGloballyCoherent() const
bool readsExecMask() const
bool isGroupSeg() const
std::vector< Tick > getRoundTripTime() const
TheGpuISA::ScalarRegU32 srcLiteral() const
Tick getAccessTime() const
bool readsExec() const
int maxSrcScalarRegOperandSize()
bool isScalar() const
bool isVector() const
InstSeqNum seqNum() const
bool isFMA() const
const std::vector< OperandInfo > & srcScalarRegOperands() const
bool isAtomicAdd() const
const std::vector< OperandInfo > & dstScalarRegOperands() const
int numSrcVecRegOperands() const
const InstSeqNum _seqNum
bool isBarrier() const
bool isLoad() const
void setRequestFlags(RequestPtr req) const
std::vector< Addr > addr
std::string printStatusVector() const
bool writesFlatScratch() const
bool allLanesZero() const
bool readsMode() const
void execute(GPUDynInstPtr gpuDynInst)
bool isMAC() const
AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
bool isKernArgSeg() const
bool isArgLoad() const
void setStatusVector(int lane, int newVal)
void setAccessTime(Tick currentTime)
bool isGlobalSeg() const
bool isArgSeg() const
bool isAtomic() const
bool isAtomicOr() const
int maxSrcVecRegOperandSize()
bool isAtomicNoRet() const
bool isSpillSeg() const
const std::string & disassemble() const
void completeAcc(GPUDynInstPtr gpuDynInst)
bool isF64() const
bool isAtomicMax() const
const std::vector< int > virtIndices
const int opIdx
Index of this operand within the set of its parent instruction's operand list.
const std::vector< int > physIndices
RegisterOperandInfo(int op_idx, int num_dwords, const std::vector< int > &virt_indices, const std::vector< int > &phys_indices)
Definition: gpu_dyn_inst.hh:83
const int numDWORDs
Size of this operand in DWORDs.
int virtIdx(int reg_num=0) const
We typically only need the first virtual register for the operand regardless of its size.
int numRegisters() const
The number of registers required to store this operand.
Definition: gpu_dyn_inst.hh:94
@ SLC_BIT
user-policy flags
Definition: request.hh:332
@ ATOMIC_RETURN_OP
The request is an atomic that returns data.
Definition: request.hh:175
@ ATOMIC_NO_RETURN_OP
The request is an atomic that does not return data.
Definition: request.hh:177
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
Definition: amo.hh:242
constexpr uint64_t mask(unsigned nbits)
Generate a 64-bit mask of 'nbits' 1s, right justified.
Definition: bitfield.hh:63
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
Bitfield< 7 > b
Definition: misc_types.hh:388
constexpr unsigned NumVecElemPerVecReg
Definition: vec.hh:61
uint32_t ScalarRegU32
const int RegSizeDWords
Size of a single-precision register in DWords.
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
uint64_t Tick
Tick count type.
Definition: types.hh:58
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:45
uint64_t InstSeqNum
Definition: inst_seq.hh:40
const std::string to_string(sc_enc enc)
Definition: sc_fxdefs.cc:60

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1