gem5 v23.0.0.1
Loading...
Searching...
No Matches
gpu_dyn_inst.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __GPU_DYN_INST_HH__
33#define __GPU_DYN_INST_HH__
34
35#include <cstdint>
36#include <memory>
37#include <string>
38
39#include "base/amo.hh"
40#include "base/logging.hh"
41#include "base/trace.hh"
42#include "debug/GPUMem.hh"
43#include "enums/StorageClassType.hh"
47
48namespace gem5
49{
50
51class GPUStaticInst;
52
53template<typename T>
55{
56 public:
57 T c;
58 T s;
59
61
62 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
63 : c(_c), s(_s), computeUnit(compute_unit) { }
64
65 void
67 {
69
70 if (*b == c) {
71 *b = s;
72 } else {
74 }
75 }
77};
78
80{
81 public:
83 RegisterOperandInfo(int op_idx, int num_dwords,
84 const std::vector<int> &virt_indices,
85 const std::vector<int> &phys_indices)
86 : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
87 physIndices(phys_indices)
88 {
89 }
90
94 int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
95 int operandIdx() const { return opIdx; }
100 int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
101
102 private:
107 const int opIdx;
111 const int numDWORDs;
114};
115
117{
118 public:
119 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
120 uint64_t instSeqNum);
121 ~GPUDynInst();
122 void execute(GPUDynInstPtr gpuDynInst);
123
128
129 int numSrcRegOperands();
130 int numDstRegOperands();
131
132 int numSrcVecRegOperands() const;
133 int numDstVecRegOperands() const;
135 int numSrcVecDWords();
136 int numDstVecDWords();
137
138 int numSrcScalarRegOperands() const;
139 int numDstScalarRegOperands() const;
141 int numSrcScalarDWords();
142 int numDstScalarDWords();
143
144 int maxOperandSize();
145
146 int getNumOperands() const;
147
148 bool hasSourceSgpr() const;
149 bool hasDestinationSgpr() const;
150 bool hasSourceVgpr() const;
151 bool hasDestinationVgpr() const;
152
153 // returns true if the string "opcodeStr" is found in the
154 // opcode of the instruction
155 bool isOpcode(const std::string& opcodeStr) const;
156 bool isOpcode(const std::string& opcodeStr,
157 const std::string& extStr) const;
158
159 const std::string &disassemble() const;
160
161 InstSeqNum seqNum() const;
162
163 Addr pc();
164 void pc(Addr _pc);
165
166 enums::StorageClassType executedAs();
167
168 // virtual address for scalar memory operations
170 // virtual addressies for vector memory operations
173
174 // vector data to get written
175 uint8_t *d_data;
176 // scalar data to be transferred
177 uint8_t *scalar_data;
178 // Additional data (for atomics)
179 uint8_t *a_data;
180 // Additional data (for atomics)
181 uint8_t *x_data;
182 // The execution mask
184
185 // SIMD where the WF of the memory instruction has been mapped to
187 // unique id of the WF where the memory instruction belongs to
189 // The kernel id of the requesting wf
191 // The CU id of the requesting wf
192 int cu_id;
193 // The workgroup id of the requesting wf
194 int wg_id;
195 // HW slot id where the WF is mapped to inside a SIMD unit
197 // execution pipeline id where the memory instruction has been scheduled
199 // The execution time of this operation
201 // The latency of this operation
203
204 // Initiate the specified memory operation, by creating a
205 // memory request and sending it off to the memory system.
206 void initiateAcc(GPUDynInstPtr gpuDynInst);
207 // Complete the specified memory operation, by writing
208 // value back to the RF in the case of a load or atomic
209 // return or, in the case of a store, we do nothing
210 void completeAcc(GPUDynInstPtr gpuDynInst);
211
212 void updateStats();
213
215
216 TheGpuISA::ScalarRegU32 srcLiteral() const;
217
218 bool isALU() const;
219 bool isBranch() const;
220 bool isCondBranch() const;
221 bool isNop() const;
222 bool isReturn() const;
223 bool isEndOfKernel() const;
224 bool isKernelLaunch() const;
225 bool isSDWAInst() const;
226 bool isDPPInst() const;
227 bool isUnconditionalJump() const;
228 bool isSpecialOp() const;
229 bool isWaitcnt() const;
230 bool isSleep() const;
231
232 bool isBarrier() const;
233 bool isMemSync() const;
234 bool isMemRef() const;
235 bool isFlat() const;
236 bool isFlatGlobal() const;
237 bool isLoad() const;
238 bool isStore() const;
239
240 bool isAtomic() const;
241 bool isAtomicNoRet() const;
242 bool isAtomicRet() const;
243
244 bool isScalar() const;
245 bool isVector() const;
246 bool readsSCC() const;
247 bool writesSCC() const;
248 bool readsVCC() const;
249 bool writesVCC() const;
250 bool readsExec() const;
251 bool writesExec() const;
252 bool readsMode() const;
253 bool writesMode() const;
254 bool ignoreExec() const;
255 bool readsFlatScratch() const;
256 bool writesFlatScratch() const;
257 bool readsExecMask() const;
258 bool writesExecMask() const;
259
260 bool isAtomicAnd() const;
261 bool isAtomicOr() const;
262 bool isAtomicXor() const;
263 bool isAtomicCAS() const;
264 bool isAtomicExch() const;
265 bool isAtomicAdd() const;
266 bool isAtomicSub() const;
267 bool isAtomicInc() const;
268 bool isAtomicDec() const;
269 bool isAtomicMax() const;
270 bool isAtomicMin() const;
271
272 bool isArgLoad() const;
273 bool isGlobalMem() const;
274 bool isLocalMem() const;
275
276 bool isArgSeg() const;
277 bool isGlobalSeg() const;
278 bool isGroupSeg() const;
279 bool isKernArgSeg() const;
280 bool isPrivateSeg() const;
281 bool isReadOnlySeg() const;
282 bool isSpillSeg() const;
283
284 bool isGloballyCoherent() const;
285 bool isSystemCoherent() const;
286
287 bool isF16() const;
288 bool isF32() const;
289 bool isF64() const;
290
291 bool isFMA() const;
292 bool isMAC() const;
293 bool isMAD() const;
294
295 // for FLAT memory ops. check the segment address
296 // against the APE registers to see if it falls
297 // within one of the APE ranges for LDS/SCRATCH/GPUVM.
298 // if it does not fall into one of the three APEs, it
299 // will be a regular global access.
300 void doApertureCheck(const VectorMask &mask);
301 // Function to resolve a flat accesses during execution stage.
303
304 template<typename c0> AtomicOpFunctorPtr
305 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
306 {
307 if (isAtomicAnd()) {
308 return std::make_unique<AtomicOpAnd<c0>>(*reg0);
309 } else if (isAtomicOr()) {
310 return std::make_unique<AtomicOpOr<c0>>(*reg0);
311 } else if (isAtomicXor()) {
312 return std::make_unique<AtomicOpXor<c0>>(*reg0);
313 } else if (isAtomicCAS()) {
314 return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
315 } else if (isAtomicExch()) {
316 return std::make_unique<AtomicOpExch<c0>>(*reg0);
317 } else if (isAtomicAdd()) {
318 return std::make_unique<AtomicOpAdd<c0>>(*reg0);
319 } else if (isAtomicSub()) {
320 return std::make_unique<AtomicOpSub<c0>>(*reg0);
321 } else if (isAtomicInc()) {
322 return std::make_unique<AtomicOpInc<c0>>();
323 } else if (isAtomicDec()) {
324 return std::make_unique<AtomicOpDec<c0>>();
325 } else if (isAtomicMax()) {
326 return std::make_unique<AtomicOpMax<c0>>(*reg0);
327 } else if (isAtomicMin()) {
328 return std::make_unique<AtomicOpMin<c0>>(*reg0);
329 } else {
330 fatal("Unrecognized atomic operation");
331 }
332 }
333
334 void
336 {
337 if (isGloballyCoherent()) {
338 req->setCacheCoherenceFlags(Request::GLC_BIT);
339 }
340
341 if (isSystemCoherent()) {
342 req->setCacheCoherenceFlags(Request::SLC_BIT);
343 }
344
345 if (isAtomicRet()) {
346 req->setFlags(Request::ATOMIC_RETURN_OP);
347 } else if (isAtomicNoRet()) {
348 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
349 }
350
351 if (isMemSync()) {
352 // the path for kernel launch and kernel end is different
353 // from non-kernel mem sync.
354 assert(!isKernelLaunch());
355 assert(!isEndOfKernel());
356
357 // must be wbinv inst if not kernel launch/end
358 req->setCacheCoherenceFlags(Request::INV_L1);
359 }
360 }
361
362 // reset the number of pending memory requests for all lanes
363 void
365 {
366 assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
367 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
368 resetStatusVector(lane);
369 }
370 }
371
372 // reset the number of pending memory requests for the inputted lane
373 void
375 {
376 setStatusVector(lane, 0);
377 }
378
379 // set the number of pending memory requests for the inputted lane
380 void
381 setStatusVector(int lane, int newVal)
382 {
383 // currently we can have up to 2 memory requests per lane (if the
384 // lane's request goes across multiple cache lines)
385 assert((newVal >= 0) && (newVal <= 2));
386 statusVector[lane] = newVal;
387 }
388
389 // subtracts the number of pending memory requests for the inputted lane
390 // by 1
391 void
393 {
394 // this lane may have multiple requests, so only subtract one for
395 // this request
396 assert(statusVector[lane] >= 1);
397 statusVector[lane]--;
398 }
399
400 // return the current number of pending memory requests for the inputted
401 // lane
402 int
403 getLaneStatus(int lane) const
404 {
405 return statusVector[lane];
406 }
407
408 // returns true if all memory requests from all lanes have been received,
409 // else returns false
410 bool
412 {
413 // local variables
414 bool allZero = true;
415
416 // iterate over all lanes, checking the number of pending memory
417 // requests they have
418 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
419 // if any lane still has pending requests, return false
420 if (statusVector[lane] > 0) {
421 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
422 "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
423 statusVector[lane], addr[lane]);
424 allZero = false;
425 }
426 }
427
428 if (allZero) {
429 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
430 " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
431 }
432 return allZero;
433 }
434
435 // returns a string representing the current state of the statusVector
436 std::string
438 {
439 std::string statusVec_str = "[";
440
441 // iterate over all lanes, adding the current number of pending
442 // requests for this lane to the string
443 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
444 statusVec_str += std::to_string(statusVector[lane]);
445 }
446 statusVec_str += "]";
447
448 return statusVec_str;
449 }
450
451 // Map returned packets and the addresses they satisfy with which lane they
452 // were requested from
453 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
455
456 // Track the status of memory requests per lane, an int per lane to allow
457 // unaligned accesses
459 // for ld_v# or st_v#
461
462 // for misaligned scalar ops we track the number
463 // of outstanding reqs here
465
466 Tick getAccessTime() const { return accessTime; }
467
468 void setAccessTime(Tick currentTime) { accessTime = currentTime; }
469
470 void profileRoundTripTime(Tick currentTime, int hopId);
472
473 void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
474 const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
475 { return lineAddressTime; }
476
477 // inst used to save/restore a wavefront context
479
480 bool isSystemReq() { return systemReq; }
481 void setSystemReq() { systemReq = true; }
482
483 private:
488 bool systemReq = false;
489
490 // the time the request was started
492
493 // hold the tick when the instruction arrives at certain hop points
494 // on it's way to main memory
496
497 // hold each cache block address for the instruction and a vector
498 // to hold the tick when the block arrives at certain hop points
499 std::map<Addr, std::vector<Tick>> lineAddressTime;
500};
501
502} // namespace gem5
503
504#endif // __GPU_DYN_INST_HH__
#define DPRINTF(x,...)
Definition trace.hh:210
AtomicOpFunctor * clone()
ComputeUnit * computeUnit
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
gem5::ComputeUnit::ComputeUnitStats stats
bool isKernelLaunch() const
std::unordered_map< Addr, std::vector< int > > StatusVector
bool isAtomicCAS() const
bool isSpecialOp() const
std::vector< Tick > roundTripTime
bool isLocalMem() const
bool hasDestinationSgpr() const
bool writesVCC() const
bool isAtomicDec() const
bool readsVCC() const
bool isNop() const
bool isF16() const
int numDstScalarRegOperands() const
std::map< Addr, std::vector< Tick > > lineAddressTime
void doApertureCheck(const VectorMask &mask)
bool isAtomicRet() const
void resolveFlatSegment(const VectorMask &mask)
std::vector< int > tlbHitLevel
bool isGlobalMem() const
bool isAtomicMin() const
bool isAtomicExch() const
std::vector< Tick > getRoundTripTime() const
bool isFlatGlobal() const
bool isBranch() const
bool isF32() const
bool isAtomicSub() const
GPUStaticInst * _staticInst
bool hasDestinationVgpr() const
std::vector< int > statusVector
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
void decrementStatusVector(int lane)
bool isUnconditionalJump() const
GPUStaticInst * staticInstruction()
int numSrcScalarRegOperands() const
bool isOpcode(const std::string &opcodeStr) const
bool isAtomicXor() const
const std::map< Addr, std::vector< Tick > > & getLineAddressTime() const
bool isALU() const
accessor methods for the attributes of the underlying GPU static instruction
bool isReadOnlySeg() const
bool isSystemCoherent() const
bool isMemRef() const
bool isAtomicAnd() const
bool isStore() const
bool isDPPInst() const
bool isSleep() const
VectorMask exec_mask
bool isMemSync() const
bool writesSCC() const
bool hasSourceVgpr() const
int numDstVecRegOperands() const
bool ignoreExec() const
StatusVector memStatusVector
bool hasSourceSgpr() const
uint8_t * scalar_data
bool isReturn() const
bool readsSCC() const
bool isMAD() const
int getLaneStatus(int lane) const
bool readsFlatScratch() const
void initiateAcc(GPUDynInstPtr gpuDynInst)
int getNumOperands() const
bool writesExec() const
bool isSDWAInst() const
bool isWaitcnt() const
bool writesMode() const
enums::StorageClassType executedAs()
bool isFlat() const
const std::vector< OperandInfo > & dstVecRegOperands() const
void profileRoundTripTime(Tick currentTime, int hopId)
void resetStatusVector(int lane)
bool isCondBranch() const
bool writesExecMask() const
bool isPrivateSeg() const
bool isEndOfKernel() const
void resetEntireStatusVector()
const std::vector< OperandInfo > & srcVecRegOperands() const
bool isAtomicInc() const
bool isGloballyCoherent() const
bool readsExecMask() const
bool isGroupSeg() const
TheGpuISA::ScalarRegU32 srcLiteral() const
Tick getAccessTime() const
bool readsExec() const
int maxSrcScalarRegOperandSize()
bool isScalar() const
bool isVector() const
InstSeqNum seqNum() const
bool isFMA() const
const std::vector< OperandInfo > & srcScalarRegOperands() const
bool isAtomicAdd() const
const std::vector< OperandInfo > & dstScalarRegOperands() const
int numSrcVecRegOperands() const
const InstSeqNum _seqNum
bool isBarrier() const
bool isLoad() const
void setRequestFlags(RequestPtr req) const
std::vector< Addr > addr
std::string printStatusVector() const
bool writesFlatScratch() const
bool allLanesZero() const
bool readsMode() const
void execute(GPUDynInstPtr gpuDynInst)
bool isMAC() const
AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
bool isKernArgSeg() const
bool isArgLoad() const
void setStatusVector(int lane, int newVal)
void setAccessTime(Tick currentTime)
bool isGlobalSeg() const
bool isArgSeg() const
bool isAtomic() const
bool isAtomicOr() const
int maxSrcVecRegOperandSize()
bool isAtomicNoRet() const
bool isSpillSeg() const
const std::string & disassemble() const
void completeAcc(GPUDynInstPtr gpuDynInst)
bool isF64() const
bool isAtomicMax() const
const std::vector< int > virtIndices
const int opIdx
Index of this operand within the set of its parent instruction's operand list.
const std::vector< int > physIndices
RegisterOperandInfo(int op_idx, int num_dwords, const std::vector< int > &virt_indices, const std::vector< int > &phys_indices)
const int numDWORDs
Size of this operand in DWORDs.
int virtIdx(int reg_num=0) const
We typically only need the first virtual register for the operand regardless of its size.
int numRegisters() const
The number of registers required to store this operand.
@ ATOMIC_RETURN_OP
The request is an atomic that returns data.
Definition request.hh:175
@ ATOMIC_NO_RETURN_OP
The request is an atomic that does not return data.
Definition request.hh:177
@ SLC_BIT
user-policy flags
Definition request.hh:332
STL vector class.
Definition stl.hh:37
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
Definition amo.hh:269
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7 > b
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
uint64_t Tick
Tick count type.
Definition types.hh:58
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
uint64_t InstSeqNum
Definition inst_seq.hh:40

Generated on Mon Jul 10 2023 15:32:03 for gem5 by doxygen 1.9.7