gem5 [DEVELOP-FOR-25.1]
Loading...
Searching...
No Matches
gpu_dyn_inst.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __GPU_DYN_INST_HH__
33#define __GPU_DYN_INST_HH__
34
35#include <cstdint>
36#include <memory>
37#include <string>
38
40#include "base/amo.hh"
41#include "base/logging.hh"
42#include "base/trace.hh"
43#include "debug/GPUMem.hh"
44#include "enums/StorageClassType.hh"
48
49namespace gem5
50{
51
52class GPUStaticInst;
53
54template<typename T>
56{
57 public:
59 AtomicOpPkAddBF16(T _data) : data(_data) { }
60
61 void
62 execute([[maybe_unused]] T *b)
63 {
64 if constexpr (sizeof(T) == 4) {
65 AMDGPU::PkBfloat16 pk_b, pk_data;
66 pk_data = data;
67 pk_b = *b;
68
69 pk_b += pk_data;
70
71 *b = pk_b.get();
72 } else {
73 fatal("Attempted packed atomic bf16 on non 32-bit type");
74 }
75 }
76
78};
79
80template<typename T>
82{
83 public:
84 T c;
85 T s;
86
88
89 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
90 : c(_c), s(_s), computeUnit(compute_unit) { }
91
92 void
94 {
95 computeUnit->stats.numCASOps++;
96
97 if (*b == c) {
98 *b = s;
99 } else {
100 computeUnit->stats.numFailedCASOps++;
101 }
102 }
104};
105
107{
108 public:
110 RegisterOperandInfo(int op_idx, int num_dwords,
111 const std::vector<int> &virt_indices,
112 const std::vector<int> &phys_indices)
113 : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
114 physIndices(phys_indices)
115 {
116 }
117
121 int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
122 int operandIdx() const { return opIdx; }
127 int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
128
129 private:
134 const int opIdx;
138 const int numDWORDs;
141};
142
144{
145 public:
146 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
147 uint64_t instSeqNum);
148 ~GPUDynInst();
149 void execute(GPUDynInstPtr gpuDynInst);
150
155
156 int numSrcRegOperands();
157 int numDstRegOperands();
158
159 int numSrcVecRegOperands() const;
160 int numDstVecRegOperands() const;
162 int numSrcVecDWords();
163 int numDstVecDWords();
164
165 int numSrcScalarRegOperands() const;
166 int numDstScalarRegOperands() const;
168 int numSrcScalarDWords();
169 int numDstScalarDWords();
170
171 int maxOperandSize();
172
173 int getNumOperands() const;
174
175 bool hasSourceSgpr() const;
176 bool hasDestinationSgpr() const;
177 bool hasSourceVgpr() const;
178 bool hasDestinationVgpr() const;
179
180 // returns true if the string "opcodeStr" is found in the
181 // opcode of the instruction
182 bool isOpcode(const std::string& opcodeStr) const;
183 bool isOpcode(const std::string& opcodeStr,
184 const std::string& extStr) const;
185
186 const std::string &disassemble() const;
187
188 InstSeqNum seqNum() const;
189
190 Addr pc();
191 void pc(Addr _pc);
192
193 enums::StorageClassType executedAs();
194
195 // virtual address for scalar memory operations
197 // virtual addressies for vector memory operations
200
201 // vector data to get written
202 uint8_t *d_data;
203 // scalar data to be transferred
204 uint8_t *scalar_data;
205 // Additional data (for atomics)
206 uint8_t *a_data;
207 // Additional data (for atomics)
208 uint8_t *x_data;
209 // The execution mask
211
212 // SIMD where the WF of the memory instruction has been mapped to
214 // unique id of the WF where the memory instruction belongs to
216 // The kernel id of the requesting wf
218 // The CU id of the requesting wf
219 int cu_id;
220 // The workgroup id of the requesting wf
221 int wg_id;
222 // HW slot id where the WF is mapped to inside a SIMD unit
224 // execution pipeline id where the memory instruction has been scheduled
226 // The execution time of this operation
228 // The latency of this operation
230
231 // Initiate the specified memory operation, by creating a
232 // memory request and sending it off to the memory system.
233 void initiateAcc(GPUDynInstPtr gpuDynInst);
234 // Complete the specified memory operation, by writing
235 // value back to the RF in the case of a load or atomic
236 // return or, in the case of a store, we do nothing
237 void completeAcc(GPUDynInstPtr gpuDynInst);
238
239 void updateStats();
240
242
243 TheGpuISA::ScalarRegU32 srcLiteral() const;
244
245 bool isALU() const;
246 bool isBranch() const;
247 bool isCondBranch() const;
248 bool isNop() const;
249 bool isReturn() const;
250 bool isEndOfKernel() const;
251 bool isKernelLaunch() const;
252 bool isSDWAInst() const;
253 bool isDPPInst() const;
254 bool isUnconditionalJump() const;
255 bool isSpecialOp() const;
256 bool isWaitcnt() const;
257 bool isSleep() const;
258
259 bool isBarrier() const;
260 bool isMemSync() const;
261 bool isMemRef() const;
262 bool isFlat() const;
263 bool isFlatGlobal() const;
264 bool isFlatScratch() const;
265 bool isLoad() const;
266 bool isStore() const;
267
268 bool isAtomic() const;
269 bool isAtomicNoRet() const;
270 bool isAtomicRet() const;
271
272 bool isScalar() const;
273 bool isVector() const;
274 bool readsSCC() const;
275 bool writesSCC() const;
276 bool readsVCC() const;
277 bool writesVCC() const;
278 bool readsExec() const;
279 bool writesExec() const;
280 bool readsMode() const;
281 bool writesMode() const;
282 bool ignoreExec() const;
283 bool readsFlatScratch() const;
284 bool writesFlatScratch() const;
285 bool readsExecMask() const;
286 bool writesExecMask() const;
287 bool needsToken() const;
288
289 bool isAtomicAnd() const;
290 bool isAtomicOr() const;
291 bool isAtomicXor() const;
292 bool isAtomicCAS() const;
293 bool isAtomicExch() const;
294 bool isAtomicAdd() const;
295 bool isAtomicSub() const;
296 bool isAtomicInc() const;
297 bool isAtomicDec() const;
298 bool isAtomicMax() const;
299 bool isAtomicMin() const;
300 bool isAtomicPkAddBF16() const;
301
302 bool isArgLoad() const;
303 bool isGlobalMem() const;
304 bool isLocalMem() const;
305
306 bool isArgSeg() const;
307 bool isGlobalSeg() const;
308 bool isGroupSeg() const;
309 bool isKernArgSeg() const;
310 bool isPrivateSeg() const;
311 bool isReadOnlySeg() const;
312 bool isSpillSeg() const;
313
314 bool isGloballyCoherent() const;
315 bool isSystemCoherent() const;
316
317 bool isI8() const;
318 bool isF16() const;
319 bool isF32() const;
320 bool isF64() const;
321
322 bool isFMA() const;
323 bool isMAC() const;
324 bool isMAD() const;
325 bool isMFMA() const;
326
327 // for FLAT memory ops. check the segment address
328 // against the APE registers to see if it falls
329 // within one of the APE ranges for LDS/SCRATCH/GPUVM.
330 // if it does not fall into one of the three APEs, it
331 // will be a regular global access.
332 void doApertureCheck(const VectorMask &mask);
333 // Function to resolve a flat accesses during execution stage.
335
336 template<typename c0> AtomicOpFunctorPtr
337 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
338 {
339 if (isAtomicAnd()) {
340 return std::make_unique<AtomicOpAnd<c0>>(*reg0);
341 } else if (isAtomicOr()) {
342 return std::make_unique<AtomicOpOr<c0>>(*reg0);
343 } else if (isAtomicXor()) {
344 return std::make_unique<AtomicOpXor<c0>>(*reg0);
345 } else if (isAtomicCAS()) {
346 return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
347 } else if (isAtomicExch()) {
348 return std::make_unique<AtomicOpExch<c0>>(*reg0);
349 } else if (isAtomicAdd()) {
350 return std::make_unique<AtomicOpAdd<c0>>(*reg0);
351 } else if (isAtomicSub()) {
352 return std::make_unique<AtomicOpSub<c0>>(*reg0);
353 } else if (isAtomicInc()) {
354 return std::make_unique<AtomicOpInc<c0>>();
355 } else if (isAtomicDec()) {
356 return std::make_unique<AtomicOpDec<c0>>();
357 } else if (isAtomicMax()) {
358 return std::make_unique<AtomicOpMax<c0>>(*reg0);
359 } else if (isAtomicMin()) {
360 return std::make_unique<AtomicOpMin<c0>>(*reg0);
361 } else if (isAtomicPkAddBF16()) {
362 return std::make_unique<AtomicOpPkAddBF16<c0>>(*reg0);
363 } else {
364 fatal("Unrecognized atomic operation");
365 }
366 }
367
368 void
370 {
371 if (isGloballyCoherent()) {
372 req->setCacheCoherenceFlags(Request::GLC_BIT);
373 }
374
375 if (isSystemCoherent()) {
376 req->setCacheCoherenceFlags(Request::SLC_BIT);
377 }
378
379 if (isAtomicRet()) {
380 req->setFlags(Request::ATOMIC_RETURN_OP);
381 } else if (isAtomicNoRet()) {
382 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
383 }
384
385 if (isMemSync()) {
386 // the path for kernel launch and kernel end is different
387 // from non-kernel mem sync.
388 assert(!isKernelLaunch());
389 assert(!isEndOfKernel());
390
391 // must be wbinv inst if not kernel launch/end
392 req->setCacheCoherenceFlags(Request::INV_L1);
393 }
394 }
395
396 // reset the number of pending memory requests for all lanes
397 void
399 {
400 assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
401 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
402 resetStatusVector(lane);
403 }
404 }
405
406 // reset the number of pending memory requests for the inputted lane
407 void
409 {
410 setStatusVector(lane, 0);
411 }
412
413 // set the number of pending memory requests for the inputted lane
414 void
415 setStatusVector(int lane, int newVal)
416 {
417 // Currently we can have up to 4 memory requests per lane. This can
418 // occur on a memory request loading 4x dwords where the memory is
419 // swizzled.
420 assert((newVal >= 0) && (newVal <= 4));
421 statusVector[lane] = newVal;
422 }
423
424 // subtracts the number of pending memory requests for the inputted lane
425 // by 1
426 void
428 {
429 // this lane may have multiple requests, so only subtract one for
430 // this request
431 assert(statusVector[lane] >= 1);
432 statusVector[lane]--;
433 }
434
435 // return the current number of pending memory requests for the inputted
436 // lane
437 int
438 getLaneStatus(int lane) const
439 {
440 return statusVector[lane];
441 }
442
443 // returns true if all memory requests from all lanes have been received,
444 // else returns false
445 bool
447 {
448 // local variables
449 bool allZero = true;
450
451 // iterate over all lanes, checking the number of pending memory
452 // requests they have
453 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
454 // if any lane still has pending requests, return false
455 if (statusVector[lane] > 0) {
456 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
457 "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
458 statusVector[lane], addr[lane]);
459 allZero = false;
460 }
461 }
462
463 if (allZero) {
464 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
465 " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
466 }
467 return allZero;
468 }
469
470 // returns a string representing the current state of the statusVector
471 std::string
473 {
474 std::string statusVec_str = "[";
475
476 // iterate over all lanes, adding the current number of pending
477 // requests for this lane to the string
478 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
479 statusVec_str += std::to_string(statusVector[lane]);
480 }
481 statusVec_str += "]";
482
483 return statusVec_str;
484 }
485
486 // Map returned packets and the addresses they satisfy with which lane they
487 // were requested from
488 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
490
491 // Track the status of memory requests per lane, an int per lane to allow
492 // unaligned accesses
494 // for ld_v# or st_v#
496
497 // for misaligned scalar ops we track the number
498 // of outstanding reqs here
500
501 Tick getAccessTime() const { return accessTime; }
502
503 void setAccessTime(Tick currentTime) { accessTime = currentTime; }
504
505 void profileRoundTripTime(Tick currentTime, int hopId);
507
508 void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
509 const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
510 { return lineAddressTime; }
511
512 // inst used to save/restore a wavefront context
514
515 bool isSystemReq() { return systemReq; }
516 void setSystemReq() { systemReq = true; }
517
518 private:
523 bool systemReq = false;
524
525 // the time the request was started
527
528 // hold the tick when the instruction arrives at certain hop points
529 // on it's way to main memory
531
532 // hold each cache block address for the instruction and a vector
533 // to hold the tick when the block arrives at certain hop points
534 std::map<Addr, std::vector<Tick>> lineAddressTime;
535};
536
537} // namespace gem5
538
539#endif // __GPU_DYN_INST_HH__
#define DPRINTF(x,...)
Definition trace.hh:209
AtomicOpFunctor * clone()
ComputeUnit * computeUnit
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
AtomicOpFunctor * clone()
bool isFlatScratch() const
bool isKernelLaunch() const
std::unordered_map< Addr, std::vector< int > > StatusVector
bool isAtomicCAS() const
bool isSpecialOp() const
std::vector< Tick > roundTripTime
bool isI8() const
bool isLocalMem() const
bool hasDestinationSgpr() const
bool writesVCC() const
bool isAtomicDec() const
bool readsVCC() const
bool isNop() const
bool isF16() const
int numDstScalarRegOperands() const
std::map< Addr, std::vector< Tick > > lineAddressTime
void doApertureCheck(const VectorMask &mask)
bool isAtomicRet() const
void resolveFlatSegment(const VectorMask &mask)
std::vector< int > tlbHitLevel
bool isGlobalMem() const
bool isAtomicMin() const
bool isAtomicExch() const
std::vector< Tick > getRoundTripTime() const
bool isFlatGlobal() const
bool isBranch() const
bool isF32() const
bool isAtomicSub() const
GPUStaticInst * _staticInst
bool hasDestinationVgpr() const
std::vector< int > statusVector
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
void decrementStatusVector(int lane)
bool isUnconditionalJump() const
GPUStaticInst * staticInstruction()
int numSrcScalarRegOperands() const
bool isOpcode(const std::string &opcodeStr) const
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum)
bool isAtomicXor() const
const std::map< Addr, std::vector< Tick > > & getLineAddressTime() const
bool isALU() const
accessor methods for the attributes of the underlying GPU static instruction
bool isReadOnlySeg() const
bool isSystemCoherent() const
bool isMemRef() const
bool isAtomicAnd() const
bool isStore() const
bool isDPPInst() const
bool isSleep() const
VectorMask exec_mask
bool isMemSync() const
bool needsToken() const
bool writesSCC() const
bool hasSourceVgpr() const
int numDstVecRegOperands() const
bool ignoreExec() const
StatusVector memStatusVector
bool hasSourceSgpr() const
uint8_t * scalar_data
bool isReturn() const
bool readsSCC() const
bool isMAD() const
int getLaneStatus(int lane) const
bool readsFlatScratch() const
void initiateAcc(GPUDynInstPtr gpuDynInst)
int getNumOperands() const
bool writesExec() const
bool isSDWAInst() const
bool isWaitcnt() const
bool writesMode() const
enums::StorageClassType executedAs()
bool isFlat() const
const std::vector< OperandInfo > & dstVecRegOperands() const
void profileRoundTripTime(Tick currentTime, int hopId)
void resetStatusVector(int lane)
bool isCondBranch() const
bool writesExecMask() const
bool isPrivateSeg() const
bool isEndOfKernel() const
void resetEntireStatusVector()
const std::vector< OperandInfo > & srcVecRegOperands() const
bool isAtomicInc() const
bool isGloballyCoherent() const
bool readsExecMask() const
bool isGroupSeg() const
TheGpuISA::ScalarRegU32 srcLiteral() const
Tick getAccessTime() const
bool readsExec() const
int maxSrcScalarRegOperandSize()
bool isScalar() const
bool isVector() const
InstSeqNum seqNum() const
bool isFMA() const
const std::vector< OperandInfo > & srcScalarRegOperands() const
bool isMFMA() const
bool isAtomicAdd() const
const std::vector< OperandInfo > & dstScalarRegOperands() const
int numSrcVecRegOperands() const
const InstSeqNum _seqNum
bool isBarrier() const
bool isLoad() const
void setRequestFlags(RequestPtr req) const
std::vector< Addr > addr
std::string printStatusVector() const
bool writesFlatScratch() const
bool allLanesZero() const
bool readsMode() const
void execute(GPUDynInstPtr gpuDynInst)
bool isMAC() const
AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
bool isKernArgSeg() const
bool isArgLoad() const
void setStatusVector(int lane, int newVal)
void setAccessTime(Tick currentTime)
bool isGlobalSeg() const
bool isArgSeg() const
bool isAtomic() const
bool isAtomicOr() const
bool isAtomicPkAddBF16() const
int maxSrcVecRegOperandSize()
bool isAtomicNoRet() const
bool isSpillSeg() const
const std::string & disassemble() const
void completeAcc(GPUDynInstPtr gpuDynInst)
bool isF64() const
bool isAtomicMax() const
GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
const std::vector< int > virtIndices
const int opIdx
Index of this operand within the set of its parent instruction's operand list.
const std::vector< int > physIndices
RegisterOperandInfo(int op_idx, int num_dwords, const std::vector< int > &virt_indices, const std::vector< int > &phys_indices)
const int numDWORDs
Size of this operand in DWORDs.
int virtIdx(int reg_num=0) const
We typically only need the first virtual register for the operand regardless of its size.
int numRegisters() const
The number of registers required to store this operand.
@ ATOMIC_RETURN_OP
The request is an atomic that returns data.
Definition request.hh:175
@ ATOMIC_NO_RETURN_OP
The request is an atomic that does not return data.
Definition request.hh:177
@ SLC_BIT
user-policy flags
Definition request.hh:335
STL vector class.
Definition stl.hh:37
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
Definition amo.hh:269
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7 > b
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
uint64_t Tick
Tick count type.
Definition types.hh:58
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
uint64_t InstSeqNum
Definition inst_seq.hh:40

Generated on Mon Oct 27 2025 04:13:02 for gem5 by doxygen 1.14.0