gem5 v24.0.0.0
Loading...
Searching...
No Matches
gpu_dyn_inst.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __GPU_DYN_INST_HH__
33#define __GPU_DYN_INST_HH__
34
35#include <cstdint>
36#include <memory>
37#include <string>
38
39#include "base/amo.hh"
40#include "base/logging.hh"
41#include "base/trace.hh"
42#include "debug/GPUMem.hh"
43#include "enums/StorageClassType.hh"
47
48namespace gem5
49{
50
51class GPUStaticInst;
52
53template<typename T>
55{
56 public:
57 T c;
58 T s;
59
61
62 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
63 : c(_c), s(_s), computeUnit(compute_unit) { }
64
65 void
67 {
69
70 if (*b == c) {
71 *b = s;
72 } else {
74 }
75 }
77};
78
80{
81 public:
83 RegisterOperandInfo(int op_idx, int num_dwords,
84 const std::vector<int> &virt_indices,
85 const std::vector<int> &phys_indices)
86 : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
87 physIndices(phys_indices)
88 {
89 }
90
94 int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
95 int operandIdx() const { return opIdx; }
100 int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
101
102 private:
107 const int opIdx;
111 const int numDWORDs;
114};
115
117{
118 public:
119 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
120 uint64_t instSeqNum);
121 ~GPUDynInst();
122 void execute(GPUDynInstPtr gpuDynInst);
123
128
129 int numSrcRegOperands();
130 int numDstRegOperands();
131
132 int numSrcVecRegOperands() const;
133 int numDstVecRegOperands() const;
135 int numSrcVecDWords();
136 int numDstVecDWords();
137
138 int numSrcScalarRegOperands() const;
139 int numDstScalarRegOperands() const;
141 int numSrcScalarDWords();
142 int numDstScalarDWords();
143
144 int maxOperandSize();
145
146 int getNumOperands() const;
147
148 bool hasSourceSgpr() const;
149 bool hasDestinationSgpr() const;
150 bool hasSourceVgpr() const;
151 bool hasDestinationVgpr() const;
152
153 // returns true if the string "opcodeStr" is found in the
154 // opcode of the instruction
155 bool isOpcode(const std::string& opcodeStr) const;
156 bool isOpcode(const std::string& opcodeStr,
157 const std::string& extStr) const;
158
159 const std::string &disassemble() const;
160
161 InstSeqNum seqNum() const;
162
163 Addr pc();
164 void pc(Addr _pc);
165
166 enums::StorageClassType executedAs();
167
168 // virtual address for scalar memory operations
170 // virtual addressies for vector memory operations
173
174 // vector data to get written
175 uint8_t *d_data;
176 // scalar data to be transferred
177 uint8_t *scalar_data;
178 // Additional data (for atomics)
179 uint8_t *a_data;
180 // Additional data (for atomics)
181 uint8_t *x_data;
182 // The execution mask
184
185 // SIMD where the WF of the memory instruction has been mapped to
187 // unique id of the WF where the memory instruction belongs to
189 // The kernel id of the requesting wf
191 // The CU id of the requesting wf
192 int cu_id;
193 // The workgroup id of the requesting wf
194 int wg_id;
195 // HW slot id where the WF is mapped to inside a SIMD unit
197 // execution pipeline id where the memory instruction has been scheduled
199 // The execution time of this operation
201 // The latency of this operation
203
204 // Initiate the specified memory operation, by creating a
205 // memory request and sending it off to the memory system.
206 void initiateAcc(GPUDynInstPtr gpuDynInst);
207 // Complete the specified memory operation, by writing
208 // value back to the RF in the case of a load or atomic
209 // return or, in the case of a store, we do nothing
210 void completeAcc(GPUDynInstPtr gpuDynInst);
211
212 void updateStats();
213
215
216 TheGpuISA::ScalarRegU32 srcLiteral() const;
217
218 bool isALU() const;
219 bool isBranch() const;
220 bool isCondBranch() const;
221 bool isNop() const;
222 bool isReturn() const;
223 bool isEndOfKernel() const;
224 bool isKernelLaunch() const;
225 bool isSDWAInst() const;
226 bool isDPPInst() const;
227 bool isUnconditionalJump() const;
228 bool isSpecialOp() const;
229 bool isWaitcnt() const;
230 bool isSleep() const;
231
232 bool isBarrier() const;
233 bool isMemSync() const;
234 bool isMemRef() const;
235 bool isFlat() const;
236 bool isFlatGlobal() const;
237 bool isFlatScratch() const;
238 bool isLoad() const;
239 bool isStore() const;
240
241 bool isAtomic() const;
242 bool isAtomicNoRet() const;
243 bool isAtomicRet() const;
244
245 bool isScalar() const;
246 bool isVector() const;
247 bool readsSCC() const;
248 bool writesSCC() const;
249 bool readsVCC() const;
250 bool writesVCC() const;
251 bool readsExec() const;
252 bool writesExec() const;
253 bool readsMode() const;
254 bool writesMode() const;
255 bool ignoreExec() const;
256 bool readsFlatScratch() const;
257 bool writesFlatScratch() const;
258 bool readsExecMask() const;
259 bool writesExecMask() const;
260 bool needsToken() const;
261
262 bool isAtomicAnd() const;
263 bool isAtomicOr() const;
264 bool isAtomicXor() const;
265 bool isAtomicCAS() const;
266 bool isAtomicExch() const;
267 bool isAtomicAdd() const;
268 bool isAtomicSub() const;
269 bool isAtomicInc() const;
270 bool isAtomicDec() const;
271 bool isAtomicMax() const;
272 bool isAtomicMin() const;
273
274 bool isArgLoad() const;
275 bool isGlobalMem() const;
276 bool isLocalMem() const;
277
278 bool isArgSeg() const;
279 bool isGlobalSeg() const;
280 bool isGroupSeg() const;
281 bool isKernArgSeg() const;
282 bool isPrivateSeg() const;
283 bool isReadOnlySeg() const;
284 bool isSpillSeg() const;
285
286 bool isGloballyCoherent() const;
287 bool isSystemCoherent() const;
288
289 bool isI8() const;
290 bool isF16() const;
291 bool isF32() const;
292 bool isF64() const;
293
294 bool isFMA() const;
295 bool isMAC() const;
296 bool isMAD() const;
297 bool isMFMA() const;
298
299 // for FLAT memory ops. check the segment address
300 // against the APE registers to see if it falls
301 // within one of the APE ranges for LDS/SCRATCH/GPUVM.
302 // if it does not fall into one of the three APEs, it
303 // will be a regular global access.
304 void doApertureCheck(const VectorMask &mask);
305 // Function to resolve a flat accesses during execution stage.
307
308 template<typename c0> AtomicOpFunctorPtr
309 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
310 {
311 if (isAtomicAnd()) {
312 return std::make_unique<AtomicOpAnd<c0>>(*reg0);
313 } else if (isAtomicOr()) {
314 return std::make_unique<AtomicOpOr<c0>>(*reg0);
315 } else if (isAtomicXor()) {
316 return std::make_unique<AtomicOpXor<c0>>(*reg0);
317 } else if (isAtomicCAS()) {
318 return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
319 } else if (isAtomicExch()) {
320 return std::make_unique<AtomicOpExch<c0>>(*reg0);
321 } else if (isAtomicAdd()) {
322 return std::make_unique<AtomicOpAdd<c0>>(*reg0);
323 } else if (isAtomicSub()) {
324 return std::make_unique<AtomicOpSub<c0>>(*reg0);
325 } else if (isAtomicInc()) {
326 return std::make_unique<AtomicOpInc<c0>>();
327 } else if (isAtomicDec()) {
328 return std::make_unique<AtomicOpDec<c0>>();
329 } else if (isAtomicMax()) {
330 return std::make_unique<AtomicOpMax<c0>>(*reg0);
331 } else if (isAtomicMin()) {
332 return std::make_unique<AtomicOpMin<c0>>(*reg0);
333 } else {
334 fatal("Unrecognized atomic operation");
335 }
336 }
337
338 void
340 {
341 if (isGloballyCoherent()) {
342 req->setCacheCoherenceFlags(Request::GLC_BIT);
343 }
344
345 if (isSystemCoherent()) {
346 req->setCacheCoherenceFlags(Request::SLC_BIT);
347 }
348
349 if (isAtomicRet()) {
350 req->setFlags(Request::ATOMIC_RETURN_OP);
351 } else if (isAtomicNoRet()) {
352 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
353 }
354
355 if (isMemSync()) {
356 // the path for kernel launch and kernel end is different
357 // from non-kernel mem sync.
358 assert(!isKernelLaunch());
359 assert(!isEndOfKernel());
360
361 // must be wbinv inst if not kernel launch/end
362 req->setCacheCoherenceFlags(Request::INV_L1);
363 }
364 }
365
366 // reset the number of pending memory requests for all lanes
367 void
369 {
370 assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
371 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
372 resetStatusVector(lane);
373 }
374 }
375
376 // reset the number of pending memory requests for the inputted lane
377 void
379 {
380 setStatusVector(lane, 0);
381 }
382
383 // set the number of pending memory requests for the inputted lane
384 void
385 setStatusVector(int lane, int newVal)
386 {
387 // currently we can have up to 2 memory requests per lane (if the
388 // lane's request goes across multiple cache lines)
389 assert((newVal >= 0) && (newVal <= 2));
390 statusVector[lane] = newVal;
391 }
392
393 // subtracts the number of pending memory requests for the inputted lane
394 // by 1
395 void
397 {
398 // this lane may have multiple requests, so only subtract one for
399 // this request
400 assert(statusVector[lane] >= 1);
401 statusVector[lane]--;
402 }
403
404 // return the current number of pending memory requests for the inputted
405 // lane
406 int
407 getLaneStatus(int lane) const
408 {
409 return statusVector[lane];
410 }
411
412 // returns true if all memory requests from all lanes have been received,
413 // else returns false
414 bool
416 {
417 // local variables
418 bool allZero = true;
419
420 // iterate over all lanes, checking the number of pending memory
421 // requests they have
422 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
423 // if any lane still has pending requests, return false
424 if (statusVector[lane] > 0) {
425 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
426 "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
427 statusVector[lane], addr[lane]);
428 allZero = false;
429 }
430 }
431
432 if (allZero) {
433 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
434 " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
435 }
436 return allZero;
437 }
438
439 // returns a string representing the current state of the statusVector
440 std::string
442 {
443 std::string statusVec_str = "[";
444
445 // iterate over all lanes, adding the current number of pending
446 // requests for this lane to the string
447 for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
448 statusVec_str += std::to_string(statusVector[lane]);
449 }
450 statusVec_str += "]";
451
452 return statusVec_str;
453 }
454
455 // Map returned packets and the addresses they satisfy with which lane they
456 // were requested from
457 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
459
460 // Track the status of memory requests per lane, an int per lane to allow
461 // unaligned accesses
463 // for ld_v# or st_v#
465
466 // for misaligned scalar ops we track the number
467 // of outstanding reqs here
469
470 Tick getAccessTime() const { return accessTime; }
471
472 void setAccessTime(Tick currentTime) { accessTime = currentTime; }
473
474 void profileRoundTripTime(Tick currentTime, int hopId);
476
477 void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
478 const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
479 { return lineAddressTime; }
480
481 // inst used to save/restore a wavefront context
483
484 bool isSystemReq() { return systemReq; }
485 void setSystemReq() { systemReq = true; }
486
487 private:
492 bool systemReq = false;
493
494 // the time the request was started
496
497 // hold the tick when the instruction arrives at certain hop points
498 // on it's way to main memory
500
501 // hold each cache block address for the instruction and a vector
502 // to hold the tick when the block arrives at certain hop points
503 std::map<Addr, std::vector<Tick>> lineAddressTime;
504};
505
506} // namespace gem5
507
508#endif // __GPU_DYN_INST_HH__
#define DPRINTF(x,...)
Definition trace.hh:210
AtomicOpFunctor * clone()
ComputeUnit * computeUnit
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
gem5::ComputeUnit::ComputeUnitStats stats
bool isFlatScratch() const
bool isKernelLaunch() const
std::unordered_map< Addr, std::vector< int > > StatusVector
bool isAtomicCAS() const
bool isSpecialOp() const
std::vector< Tick > roundTripTime
bool isI8() const
bool isLocalMem() const
bool hasDestinationSgpr() const
bool writesVCC() const
bool isAtomicDec() const
bool readsVCC() const
bool isNop() const
bool isF16() const
int numDstScalarRegOperands() const
std::map< Addr, std::vector< Tick > > lineAddressTime
void doApertureCheck(const VectorMask &mask)
bool isAtomicRet() const
void resolveFlatSegment(const VectorMask &mask)
std::vector< int > tlbHitLevel
bool isGlobalMem() const
bool isAtomicMin() const
bool isAtomicExch() const
std::vector< Tick > getRoundTripTime() const
bool isFlatGlobal() const
bool isBranch() const
bool isF32() const
bool isAtomicSub() const
GPUStaticInst * _staticInst
bool hasDestinationVgpr() const
std::vector< int > statusVector
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
void decrementStatusVector(int lane)
bool isUnconditionalJump() const
GPUStaticInst * staticInstruction()
int numSrcScalarRegOperands() const
bool isOpcode(const std::string &opcodeStr) const
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum)
bool isAtomicXor() const
const std::map< Addr, std::vector< Tick > > & getLineAddressTime() const
bool isALU() const
accessor methods for the attributes of the underlying GPU static instruction
bool isReadOnlySeg() const
bool isSystemCoherent() const
bool isMemRef() const
bool isAtomicAnd() const
bool isStore() const
bool isDPPInst() const
bool isSleep() const
VectorMask exec_mask
bool isMemSync() const
bool needsToken() const
bool writesSCC() const
bool hasSourceVgpr() const
int numDstVecRegOperands() const
bool ignoreExec() const
StatusVector memStatusVector
bool hasSourceSgpr() const
uint8_t * scalar_data
bool isReturn() const
bool readsSCC() const
bool isMAD() const
int getLaneStatus(int lane) const
bool readsFlatScratch() const
void initiateAcc(GPUDynInstPtr gpuDynInst)
int getNumOperands() const
bool writesExec() const
bool isSDWAInst() const
bool isWaitcnt() const
bool writesMode() const
enums::StorageClassType executedAs()
bool isFlat() const
const std::vector< OperandInfo > & dstVecRegOperands() const
void profileRoundTripTime(Tick currentTime, int hopId)
void resetStatusVector(int lane)
bool isCondBranch() const
bool writesExecMask() const
bool isPrivateSeg() const
bool isEndOfKernel() const
void resetEntireStatusVector()
const std::vector< OperandInfo > & srcVecRegOperands() const
bool isAtomicInc() const
bool isGloballyCoherent() const
bool readsExecMask() const
bool isGroupSeg() const
TheGpuISA::ScalarRegU32 srcLiteral() const
Tick getAccessTime() const
bool readsExec() const
int maxSrcScalarRegOperandSize()
bool isScalar() const
bool isVector() const
InstSeqNum seqNum() const
bool isFMA() const
const std::vector< OperandInfo > & srcScalarRegOperands() const
bool isMFMA() const
bool isAtomicAdd() const
const std::vector< OperandInfo > & dstScalarRegOperands() const
int numSrcVecRegOperands() const
const InstSeqNum _seqNum
bool isBarrier() const
bool isLoad() const
void setRequestFlags(RequestPtr req) const
std::vector< Addr > addr
std::string printStatusVector() const
bool writesFlatScratch() const
bool allLanesZero() const
bool readsMode() const
void execute(GPUDynInstPtr gpuDynInst)
bool isMAC() const
AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
bool isKernArgSeg() const
bool isArgLoad() const
void setStatusVector(int lane, int newVal)
void setAccessTime(Tick currentTime)
bool isGlobalSeg() const
bool isArgSeg() const
bool isAtomic() const
bool isAtomicOr() const
int maxSrcVecRegOperandSize()
bool isAtomicNoRet() const
bool isSpillSeg() const
const std::string & disassemble() const
void completeAcc(GPUDynInstPtr gpuDynInst)
bool isF64() const
bool isAtomicMax() const
const std::vector< int > virtIndices
const int opIdx
Index of this operand within the set of its parent instruction's operand list.
const std::vector< int > physIndices
RegisterOperandInfo(int op_idx, int num_dwords, const std::vector< int > &virt_indices, const std::vector< int > &phys_indices)
const int numDWORDs
Size of this operand in DWORDs.
int virtIdx(int reg_num=0) const
We typically only need the first virtual register for the operand regardless of its size.
int numRegisters() const
The number of registers required to store this operand.
@ ATOMIC_RETURN_OP
The request is an atomic that returns data.
Definition request.hh:175
@ ATOMIC_NO_RETURN_OP
The request is an atomic that does not return data.
Definition request.hh:177
@ SLC_BIT
user-policy flags
Definition request.hh:332
STL vector class.
Definition stl.hh:37
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
Definition amo.hh:269
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7 > b
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
uint64_t Tick
Tick count type.
Definition types.hh:58
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
uint64_t InstSeqNum
Definition inst_seq.hh:40

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0