gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
shader.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __SHADER_HH__
33#define __SHADER_HH__
34
35#include <functional>
36#include <string>
37
38#include "arch/gpu_isa.hh"
39#include "base/statistics.hh"
40#include "base/stats/group.hh"
41#include "base/types.hh"
42#include "cpu/simple/atomic.hh"
43#include "cpu/simple/timing.hh"
44#include "cpu/simple_thread.hh"
45#include "cpu/thread_context.hh"
46#include "cpu/thread_state.hh"
52#include "mem/page_table.hh"
53#include "mem/port.hh"
54#include "mem/request.hh"
55#include "params/Shader.hh"
56#include "sim/faults.hh"
57#include "sim/process.hh"
58#include "sim/sim_object.hh"
59
60namespace gem5
61{
62
63class BaseTLB;
65class GPUDispatcher;
66
67static const int LDS_SIZE = 65536;
68
69// aperture (APE) registers define the base/limit
70// pair for the ATC mapped memory space. currently
71// the only APEs we consider are for GPUVM/LDS/scratch.
72// the APEs are registered with unique values based
73// on a per-device basis
79
80// Class Shader: This describes a single shader instance. Most
81// configurations will only have a single shader.
82
83class Shader : public ClockedObject
84{
85 private:
90
91 // Hardware regs accessed by getreg/setreg instructions, set by queues
92 std::unordered_map<int, uint32_t> hwRegs;
93
94 // Number of active Cus attached to this shader
96
97 // Last tick that all CUs attached to this shader were inactive
99
100 // If a kernel-based exit event was requested, wait for all CUs in the
101 // shader to complete before actually exiting so that stats are updated.
103
104 // Set to true by the dispatcher if the current kernel is a blit kernel
105 bool blitKernel = false;
106
107 // Number of pending non-instruction invalidates outstanding. The shader
108 // should wait for these to be done to ensure correctness.
111
112 public:
113 typedef ShaderParams Params;
115
117 void sampleLoad(const Tick accessTime);
118 void sampleStore(const Tick accessTime);
119 void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
120 void sampleLineRoundTrip(const std::map<Addr,
121 std::vector<Tick>> &roundTripTime);
122
126
127 void
128 setHwReg(int regIdx, uint32_t val)
129 {
130 hwRegs[regIdx] = val;
131 }
132
133 uint32_t
134 getHwReg(int regIdx)
135 {
136 return hwRegs[regIdx];
137 }
138
139 const ApertureRegister&
140 gpuVmApe() const
141 {
142 return _gpuVmApe;
143 }
144
145 const ApertureRegister&
146 ldsApe() const
147 {
148 return _ldsApe;
149 }
150
151 void
153 {
154 _ldsApe.base = base;
155 _ldsApe.limit = limit;
156 }
157
158 const ApertureRegister&
160 {
161 return _scratchApe;
162 }
163
164 void
166 {
167 _scratchApe.base = base;
168 _scratchApe.limit = limit;
169 }
170
171 bool
173 {
174 bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
175
176 return is_gpu_vm;
177 }
178
179 bool
181 {
182 bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
183
184 return is_lds;
185 }
186
187 bool
189 {
190 bool is_scratch
191 = addr >= _scratchApe.base && addr <= _scratchApe.limit;
192
193 return is_scratch;
194 }
195
196 Addr
198 {
199 return _scratchApe.base;
200 }
201
202 Addr
207
208 void
210 {
211 Addr sh_hidden_base_new = queueBase - offset;
212
213 // We are initializing sh_hidden_private_base_vmid from the
214 // amd queue descriptor from the first queue.
215 // The sh_hidden_private_base_vmid is supposed to be same for
216 // all the queues from the same process
217 if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
218 // Do not panic if shHiddenPrivateBaseVmid == 0,
219 // that is if it is uninitialized. Panic only
220 // if the value is initilized and we get
221 // a differnt base later.
223 "Currently we support only single process\n");
224 }
225 shHiddenPrivateBaseVmid = sh_hidden_base_new;
226 }
227
229 GfxVersion getGfxVersion() const;
230
232
233 // is this simulation going to be timing mode in the memory?
236
237 // If set, issue acq packet @ kernel launch
239 // If set, issue rel packet @ kernel end
241 // If set, fetch returns may be coissued with instructions
243 // If set, always dump all 64 gprs to trace
245 // Number of cu units in the shader
246 int n_cu;
247 // Number of wavefront slots per SIMD per CU
248 int n_wf;
249 //Number of cu units per sqc in the shader
251
252 // The size of global memory
254
255 // Tracks CU that rr dispatcher should attempt scheduling
257
258 // Size of scheduled add queue
259 uint32_t sa_n;
260
261 // Pointer to value to be increments
263 // When to do the increment
265 // Amount to increment by
267
268 // List of Compute Units (CU's)
270
274
277
278 // Member and methods related to printing of GPU progress
281
282 Shader(const Params &p);
283 ~Shader();
284 virtual void init();
285
286 // Run shader scheduled adds
287 void execScheduledAdds();
288
289 // Schedule a 32-bit value to be incremented some time in the future
290 void ScheduleAdd(int *val, Tick when, int x);
292
293 void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
294 MemCmd cmd, bool suppress_func_errors);
295
296 void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
297
298 void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
299 bool suppress_func_errors);
300
301 void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
302
303 void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
304 bool suppress_func_errors);
305
306 void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
307 bool suppress_func_errors, int cu_id);
308
309 void
310 registerCU(int cu_id, ComputeUnit *compute_unit)
311 {
312 cuList[cu_id] = compute_unit;
313 }
314
316 void prepareFlush(GPUDynInstPtr gpuDynInst);
317
319 Addr mmap(int length);
320 void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode);
321 void updateContext(int cid);
322 void notifyCuSleep();
323
324 void
325 incVectorInstSrcOperand(int num_operands)
326 {
327 stats.vectorInstSrcOperand[num_operands]++;
328 }
329
330 void
331 incVectorInstDstOperand(int num_operands)
332 {
333 stats.vectorInstDstOperand[num_operands]++;
334 }
335
336 void
337 requestKernelExitEvent(bool is_blit_kernel)
338 {
339 kernelExitRequested = true;
340 blitKernel = is_blit_kernel;
341 }
342
346
347 void addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
348 Addr host_pkt_addr);
349
350 protected:
352 {
353 ShaderStats(statistics::Group *parent, int wf_size);
354
355 // some stats for measuring latency
359
360 // average ticks from vmem inst initiateAcc to coalescer issue,
362
363 // average ticks from coalescer issue to coalescer hit callback,
365
366 // average ticks from coalescer hit callback to GM pipe enqueue,
368
369 // average ticks spent in GM pipe's ordered resp buffer.
371
372 // average number of cache blocks requested by vmem inst
374
375 // average ticks for cache blocks to main memory for the Nth
376 // cache block generated by a vmem inst.
378
383};
384
385} // namespace gem5
386
387#endif // __SHADER_HH__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
const char data[]
This class handles reads from the system/host memory space from the shader.
Definition system_hub.hh:51
ClockedObject(const ClockedObjectParams &p)
bool kernelExitRequested
Definition shader.hh:102
bool timingSim
Definition shader.hh:234
Addr mmap(int length)
Definition shader.cc:118
void prepareInvalidate(HSAQueueEntry *task)
Definition shader.cc:204
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition shader.cc:400
bool blitKernel
Definition shader.hh:105
void notifyCuSleep()
Definition shader.cc:541
void setLdsApe(Addr base, Addr limit)
Definition shader.hh:152
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition shader.cc:308
void execScheduledAdds()
Definition shader.cc:174
int64_t total_valu_insts
Definition shader.hh:276
ApertureRegister _scratchApe
Definition shader.hh:88
void setScratchApe(Addr base, Addr limit)
Definition shader.hh:165
hsail_mode_e hsail_mode
Definition shader.hh:235
GfxVersion getGfxVersion() const
Definition shader.cc:593
EventFunctionWrapper tickEvent
Definition shader.hh:231
std::unordered_map< int, uint32_t > hwRegs
Definition shader.hh:92
std::vector< ComputeUnit * > cuList
Definition shader.hh:269
ApertureRegister _ldsApe
Definition shader.hh:87
ApertureRegister _gpuVmApe
Definition shader.hh:86
const ApertureRegister & scratchApe() const
Definition shader.hh:159
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:576
int nextSchedCu
Definition shader.hh:256
void incNumOutstandingInvL2s()
Definition shader.hh:344
void registerCU(int cu_id, ComputeUnit *compute_unit)
Definition shader.hh:310
void ScheduleAdd(int *val, Tick when, int x)
Definition shader.cc:381
GPUDispatcher & _dispatcher
Definition shader.hh:272
uint32_t sa_n
Definition shader.hh:259
int trace_vgpr_all
Definition shader.hh:244
ShaderParams Params
Definition shader.hh:113
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
Definition shader.cc:512
std::vector< uint64_t > sa_when
Definition shader.hh:264
bool processTimingPacket(PacketPtr pkt)
int getNumOutstandingInvL2s() const
Definition shader.hh:345
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition shader.cc:151
bool isScratchApe(Addr addr) const
Definition shader.hh:188
int coissue_return
Definition shader.hh:242
std::vector< int32_t > sa_x
Definition shader.hh:266
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:418
gem5::Shader::ShaderStats stats
const ApertureRegister & ldsApe() const
Definition shader.hh:146
std::vector< std::tuple< void *, uint32_t, Addr > > deferred_dispatches
Definition shader.hh:110
bool isLdsApe(Addr addr) const
Definition shader.hh:180
ThreadContext * gpuTc
Definition shader.hh:124
bool isGpuVmApe(Addr addr) const
Definition shader.hh:172
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition shader.cc:259
GPUDispatcher & dispatcher()
Definition shader.cc:112
Addr getScratchBase()
Definition shader.hh:197
Shader(const Params &p)
Definition shader.cc:57
void decNumOutstandingInvL2s()
Definition shader.cc:561
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
Definition shader.hh:209
int impl_kern_launch_acq
Definition shader.hh:238
void incVectorInstDstOperand(int num_operands)
Definition shader.hh:331
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:587
AMDGPUSystemHub * systemHub
Definition shader.hh:273
void setHwReg(int regIdx, uint32_t val)
Definition shader.hh:128
SimpleThread * cpuThread
Definition shader.hh:123
void updateContext(int cid)
Definition shader.cc:166
int n_cu_per_sqc
Definition shader.hh:250
int64_t max_valu_insts
Definition shader.hh:275
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:432
int globalMemSize
Definition shader.hh:253
void requestKernelExitEvent(bool is_blit_kernel)
Definition shader.hh:337
int impl_kern_end_rel
Definition shader.hh:240
uint32_t getHwReg(int regIdx)
Definition shader.hh:134
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition shader.cc:246
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition shader.cc:492
void sampleLoad(const Tick accessTime)
Definition shader.cc:485
Tick getProgressInterval() const
Definition shader.hh:280
const ApertureRegister & gpuVmApe() const
Definition shader.hh:140
void incVectorInstSrcOperand(int num_operands)
Definition shader.hh:325
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition shader.cc:451
int num_outstanding_invl2s
Definition shader.hh:109
const Tick progressInterval
Definition shader.hh:279
void sampleStore(const Tick accessTime)
Definition shader.cc:475
BaseCPU * cpuPointer
Definition shader.hh:125
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:271
Addr shHiddenPrivateBaseVmid
Definition shader.hh:89
Tick _lastInactiveTick
Definition shader.hh:98
std::vector< int * > sa_val
Definition shader.hh:262
Addr getHiddenPrivateBase()
Definition shader.hh:203
int _activeCus
Definition shader.hh:95
The SimpleThread object provides a combination of the ThreadState object and the ThreadContext interf...
ThreadContext is the external interface to all thread state for anything outside of the CPU.
A simple distribution stat.
Statistics container.
Definition group.hh:93
This is a simple scalar statistic, like a counter.
A vector of scalar stats.
STL vector class.
Definition stl.hh:37
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Port Object Declaration.
Bitfield< 4, 0 > mode
Definition misc_types.hh:74
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Bitfield< 3 > x
Definition pagetable.hh:76
Bitfield< 63 > val
Definition misc.hh:804
BitfieldType< SegDescriptorLimit > limit
Definition misc.hh:959
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
Packet * PacketPtr
static const int LDS_SIZE
Definition shader.hh:67
Declarations of a non-full system Page Table.
Declaration of a request, the overall memory request consisting of the parts of the request that are ...
Declaration of Statistics objects.
statistics::Vector vectorInstSrcOperand
Definition shader.hh:380
statistics::Distribution storeLatencyDist
Definition shader.hh:358
statistics::Distribution initToCoalesceLatency
Definition shader.hh:361
statistics::Scalar shaderActiveTicks
Definition shader.hh:379
statistics::Distribution loadLatencyDist
Definition shader.hh:357
statistics::Distribution allLatencyDist
Definition shader.hh:356
statistics::Distribution gmToCompleteLatency
Definition shader.hh:370
ShaderStats(statistics::Group *parent, int wf_size)
Definition shader.cc:598
statistics::Distribution coalsrLineAddresses
Definition shader.hh:373
statistics::Vector vectorInstDstOperand
Definition shader.hh:381
statistics::Distribution rubyNetworkLatency
Definition shader.hh:364
statistics::Distribution * cacheBlockRoundTrip
Definition shader.hh:377
statistics::Distribution gmEnqueueLatency
Definition shader.hh:367

Generated on Mon May 26 2025 09:19:11 for gem5 by doxygen 1.13.2