gem5  v22.1.0.0
shader.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef __SHADER_HH__
33 #define __SHADER_HH__
34 
35 #include <functional>
36 #include <string>
37 
38 #include "arch/gpu_isa.hh"
39 #include "base/statistics.hh"
40 #include "base/stats/group.hh"
41 #include "base/types.hh"
42 #include "cpu/simple/atomic.hh"
43 #include "cpu/simple/timing.hh"
44 #include "cpu/simple_thread.hh"
45 #include "cpu/thread_context.hh"
46 #include "cpu/thread_state.hh"
47 #include "dev/amdgpu/system_hub.hh"
51 #include "gpu-compute/lds_state.hh"
52 #include "mem/page_table.hh"
53 #include "mem/port.hh"
54 #include "mem/request.hh"
55 #include "params/Shader.hh"
56 #include "sim/faults.hh"
57 #include "sim/process.hh"
58 #include "sim/sim_object.hh"
59 
60 namespace gem5
61 {
62 
63 class BaseTLB;
64 class GPUCommandProcessor;
65 class GPUDispatcher;
66 
67 static const int LDS_SIZE = 65536;
68 
69 // aperture (APE) registers define the base/limit
70 // pair for the ATC mapped memory space. currently
71 // the only APEs we consider are for GPUVM/LDS/scratch.
72 // the APEs are registered with unique values based
73 // on a per-device basis
75 {
78 };
79 
80 // Class Shader: This describes a single shader instance. Most
81 // configurations will only have a single shader.
82 
83 class Shader : public ClockedObject
84 {
85  private:
90 
91  // Hardware regs accessed by getreg/setreg instructions, set by queues
92  std::unordered_map<int, uint32_t> hwRegs;
93 
94  // Number of active Cus attached to this shader
96 
97  // Last tick that all CUs attached to this shader were inactive
99 
100  public:
101  typedef ShaderParams Params;
103 
105  void sampleLoad(const Tick accessTime);
106  void sampleStore(const Tick accessTime);
107  void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
108  void sampleLineRoundTrip(const std::map<Addr,
109  std::vector<Tick>> &roundTripTime);
110 
114 
115  void
116  setHwReg(int regIdx, uint32_t val)
117  {
118  hwRegs[regIdx] = val;
119  }
120 
121  uint32_t
122  getHwReg(int regIdx)
123  {
124  return hwRegs[regIdx];
125  }
126 
127  const ApertureRegister&
128  gpuVmApe() const
129  {
130  return _gpuVmApe;
131  }
132 
133  const ApertureRegister&
134  ldsApe() const
135  {
136  return _ldsApe;
137  }
138 
139  void
141  {
142  _ldsApe.base = base;
143  _ldsApe.limit = limit;
144  }
145 
146  const ApertureRegister&
147  scratchApe() const
148  {
149  return _scratchApe;
150  }
151 
152  void
154  {
157  }
158 
159  bool
161  {
162  bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
163 
164  return is_gpu_vm;
165  }
166 
167  bool
169  {
170  bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
171 
172  return is_lds;
173  }
174 
175  bool
177  {
178  bool is_scratch
180 
181  return is_scratch;
182  }
183 
184  Addr
186  {
187  return _scratchApe.base;
188  }
189 
190  Addr
192  {
194  }
195 
196  void
197  initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
198  {
199  Addr sh_hidden_base_new = queueBase - offset;
200 
201  // We are initializing sh_hidden_private_base_vmid from the
202  // amd queue descriptor from the first queue.
203  // The sh_hidden_private_base_vmid is supposed to be same for
204  // all the queues from the same process
205  if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
206  // Do not panic if shHiddenPrivateBaseVmid == 0,
207  // that is if it is uninitialized. Panic only
208  // if the value is initilized and we get
209  // a differnt base later.
211  "Currently we support only single process\n");
212  }
213  shHiddenPrivateBaseVmid = sh_hidden_base_new;
214  }
215 
217 
219 
220  // is this simulation going to be timing mode in the memory?
221  bool timingSim;
223 
224  // If set, issue acq packet @ kernel launch
226  // If set, issue rel packet @ kernel end
228  // If set, fetch returns may be coissued with instructions
230  // If set, always dump all 64 gprs to trace
232  // Number of cu units in the shader
233  int n_cu;
234  // Number of wavefront slots per SIMD per CU
235  int n_wf;
236 
237  // The size of global memory
239 
240  // Tracks CU that rr dispatcher should attempt scheduling
242 
243  // Size of scheduled add queue
244  uint32_t sa_n;
245 
246  // Pointer to value to be increments
248  // When to do the increment
250  // Amount to increment by
252 
253  // List of Compute Units (CU's)
255 
259 
260  int64_t max_valu_insts;
262 
263  Shader(const Params &p);
264  ~Shader();
265  virtual void init();
266 
267  // Run shader scheduled adds
268  void execScheduledAdds();
269 
270  // Schedule a 32-bit value to be incremented some time in the future
271  void ScheduleAdd(int *val, Tick when, int x);
273 
274  void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
275  MemCmd cmd, bool suppress_func_errors);
276 
277  void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
278 
279  void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
280  bool suppress_func_errors);
281 
282  void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
283 
284  void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
285  bool suppress_func_errors);
286 
287  void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
288  bool suppress_func_errors, int cu_id);
289 
290  void
291  registerCU(int cu_id, ComputeUnit *compute_unit)
292  {
293  cuList[cu_id] = compute_unit;
294  }
295 
296  void prepareInvalidate(HSAQueueEntry *task);
297  void prepareFlush(GPUDynInstPtr gpuDynInst);
298 
299  bool dispatchWorkgroups(HSAQueueEntry *task);
300  Addr mmap(int length);
301  void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode);
302  void updateContext(int cid);
303  void notifyCuSleep();
304 
305  void
306  incVectorInstSrcOperand(int num_operands)
307  {
308  stats.vectorInstSrcOperand[num_operands]++;
309  }
310 
311  void
312  incVectorInstDstOperand(int num_operands)
313  {
314  stats.vectorInstDstOperand[num_operands]++;
315  }
316 
317  protected:
319  {
320  ShaderStats(statistics::Group *parent, int wf_size);
321 
322  // some stats for measuring latency
326 
327  // average ticks from vmem inst initiateAcc to coalescer issue,
329 
330  // average ticks from coalescer issue to coalescer hit callback,
332 
333  // average ticks from coalescer hit callback to GM pipe enqueue,
335 
336  // average ticks spent in GM pipe's ordered resp buffer.
338 
339  // average number of cache blocks requested by vmem inst
341 
342  // average ticks for cache blocks to main memory for the Nth
343  // cache block generated by a vmem inst.
345 
349  } stats;
350 };
351 
352 } // namespace gem5
353 
354 #endif // __SHADER_HH__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
const char data[]
This class handles reads from the system/host memory space from the shader.
Definition: system_hub.hh:51
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
bool timingSim
Definition: shader.hh:221
Addr mmap(int length)
Definition: shader.cc:105
void prepareInvalidate(HSAQueueEntry *task)
Definition: shader.cc:191
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:376
void notifyCuSleep()
Definition: shader.cc:517
void setLdsApe(Addr base, Addr limit)
Definition: shader.hh:140
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:284
void execScheduledAdds()
Definition: shader.cc:161
int64_t total_valu_insts
Definition: shader.hh:261
ApertureRegister _scratchApe
Definition: shader.hh:88
@ VECTOR_SCALAR
Definition: shader.hh:102
void setScratchApe(Addr base, Addr limit)
Definition: shader.hh:153
hsail_mode_e hsail_mode
Definition: shader.hh:222
EventFunctionWrapper tickEvent
Definition: shader.hh:218
std::unordered_map< int, uint32_t > hwRegs
Definition: shader.hh:92
std::vector< ComputeUnit * > cuList
Definition: shader.hh:254
ApertureRegister _ldsApe
Definition: shader.hh:87
ApertureRegister _gpuVmApe
Definition: shader.hh:86
const ApertureRegister & scratchApe() const
Definition: shader.hh:147
int nextSchedCu
Definition: shader.hh:241
void registerCU(int cu_id, ComputeUnit *compute_unit)
Definition: shader.hh:291
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:357
GPUDispatcher & _dispatcher
Definition: shader.hh:257
uint32_t sa_n
Definition: shader.hh:244
int trace_vgpr_all
Definition: shader.hh:231
ShaderParams Params
Definition: shader.hh:101
std::vector< uint64_t > sa_when
Definition: shader.hh:249
bool processTimingPacket(PacketPtr pkt)
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:138
bool isScratchApe(Addr addr) const
Definition: shader.hh:176
int coissue_return
Definition: shader.hh:229
std::vector< int32_t > sa_x
Definition: shader.hh:251
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:394
gem5::Shader::ShaderStats stats
const ApertureRegister & ldsApe() const
Definition: shader.hh:134
bool isLdsApe(Addr addr) const
Definition: shader.hh:168
ThreadContext * gpuTc
Definition: shader.hh:112
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:488
bool isGpuVmApe(Addr addr) const
Definition: shader.hh:160
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition: shader.cc:235
GPUDispatcher & dispatcher()
Definition: shader.cc:99
Addr getScratchBase()
Definition: shader.hh:185
Shader(const Params &p)
Definition: shader.cc:56
void initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
Definition: shader.hh:197
int impl_kern_launch_acq
Definition: shader.hh:225
void incVectorInstDstOperand(int num_operands)
Definition: shader.hh:312
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition: shader.cc:530
AMDGPUSystemHub * systemHub
Definition: shader.hh:258
void setHwReg(int regIdx, uint32_t val)
Definition: shader.hh:116
SimpleThread * cpuThread
Definition: shader.hh:111
void updateContext(int cid)
Definition: shader.cc:153
int64_t max_valu_insts
Definition: shader.hh:260
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:408
int globalMemSize
Definition: shader.hh:238
int impl_kern_end_rel
Definition: shader.hh:227
uint32_t getHwReg(int regIdx)
Definition: shader.hh:122
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition: shader.cc:222
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:468
void sampleLoad(const Tick accessTime)
Definition: shader.cc:461
const ApertureRegister & gpuVmApe() const
Definition: shader.hh:128
void incVectorInstSrcOperand(int num_operands)
Definition: shader.hh:306
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition: shader.cc:427
void sampleStore(const Tick accessTime)
Definition: shader.cc:451
BaseCPU * cpuPointer
Definition: shader.hh:113
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:256
Addr shHiddenPrivateBaseVmid
Definition: shader.hh:89
Tick _lastInactiveTick
Definition: shader.hh:98
std::vector< int * > sa_val
Definition: shader.hh:247
Addr getHiddenPrivateBase()
Definition: shader.hh:191
int _activeCus
Definition: shader.hh:95
The SimpleThread object provides a combination of the ThreadState object and the ThreadContext interf...
ThreadContext is the external interface to all thread state for anything outside of the CPU.
A simple distribution stat.
Definition: statistics.hh:2085
Statistics container.
Definition: group.hh:94
This is a simple scalar statistic, like a counter.
Definition: statistics.hh:1931
A vector of scalar stats.
Definition: statistics.hh:2007
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Port Object Declaration.
Bitfield< 4, 0 > mode
Definition: misc_types.hh:74
Bitfield< 23, 0 > offset
Definition: types.hh:144
Bitfield< 4 > x
Definition: pagetable.hh:61
Bitfield< 54 > p
Definition: pagetable.hh:70
Bitfield< 51, 12 > base
Definition: pagetable.hh:141
Bitfield< 63 > val
Definition: misc.hh:776
BitfieldType< SegDescriptorLimit > limit
Definition: misc.hh:931
Bitfield< 3 > addr
Definition: types.hh:84
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
uint64_t Tick
Tick count type.
Definition: types.hh:58
uint16_t RequestorID
Definition: request.hh:95
static const int LDS_SIZE
Definition: shader.hh:67
Declarations of a non-full system Page Table.
Declaration of a request, the overall memory request consisting of the parts of the request that are ...
Declaration of Statistics objects.
statistics::Vector vectorInstSrcOperand
Definition: shader.hh:347
statistics::Distribution storeLatencyDist
Definition: shader.hh:325
statistics::Distribution initToCoalesceLatency
Definition: shader.hh:328
statistics::Scalar shaderActiveTicks
Definition: shader.hh:346
statistics::Distribution loadLatencyDist
Definition: shader.hh:324
statistics::Distribution allLatencyDist
Definition: shader.hh:323
statistics::Distribution gmToCompleteLatency
Definition: shader.hh:337
ShaderStats(statistics::Group *parent, int wf_size)
Definition: shader.cc:535
statistics::Distribution coalsrLineAddresses
Definition: shader.hh:340
statistics::Vector vectorInstDstOperand
Definition: shader.hh:348
statistics::Distribution rubyNetworkLatency
Definition: shader.hh:331
statistics::Distribution * cacheBlockRoundTrip
Definition: shader.hh:344
statistics::Distribution gmEnqueueLatency
Definition: shader.hh:334

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1