gem5  v20.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
shader.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "gpu-compute/shader.hh"
35 
36 #include <limits>
37 
38 #include "arch/x86/linux/linux.hh"
39 #include "base/chunk_generator.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUMem.hh"
42 #include "debug/HSAIL.hh"
45 #include "gpu-compute/qstruct.hh"
46 #include "gpu-compute/wavefront.hh"
47 #include "mem/packet.hh"
49 #include "sim/sim_exit.hh"
50 
52  : ClockedObject(p), clock(p->clk_domain->clockPeriod()),
53  cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer),
54  tickEvent([this]{ processTick(); }, "Shader tick",
55  false, Event::CPU_Tick_Pri),
56  timingSim(p->timing), hsail_mode(SIMT),
57  impl_kern_boundary_sync(p->impl_kern_boundary_sync),
58  separate_acquire_release(p->separate_acquire_release), coissue_return(1),
59  trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
60  globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
62 {
63 
64  cuList.resize(n_cu);
65 
66  for (int i = 0; i < n_cu; ++i) {
67  cuList[i] = p->CUs[i];
68  assert(i == cuList[i]->cu_id);
69  cuList[i]->shader = this;
70  }
71 }
72 
73 Addr
75 {
76 
77  Addr start;
78 
79  // round up length to the next page
80  length = roundUp(length, TheISA::PageBytes);
81 
82  Process *proc = gpuTc->getProcessPtr();
83  auto mem_state = proc->memState;
84 
85  if (proc->mmapGrowsDown()) {
86  DPRINTF(HSAIL, "GROWS DOWN");
87  start = mem_state->getMmapEnd() - length;
88  mem_state->setMmapEnd(start);
89  } else {
90  DPRINTF(HSAIL, "GROWS UP");
91  start = mem_state->getMmapEnd();
92  mem_state->setMmapEnd(start + length);
93 
94  // assertion to make sure we don't overwrite the stack (it grows down)
95  assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
96  mem_state->getMmapEnd());
97  }
98 
99  DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
100 
101  proc->allocateMem(start, length);
102 
103  return start;
104 }
105 
106 void
108 {
109  // grab the threadContext of the thread running on the CPU
110  assert(cpuPointer);
112  assert(gpuTc);
113 }
114 
116 {
117  for (int j = 0; j < n_cu; ++j)
118  delete cuList[j];
119 }
120 
121 void
123  // context of the thread which dispatched work
124  assert(cpuPointer);
125  gpuTc = cpuPointer->getContext(cid);
126  assert(gpuTc);
127 }
128 
129 void
131  if (cpuPointer == cpu) {
133  cpu->activateContext(gpuTc->threadId());
134  } else {
135  //Make sure both dispatcher and shader are trying to
136  //wakeup same host. Hack here to enable kernel launch
137  //from multiple CPUs
138  panic("Dispatcher wants to wakeup a different host");
139  }
140 }
141 
142 Shader*
143 ShaderParams::create()
144 {
145  return new Shader(this);
146 }
147 
148 void
150 {
151  tick_cnt = curTick();
153 
154  // apply any scheduled adds
155  for (int i = 0; i < sa_n; ++i) {
156  if (sa_when[i] <= tick_cnt) {
157  *sa_val[i] += sa_x[i];
158  sa_val.erase(sa_val.begin() + i);
159  sa_x.erase(sa_x.begin() + i);
160  sa_when.erase(sa_when.begin() + i);
161  --sa_n;
162  --i;
163  }
164  }
165 
166  // clock all of the cu's
167  for (int i = 0; i < n_cu; ++i)
168  cuList[i]->exec();
169 }
170 
171 bool
173 {
174  bool scheduledSomething = false;
175  int cuCount = 0;
176  int curCu = nextSchedCu;
177 
178  while (cuCount < n_cu) {
179  //Every time we try a CU, update nextSchedCu
180  nextSchedCu = (nextSchedCu + 1) % n_cu;
181 
182  // dispatch workgroup iff the following two conditions are met:
183  // (a) wg_rem is true - there are unassigned workgroups in the grid
184  // (b) there are enough free slots in cu cuList[i] for this wg
185  if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
186  scheduledSomething = true;
187  DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
188 
189  // ticks() member function translates cycles to simulation ticks.
190  if (!tickEvent.scheduled()) {
191  schedule(tickEvent, curTick() + this->ticks(1));
192  }
193 
194  cuList[curCu]->StartWorkgroup(ndr);
195  ndr->wgId[0]++;
196  ndr->globalWgId++;
197  if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
198  ndr->wgId[0] = 0;
199  ndr->wgId[1]++;
200 
201  if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
202  ndr->wgId[1] = 0;
203  ndr->wgId[2]++;
204 
205  if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
206  ndr->wg_disp_rem = false;
207  break;
208  }
209  }
210  }
211  }
212 
213  ++cuCount;
214  curCu = nextSchedCu;
215  }
216 
217  return scheduledSomething;
218 }
219 
220 void
222 {
223  dispatcher = _dispatcher;
224 }
225 
226 void
228  bool suppress_func_errors, int cu_id)
229 {
230  int block_size = cuList.at(cu_id)->cacheLineSize();
231  unsigned size = req->getSize();
232 
233  Addr tmp_addr;
234  BaseTLB::Mode trans_mode;
235 
236  if (cmd == MemCmd::ReadReq) {
237  trans_mode = BaseTLB::Read;
238  } else if (cmd == MemCmd::WriteReq) {
239  trans_mode = BaseTLB::Write;
240  } else {
241  fatal("unexcepted MemCmd\n");
242  }
243 
244  tmp_addr = req->getVaddr();
245  Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
246 
247  assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
248 
249  // Misaligned access
250  if (split_addr > tmp_addr) {
251  RequestPtr req1, req2;
252  req->splitOnVaddr(split_addr, req1, req2);
253 
254 
255  PacketPtr pkt1 = new Packet(req2, cmd);
256  PacketPtr pkt2 = new Packet(req1, cmd);
257 
258  functionalTLBAccess(pkt1, cu_id, trans_mode);
259  functionalTLBAccess(pkt2, cu_id, trans_mode);
260 
261  PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
262  PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
263 
264  new_pkt1->dataStatic(data);
265  new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
266 
267  if (suppress_func_errors) {
268  new_pkt1->setSuppressFuncError();
269  new_pkt2->setSuppressFuncError();
270  }
271 
272  // fixme: this should be cuList[cu_id] if cu_id != n_cu
273  // The latter requires a memPort in the dispatcher
274  cuList[0]->memPort[0]->sendFunctional(new_pkt1);
275  cuList[0]->memPort[0]->sendFunctional(new_pkt2);
276 
277  delete new_pkt1;
278  delete new_pkt2;
279  delete pkt1;
280  delete pkt2;
281  } else {
282  PacketPtr pkt = new Packet(req, cmd);
283  functionalTLBAccess(pkt, cu_id, trans_mode);
284  PacketPtr new_pkt = new Packet(pkt->req, cmd);
285  new_pkt->dataStatic(data);
286 
287  if (suppress_func_errors) {
288  new_pkt->setSuppressFuncError();
289  };
290 
291  // fixme: this should be cuList[cu_id] if cu_id != n_cu
292  // The latter requires a memPort in the dispatcher
293  cuList[0]->memPort[0]->sendFunctional(new_pkt);
294 
295  delete new_pkt;
296  delete pkt;
297  }
298 }
299 
300 bool
302 {
303  for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
304  if (!cuList[i_cu]->isDone()) {
305  return true;
306  }
307  }
308 
309  return false;
310 }
311 
312 void
313 Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
314 {
315  sa_val.push_back(val);
316  sa_when.push_back(tick_cnt + when);
317  sa_x.push_back(x);
318  ++sa_n;
319 }
320 
321 
322 void
324 {
325  if (busy()) {
326  exec();
327  schedule(tickEvent, curTick() + ticks(1));
328  }
329 }
330 
331 void
332 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
333  MemCmd cmd, bool suppress_func_errors)
334 {
335  uint8_t *data_buf = (uint8_t*)ptr;
336 
337  for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
338  !gen.done(); gen.next()) {
339 
340  RequestPtr req = std::make_shared<Request>(
341  gen.addr(), gen.size(), 0,
342  cuList[0]->masterId(), 0, 0, nullptr);
343 
344  doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
345  data_buf += gen.size();
346  }
347 }
348 
349 void
350 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
351 {
352  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
353 }
354 
355 void
356 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
357  bool suppress_func_errors)
358 {
359  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
360 }
361 
362 void
363 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
364 {
365  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
366 }
367 
368 void
369 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
370  bool suppress_func_errors)
371 {
372  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
373  suppress_func_errors);
374 }
375 
376 /*
377  * Send a packet through the appropriate TLB functional port.
378  * If cu_id=n_cu, then this is the dispatcher's TLB.
379  * Otherwise it's the TLB of the cu_id compute unit.
380  */
381 void
383 {
384  // update senderState. Need to know the gpuTc and the TLB mode
385  pkt->senderState =
386  new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
387 
388  if (cu_id == n_cu) {
390  } else {
391  // even when the perLaneTLB flag is turned on
392  // it's ok tp send all accesses through lane 0
393  // since the lane # is not known here,
394  // This isn't important since these are functional accesses.
395  cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
396  }
397 
398  /* safe_cast the senderState */
399  TheISA::GpuTLB::TranslationState *sender_state =
400  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
401 
402  delete sender_state->tlbEntry;
403  delete pkt->senderState;
404 }
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
int coissue_return
Definition: shader.hh:115
#define DPRINTF(x,...)
Definition: trace.hh:225
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:332
std::vector< int32_t > sa_x
Definition: shader.hh:146
std::vector< ComputeUnit * > cuList
Definition: shader.hh:149
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:171
Definition: packet.hh:70
void setSuppressFuncError()
Definition: packet.hh:683
Bitfield< 7 > i
int n_cu
Definition: shader.hh:119
BaseCPU * cpuPointer
Definition: shader.hh:100
void updateContext(int cid)
Definition: shader.cc:122
std::shared_ptr< Request > RequestPtr
Definition: request.hh:81
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:326
virtual void activateContext(ThreadID thread_num)
Notify the CPU that the indicated context is now active.
Definition: base.cc:487
GpuDispatcher * dispatcher
Definition: shader.hh:155
virtual Process * getProcessPtr()=0
std::vector< uint64_t > sa_when
Definition: shader.hh:144
Definition: shader.hh:76
Bitfield< 4, 0 > mode
T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:114
std::shared_ptr< MemState > memState
Definition: process.hh:279
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1034
Bitfield< 63 > val
Definition: misc.hh:769
int trace_vgpr_all
Definition: shader.hh:117
RequestPtr req
A pointer to the original request.
Definition: packet.hh:321
Bitfield< 3 > x
Definition: pagetable.hh:69
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:363
void exec()
Definition: shader.cc:149
Tick curTick()
The current simulated tick.
Definition: core.hh:44
int nextSchedCu
Definition: shader.hh:136
uint64_t Tick
Tick count type.
Definition: types.hh:61
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:198
int separate_acquire_release
Definition: shader.hh:113
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:107
void schedule(Event &event, Tick when)
Definition: eventq.hh:934
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:131
uint32_t wgSize[3]
Definition: qstruct.hh:59
uint32_t gdSize[3]
Definition: qstruct.hh:57
ThreadContext * gpuTc
Definition: shader.hh:99
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
Definition: shader.cc:382
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
T safe_cast(U ptr)
Definition: cast.hh:59
Shader(const Params *p)
Definition: shader.cc:51
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:249
~Shader()
Definition: shader.cc:115
bool wg_disp_rem
Definition: ndrange.hh:60
Bitfield< 24 > j
uint32_t globalWgId
Definition: ndrange.hh:57
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:459
int globalMemSize
Definition: shader.hh:123
Mode
Definition: tlb.hh:57
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:227
void processTick()
Definition: shader.cc:323
Addr mmap(int length)
Definition: shader.cc:74
void ScheduleAdd(uint32_t *val, Tick when, int x)
Definition: shader.cc:313
bool timingSim
Definition: shader.hh:106
Declaration of the Packet class.
EventFunctionWrapper tickEvent
Definition: shader.hh:103
SenderState * senderState
This packet&#39;s sender state.
Definition: packet.hh:474
Tick ticks(int numCycles) const
Definition: shader.hh:91
virtual int threadId() const =0
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Definition: base.hh:294
uint32_t sa_n
Definition: shader.hh:139
virtual Status status() const =0
HsaQueueEntry q
Definition: ndrange.hh:45
const Addr PageBytes
Definition: isa_traits.hh:56
TLBPort * tlbPort
Definition: dispatcher.hh:137
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward, but a few (such as Alpha) grow upward instead, so they can override this method to return false.
Definition: process.hh:136
hsail_mode_e hsail_mode
Definition: shader.hh:107
void sendFunctional(PacketPtr pkt) const
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
Definition: port.hh:435
Temporarily inactive.
uint8_t length
Definition: inet.hh:329
uint64_t start_tick_cnt
Definition: shader.hh:153
Declaration and inline definition of ChunkGenerator object.
uint64_t box_tick_cnt
Definition: shader.hh:152
int impl_kern_boundary_sync
Definition: shader.hh:110
bool dispatch_workgroups(NDRange *ndr)
Definition: shader.cc:172
uint64_t tick_cnt
Definition: shader.hh:151
Bitfield< 0 > p
int n_wf
Definition: shader.hh:121
int wgId[3]
Definition: ndrange.hh:48
const char data[]
void hostWakeUp(BaseCPU *cpu)
Definition: shader.cc:130
ShaderParams Params
Definition: shader.hh:84
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:350
bool busy()
Definition: shader.cc:301
void handshake(GpuDispatcher *dispatcher)
Definition: shader.cc:221
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:103
std::vector< uint32_t * > sa_val
Definition: shader.hh:142

Generated on Thu May 28 2020 16:21:33 for gem5 by doxygen 1.8.13