gem5  v20.0.0.2
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
wavefront.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "gpu-compute/wavefront.hh"
35 
36 #include "debug/GPUExec.hh"
37 #include "debug/WavefrontStack.hh"
40 #include "gpu-compute/shader.hh"
42 
43 Wavefront*
44 WavefrontParams::create()
45 {
46  return new Wavefront(this);
47 }
48 
50  : SimObject(p), callArgMem(nullptr), _gpuISA()
51 {
52  lastTrace = 0;
53  simdId = p->simdId;
54  wfSlotId = p->wf_slot_id;
55  status = S_STOPPED;
57  startVgprIndex = 0;
58  outstandingReqs = 0;
59  memReqsInPipe = 0;
64  rdLmReqsInPipe = 0;
65  rdGmReqsInPipe = 0;
66  wrLmReqsInPipe = 0;
67  wrGmReqsInPipe = 0;
68 
69  barrierCnt = 0;
70  oldBarrierCnt = 0;
71  stalledAtBarrier = false;
72 
73  memTraceBusy = 0;
74  oldVgprTcnt = 0xffffffffffffffffll;
75  oldDgprTcnt = 0xffffffffffffffffll;
76  oldVgpr.resize(p->wfSize);
77 
78  pendingFetch = false;
79  dropFetch = false;
81  maxSpVgprs = 0;
82  maxDpVgprs = 0;
83  lastAddr.resize(p->wfSize);
84  workItemFlatId.resize(p->wfSize);
85  oldDgpr.resize(p->wfSize);
86  barCnt.resize(p->wfSize);
87  for (int i = 0; i < 3; ++i) {
88  workItemId[i].resize(p->wfSize);
89  }
90 }
91 
92 void
94 {
96 
98  .init(0, 4, 2)
99  .name(name() + ".src_reg_operand_dist")
100  .desc("number of executed instructions with N source register operands")
101  ;
102 
104  .init(0, 3, 2)
105  .name(name() + ".dst_reg_operand_dist")
106  .desc("number of executed instructions with N destination register "
107  "operands")
108  ;
109 
110  // FIXME: the name of the WF needs to be unique
112  .name(name() + ".timesBlockedDueWAXDependencies")
113  .desc("number of times the wf's instructions are blocked due to WAW "
114  "or WAR dependencies")
115  ;
116 
117  // FIXME: the name of the WF needs to be unique
119  .name(name() + ".timesBlockedDueRAWDependencies")
120  .desc("number of times the wf's instructions are blocked due to RAW "
121  "dependencies")
122  ;
123 
124  // FIXME: the name of the WF needs to be unique
126  .name(name() + ".timesBlockedDueVrfPortAvail")
127  .desc("number of times instructions are blocked due to VRF port "
128  "availability")
129  ;
130 }
131 
132 void
134 {
135  reservedVectorRegs = 0;
136  startVgprIndex = 0;
137 }
138 
139 void
140 Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
141 {
142  condRegState->init(num_cregs);
143  maxSpVgprs = num_sregs;
144  maxDpVgprs = num_dregs;
145 }
146 
148 {
149  if (callArgMem)
150  delete callArgMem;
151  delete condRegState;
152 }
153 
154 void
155 Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
156 {
157  wfDynId = _wf_dyn_id;
158  basePtr = _base_ptr;
159  status = S_RUNNING;
160 }
161 
162 bool
164 {
165  if (ii->isGlobalMem() || ii->isFlat())
166  return true;
167 
168  return false;
169 }
170 
171 bool
173 {
174  if (ii->isLocalMem()) {
175  return true;
176  }
177 
178  return false;
179 }
180 
181 bool
183 {
184  assert(!instructionBuffer.empty());
185  GPUDynInstPtr ii = instructionBuffer.front();
186 
187  if (status != S_STOPPED && (ii->isNop() ||
188  ii->isReturn() || ii->isBranch() ||
189  ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
190  return true;
191  }
192 
193  return false;
194 }
195 
196 bool
198 {
199  assert(!instructionBuffer.empty());
200  GPUDynInstPtr ii = instructionBuffer.front();
201 
202  if (status != S_STOPPED && ii->isBarrier()) {
203  return true;
204  }
205 
206  return false;
207 }
208 
209 bool
211 {
212  assert(!instructionBuffer.empty());
213  GPUDynInstPtr ii = instructionBuffer.front();
214 
215  if (status != S_STOPPED && ii->isGlobalMem()) {
216  return true;
217  }
218 
219  return false;
220 }
221 
222 bool
224 {
225  assert(!instructionBuffer.empty());
226  GPUDynInstPtr ii = instructionBuffer.front();
227 
228  if (status != S_STOPPED && ii->isLocalMem()) {
229  return true;
230  }
231 
232  return false;
233 }
234 
235 bool
237 {
238  assert(!instructionBuffer.empty());
239  GPUDynInstPtr ii = instructionBuffer.front();
240 
241  if (status != S_STOPPED && ii->isPrivateSeg()) {
242  return true;
243  }
244 
245  return false;
246 }
247 
248 bool
250 {
251  assert(!instructionBuffer.empty());
252  GPUDynInstPtr ii = instructionBuffer.front();
253 
254  if (status != S_STOPPED && ii->isFlat()) {
255  return true;
256  }
257 
258  return false;
259 }
260 
261 // Return true if the Wavefront's instruction
262 // buffer has branch instruction.
263 bool
265 {
266  for (auto it : instructionBuffer) {
267  GPUDynInstPtr ii = it;
268 
269  if (ii->isReturn() || ii->isBranch()) {
270  return true;
271  }
272  }
273 
274  return false;
275 }
276 
277 // Remap HSAIL register to physical VGPR.
278 // HSAIL register = virtual register assigned to an operand by HLC compiler
279 uint32_t
280 Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
281 {
282  assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
283  // add the offset from where the VGPRs of the wavefront have been assigned
284  uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
285  // HSAIL double precision (DP) register: calculate the physical VGPR index
286  // assuming that DP registers are placed after SP ones in the VRF. The DP
287  // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
288  // the DP VGPR index before mapping it to the physical VRF address space
289  if (mode == 1 && size > 4) {
290  physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
291  }
292 
293  assert((startVgprIndex <= physicalVgprIndex) &&
294  (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
295 
296  // calculate absolute physical VGPR index
297  return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
298 }
299 
300 // Return true if this wavefront is ready
301 // to execute an instruction of the specified type.
302 int
304 {
305  // Check to make sure wave is running
306  if (status == S_STOPPED || status == S_RETURNING ||
307  instructionBuffer.empty()) {
308  return 0;
309  }
310 
311  // Is the wave waiting at a barrier
312  if (stalledAtBarrier) {
315  // Are all threads at barrier?
316  return 0;
317  }
319  stalledAtBarrier = false;
320  }
321 
322  // Read instruction
323  GPUDynInstPtr ii = instructionBuffer.front();
324 
325  bool ready_inst M5_VAR_USED = false;
326  bool glbMemBusRdy = false;
327  bool glbMemIssueRdy = false;
328  if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
329  for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
330  if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
331  glbMemBusRdy = true;
332  if (computeUnit->wfWait[j].prerdy())
333  glbMemIssueRdy = true;
334  }
335  }
336  bool locMemBusRdy = false;
337  bool locMemIssueRdy = false;
338  if (type == I_SHARED || type == I_FLAT) {
339  for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
340  if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
341  locMemBusRdy = true;
342  if (computeUnit->wfWait[j].prerdy())
343  locMemIssueRdy = true;
344  }
345  }
346 
347  // The following code is very error prone and the entire process for
348  // checking readiness will be fixed eventually. In the meantime, let's
349  // make sure that we do not silently let an instruction type slip
350  // through this logic and always return not ready.
351  if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
352  ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
353  ii->isMemFence() || ii->isFlat())) {
354  panic("next instruction: %s is of unknown type\n", ii->disassemble());
355  }
356 
357  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
358  computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
359 
360  if (type == I_ALU && ii->isBarrier()) {
361  // Here for ALU instruction (barrier)
362  if (!computeUnit->wfWait[simdId].prerdy()) {
363  // Is wave slot free?
364  return 0;
365  }
366 
367  // Are there in pipe or outstanding memory requests?
368  if ((outstandingReqs + memReqsInPipe) > 0) {
369  return 0;
370  }
371 
372  ready_inst = true;
373  } else if (type == I_ALU && ii->isNop()) {
374  // Here for ALU instruction (nop)
375  if (!computeUnit->wfWait[simdId].prerdy()) {
376  // Is wave slot free?
377  return 0;
378  }
379 
380  ready_inst = true;
381  } else if (type == I_ALU && ii->isReturn()) {
382  // Here for ALU instruction (return)
383  if (!computeUnit->wfWait[simdId].prerdy()) {
384  // Is wave slot free?
385  return 0;
386  }
387 
388  // Are there in pipe or outstanding memory requests?
389  if ((outstandingReqs + memReqsInPipe) > 0) {
390  return 0;
391  }
392 
393  ready_inst = true;
394  } else if (type == I_ALU && (ii->isBranch() ||
395  ii->isALU() ||
396  (ii->isKernArgSeg() && ii->isLoad()) ||
397  ii->isArgSeg())) {
398  // Here for ALU instruction (all others)
399  if (!computeUnit->wfWait[simdId].prerdy()) {
400  // Is alu slot free?
401  return 0;
402  }
403  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
405  return 0;
406  }
407 
408  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
409  return 0;
410  }
411  ready_inst = true;
412  } else if (type == I_GLOBAL && ii->isGlobalMem()) {
413  // Here Global memory instruction
414  if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
415  // Are there in pipe or outstanding global memory write requests?
416  if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
417  return 0;
418  }
419  }
420 
421  if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
422  // Are there in pipe or outstanding global memory read requests?
424  return 0;
425  }
426 
427  if (!glbMemIssueRdy) {
428  // Is WV issue slot free?
429  return 0;
430  }
431 
432  if (!glbMemBusRdy) {
433  // Is there an available VRF->Global memory read bus?
434  return 0;
435  }
436 
437  // Does the coalescer have space for our instruction?
439  return 0;
440  }
441 
443  isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
444  // Can we insert a new request to the Global Mem Request FIFO?
445  return 0;
446  }
447  // can we schedule source & destination operands on the VRF?
448  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
450  return 0;
451  }
452  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
453  return 0;
454  }
455  ready_inst = true;
456  } else if (type == I_SHARED && ii->isLocalMem()) {
457  // Here for Shared memory instruction
458  if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
459  if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
460  return 0;
461  }
462  }
463 
464  if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
465  if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
466  return 0;
467  }
468  }
469 
470  if (!locMemBusRdy) {
471  // Is there an available VRF->LDS read bus?
472  return 0;
473  }
474  if (!locMemIssueRdy) {
475  // Is wave slot free?
476  return 0;
477  }
478 
480  isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
481  // Can we insert a new request to the LDS Request FIFO?
482  return 0;
483  }
484  // can we schedule source & destination operands on the VRF?
485  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
487  return 0;
488  }
489  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
490  return 0;
491  }
492  ready_inst = true;
493  } else if (type == I_FLAT && ii->isFlat()) {
494  if (!glbMemBusRdy) {
495  // Is there an available VRF->Global memory read bus?
496  return 0;
497  }
498 
499  if (!locMemBusRdy) {
500  // Is there an available VRF->LDS read bus?
501  return 0;
502  }
503 
504  if (!glbMemIssueRdy) {
505  // Is wave slot free?
506  return 0;
507  }
508 
509  if (!locMemIssueRdy) {
510  return 0;
511  }
512 
513  // Does the coalescer have space for our instruction?
515  return 0;
516  }
517 
519  isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
520  // Can we insert a new request to the Global Mem Request FIFO?
521  return 0;
522  }
523 
525  isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
526  // Can we insert a new request to the LDS Request FIFO?
527  return 0;
528  }
529  // can we schedule source & destination operands on the VRF?
530  if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
532  return 0;
533  }
534  // are all the operands ready? (RAW, WAW and WAR depedencies met?)
535  if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
536  return 0;
537  }
538  ready_inst = true;
539  } else {
540  return 0;
541  }
542 
543  assert(ready_inst);
544 
545  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
546  simdId, wfSlotId, ii->disassemble());
547  return 1;
548 }
549 
550 void
552 {
553  // Get current instruction
554  GPUDynInstPtr ii = instructionBuffer.front();
555  assert(ii);
556  computeUnit->vrf[simdId]->updateResources(this, ii);
557  // Single precision ALU or Branch or Return or Special instruction
558  if (ii->isALU() || ii->isSpecialOp() ||
559  ii->isBranch() ||
560  // FIXME: Kernel argument loads are currently treated as ALU operations
561  // since we don't send memory packets at execution. If we fix that then
562  // we should map them to one of the memory pipelines
563  (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
564  ii->isReturn()) {
566  ticks(computeUnit->spBypassLength()));
567  // this is to enforce a fixed number of cycles per issue slot per SIMD
569  ticks(computeUnit->issuePeriod));
570  } else if (ii->isBarrier()) {
572  ticks(computeUnit->issuePeriod));
573  } else if (ii->isLoad() && ii->isFlat()) {
574  assert(Enums::SC_NONE != ii->executedAs());
575  memReqsInPipe++;
576  rdGmReqsInPipe++;
577  if ( Enums::SC_SHARED == ii->executedAs() ) {
579  preset(computeUnit->shader->ticks(4));
582  } else {
584  preset(computeUnit->shader->ticks(4));
587  }
588  } else if (ii->isStore() && ii->isFlat()) {
589  assert(Enums::SC_NONE != ii->executedAs());
590  memReqsInPipe++;
591  wrGmReqsInPipe++;
592  if (Enums::SC_SHARED == ii->executedAs()) {
594  preset(computeUnit->shader->ticks(8));
597  } else {
599  preset(computeUnit->shader->ticks(8));
602  }
603  } else if (ii->isLoad() && ii->isGlobalMem()) {
604  memReqsInPipe++;
605  rdGmReqsInPipe++;
607  preset(computeUnit->shader->ticks(4));
610  } else if (ii->isStore() && ii->isGlobalMem()) {
611  memReqsInPipe++;
612  wrGmReqsInPipe++;
614  preset(computeUnit->shader->ticks(8));
617  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
618  memReqsInPipe++;
619  wrGmReqsInPipe++;
620  rdGmReqsInPipe++;
622  preset(computeUnit->shader->ticks(8));
625  } else if (ii->isLoad() && ii->isLocalMem()) {
626  memReqsInPipe++;
627  rdLmReqsInPipe++;
629  preset(computeUnit->shader->ticks(4));
632  } else if (ii->isStore() && ii->isLocalMem()) {
633  memReqsInPipe++;
634  wrLmReqsInPipe++;
636  preset(computeUnit->shader->ticks(8));
639  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
640  memReqsInPipe++;
641  wrLmReqsInPipe++;
642  rdLmReqsInPipe++;
644  preset(computeUnit->shader->ticks(8));
647  }
648 }
649 
650 void
652 {
653  // ---- Exit if wavefront is inactive ----------------------------- //
654 
655  if (status == S_STOPPED || status == S_RETURNING ||
656  instructionBuffer.empty()) {
657  return;
658  }
659 
660  // Get current instruction
661 
662  GPUDynInstPtr ii = instructionBuffer.front();
663 
664  const uint32_t old_pc = pc();
665  DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
666  "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
667  ii->disassemble(), old_pc);
668 
669  // update the instruction stats in the CU
670 
671  ii->execute(ii);
673  // access the VRF
674  computeUnit->vrf[simdId]->exec(ii, this);
675  srcRegOpDist.sample(ii->numSrcRegOperands());
676  dstRegOpDist.sample(ii->numDstRegOperands());
681  if (pc() == old_pc) {
682  uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
683  // PC not modified by instruction, proceed to next or pop frame
684  pc(new_pc);
685  if (new_pc == rpc()) {
687  discardFetch();
688  } else {
689  instructionBuffer.pop_front();
690  }
691  } else {
692  discardFetch();
693  }
694 
696  const int num_active_lanes = execMask().count();
697  computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
698  computeUnit->numVecOpsExecuted += num_active_lanes;
699  if (isGmInstruction(ii)) {
701  } else if (isLmInstruction(ii)) {
703  }
704  }
705 
706  // ---- Update Vector ALU pipeline and other resources ------------------ //
707  // Single precision ALU or Branch or Return or Special instruction
708  if (ii->isALU() || ii->isSpecialOp() ||
709  ii->isBranch() ||
710  // FIXME: Kernel argument loads are currently treated as ALU operations
711  // since we don't send memory packets at execution. If we fix that then
712  // we should map them to one of the memory pipelines
713  (ii->isKernArgSeg() && ii->isLoad()) ||
714  ii->isArgSeg() ||
715  ii->isReturn()) {
717  ticks(computeUnit->spBypassLength()));
718 
719  // this is to enforce a fixed number of cycles per issue slot per SIMD
721  ticks(computeUnit->issuePeriod));
722  } else if (ii->isBarrier()) {
724  ticks(computeUnit->issuePeriod));
725  } else if (ii->isLoad() && ii->isFlat()) {
726  assert(Enums::SC_NONE != ii->executedAs());
727 
728  if (Enums::SC_SHARED == ii->executedAs()) {
730  set(computeUnit->shader->ticks(4));
733  } else {
735  set(computeUnit->shader->ticks(4));
738  }
739  } else if (ii->isStore() && ii->isFlat()) {
740  assert(Enums::SC_NONE != ii->executedAs());
741  if (Enums::SC_SHARED == ii->executedAs()) {
743  set(computeUnit->shader->ticks(8));
746  } else {
748  set(computeUnit->shader->ticks(8));
751  }
752  } else if (ii->isLoad() && ii->isGlobalMem()) {
754  set(computeUnit->shader->ticks(4));
757  } else if (ii->isStore() && ii->isGlobalMem()) {
759  set(computeUnit->shader->ticks(8));
762  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
764  set(computeUnit->shader->ticks(8));
767  } else if (ii->isLoad() && ii->isLocalMem()) {
769  set(computeUnit->shader->ticks(4));
772  } else if (ii->isStore() && ii->isLocalMem()) {
774  set(computeUnit->shader->ticks(8));
777  } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
779  set(computeUnit->shader->ticks(8));
782  }
783 }
784 
785 bool
787 {
788  return barCnt[lane] < maxBarCnt;
789 }
790 
791 void
793  const VectorMask& mask)
794 {
795  assert(mask.count());
796  reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
797 }
798 
799 void
801 {
802  assert(!reconvergenceStack.empty());
803 
804  DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
806  execMask().to_string<char, std::string::traits_type,
807  std::string::allocator_type>().c_str(), pc());
808 
809  reconvergenceStack.pop_back();
810 
811  DPRINTF(WavefrontStack, "%3i %s\n", pc(),
812  execMask().to_string<char, std::string::traits_type,
813  std::string::allocator_type>().c_str());
814 
815 }
816 
817 void
819 {
820  instructionBuffer.clear();
822 }
823 
824 uint32_t
826 {
827  return reconvergenceStack.back()->pc;
828 }
829 
830 uint32_t
832 {
833  return reconvergenceStack.back()->rpc;
834 }
835 
838 {
839  return reconvergenceStack.back()->execMask;
840 }
841 
842 bool
843 Wavefront::execMask(int lane) const
844 {
845  return reconvergenceStack.back()->execMask[lane];
846 }
847 
848 
849 void
850 Wavefront::pc(uint32_t new_pc)
851 {
852  reconvergenceStack.back()->pc = new_pc;
853 }
854 
855 uint32_t
857 {
858  return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
859  sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
860  sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
861  sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
863 }
864 
865 void
866 Wavefront::getContext(const void *out)
867 {
868  uint8_t *iter = (uint8_t *)out;
869  for (int i = 0; i < barCnt.size(); i++) {
870  *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
871  }
872  *(int *)iter = wfId; iter += sizeof(wfId);
873  *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
874  *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
875  *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
876  *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
877  *(uint32_t *)iter = wgId; iter += sizeof(wgId);
878  *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
879  *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
880  *(Addr *)iter = privBase; iter += sizeof(privBase);
881  *(Addr *)iter = spillBase; iter += sizeof(spillBase);
882 
883  int stackSize = reconvergenceStack.size();
884  ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
885  std::numeric_limits<uint32_t>::max(),
886  std::numeric_limits<uint64_t>::max()};
887  for (int i = 0; i < workItemId[0].size(); i++) {
888  if (i < stackSize) {
889  *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
890  iter += sizeof(ReconvergenceStackEntry);
891  reconvergenceStack.pop_back();
892  } else {
893  *(ReconvergenceStackEntry *)iter = empty;
894  iter += sizeof(ReconvergenceStackEntry);
895  }
896  }
897 
898  int wf_size = computeUnit->wfSize();
899  for (int i = 0; i < maxSpVgprs; i++) {
900  uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
901  for (int lane = 0; lane < wf_size; lane++) {
902  uint32_t regVal = computeUnit->vrf[simdId]->
903  read<uint32_t>(vgprIdx,lane);
904  *(uint32_t *)iter = regVal; iter += sizeof(regVal);
905  }
906  }
907 
908  for (int i = 0; i < maxDpVgprs; i++) {
909  uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
910  for (int lane = 0; lane < wf_size; lane++) {
911  uint64_t regVal = computeUnit->vrf[simdId]->
912  read<uint64_t>(vgprIdx,lane);
913  *(uint64_t *)iter = regVal; iter += sizeof(regVal);
914  }
915  }
916 
917  for (int i = 0; i < condRegState->numRegs(); i++) {
918  for (int lane = 0; lane < wf_size; lane++) {
919  uint64_t regVal = condRegState->read<uint64_t>(i, lane);
920  *(uint64_t *)iter = regVal; iter += sizeof(regVal);
921  }
922  }
923 
924  /* saving LDS content */
925  if (ldsChunk)
926  for (int i = 0; i < ldsChunk->size(); i++) {
927  char val = ldsChunk->read<char>(i);
928  *(char *) iter = val; iter += sizeof(val);
929  }
930 }
931 
932 void
933 Wavefront::setContext(const void *in)
934 {
935  uint8_t *iter = (uint8_t *)in;
936  for (int i = 0; i < barCnt.size(); i++) {
937  barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
938  }
939  wfId = *(int *)iter; iter += sizeof(wfId);
940  maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
941  oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
942  barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
943  computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
944  wgId = *(uint32_t *)iter; iter += sizeof(wgId);
945  barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
946  initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
947  privBase = *(Addr *)iter; iter += sizeof(privBase);
948  spillBase = *(Addr *)iter; iter += sizeof(spillBase);
949 
950  for (int i = 0; i < workItemId[0].size(); i++) {
952  iter += sizeof(ReconvergenceStackEntry);
953  if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
954  pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
955  newEntry.execMask);
956  }
957  }
958  int wf_size = computeUnit->wfSize();
959 
960  for (int i = 0; i < maxSpVgprs; i++) {
961  uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
962  for (int lane = 0; lane < wf_size; lane++) {
963  uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
964  computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
965  }
966  }
967 
968  for (int i = 0; i < maxDpVgprs; i++) {
969  uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
970  for (int lane = 0; lane < wf_size; lane++) {
971  uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
972  computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
973  }
974  }
975 
976  for (int i = 0; i < condRegState->numRegs(); i++) {
977  for (int lane = 0; lane < wf_size; lane++) {
978  uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
979  condRegState->write<uint64_t>(i, lane, regVal);
980  }
981  }
983  if (ldsChunk)
984  for (int i = 0; i < ldsChunk->size(); i++) {
985  char val = *(char *) iter; iter += sizeof(val);
986  ldsChunk->write<char>(i, val);
987  }
988 }
989 
990 void
992 {
993  actualWgSzTotal = 1;
994  for (int d = 0; d < 3; ++d) {
995  actualWgSz[d] = std::min(workGroupSz[d],
996  gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
998  }
999 }
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
std::vector< uint32_t > oldVgpr
Definition: wavefront.hh:234
#define DPRINTF(x,...)
Definition: trace.hh:222
uint32_t workGroupSz[3]
Definition: wavefront.hh:195
void discardFetch()
Definition: wavefront.cc:818
Addr spillBase
Definition: wavefront.hh:261
bool isOldestInstGMem()
Definition: wavefront.cc:210
std::vector< uint8_t >::size_type size() const
get the size of this chunk
Definition: lds_state.hh:96
Stats::Scalar numTimesBlockedDueRAWDependencies
Definition: wavefront.hh:290
Bitfield< 7 > i
void setContext(const void *in)
Sets the hardware context fromt a stream of bytes This method is designed for HSAIL execution...
Definition: wavefront.cc:933
void write(int regIdx, int threadId, T value)
uint32_t barrierCnt
Definition: wavefront.hh:155
Stats::Scalar numTimesBlockedDueVrfPortAvail
Definition: wavefront.hh:284
std::deque< std::unique_ptr< ReconvergenceStackEntry > > reconvergenceStack
Stack containing Control Flow Graph nodes (i.e., kernel instructions) to be visited by the wavefront...
Definition: wavefront.hh:390
Stats::Distribution controlFlowDivergenceDist
int maxBarCnt
Definition: wavefront.hh:252
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a particular wavefront This should be updated ever...
Definition: wavefront.cc:856
uint32_t gridSz[3]
Definition: wavefront.hh:196
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition: misc.hh:43
VectorMask initMask
Definition: wavefront.hh:248
int simdId
Definition: wavefront.hh:163
bool dropFetch
Definition: wavefront.hh:170
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, const VectorMask &exec_mask)
Definition: wavefront.cc:792
uint32_t dispatchId
Definition: wavefront.hh:206
T read(int regIdx, int threadId)
class ConditionRegisterState * condRegState
Definition: wavefront.hh:173
bool isOldestInstFlatMem()
Definition: wavefront.cc:249
Bitfield< 4, 0 > mode
bool isOldestInstPrivMem()
Definition: wavefront.cc:236
int wfSlotId
Definition: wavefront.hh:160
bool stalledAtBarrier
Definition: wavefront.hh:254
uint32_t maxSpVgprs
Definition: wavefront.hh:175
LdsChunk * ldsChunk
Definition: wavefront.hh:258
Stats::Scalar numTimesBlockedDueWAXDependencies
Definition: wavefront.hh:287
int spBypassLength()
uint32_t pc() const
Definition: wavefront.cc:825
Stats::Scalar numInstrExecuted
uint64_t lastTrace
Definition: wavefront.hh:226
Bitfield< 63 > val
Definition: misc.hh:769
std::vector< WaitClass > vrfToLocalMemPipeBus
bool instructionBufferHasBranch()
Definition: wavefront.cc:264
std::vector< WaitClass > aluPipe
uint64_t wfDynId
Definition: wavefront.hh:280
CallArgMem * callArgMem
Definition: wavefront.hh:298
GlobalMemPipeline globalMemoryPipe
Definition: compute_unit.hh:99
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:46
Stats::Distribution activeLanesPerLMemInstrDist
Stats::Distribution srcRegOpDist
Definition: wavefront.hh:293
uint8_t type
Definition: inet.hh:328
std::vector< uint32_t > workItemId[3]
Definition: wavefront.hh:191
uint32_t pc
PC of current instruction.
Definition: wavefront.hh:64
std::deque< GPUDynInstPtr > instructionBuffer
Definition: wavefront.hh:167
Stats::Distribution execRateDist
int ShrMemUnitId()
uint32_t actualWgSz[3]
Definition: wavefront.hh:200
uint32_t wfId
Definition: wavefront.hh:204
uint32_t rdLmReqsInPipe
Definition: wavefront.hh:220
void regStats()
Callback to set stat parameters.
Definition: wavefront.cc:93
Addr privBase
Definition: wavefront.hh:268
std::vector< uint32_t > workItemFlatId
Definition: wavefront.hh:192
Wavefront(const Params *p)
Definition: wavefront.cc:49
void updateResources()
Definition: wavefront.cc:551
std::vector< WaitClass > vrfToGlobalMemPipeBus
void updateInstStats(GPUDynInstPtr gpuDynInst)
std::vector< int > barCnt
Definition: wavefront.hh:251
Bitfield< 9 > d
ComputeUnit * computeUnit
Definition: wavefront.hh:165
uint32_t wgId
Definition: wavefront.hh:197
Stats::Distribution dstRegOpDist
Definition: wavefront.hh:294
uint32_t rdGmReqsInPipe
Definition: wavefront.hh:221
bool isLmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:172
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: wavefront.cc:133
uint32_t outstandingReqsWrLm
Definition: wavefront.hh:215
uint32_t actualWgSzTotal
Definition: wavefront.hh:201
int wfSize() const
void getContext(const void *out)
Returns the hardware context as a stream of bytes This method is designed for HSAIL execution...
Definition: wavefront.cc:866
uint32_t outstandingReqsRdGm
Definition: wavefront.hh:217
int ready(itype_e type)
Definition: wavefront.cc:303
int memTraceBusy
Definition: wavefront.hh:225
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
void computeActualWgSz(NDRange *ndr)
Definition: wavefront.cc:991
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2606
T read(const uint32_t index)
a read operation
Definition: lds_state.hh:71
WavefrontParams Params
Definition: wavefront.hh:319
void exec()
Definition: wavefront.cc:651
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140
bool isOldestInstALU()
Definition: wavefront.cc:182
uint32_t outstandingReqsRdLm
Definition: wavefront.hh:219
bool coalescerReady(GPUDynInstPtr mp) const
uint64_t basePtr
Definition: wavefront.hh:152
uint32_t outstandingReqs
Definition: wavefront.hh:208
Bitfield< 24 > j
bool isOldestInstBarrier()
Definition: wavefront.cc:197
uint32_t outstandingReqsWrGm
Definition: wavefront.hh:213
bool isGmInstruction(GPUDynInstPtr ii)
Definition: wavefront.cc:163
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:276
std::vector< Addr > lastAddr
Definition: wavefront.hh:190
virtual const std::string name() const
Definition: sim_object.hh:128
Shader * shader
TheGpuISA::GPUISA _gpuISA
Definition: wavefront.hh:382
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
Stats::Distribution activeLanesPerGMemInstrDist
uint32_t oldBarrierCnt
Definition: wavefront.hh:154
uint64_t oldDgprTcnt
Definition: wavefront.hh:245
bool pendingFetch
Definition: wavefront.hh:169
uint64_t oldVgprTcnt
Definition: wavefront.hh:238
uint32_t memReqsInPipe
Definition: wavefront.hh:211
bool isOldestInstLMem()
Definition: wavefront.cc:223
int reservedVectorRegs
Definition: wavefront.hh:228
uint32_t startVgprIndex
Definition: wavefront.hh:231
A reconvergence stack entry conveys the necessary state to implement control flow divergence...
Definition: wavefront.hh:60
Stats::Scalar numVecOpsExecuted
uint32_t wrGmReqsInPipe
Definition: wavefront.hh:223
std::vector< VectorRegisterFile * > vrf
void start(uint64_t _wfDynId, uint64_t _base_ptr)
Definition: wavefront.cc:155
uint32_t rpc() const
Definition: wavefront.cc:831
Tick ticks(int numCycles) const
Definition: shader.hh:91
std::vector< uint64_t > oldDgpr
Definition: wavefront.hh:241
VectorMask execMask() const
Definition: wavefront.cc:837
void popFromReconvergenceStack()
Definition: wavefront.cc:800
int nextLocRdBus()
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
Definition: wavefront.cc:140
hsail_mode_e hsail_mode
Definition: shader.hh:107
void write(const uint32_t index, const T value)
a write operation
Definition: lds_state.hh:84
uint32_t wrLmReqsInPipe
Definition: wavefront.hh:222
Bitfield< 3, 0 > mask
Definition: types.hh:62
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:309
std::vector< uint64_t > lastExecCycle
uint32_t maxDpVgprs
Definition: wavefront.hh:177
std::vector< WaitClass > wfWait
LocalMemPipeline localMemoryPipe
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:64
int GlbMemUnitId()
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0)
Definition: wavefront.cc:280
uint32_t barrierId
Definition: wavefront.hh:156
Stats::Scalar totalCycles
Bitfield< 0 > p
int wgId[3]
Definition: ndrange.hh:48
int nextGlbRdBus()
Counter value() const
Return the current value of this stat as its base type.
Definition: statistics.hh:700
Abstract superclass for simulation objects.
Definition: sim_object.hh:92
status_e status
Definition: wavefront.hh:158
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1896
const std::string to_string(sc_enc enc)
Definition: sc_fxdefs.cc:60
uint32_t rpc
PC of the immediate post-dominator instruction, i.e., the value of pc for the first instruction that ...
Definition: wavefront.hh:70
bool waitingAtBarrier(int lane)
Definition: wavefront.cc:786
VectorMask execMask
Execution mask.
Definition: wavefront.hh:74

Generated on Mon Jun 8 2020 15:45:11 for gem5 by doxygen 1.8.13