gem5  v22.1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef __COMPUTE_UNIT_HH__
33 #define __COMPUTE_UNIT_HH__
34 
35 #include <deque>
36 #include <map>
37 #include <unordered_set>
38 #include <vector>
39 
40 #include "base/callback.hh"
41 #include "base/compiler.hh"
42 #include "base/statistics.hh"
43 #include "base/stats/group.hh"
44 #include "base/types.hh"
45 #include "config/the_gpu_isa.hh"
46 #include "enums/PrefetchType.hh"
47 #include "gpu-compute/comm.hh"
57 #include "mem/port.hh"
58 #include "mem/token_port.hh"
59 #include "sim/clocked_object.hh"
60 
61 namespace gem5
62 {
63 
64 class HSAQueueEntry;
65 class LdsChunk;
66 class ScalarRegisterFile;
67 class Shader;
68 class VectorRegisterFile;
69 
70 struct ComputeUnitParams;
71 
73 {
74  OLDEST = 0,
75  RR
76 };
77 
79 {
84 };
85 
90 class WFBarrier
91 {
92  public:
94  {
95  }
96 
97  static const int InvalidID = -1;
98 
99  int
100  numAtBarrier() const
101  {
102  return _numAtBarrier;
103  }
104 
108  int
110  {
111  return _maxBarrierCnt - _numAtBarrier;
112  }
113 
114  int
116  {
117  return _maxBarrierCnt;
118  }
119 
124  void
125  setMaxBarrierCnt(int max_barrier_cnt)
126  {
127  _maxBarrierCnt = max_barrier_cnt;
128  }
129 
133  void
135  {
136  assert(_numAtBarrier < _maxBarrierCnt);
137  ++_numAtBarrier;
138  }
139 
145  bool
146  allAtBarrier() const
147  {
148  return _numAtBarrier == _maxBarrierCnt;
149  }
150 
155  void
157  {
158  assert(_maxBarrierCnt > 0);
159  --_maxBarrierCnt;
160  }
161 
166  void
168  {
169  _numAtBarrier = 0;
170  _maxBarrierCnt = 0;
171  }
172 
177  void
179  {
180  _numAtBarrier = 0;
181  }
182 
183  private:
190 
199 };
200 
202 {
203  public:
204 
205 
206  // Execution resources
207  //
208  // The ordering of units is:
209  // Vector ALUs
210  // Scalar ALUs
211  // GM Pipe
212  // LM Pipe
213  // Scalar Mem Pipe
214  //
215  // Note: the ordering of units is important and the code assumes the
216  // above ordering. However, there may be more than one resource of
217  // each type (e.g., 4 VALUs or 2 SALUs)
218 
220  // Resource control for global memory to VRF data/address bus
222  // Resource control for Vector Register File->Global Memory pipe buses
224  // Resource control for Vector Global Memory execution unit
226 
228  // Resource control for local memory to VRF data/address bus
230  // Resource control for Vector Register File->Local Memory pipe buses
232  // Resource control for Vector Shared/Local Memory execution unit
234 
236  // Resource control for scalar memory to SRF data/address bus
238  // Resource control for Scalar Register File->Scalar Memory pipe buses
240  // Resource control for Scalar Memory execution unit
242 
243  // vector ALU execution resources
246 
247  // scalar ALU execution resources
250 
251  // Return total number of execution units on this CU
252  int numExeUnits() const;
253  // index into readyList of the first memory unit
254  int firstMemUnit() const;
255  // index into readyList of the last memory unit
256  int lastMemUnit() const;
257  // index into scalarALUs vector of SALU used by the wavefront
258  int mapWaveToScalarAlu(Wavefront *w) const;
259  // index into readyList of SALU used by wavefront
261  // index into readyList of Global Memory unit used by wavefront
262  int mapWaveToGlobalMem(Wavefront *w) const;
263  // index into readyList of Local Memory unit used by wavefront
264  int mapWaveToLocalMem(Wavefront *w) const;
265  // index into readyList of Scalar Memory unit used by wavefront
266  int mapWaveToScalarMem(Wavefront *w) const;
267 
268  int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
269  int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
270  int numCyclesPerStoreTransfer; // number of cycles per vector store
271  int numCyclesPerLoadTransfer; // number of cycles per vector load
272 
273  // track presence of dynamic instructions in the Schedule pipeline
274  // stage. This is used to check the readiness of the oldest,
275  // non-dispatched instruction of every WF in the Scoreboard stage.
276  std::unordered_set<uint64_t> pipeMap;
277 
279 
287 
289 
290  typedef ComputeUnitParams Params;
292  int cu_id;
293 
294  // array of vector register files, one per SIMD
296  // array of scalar register files, one per SIMD
298 
299  // Width per VALU/SIMD unit: number of work items that can be executed
300  // on the vector ALU simultaneously in a SIMD unit
302  // number of pipe stages for bypassing data to next dependent single
303  // precision vector instruction inside the vector ALU pipeline
305  // number of pipe stages for bypassing data to next dependent double
306  // precision vector instruction inside the vector ALU pipeline
308  // number of pipe stages for scalar ALU
310  // number of pipe stages for operand collection & distribution network
312  // number of cycles per instruction issue period
314 
315  // VRF to GM Bus latency
317  // SRF to Scalar Mem Bus latency
319  // VRF to LM Bus latency
321 
322  // tracks the last cycle a vector instruction was executed on a SIMD
324 
325  // tracks the number of dyn inst executed per SIMD
327 
328  // true if we allow a separate TLB per lane
330  // if 0, TLB prefetching is off.
332  // if fixed-stride prefetching, this is the stride.
334 
338  enums::PrefetchType prefetchType;
340 
342  // Idle CU timeout in ticks
344  int idleWfs;
347 
348  /*
349  * for Counting page accesses
350  */
352 
354 
357 
365 
366  // number of currently reserved vector registers per SIMD unit
368  // number of currently reserved scalar registers per SIMD unit
370  // number of vector registers per SIMD unit
372  // number of available scalar registers per SIMD unit
374 
375  // this hash map will keep track of page divergence
376  // per memory instruction per wavefront. The hash map
377  // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
378  std::map<Addr, int> pagesTouched;
379 
380  void insertInPipeMap(Wavefront *w);
382 
383  ComputeUnit(const Params &p);
384  ~ComputeUnit();
385 
386  // Timing Functions
387  int oprNetPipeLength() const { return operandNetworkLength; }
388  int simdUnitWidth() const { return simdWidth; }
389  int spBypassLength() const { return spBypassPipeLength; }
390  int dpBypassLength() const { return dpBypassPipeLength; }
391  int scalarPipeLength() const { return scalarPipeStages; }
393  int loadBusLength() const { return numCyclesPerLoadTransfer; }
394  int wfSize() const { return wavefrontSize; }
395 
396  void exec();
397  void initiateFetch(Wavefront *wavefront);
398  void fetch(PacketPtr pkt, Wavefront *wavefront);
400 
401  void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
402  HSAQueueEntry *task, int bar_id,
403  bool fetchContext=false);
404 
405  void doInvalidate(RequestPtr req, int kernId);
406  void doFlush(GPUDynInstPtr gpuDynInst);
407 
408  void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
409  bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
410 
411  int cacheLineSize() const { return _cacheLineSize; }
412  int getCacheLineBits() const { return cacheLineBits; }
413 
414  void resetRegisterPool();
415 
416  private:
417  WFBarrier&
418  barrierSlot(int bar_id)
419  {
420  assert(bar_id > WFBarrier::InvalidID);
421  return wfBarrierSlots.at(bar_id);
422  }
423 
424  int
426  {
427  assert(freeBarrierIds.size());
428  auto free_bar_id = freeBarrierIds.begin();
429  int bar_id = *free_bar_id;
430  freeBarrierIds.erase(free_bar_id);
431  return bar_id;
432  }
433 
434  public:
435  int numYetToReachBarrier(int bar_id);
436  bool allAtBarrier(int bar_id);
437  void incNumAtBarrier(int bar_id);
438  int numAtBarrier(int bar_id);
439  int maxBarrierCnt(int bar_id);
440  void resetBarrier(int bar_id);
441  void decMaxBarrierCnt(int bar_id);
442  void releaseBarrier(int bar_id);
443  void releaseWFsFromBarrier(int bar_id);
444  int numBarrierSlots() const { return _numBarrierSlots; }
445 
446  template<typename c0, typename c1>
447  void doSmReturn(GPUDynInstPtr gpuDynInst);
448 
449  virtual void init() override;
450  void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
451  void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
452  void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
453  bool kernelMemSync,
454  RequestPtr req=nullptr);
455  void handleMemPacket(PacketPtr pkt, int memport_index);
459 
462 
463  bool isDone() const;
464  bool isVectorAluIdle(uint32_t simdId) const;
465 
466  void handleSQCReturn(PacketPtr pkt);
467 
468  protected:
470 
472 
473  public:
474  LdsState &
475  getLds() const
476  {
477  return lds;
478  }
479 
480  int32_t
481  getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
482 
483  [[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
484 
485  typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
487 
488  void exitCallback();
489 
491  {
492  public:
493  GMTokenPort(const std::string& name, SimObject *owner,
494  PortID id = InvalidPortID)
496  { }
498 
499  protected:
500  bool recvTimingResp(PacketPtr) { return false; }
501  void recvReqRetry() { }
502  };
503 
504  // Manager for the number of tokens available to this compute unit to
505  // send global memory request packets to the coalescer this is only used
506  // between global memory pipe and TCP coalescer.
509 
511  class DataPort : public RequestPort
512  {
513  public:
514  DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
515  : RequestPort(_name, _cu, id), computeUnit(_cu) { }
516 
518 
520  {
524 
525  SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
526  Packet::SenderState *sender_state=nullptr)
527  : _gpuDynInst(gpuDynInst),
528  port_index(_port_index),
529  saved(sender_state) { }
530  };
531 
532  class SystemHubEvent : public Event
533  {
536 
537  public:
539  : dataPort(_dataPort), reqPkt(pkt)
540  {
542  }
543 
544  void
546  {
547  // DMAs do not operate on packets and therefore do not
548  // convert to a response. Do that here instead.
549  reqPkt->makeResponse();
551  }
552  };
553 
554  void processMemReqEvent(PacketPtr pkt);
556 
557  void processMemRespEvent(PacketPtr pkt);
559 
561 
562  bool handleResponse(PacketPtr pkt);
563 
564  protected:
566 
567  virtual bool recvTimingResp(PacketPtr pkt);
568  virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
569  virtual void recvFunctional(PacketPtr pkt) { }
570  virtual void recvRangeChange() { }
571  virtual void recvReqRetry();
572 
573  virtual void
575  {
576  resp.clear();
577  snoop = true;
578  }
579 
580  };
581 
582  // Scalar data cache access port
584  {
585  public:
586  ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
587  : RequestPort(_name, _cu), computeUnit(_cu)
588  {
589  }
590 
591  bool recvTimingResp(PacketPtr pkt) override;
592  void recvReqRetry() override;
593 
595  {
597  Packet::SenderState *sender_state=nullptr)
598  : _gpuDynInst(gpuDynInst), saved(sender_state)
599  {
600  }
601 
604  };
605 
606  class MemReqEvent : public Event
607  {
608  private:
611 
612  public:
613  MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
614  : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
615  {
617  }
618 
619  void process();
620  const char *description() const;
621  };
622 
623  class SystemHubEvent : public Event
624  {
627 
628  public:
630  : dataPort(_dataPort), reqPkt(pkt)
631  {
633  }
634 
635  void
637  {
638  // DMAs do not operate on packets and therefore do not
639  // convert to a response. Do that here instead.
640  reqPkt->makeResponse();
642  }
643  };
644 
645  bool handleResponse(PacketPtr pkt);
646 
648 
649  private:
651  };
652 
653  // Instruction cache access port
654  class SQCPort : public RequestPort
655  {
656  public:
657  SQCPort(const std::string &_name, ComputeUnit *_cu)
658  : RequestPort(_name, _cu), computeUnit(_cu) { }
659 
661 
663  {
666  // kernel id to be used in handling I-Cache invalidate response
667  int kernId;
668 
670  *sender_state=nullptr, int _kernId=-1)
671  : wavefront(_wavefront), saved(sender_state),
672  kernId(_kernId){ }
673  };
674 
676 
677  protected:
679 
680  virtual bool recvTimingResp(PacketPtr pkt);
681  virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
682  virtual void recvFunctional(PacketPtr pkt) { }
683  virtual void recvRangeChange() { }
684  virtual void recvReqRetry();
685 
686  virtual void
688  {
689  resp.clear();
690  snoop = true;
691  }
692  };
693 
695  class DTLBPort : public RequestPort
696  {
697  public:
698  DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
699  : RequestPort(_name, _cu, id), computeUnit(_cu),
700  stalled(false)
701  { }
702 
703  bool isStalled() { return stalled; }
704  void stallPort() { stalled = true; }
705  void unstallPort() { stalled = false; }
706 
712 
717  {
718  // the memInst that this is associated with
720 
721  // the lane in the memInst this is associated with, so we send
722  // the memory request down the right port
724 
725  // constructor used for packets involved in timing accesses
726  SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
727  : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
728 
729  };
730 
731  protected:
733  bool stalled;
734 
735  virtual bool recvTimingResp(PacketPtr pkt);
736  virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
737  virtual void recvFunctional(PacketPtr pkt) { }
738  virtual void recvRangeChange() { }
739  virtual void recvReqRetry();
740  };
741 
743  {
744  public:
745  ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
746  : RequestPort(_name, _cu), computeUnit(_cu), stalled(false)
747  {
748  }
749 
751  {
752  SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
754  };
755 
756  bool recvTimingResp(PacketPtr pkt) override;
757  void recvReqRetry() override { assert(false); }
758 
759  bool isStalled() const { return stalled; }
760  void stallPort() { stalled = true; }
761  void unstallPort() { stalled = false; }
762 
764 
765  private:
767  bool stalled;
768  };
769 
770  class ITLBPort : public RequestPort
771  {
772  public:
773  ITLBPort(const std::string &_name, ComputeUnit *_cu)
774  : RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { }
775 
776 
777  bool isStalled() { return stalled; }
778  void stallPort() { stalled = true; }
779  void unstallPort() { stalled = false; }
780 
786 
791  {
792  // The wavefront associated with this request
794 
795  SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
796  };
797 
798  protected:
800  bool stalled;
801 
802  virtual bool recvTimingResp(PacketPtr pkt);
803  virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
804  virtual void recvFunctional(PacketPtr pkt) { }
805  virtual void recvRangeChange() { }
806  virtual void recvReqRetry();
807  };
808 
812  class LDSPort : public RequestPort
813  {
814  public:
815  LDSPort(const std::string &_name, ComputeUnit *_cu)
816  : RequestPort(_name, _cu), computeUnit(_cu)
817  {
818  }
819 
820  bool isStalled() const { return stalled; }
821  void stallPort() { stalled = true; }
822  void unstallPort() { stalled = false; }
823 
828  std::queue<PacketPtr> retries;
829 
835  {
836  protected:
837  // The actual read/write/atomic request that goes with this command
839 
840  public:
842  _gpuDynInst(gpuDynInst)
843  {
844  }
845 
847  getMemInst() const
848  {
849  return _gpuDynInst;
850  }
851  };
852 
853  virtual bool
855 
856  protected:
857 
858  bool stalled = false;
859 
861 
862  virtual bool
864 
865  virtual Tick
866  recvAtomic(PacketPtr pkt) { return 0; }
867 
868  virtual void
870  {
871  }
872 
873  virtual void
875  {
876  }
877 
878  virtual void
879  recvReqRetry();
880  };
881 
886 
887  TokenManager *
889  {
890  return memPortTokens;
891  }
892 
897  // port to the TLB hierarchy (i.e., the L1 TLB)
899  // port to the scalar data cache
901  // port to the scalar data TLB
903  // port to the SQC (i.e. the I-cache)
905  // port to the SQC TLB (there's a separate TLB for each I-cache)
907 
908  Port &
909  getPort(const std::string &if_name, PortID idx) override
910  {
911  if (if_name == "memory_port" && idx < memPort.size()) {
912  return memPort[idx];
913  } else if (if_name == "translation_port" && idx < tlbPort.size()) {
914  return tlbPort[idx];
915  } else if (if_name == "scalar_port") {
916  return scalarDataPort;
917  } else if (if_name == "scalar_tlb_port") {
918  return scalarDTLBPort;
919  } else if (if_name == "sqc_port") {
920  return sqcPort;
921  } else if (if_name == "sqc_tlb_port") {
922  return sqcTLBPort;
923  } else if (if_name == "ldsPort") {
924  return ldsPort;
925  } else if (if_name == "gmTokenPort") {
926  return gmTokenPort;
927  } else {
928  return ClockedObject::getPort(if_name, idx);
929  }
930  }
931 
933 
934  private:
935  const int _cacheLineSize;
936  const int _numBarrierSlots;
940 
975 
983  std::unordered_set<int> freeBarrierIds;
984 
985  // hold the time of the arrival of the first cache block related to
986  // a particular GPUDynInst. This is used to calculate the difference
987  // between the first and last chace block arrival times.
988  std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
989 
990  public:
991  void updateInstStats(GPUDynInstPtr gpuDynInst);
993 
995  {
996  ComputeUnitStats(statistics::Group *parent, int n_wf);
997 
1020 
1027 
1028  // Cycles required to send register source (addr and data) from
1029  // register files to memory pipeline, per SIMD.
1033 
1055 
1057 
1058  // the following stats compute the avg. TLB accesslatency per
1059  // uncoalesced request (only for data)
1063  // hitsPerTLBLevel[x] are the hits in Level x TLB.
1064  // x = 0 is the page table.
1066 
1069 
1070  // over all memory instructions executed over all wavefronts
1071  // how many touched 0-4 pages, 4-8, ..., 60-64 pages
1073  // count of non-flat global memory vector instructions executed
1075  // count of flat global memory vector instructions executed
1078 
1081  // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
1082  // active when the instruction is committed, this number is still
1083  // incremented by 1
1085  // Number of cycles among successive instruction executions across all
1086  // wavefronts of the same CU
1088  // number of individual vector operations executed
1090  // number of individual f16 vector operations executed
1092  // number of individual f32 vector operations executed
1094  // number of individual f64 vector operations executed
1096  // number of individual FMA 16,32,64 vector operations executed
1100  // number of individual MAC 16,32,64 vector operations executed
1104  // number of individual MAD 16,32,64 vector operations executed
1108  // total number of two op FP vector operations executed
1110  // Total cycles that something is running on the GPU
1112  statistics::Formula vpc; // vector ops per cycle
1113  statistics::Formula vpc_f16; // vector ops per cycle
1114  statistics::Formula vpc_f32; // vector ops per cycle
1115  statistics::Formula vpc_f64; // vector ops per cycle
1116  statistics::Formula ipc; // vector instructions per cycle
1120  // number of vector ALU instructions received
1122  // number of times a WG cannot start due to lack of free VGPRs in SIMDs
1124  // number of times a WG cannot start due to lack of free SGPRs in SIMDs
1130 
1131  // distrubtion in latency difference between first and last cache block
1132  // arrival ticks
1134 
1135  // Track the amount of interleaving between wavefronts on each SIMD.
1136  // This stat is sampled using instExecPerSimd to compute the number
1137  // of instructions that have been executed on a SIMD between a WF
1138  // executing two successive instructions.
1141 };
1142 
1143 } // namespace gem5
1144 
1145 #endif // __COMPUTE_UNIT_HH__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
GMTokenPort(const std::string &name, SimObject *owner, PortID id=InvalidPortID)
bool recvTimingResp(PacketPtr)
Receive a timing response from the peer.
void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ITLBPort(const std::string &_name, ComputeUnit *_cu)
SenderState is information carried along with the packet, esp.
SenderState(GPUDynInstPtr gpuDynInst)
the port intended to communicate between the CU and its LDS
bool stalled
whether or not it is stalled
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
LDSPort(const std::string &_name, ComputeUnit *_cu)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
SQCPort(const std::string &_name, ComputeUnit *_cu)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
virtual Tick recvAtomic(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
const char * description() const
Return a C string describing the event.
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
bool handleResponse(PacketPtr pkt)
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
int oprNetPipeLength() const
int simdUnitWidth() const
void releaseBarrier(int bar_id)
int wfSize() const
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
Definition: compute_unit.cc:65
bool processTimingPacket(PacketPtr pkt)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
Cycles srf_scm_bus_latency
std::vector< uint64_t > instExecPerSimd
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
const int _cacheLineSize
EXEC_POLICY exec_policy
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
RequestorID _requestorId
void handleMemPacket(PacketPtr pkt, int memport_index)
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void doSmReturn(GPUDynInstPtr gpuDynInst)
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::unordered_map< Addr, std::pair< int, int > > pageDataStruct
int getCacheLineBits() const
std::vector< WFBarrier > wfBarrierSlots
The barrier slots for this CU.
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
int cacheLineSize() const
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > lastExecCycle
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< ScalarRegisterFile * > srf
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
int dpBypassLength() const
int loadBusLength() const
enums::PrefetchType prefetchType
Port & getPort(const std::string &if_name, PortID idx) override
Get a port with a given name and index.
void processFetchReturn(PacketPtr pkt)
int numBarrierSlots() const
int scalarPipeLength() const
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
TokenManager * getTokenManager()
WFBarrier & barrierSlot(int bar_id)
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
InstSeqNum globalSeqNum
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
const int _numBarrierSlots
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
int storeBusLength() const
void initiateFetch(Wavefront *wavefront)
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
int spBypassLength() const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
LdsState & getLds() const
void fetch(PacketPtr pkt, Wavefront *wavefront)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:79
static const FlagsType AutoDelete
Definition: eventq.hh:107
void setFlags(Flags _flags)
Definition: eventq.hh:328
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition: lds_state.hh:57
const std::string _name
Definition: named.hh:41
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
void makeResponse()
Take a request packet and modify it in place to be suitable for returning as a response to that reque...
Definition: packet.hh:1059
Ports are used to interface objects to each other.
Definition: port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition: port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition: port.hh:111
A RequestPort is a specialisation of a Port, which implements the default protocol for the three diff...
Definition: port.hh:79
SimObject & owner
Definition: port.hh:86
Communication interface between Schedule and Execute stages.
Definition: comm.hh:99
Communication interface between ScoreboardCheck and Schedule stages.
Definition: comm.hh:63
Abstract superclass for simulation objects.
Definition: sim_object.hh:148
WF barrier slots.
Definition: compute_unit.hh:91
static const int InvalidID
Definition: compute_unit.hh:97
int numAtBarrier() const
void decMaxBarrierCnt()
Decrement the number of WFs that are participating in this barrier.
int numYetToReachBarrier() const
Number of WFs that have not yet reached the barrier.
void setMaxBarrierCnt(int max_barrier_cnt)
Set the maximum barrier count (i.e., the number of WFs that are participating in the barrier).
void release()
Release this barrier resource so it can be used by other WGs.
void reset()
Reset the barrier.
void incNumAtBarrier()
Mark that a WF has reached the barrier.
int _maxBarrierCnt
The maximum number of WFs that can reach this barrier.
int _numAtBarrier
The number of WFs in the WG that have reached the barrier.
bool allAtBarrier() const
Have all WFs participating in this barrier reached the barrier? If so, then the barrier is satisfied ...
int maxBarrierCnt() const
A simple distribution stat.
Definition: statistics.hh:2085
A formula for statistics that is calculated when printed.
Definition: statistics.hh:2540
Statistics container.
Definition: group.hh:94
This is a simple scalar statistic, like a counter.
Definition: statistics.hh:1931
A vector of distributions.
Definition: statistics.hh:2246
A vector of scalar stats.
Definition: statistics.hh:2007
STL deque class.
Definition: stl.hh:44
STL vector class.
Definition: stl.hh:37
ClockedObject declaration and implementation.
virtual Port & getPort(const std::string &if_name, PortID idx=InvalidPortID)
Get a port with a given name and index.
Definition: sim_object.cc:126
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Port Object Declaration.
Bitfield< 30, 0 > index
Bitfield< 6 > w
Definition: pagetable.hh:59
Bitfield< 54 > p
Definition: pagetable.hh:70
Bitfield< 3 > addr
Definition: types.hh:84
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
const PortID InvalidPortID
Definition: types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
@ TLB_MISS_CACHE_MISS
Definition: compute_unit.hh:80
@ TLB_MISS_CACHE_HIT
Definition: compute_unit.hh:81
@ TLB_HIT_CACHE_HIT
Definition: compute_unit.hh:83
@ TLB_HIT_CACHE_MISS
Definition: compute_unit.hh:82
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:245
uint64_t Tick
Tick count type.
Definition: types.hh:58
uint16_t RequestorID
Definition: request.hh:95
@ OLDEST
Definition: compute_unit.hh:74
uint64_t InstSeqNum
Definition: inst_seq.hh:40
Declaration of Statistics objects.
statistics::Formula vectorMemWritesPerWF
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar numVecOpsExecutedMAC64
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Vector instCyclesVMemPerSimd
statistics::Formula flatVMemInstsPerWF
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Scalar numVecOpsExecutedFMA64
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numVecOpsExecutedMAC16
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Formula numALUInstsExecuted
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Formula scalarMemWritesPerWF
statistics::Scalar numVecOpsExecutedMAD64
statistics::Formula vectorMemReadsPerWF
statistics::Scalar numVecOpsExecutedFMA32
statistics::Formula scalarMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerWF
statistics::Scalar numVecOpsExecutedMAD16
statistics::Scalar numVecOpsExecutedMAC32
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar numVecOpsExecutedFMA16
statistics::Formula ldsNoFlatInstsPerWF
statistics::Scalar numVecOpsExecutedMAD32
statistics::Vector instCyclesScMemPerSimd
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Scalar dynamicFlatMemInstrCnt
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
SenderState(GPUDynInstPtr gpuDynInst, Packet::SenderState *sender_state=nullptr)
A virtual base opaque structure used to hold state associated with the packet (e.g....
Definition: packet.hh:468

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1