gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
compute_unit.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __COMPUTE_UNIT_HH__
33#define __COMPUTE_UNIT_HH__
34
35#include <deque>
36#include <map>
37#include <unordered_set>
38#include <vector>
39
40#include "base/callback.hh"
41#include "base/compiler.hh"
42#include "base/statistics.hh"
43#include "base/stats/group.hh"
44#include "base/types.hh"
45#include "config/the_gpu_isa.hh"
46#include "enums/GfxVersion.hh"
47#include "enums/PrefetchType.hh"
48#include "gpu-compute/comm.hh"
58#include "mem/port.hh"
59#include "mem/token_port.hh"
60#include "sim/clocked_object.hh"
61
62namespace gem5
63{
64
65class HSAQueueEntry;
66class LdsChunk;
68class Shader;
71
72struct ComputeUnitParams;
73
75{
76 OLDEST = 0,
78};
79
87
93{
94 public:
98
99 static const int InvalidID = -1;
100
101 int
103 {
104 return _numAtBarrier;
105 }
106
110 int
112 {
114 }
115
116 int
118 {
119 return _maxBarrierCnt;
120 }
121
126 void
127 setMaxBarrierCnt(int max_barrier_cnt)
128 {
129 _maxBarrierCnt = max_barrier_cnt;
130 }
131
135 void
137 {
140 }
141
147 bool
149 {
151 }
152
157 void
159 {
160 assert(_maxBarrierCnt > 0);
162 }
163
168 void
170 {
171 _numAtBarrier = 0;
172 _maxBarrierCnt = 0;
173 }
174
179 void
181 {
182 _numAtBarrier = 0;
183 }
184
185 private:
192
201};
202
204{
205 public:
206
207
208 // Execution resources
209 //
210 // The ordering of units is:
211 // Vector ALUs
212 // Scalar ALUs
213 // GM Pipe
214 // LM Pipe
215 // Scalar Mem Pipe
216 //
217 // Note: the ordering of units is important and the code assumes the
218 // above ordering. However, there may be more than one resource of
219 // each type (e.g., 4 VALUs or 2 SALUs)
220
222 // Resource control for global memory to VRF data/address bus
224 // Resource control for Vector Register File->Global Memory pipe buses
226 // Resource control for Vector Global Memory execution unit
228
230 // Resource control for local memory to VRF data/address bus
232 // Resource control for Vector Register File->Local Memory pipe buses
234 // Resource control for Vector Shared/Local Memory execution unit
236
238 // Resource control for scalar memory to SRF data/address bus
240 // Resource control for Scalar Register File->Scalar Memory pipe buses
242 // Resource control for Scalar Memory execution unit
244
245 // vector ALU execution resources
248
249 // scalar ALU execution resources
252
253 // Return total number of execution units on this CU
254 int numExeUnits() const;
255 // index into readyList of the first memory unit
256 int firstMemUnit() const;
257 // index into readyList of the last memory unit
258 int lastMemUnit() const;
259 // index into scalarALUs vector of SALU used by the wavefront
260 int mapWaveToScalarAlu(Wavefront *w) const;
261 // index into readyList of SALU used by wavefront
263 // index into readyList of Global Memory unit used by wavefront
264 int mapWaveToGlobalMem(Wavefront *w) const;
265 // index into readyList of Local Memory unit used by wavefront
266 int mapWaveToLocalMem(Wavefront *w) const;
267 // index into readyList of Scalar Memory unit used by wavefront
268 int mapWaveToScalarMem(Wavefront *w) const;
269
270 int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
271 int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
272 int numCyclesPerStoreTransfer; // number of cycles per vector store
273 int numCyclesPerLoadTransfer; // number of cycles per vector load
274
275 // track presence of dynamic instructions in the Schedule pipeline
276 // stage. This is used to check the readiness of the oldest,
277 // non-dispatched instruction of every WF in the Scoreboard stage.
278 std::unordered_set<uint64_t> pipeMap;
279
281
289
291
292 typedef ComputeUnitParams Params;
294 int cu_id;
295
296 // array of vector register files, one per SIMD
298 // array of scalar register files, one per SIMD
300
302
303 // Width per VALU/SIMD unit: number of work items that can be executed
304 // on the vector ALU simultaneously in a SIMD unit
306 // number of pipe stages for bypassing data to next dependent single
307 // precision vector instruction inside the vector ALU pipeline
309 // number of pipe stages for bypassing data to next dependent double
310 // precision vector instruction inside the vector ALU pipeline
312 // number of pipe stages for register file cache
314 // number of pipe stages for scalar ALU
316 // number of pipe stages for operand collection & distribution network
318 // number of cycles per instruction issue period
320
321 // VRF to GM Bus latency
323 // SRF to Scalar Mem Bus latency
325 // VRF to LM Bus latency
327
328 // tracks the last cycle a vector instruction was executed on a SIMD
330
331 // tracks the number of dyn inst executed per SIMD
333
334 // true if we allow a separate TLB per lane
336 // if 0, TLB prefetching is off.
338 // if fixed-stride prefetching, this is the stride.
340
344 enums::PrefetchType prefetchType;
346
348 // Idle CU timeout in ticks
353
354 /*
355 * for Counting page accesses
356 */
358
360
365
368
369 // Keeps track of mfma instructions occupying matrix core engine per SM
371
379
380 // number of currently reserved vector registers per SIMD unit
382 // number of currently reserved scalar registers per SIMD unit
384 // number of vector registers per SIMD unit
386 // number of available scalar registers per SIMD unit
388
389 // this hash map will keep track of page divergence
390 // per memory instruction per wavefront. The hash map
391 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
392 std::map<Addr, int> pagesTouched;
393
394 // get cycles for mfma instructions based on their memonic
395 std::map<GfxVersion, std::map<std::string, int>> mfma_cycles;
396
399
400 ComputeUnit(const Params &p);
401 ~ComputeUnit();
402
403 // Timing Functions
405 int simdUnitWidth() const { return simdWidth; }
406 int spBypassLength() const { return spBypassPipeLength; }
407 int dpBypassLength() const { return dpBypassPipeLength; }
408 int rfcLength() const { return rfcPipeLength; }
409 int scalarPipeLength() const { return scalarPipeStages; }
412 int wfSize() const { return wavefrontSize; }
413
414 void exec();
415 void initiateFetch(Wavefront *wavefront);
416 void fetch(PacketPtr pkt, Wavefront *wavefront);
418
419 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
420 HSAQueueEntry *task, int bar_id,
421 bool fetchContext=false);
422
423 void doInvalidate(RequestPtr req, int kernId);
424 void doFlush(GPUDynInstPtr gpuDynInst);
425 void doSQCInvalidate(RequestPtr req, int kernId);
426
427 void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
428 bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
429
430 int cacheLineSize() const { return _cacheLineSize; }
431 int getCacheLineBits() const { return cacheLineBits; }
432
433 void resetRegisterPool();
434
435 private:
436 WFBarrier&
437 barrierSlot(int bar_id)
438 {
439 assert(bar_id > WFBarrier::InvalidID);
440 return wfBarrierSlots.at(bar_id);
441 }
442
443 int
445 {
446 assert(freeBarrierIds.size());
447 auto free_bar_id = freeBarrierIds.begin();
448 int bar_id = *free_bar_id;
449 freeBarrierIds.erase(free_bar_id);
450 return bar_id;
451 }
452
453 public:
454 int numYetToReachBarrier(int bar_id);
455 bool allAtBarrier(int bar_id);
456 void incNumAtBarrier(int bar_id);
457 int numAtBarrier(int bar_id);
458 int maxBarrierCnt(int bar_id);
459 void resetBarrier(int bar_id);
460 void decMaxBarrierCnt(int bar_id);
461 void releaseBarrier(int bar_id);
462 void releaseWFsFromBarrier(int bar_id);
463 int numBarrierSlots() const { return _numBarrierSlots; }
464
465 template<typename c0, typename c1>
466 void doSmReturn(GPUDynInstPtr gpuDynInst);
467
468 virtual void init() override;
469 void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
470 void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
471 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
472 bool kernelMemSync,
473 RequestPtr req=nullptr);
474 void handleMemPacket(PacketPtr pkt, int memport_index);
478
481
482 bool isDone() const;
483 bool isVectorAluIdle(uint32_t simdId) const;
484
485 void handleSQCReturn(PacketPtr pkt);
486
487 void sendInvL2(Addr paddr);
488
489 void printProgress();
490
491 protected:
493
495
496 public:
497 LdsState &
498 getLds() const
499 {
500 return lds;
501 }
502
503 int32_t
504 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
505
506 [[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
507
508 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
510
511 void exitCallback();
512
514 {
515 public:
516 GMTokenPort(const std::string& name, SimObject *owner,
519 { }
521
522 protected:
523 bool recvTimingResp(PacketPtr) { return false; }
524 void recvReqRetry() { }
525 };
526
527 // Manager for the number of tokens available to this compute unit to
528 // send global memory request packets to the coalescer this is only used
529 // between global memory pipe and TCP coalescer.
532
534 class DataPort : public RequestPort
535 {
536 public:
537 DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
538 : RequestPort(_name, id), computeUnit(_cu) { }
539
541
543 {
548
549 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
550 Packet::SenderState *sender_state=nullptr)
551 : _gpuDynInst(gpuDynInst),
552 port_index(_port_index),
553 saved(sender_state) { }
554
556 Packet::SenderState *sender_state=nullptr)
557 : computeUnit(cu),
558 port_index(_port_index),
559 saved(sender_state) { }
560 };
561
562 class SystemHubEvent : public Event
563 {
566
567 public:
569 : dataPort(_dataPort), reqPkt(pkt)
570 {
572 }
573
574 void
576 {
577 // DMAs do not operate on packets and therefore do not
578 // convert to a response. Do that here instead.
579 reqPkt->makeResponse();
580 dataPort->handleResponse(reqPkt);
581 }
582 };
583
586
589
591
592 bool handleResponse(PacketPtr pkt);
593
594 protected:
596
597 virtual bool recvTimingResp(PacketPtr pkt);
598 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
599 virtual void recvFunctional(PacketPtr pkt) { }
600 virtual void recvRangeChange() { }
601 virtual void recvReqRetry();
602
603 virtual void
605 {
606 resp.clear();
607 snoop = true;
608 }
609
610 };
611
612 // Scalar data cache access port
614 {
615 public:
616 ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
618 {
619 }
620
621 bool recvTimingResp(PacketPtr pkt) override;
622 void recvReqRetry() override;
623
625 {
627 Packet::SenderState *sender_state=nullptr)
628 : _gpuDynInst(gpuDynInst), saved(sender_state)
629 {
630 }
631
634 };
635
636 class MemReqEvent : public Event
637 {
638 private:
641
642 public:
643 MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
644 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
645 {
647 }
648
649 void process();
650 const char *description() const;
651 };
652
653 class SystemHubEvent : public Event
654 {
657
658 public:
660 : dataPort(_dataPort), reqPkt(pkt)
661 {
663 }
664
665 void
667 {
668 // DMAs do not operate on packets and therefore do not
669 // convert to a response. Do that here instead.
670 reqPkt->makeResponse();
671 dataPort->handleResponse(reqPkt);
672 }
673 };
674
675 bool handleResponse(PacketPtr pkt);
676
678
679 private:
681 };
682
683 // Instruction cache access port
684 class SQCPort : public RequestPort
685 {
686 public:
687 SQCPort(const std::string &_name, ComputeUnit *_cu)
688 : RequestPort(_name), computeUnit(_cu) { }
689
691
693 {
694 enum : int
695 {
699 };
700
703 // kernel id to be used in handling I-Cache invalidate response
708 *sender_state=nullptr, int _kernId=-1)
709 : wavefront(_wavefront), saved(sender_state),
710 kernId(_kernId), isKernDispatch(false){ }
711
712 SenderState(Wavefront *_wavefront, bool _isKernDispatch,
713 Packet::SenderState *sender_state=nullptr, int _kernId=-1)
714 : wavefront(_wavefront), saved(sender_state),
715 kernId(_kernId), isKernDispatch(_isKernDispatch){ }
716
717 };
718
719 class MemReqEvent : public Event
720 {
721 private:
724
725 public:
726 MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
727 : Event(), sqcPort(_sqc_port), pkt(_pkt)
728 {
730 }
731
732 void process();
733 const char *description() const;
734 };
735
737
738 protected:
740
741 virtual bool recvTimingResp(PacketPtr pkt);
742 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
743 virtual void recvFunctional(PacketPtr pkt) { }
744 virtual void recvRangeChange() { }
745 virtual void recvReqRetry();
746
747 virtual void
749 {
750 resp.clear();
751 snoop = true;
752 }
753 };
754
756 class DTLBPort : public RequestPort
757 {
758 public:
759 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
761 stalled(false)
762 { }
763
764 bool isStalled() { return stalled; }
765 void stallPort() { stalled = true; }
766 void unstallPort() { stalled = false; }
767
773
778 {
779 // the memInst that this is associated with
781
782 // the lane in the memInst this is associated with, so we send
783 // the memory request down the right port
785
786 // constructor used for packets involved in timing accesses
787 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
788 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
789
790 };
791
792 protected:
795
796 virtual bool recvTimingResp(PacketPtr pkt);
797 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
798 virtual void recvFunctional(PacketPtr pkt) { }
799 virtual void recvRangeChange() { }
800 virtual void recvReqRetry();
801 };
802
804 {
805 public:
806 ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
807 : RequestPort(_name), computeUnit(_cu), stalled(false)
808 {
809 }
810
812 {
813 SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
815 };
816
817 bool recvTimingResp(PacketPtr pkt) override;
818 void recvReqRetry() override { assert(false); }
819
820 bool isStalled() const { return stalled; }
821 void stallPort() { stalled = true; }
822 void unstallPort() { stalled = false; }
823
825
826 private:
829 };
830
831 class ITLBPort : public RequestPort
832 {
833 public:
834 ITLBPort(const std::string &_name, ComputeUnit *_cu)
835 : RequestPort(_name), computeUnit(_cu), stalled(false) { }
836
837
838 bool isStalled() { return stalled; }
839 void stallPort() { stalled = true; }
840 void unstallPort() { stalled = false; }
841
847
852 {
853 // The wavefront associated with this request
855
856 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
857 };
858
859 protected:
862
863 virtual bool recvTimingResp(PacketPtr pkt);
864 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
865 virtual void recvFunctional(PacketPtr pkt) { }
866 virtual void recvRangeChange() { }
867 virtual void recvReqRetry();
868 };
869
873 class LDSPort : public RequestPort
874 {
875 public:
876 LDSPort(const std::string &_name, ComputeUnit *_cu)
878 {
879 }
880
881 bool isStalled() const { return stalled; }
882 void stallPort() { stalled = true; }
883 void unstallPort() { stalled = false; }
884
889 std::queue<PacketPtr> retries;
890
896 {
897 protected:
898 // The actual read/write/atomic request that goes with this command
900
901 public:
903 _gpuDynInst(gpuDynInst)
904 {
905 }
906
909 {
910 return _gpuDynInst;
911 }
912 };
913
914 virtual bool
916
917 protected:
918
919 bool stalled = false;
920
922
923 virtual bool
925
926 virtual Tick
927 recvAtomic(PacketPtr pkt) { return 0; }
928
929 virtual void
931 {
932 }
933
934 virtual void
936 {
937 }
938
939 virtual void
940 recvReqRetry();
941 };
942
947
950 {
951 return memPortTokens;
952 }
953
958 // port to the TLB hierarchy (i.e., the L1 TLB)
960 // port to the scalar data cache
962 // port to the scalar data TLB
964 // port to the SQC (i.e. the I-cache)
966 // port to the SQC TLB (there's a separate TLB for each I-cache)
968
969 Port &
970 getPort(const std::string &if_name, PortID idx) override
971 {
972 if (if_name == "memory_port" && idx < memPort.size()) {
973 return memPort[idx];
974 } else if (if_name == "translation_port" && idx < tlbPort.size()) {
975 return tlbPort[idx];
976 } else if (if_name == "scalar_port") {
977 return scalarDataPort;
978 } else if (if_name == "scalar_tlb_port") {
979 return scalarDTLBPort;
980 } else if (if_name == "sqc_port") {
981 return sqcPort;
982 } else if (if_name == "sqc_tlb_port") {
983 return sqcTLBPort;
984 } else if (if_name == "ldsPort") {
985 return ldsPort;
986 } else if (if_name == "gmTokenPort") {
987 return gmTokenPort;
988 } else {
989 return ClockedObject::getPort(if_name, idx);
990 }
991 }
992
994
995 private:
996 const int _cacheLineSize;
1001 uint64_t execCycles;
1002
1037
1045 std::unordered_set<int> freeBarrierIds;
1046
1047 // hold the time of the arrival of the first cache block related to
1048 // a particular GPUDynInst. This is used to calculate the difference
1049 // between the first and last cache block arrival times.
1050 std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
1051
1052 public:
1053 void updateInstStats(GPUDynInstPtr gpuDynInst);
1055
1057 {
1058 ComputeUnitStats(statistics::Group *parent, int n_wf);
1059
1082
1089
1090 // Cycles required to send register source (addr and data) from
1091 // register files to memory pipeline, per SIMD.
1095
1117
1119
1120 // the following stats compute the avg. TLB accesslatency per
1121 // uncoalesced request (only for data)
1125 // hitsPerTLBLevel[x] are the hits in Level x TLB.
1126 // x = 0 is the page table.
1128
1131
1132 // over all memory instructions executed over all wavefronts
1133 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
1135 // count of non-flat global memory vector instructions executed
1137 // count of flat global memory vector instructions executed
1140
1143 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
1144 // active when the instruction is committed, this number is still
1145 // incremented by 1
1147 // Number of cycles among successive instruction executions across all
1148 // wavefronts of the same CU
1150 // number of individual vector operations executed
1152 // number of individual f16 vector operations executed
1154 // number of individual f32 vector operations executed
1156 // number of individual f64 vector operations executed
1158 // number of individual FMA 16,32,64 vector operations executed
1162 // number of individual MAC 16,32,64 vector operations executed
1166 // number of individual MAD 16,32,64 vector operations executed
1170 // number of individual MFMA 16,32,64 vector operations executed
1176 // total number of two op FP vector operations executed
1178 // Total cycles that something is running on the GPU
1180 statistics::Formula vpc; // vector ops per cycle
1181 statistics::Formula vpc_f16; // vector ops per cycle
1182 statistics::Formula vpc_f32; // vector ops per cycle
1183 statistics::Formula vpc_f64; // vector ops per cycle
1184 statistics::Formula ipc; // vector instructions per cycle
1188 // number of vector ALU instructions received
1190 // number of times a WG cannot start due to lack of free VGPRs in SIMDs
1192 // number of times a WG cannot start due to lack of free SGPRs in SIMDs
1198
1199 // distrubtion in latency difference between first and last cache block
1200 // arrival ticks
1202
1203 // Track the amount of interleaving between wavefronts on each SIMD.
1204 // This stat is sampled using instExecPerSimd to compute the number
1205 // of instructions that have been executed on a SIMD between a WF
1206 // executing two successive instructions.
1209};
1210
1211} // namespace gem5
1212
1213#endif // __COMPUTE_UNIT_HH__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
ClockedObject(const ClockedObjectParams &p)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
GMTokenPort(const std::string &name, SimObject *owner, PortID id=InvalidPortID)
bool recvTimingResp(PacketPtr)
Receive a timing response from the peer.
void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ITLBPort(const std::string &_name, ComputeUnit *_cu)
SenderState(GPUDynInstPtr gpuDynInst)
the port intended to communicate between the CU and its LDS
bool stalled
whether or not it is stalled
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
LDSPort(const std::string &_name, ComputeUnit *_cu)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
const char * description() const
Return a C string describing the event.
MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
SQCPort(const std::string &_name, ComputeUnit *_cu)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
virtual Tick recvAtomic(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
const char * description() const
Return a C string describing the event.
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
int oprNetPipeLength() const
int simdUnitWidth() const
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
bool processTimingPacket(PacketPtr pkt)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
std::vector< uint64_t > instExecPerSimd
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
const int _cacheLineSize
EXEC_POLICY exec_policy
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
RequestorID _requestorId
void handleMemPacket(PacketPtr pkt, int memport_index)
int mapWaveToLocalMem(Wavefront *w) const
std::vector< RegisterFileCache * > rfc
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void doSmReturn(GPUDynInstPtr gpuDynInst)
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::unordered_map< Addr, std::pair< int, int > > pageDataStruct
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
int getCacheLineBits() const
std::vector< WFBarrier > wfBarrierSlots
The barrier slots for this CU.
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
TokenManager * getTokenManager()
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
int cacheLineSize() const
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
void sendInvL2(Addr paddr)
LdsState & getLds() const
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > lastExecCycle
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
std::vector< Tick > matrix_core_ready
Port & getPort(const std::string &if_name, PortID idx) override
Get a port with a given name and index.
std::vector< ScalarRegisterFile * > srf
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
std::map< GfxVersion, std::map< std::string, int > > mfma_cycles
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
int dpBypassLength() const
int loadBusLength() const
enums::PrefetchType prefetchType
void processFetchReturn(PacketPtr pkt)
int numBarrierSlots() const
int scalarPipeLength() const
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
InstSeqNum globalSeqNum
int rfcLength() const
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
const int _numBarrierSlots
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
int storeBusLength() const
void initiateFetch(Wavefront *wavefront)
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
int spBypassLength() const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
void fetch(PacketPtr pkt, Wavefront *wavefront)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
static const FlagsType AutoDelete
Definition eventq.hh:110
void setFlags(Flags _flags)
Definition eventq.hh:331
Event(Priority p=Default_Pri, Flags f=0)
Definition eventq.hh:407
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:58
const std::string _name
Definition named.hh:54
Ports are used to interface objects to each other.
Definition port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition port.hh:111
SimObject & owner
Definition port.hh:143
RequestPort(const std::string &name, SimObject *_owner, PortID id=InvalidPortID)
Request port.
Definition port.cc:125
Communication interface between Schedule and Execute stages.
Definition comm.hh:99
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63
Abstract superclass for simulation objects.
TokenRequestPort(const std::string &name, SimObject *owner, PortID id=InvalidPortID)
Definition token_port.hh:51
WF barrier slots.
static const int InvalidID
int numAtBarrier() const
void decMaxBarrierCnt()
Decrement the number of WFs that are participating in this barrier.
int numYetToReachBarrier() const
Number of WFs that have not yet reached the barrier.
void setMaxBarrierCnt(int max_barrier_cnt)
Set the maximum barrier count (i.e., the number of WFs that are participating in the barrier).
void release()
Release this barrier resource so it can be used by other WGs.
void reset()
Reset the barrier.
void incNumAtBarrier()
Mark that a WF has reached the barrier.
int _maxBarrierCnt
The maximum number of WFs that can reach this barrier.
int _numAtBarrier
The number of WFs in the WG that have reached the barrier.
bool allAtBarrier() const
Have all WFs participating in this barrier reached the barrier?
int maxBarrierCnt() const
A simple distribution stat.
A formula for statistics that is calculated when printed.
Statistics container.
Definition group.hh:93
This is a simple scalar statistic, like a counter.
A vector of distributions.
A vector of scalar stats.
STL deque class.
Definition stl.hh:44
STL vector class.
Definition stl.hh:37
ClockedObject declaration and implementation.
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64
virtual Port & getPort(const std::string &if_name, PortID idx=InvalidPortID)
Get a port with a given name and index.
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Port Object Declaration.
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 0 > w
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
const PortID InvalidPortID
Definition types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
@ TLB_MISS_CACHE_MISS
@ TLB_MISS_CACHE_HIT
@ TLB_HIT_CACHE_HIT
@ TLB_HIT_CACHE_MISS
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
Packet * PacketPtr
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of Statistics objects.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState(ComputeUnit *cu, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState(Wavefront *_wavefront, bool _isKernDispatch, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
SenderState(GPUDynInstPtr gpuDynInst, Packet::SenderState *sender_state=nullptr)
A virtual base opaque structure used to hold state associated with the packet (e.g....
Definition packet.hh:469

Generated on Mon May 26 2025 09:19:10 for gem5 by doxygen 1.13.2