gem5 v24.1.0.1
Loading...
Searching...
No Matches
compute_unit.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __COMPUTE_UNIT_HH__
33#define __COMPUTE_UNIT_HH__
34
35#include <deque>
36#include <map>
37#include <unordered_set>
38#include <vector>
39
40#include "base/callback.hh"
41#include "base/compiler.hh"
42#include "base/statistics.hh"
43#include "base/stats/group.hh"
44#include "base/types.hh"
45#include "config/the_gpu_isa.hh"
46#include "enums/PrefetchType.hh"
47#include "gpu-compute/comm.hh"
57#include "mem/port.hh"
58#include "mem/token_port.hh"
59#include "sim/clocked_object.hh"
60
61namespace gem5
62{
63
64class HSAQueueEntry;
65class LdsChunk;
66class ScalarRegisterFile;
67class Shader;
68class VectorRegisterFile;
69class RegisterFileCache;
70
71struct ComputeUnitParams;
72
74{
75 OLDEST = 0,
76 RR
77};
78
86
92{
93 public:
97
98 static const int InvalidID = -1;
99
100 int
102 {
103 return _numAtBarrier;
104 }
105
109 int
111 {
113 }
114
115 int
117 {
118 return _maxBarrierCnt;
119 }
120
125 void
126 setMaxBarrierCnt(int max_barrier_cnt)
127 {
128 _maxBarrierCnt = max_barrier_cnt;
129 }
130
134 void
136 {
139 }
140
146 bool
148 {
150 }
151
156 void
158 {
159 assert(_maxBarrierCnt > 0);
161 }
162
167 void
169 {
170 _numAtBarrier = 0;
171 _maxBarrierCnt = 0;
172 }
173
178 void
180 {
181 _numAtBarrier = 0;
182 }
183
184 private:
191
200};
201
203{
204 public:
205
206
207 // Execution resources
208 //
209 // The ordering of units is:
210 // Vector ALUs
211 // Scalar ALUs
212 // GM Pipe
213 // LM Pipe
214 // Scalar Mem Pipe
215 //
216 // Note: the ordering of units is important and the code assumes the
217 // above ordering. However, there may be more than one resource of
218 // each type (e.g., 4 VALUs or 2 SALUs)
219
221 // Resource control for global memory to VRF data/address bus
223 // Resource control for Vector Register File->Global Memory pipe buses
225 // Resource control for Vector Global Memory execution unit
227
229 // Resource control for local memory to VRF data/address bus
231 // Resource control for Vector Register File->Local Memory pipe buses
233 // Resource control for Vector Shared/Local Memory execution unit
235
237 // Resource control for scalar memory to SRF data/address bus
239 // Resource control for Scalar Register File->Scalar Memory pipe buses
241 // Resource control for Scalar Memory execution unit
243
244 // vector ALU execution resources
247
248 // scalar ALU execution resources
251
252 // Return total number of execution units on this CU
253 int numExeUnits() const;
254 // index into readyList of the first memory unit
255 int firstMemUnit() const;
256 // index into readyList of the last memory unit
257 int lastMemUnit() const;
258 // index into scalarALUs vector of SALU used by the wavefront
259 int mapWaveToScalarAlu(Wavefront *w) const;
260 // index into readyList of SALU used by wavefront
262 // index into readyList of Global Memory unit used by wavefront
263 int mapWaveToGlobalMem(Wavefront *w) const;
264 // index into readyList of Local Memory unit used by wavefront
265 int mapWaveToLocalMem(Wavefront *w) const;
266 // index into readyList of Scalar Memory unit used by wavefront
267 int mapWaveToScalarMem(Wavefront *w) const;
268
269 int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
270 int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
271 int numCyclesPerStoreTransfer; // number of cycles per vector store
272 int numCyclesPerLoadTransfer; // number of cycles per vector load
273
274 // track presence of dynamic instructions in the Schedule pipeline
275 // stage. This is used to check the readiness of the oldest,
276 // non-dispatched instruction of every WF in the Scoreboard stage.
277 std::unordered_set<uint64_t> pipeMap;
278
280
288
290
291 typedef ComputeUnitParams Params;
293 int cu_id;
294
295 // array of vector register files, one per SIMD
297 // array of scalar register files, one per SIMD
299
301
302 // Width per VALU/SIMD unit: number of work items that can be executed
303 // on the vector ALU simultaneously in a SIMD unit
305 // number of pipe stages for bypassing data to next dependent single
306 // precision vector instruction inside the vector ALU pipeline
308 // number of pipe stages for bypassing data to next dependent double
309 // precision vector instruction inside the vector ALU pipeline
311 // number of pipe stages for register file cache
313 // number of pipe stages for scalar ALU
315 // number of pipe stages for operand collection & distribution network
317 // number of cycles per instruction issue period
319
320 // VRF to GM Bus latency
322 // SRF to Scalar Mem Bus latency
324 // VRF to LM Bus latency
326
327 // tracks the last cycle a vector instruction was executed on a SIMD
329
330 // tracks the number of dyn inst executed per SIMD
332
333 // true if we allow a separate TLB per lane
335 // if 0, TLB prefetching is off.
337 // if fixed-stride prefetching, this is the stride.
339
343 enums::PrefetchType prefetchType;
345
347 // Idle CU timeout in ticks
352
353 /*
354 * for Counting page accesses
355 */
357
359
364
366
374
375 // number of currently reserved vector registers per SIMD unit
377 // number of currently reserved scalar registers per SIMD unit
379 // number of vector registers per SIMD unit
381 // number of available scalar registers per SIMD unit
383
384 // this hash map will keep track of page divergence
385 // per memory instruction per wavefront. The hash map
386 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
387 std::map<Addr, int> pagesTouched;
388
391
392 ComputeUnit(const Params &p);
393 ~ComputeUnit();
394
395 // Timing Functions
397 int simdUnitWidth() const { return simdWidth; }
398 int spBypassLength() const { return spBypassPipeLength; }
399 int dpBypassLength() const { return dpBypassPipeLength; }
400 int rfcLength() const { return rfcPipeLength; }
401 int scalarPipeLength() const { return scalarPipeStages; }
404 int wfSize() const { return wavefrontSize; }
405
406 void exec();
407 void initiateFetch(Wavefront *wavefront);
408 void fetch(PacketPtr pkt, Wavefront *wavefront);
410
411 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
412 HSAQueueEntry *task, int bar_id,
413 bool fetchContext=false);
414
415 void doInvalidate(RequestPtr req, int kernId);
416 void doFlush(GPUDynInstPtr gpuDynInst);
417 void doSQCInvalidate(RequestPtr req, int kernId);
418
419 void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
420 bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
421
422 int cacheLineSize() const { return _cacheLineSize; }
423 int getCacheLineBits() const { return cacheLineBits; }
424
425 void resetRegisterPool();
426
427 private:
428 WFBarrier&
429 barrierSlot(int bar_id)
430 {
431 assert(bar_id > WFBarrier::InvalidID);
432 return wfBarrierSlots.at(bar_id);
433 }
434
435 int
437 {
438 assert(freeBarrierIds.size());
439 auto free_bar_id = freeBarrierIds.begin();
440 int bar_id = *free_bar_id;
441 freeBarrierIds.erase(free_bar_id);
442 return bar_id;
443 }
444
445 public:
446 int numYetToReachBarrier(int bar_id);
447 bool allAtBarrier(int bar_id);
448 void incNumAtBarrier(int bar_id);
449 int numAtBarrier(int bar_id);
450 int maxBarrierCnt(int bar_id);
451 void resetBarrier(int bar_id);
452 void decMaxBarrierCnt(int bar_id);
453 void releaseBarrier(int bar_id);
454 void releaseWFsFromBarrier(int bar_id);
455 int numBarrierSlots() const { return _numBarrierSlots; }
456
457 template<typename c0, typename c1>
458 void doSmReturn(GPUDynInstPtr gpuDynInst);
459
460 virtual void init() override;
461 void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
462 void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
463 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
464 bool kernelMemSync,
465 RequestPtr req=nullptr);
466 void handleMemPacket(PacketPtr pkt, int memport_index);
470
473
474 bool isDone() const;
475 bool isVectorAluIdle(uint32_t simdId) const;
476
477 void handleSQCReturn(PacketPtr pkt);
478
479 void sendInvL2(Addr paddr);
480
481 protected:
483
485
486 public:
487 LdsState &
488 getLds() const
489 {
490 return lds;
491 }
492
493 int32_t
494 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
495
496 [[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
497
498 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
500
501 void exitCallback();
502
504 {
505 public:
506 GMTokenPort(const std::string& name, SimObject *owner,
509 { }
511
512 protected:
513 bool recvTimingResp(PacketPtr) { return false; }
514 void recvReqRetry() { }
515 };
516
517 // Manager for the number of tokens available to this compute unit to
518 // send global memory request packets to the coalescer this is only used
519 // between global memory pipe and TCP coalescer.
522
524 class DataPort : public RequestPort
525 {
526 public:
527 DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
528 : RequestPort(_name, id), computeUnit(_cu) { }
529
531
533 {
538
539 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
540 Packet::SenderState *sender_state=nullptr)
541 : _gpuDynInst(gpuDynInst),
542 port_index(_port_index),
543 saved(sender_state) { }
544
546 Packet::SenderState *sender_state=nullptr)
547 : computeUnit(cu),
548 port_index(_port_index),
549 saved(sender_state) { }
550 };
551
552 class SystemHubEvent : public Event
553 {
556
557 public:
559 : dataPort(_dataPort), reqPkt(pkt)
560 {
562 }
563
564 void
566 {
567 // DMAs do not operate on packets and therefore do not
568 // convert to a response. Do that here instead.
571 }
572 };
573
576
579
581
582 bool handleResponse(PacketPtr pkt);
583
584 protected:
586
587 virtual bool recvTimingResp(PacketPtr pkt);
588 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
589 virtual void recvFunctional(PacketPtr pkt) { }
590 virtual void recvRangeChange() { }
591 virtual void recvReqRetry();
592
593 virtual void
595 {
596 resp.clear();
597 snoop = true;
598 }
599
600 };
601
602 // Scalar data cache access port
604 {
605 public:
606 ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
608 {
609 }
610
611 bool recvTimingResp(PacketPtr pkt) override;
612 void recvReqRetry() override;
613
615 {
617 Packet::SenderState *sender_state=nullptr)
618 : _gpuDynInst(gpuDynInst), saved(sender_state)
619 {
620 }
621
624 };
625
626 class MemReqEvent : public Event
627 {
628 private:
631
632 public:
633 MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
634 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
635 {
637 }
638
639 void process();
640 const char *description() const;
641 };
642
643 class SystemHubEvent : public Event
644 {
647
648 public:
650 : dataPort(_dataPort), reqPkt(pkt)
651 {
653 }
654
655 void
657 {
658 // DMAs do not operate on packets and therefore do not
659 // convert to a response. Do that here instead.
662 }
663 };
664
665 bool handleResponse(PacketPtr pkt);
666
668
669 private:
671 };
672
673 // Instruction cache access port
674 class SQCPort : public RequestPort
675 {
676 public:
677 SQCPort(const std::string &_name, ComputeUnit *_cu)
678 : RequestPort(_name), computeUnit(_cu) { }
679
681
683 {
686 // kernel id to be used in handling I-Cache invalidate response
690 *sender_state=nullptr, int _kernId=-1)
691 : wavefront(_wavefront), saved(sender_state),
692 kernId(_kernId), isKernDispatch(false){ }
693
694 SenderState(Wavefront *_wavefront, bool _isKernDispatch,
695 Packet::SenderState *sender_state=nullptr, int _kernId=-1)
696 : wavefront(_wavefront), saved(sender_state),
697 kernId(_kernId), isKernDispatch(_isKernDispatch){ }
698
699 };
700
701 class MemReqEvent : public Event
702 {
703 private:
706
707 public:
708 MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
709 : Event(), sqcPort(_sqc_port), pkt(_pkt)
710 {
712 }
713
714 void process();
715 const char *description() const;
716 };
717
719
720 protected:
722
723 virtual bool recvTimingResp(PacketPtr pkt);
724 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
725 virtual void recvFunctional(PacketPtr pkt) { }
726 virtual void recvRangeChange() { }
727 virtual void recvReqRetry();
728
729 virtual void
731 {
732 resp.clear();
733 snoop = true;
734 }
735 };
736
738 class DTLBPort : public RequestPort
739 {
740 public:
741 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
743 stalled(false)
744 { }
745
746 bool isStalled() { return stalled; }
747 void stallPort() { stalled = true; }
748 void unstallPort() { stalled = false; }
749
755
760 {
761 // the memInst that this is associated with
763
764 // the lane in the memInst this is associated with, so we send
765 // the memory request down the right port
767
768 // constructor used for packets involved in timing accesses
769 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
770 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
771
772 };
773
774 protected:
777
778 virtual bool recvTimingResp(PacketPtr pkt);
779 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
780 virtual void recvFunctional(PacketPtr pkt) { }
781 virtual void recvRangeChange() { }
782 virtual void recvReqRetry();
783 };
784
786 {
787 public:
788 ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
789 : RequestPort(_name), computeUnit(_cu), stalled(false)
790 {
791 }
792
794 {
795 SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
797 };
798
799 bool recvTimingResp(PacketPtr pkt) override;
800 void recvReqRetry() override { assert(false); }
801
802 bool isStalled() const { return stalled; }
803 void stallPort() { stalled = true; }
804 void unstallPort() { stalled = false; }
805
807
808 private:
811 };
812
813 class ITLBPort : public RequestPort
814 {
815 public:
816 ITLBPort(const std::string &_name, ComputeUnit *_cu)
817 : RequestPort(_name), computeUnit(_cu), stalled(false) { }
818
819
820 bool isStalled() { return stalled; }
821 void stallPort() { stalled = true; }
822 void unstallPort() { stalled = false; }
823
829
834 {
835 // The wavefront associated with this request
837
838 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
839 };
840
841 protected:
844
845 virtual bool recvTimingResp(PacketPtr pkt);
846 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
847 virtual void recvFunctional(PacketPtr pkt) { }
848 virtual void recvRangeChange() { }
849 virtual void recvReqRetry();
850 };
851
855 class LDSPort : public RequestPort
856 {
857 public:
858 LDSPort(const std::string &_name, ComputeUnit *_cu)
860 {
861 }
862
863 bool isStalled() const { return stalled; }
864 void stallPort() { stalled = true; }
865 void unstallPort() { stalled = false; }
866
871 std::queue<PacketPtr> retries;
872
878 {
879 protected:
880 // The actual read/write/atomic request that goes with this command
882
883 public:
885 _gpuDynInst(gpuDynInst)
886 {
887 }
888
891 {
892 return _gpuDynInst;
893 }
894 };
895
896 virtual bool
898
899 protected:
900
901 bool stalled = false;
902
904
905 virtual bool
907
908 virtual Tick
909 recvAtomic(PacketPtr pkt) { return 0; }
910
911 virtual void
913 {
914 }
915
916 virtual void
918 {
919 }
920
921 virtual void
922 recvReqRetry();
923 };
924
929
932 {
933 return memPortTokens;
934 }
935
940 // port to the TLB hierarchy (i.e., the L1 TLB)
942 // port to the scalar data cache
944 // port to the scalar data TLB
946 // port to the SQC (i.e. the I-cache)
948 // port to the SQC TLB (there's a separate TLB for each I-cache)
950
951 Port &
952 getPort(const std::string &if_name, PortID idx) override
953 {
954 if (if_name == "memory_port" && idx < memPort.size()) {
955 return memPort[idx];
956 } else if (if_name == "translation_port" && idx < tlbPort.size()) {
957 return tlbPort[idx];
958 } else if (if_name == "scalar_port") {
959 return scalarDataPort;
960 } else if (if_name == "scalar_tlb_port") {
961 return scalarDTLBPort;
962 } else if (if_name == "sqc_port") {
963 return sqcPort;
964 } else if (if_name == "sqc_tlb_port") {
965 return sqcTLBPort;
966 } else if (if_name == "ldsPort") {
967 return ldsPort;
968 } else if (if_name == "gmTokenPort") {
969 return gmTokenPort;
970 } else {
971 return ClockedObject::getPort(if_name, idx);
972 }
973 }
974
976
977 private:
978 const int _cacheLineSize;
983
1018
1026 std::unordered_set<int> freeBarrierIds;
1027
1028 // hold the time of the arrival of the first cache block related to
1029 // a particular GPUDynInst. This is used to calculate the difference
1030 // between the first and last cache block arrival times.
1031 std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
1032
1033 public:
1034 void updateInstStats(GPUDynInstPtr gpuDynInst);
1036
1038 {
1039 ComputeUnitStats(statistics::Group *parent, int n_wf);
1040
1063
1070
1071 // Cycles required to send register source (addr and data) from
1072 // register files to memory pipeline, per SIMD.
1076
1098
1100
1101 // the following stats compute the avg. TLB accesslatency per
1102 // uncoalesced request (only for data)
1106 // hitsPerTLBLevel[x] are the hits in Level x TLB.
1107 // x = 0 is the page table.
1109
1112
1113 // over all memory instructions executed over all wavefronts
1114 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
1116 // count of non-flat global memory vector instructions executed
1118 // count of flat global memory vector instructions executed
1121
1124 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
1125 // active when the instruction is committed, this number is still
1126 // incremented by 1
1128 // Number of cycles among successive instruction executions across all
1129 // wavefronts of the same CU
1131 // number of individual vector operations executed
1133 // number of individual f16 vector operations executed
1135 // number of individual f32 vector operations executed
1137 // number of individual f64 vector operations executed
1139 // number of individual FMA 16,32,64 vector operations executed
1143 // number of individual MAC 16,32,64 vector operations executed
1147 // number of individual MAD 16,32,64 vector operations executed
1151 // number of individual MFMA 16,32,64 vector operations executed
1157 // total number of two op FP vector operations executed
1159 // Total cycles that something is running on the GPU
1161 statistics::Formula vpc; // vector ops per cycle
1162 statistics::Formula vpc_f16; // vector ops per cycle
1163 statistics::Formula vpc_f32; // vector ops per cycle
1164 statistics::Formula vpc_f64; // vector ops per cycle
1165 statistics::Formula ipc; // vector instructions per cycle
1169 // number of vector ALU instructions received
1171 // number of times a WG cannot start due to lack of free VGPRs in SIMDs
1173 // number of times a WG cannot start due to lack of free SGPRs in SIMDs
1179
1180 // distrubtion in latency difference between first and last cache block
1181 // arrival ticks
1183
1184 // Track the amount of interleaving between wavefronts on each SIMD.
1185 // This stat is sampled using instExecPerSimd to compute the number
1186 // of instructions that have been executed on a SIMD between a WF
1187 // executing two successive instructions.
1190};
1191
1192} // namespace gem5
1193
1194#endif // __COMPUTE_UNIT_HH__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
GMTokenPort(const std::string &name, SimObject *owner, PortID id=InvalidPortID)
bool recvTimingResp(PacketPtr)
Receive a timing response from the peer.
void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ITLBPort(const std::string &_name, ComputeUnit *_cu)
SenderState is information carried along with the packet, esp.
SenderState(GPUDynInstPtr gpuDynInst)
the port intended to communicate between the CU and its LDS
bool stalled
whether or not it is stalled
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
LDSPort(const std::string &_name, ComputeUnit *_cu)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
const char * description() const
Return a C string describing the event.
MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
SQCPort(const std::string &_name, ComputeUnit *_cu)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
virtual Tick recvAtomic(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
const char * description() const
Return a C string describing the event.
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
int oprNetPipeLength() const
int simdUnitWidth() const
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
bool processTimingPacket(PacketPtr pkt)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
std::vector< uint64_t > instExecPerSimd
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
const int _cacheLineSize
EXEC_POLICY exec_policy
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
RequestorID _requestorId
void handleMemPacket(PacketPtr pkt, int memport_index)
int mapWaveToLocalMem(Wavefront *w) const
std::vector< RegisterFileCache * > rfc
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void doSmReturn(GPUDynInstPtr gpuDynInst)
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::unordered_map< Addr, std::pair< int, int > > pageDataStruct
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
int getCacheLineBits() const
std::vector< WFBarrier > wfBarrierSlots
The barrier slots for this CU.
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
TokenManager * getTokenManager()
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
int cacheLineSize() const
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
void sendInvL2(Addr paddr)
LdsState & getLds() const
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > lastExecCycle
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
Port & getPort(const std::string &if_name, PortID idx) override
Get a port with a given name and index.
std::vector< ScalarRegisterFile * > srf
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
int dpBypassLength() const
int loadBusLength() const
enums::PrefetchType prefetchType
void processFetchReturn(PacketPtr pkt)
int numBarrierSlots() const
int scalarPipeLength() const
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
InstSeqNum globalSeqNum
int rfcLength() const
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
const int _numBarrierSlots
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
int storeBusLength() const
void initiateFetch(Wavefront *wavefront)
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
int spBypassLength() const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
void fetch(PacketPtr pkt, Wavefront *wavefront)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
static const FlagsType AutoDelete
Definition eventq.hh:110
void setFlags(Flags _flags)
Definition eventq.hh:331
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:58
const std::string _name
Definition named.hh:41
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void makeResponse()
Take a request packet and modify it in place to be suitable for returning as a response to that reque...
Definition packet.hh:1062
Ports are used to interface objects to each other.
Definition port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition port.hh:111
A RequestPort is a specialisation of a Port, which implements the default protocol for the three diff...
Definition port.hh:136
SimObject & owner
Definition port.hh:143
Communication interface between Schedule and Execute stages.
Definition comm.hh:99
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63
Abstract superclass for simulation objects.
WF barrier slots.
static const int InvalidID
int numAtBarrier() const
void decMaxBarrierCnt()
Decrement the number of WFs that are participating in this barrier.
int numYetToReachBarrier() const
Number of WFs that have not yet reached the barrier.
void setMaxBarrierCnt(int max_barrier_cnt)
Set the maximum barrier count (i.e., the number of WFs that are participating in the barrier).
void release()
Release this barrier resource so it can be used by other WGs.
void reset()
Reset the barrier.
void incNumAtBarrier()
Mark that a WF has reached the barrier.
int _maxBarrierCnt
The maximum number of WFs that can reach this barrier.
int _numAtBarrier
The number of WFs in the WG that have reached the barrier.
bool allAtBarrier() const
Have all WFs participating in this barrier reached the barrier? If so, then the barrier is satisfied ...
int maxBarrierCnt() const
A simple distribution stat.
A formula for statistics that is calculated when printed.
Statistics container.
Definition group.hh:93
This is a simple scalar statistic, like a counter.
A vector of distributions.
A vector of scalar stats.
STL deque class.
Definition stl.hh:44
STL vector class.
Definition stl.hh:37
ClockedObject declaration and implementation.
virtual Port & getPort(const std::string &if_name, PortID idx=InvalidPortID)
Get a port with a given name and index.
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Port Object Declaration.
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 0 > w
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
const PortID InvalidPortID
Definition types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
@ TLB_MISS_CACHE_MISS
@ TLB_MISS_CACHE_HIT
@ TLB_HIT_CACHE_HIT
@ TLB_HIT_CACHE_MISS
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of Statistics objects.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState(ComputeUnit *cu, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(Wavefront *_wavefront, bool _isKernDispatch, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
SenderState(GPUDynInstPtr gpuDynInst, Packet::SenderState *sender_state=nullptr)
A virtual base opaque structure used to hold state associated with the packet (e.g....
Definition packet.hh:469

Generated on Mon Jan 13 2025 04:28:36 for gem5 by doxygen 1.9.8