gem5 v24.0.0.0
Loading...
Searching...
No Matches
compute_unit.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __COMPUTE_UNIT_HH__
33#define __COMPUTE_UNIT_HH__
34
35#include <deque>
36#include <map>
37#include <unordered_set>
38#include <vector>
39
40#include "base/callback.hh"
41#include "base/compiler.hh"
42#include "base/statistics.hh"
43#include "base/stats/group.hh"
44#include "base/types.hh"
45#include "config/the_gpu_isa.hh"
46#include "enums/PrefetchType.hh"
47#include "gpu-compute/comm.hh"
57#include "mem/port.hh"
58#include "mem/token_port.hh"
59#include "sim/clocked_object.hh"
60
61namespace gem5
62{
63
64class HSAQueueEntry;
65class LdsChunk;
66class ScalarRegisterFile;
67class Shader;
68class VectorRegisterFile;
69class RegisterFileCache;
70
71struct ComputeUnitParams;
72
74{
75 OLDEST = 0,
76 RR
77};
78
86
92{
93 public:
97
98 static const int InvalidID = -1;
99
100 int
102 {
103 return _numAtBarrier;
104 }
105
109 int
111 {
113 }
114
115 int
117 {
118 return _maxBarrierCnt;
119 }
120
125 void
126 setMaxBarrierCnt(int max_barrier_cnt)
127 {
128 _maxBarrierCnt = max_barrier_cnt;
129 }
130
134 void
136 {
139 }
140
146 bool
148 {
150 }
151
156 void
158 {
159 assert(_maxBarrierCnt > 0);
161 }
162
167 void
169 {
170 _numAtBarrier = 0;
171 _maxBarrierCnt = 0;
172 }
173
178 void
180 {
181 _numAtBarrier = 0;
182 }
183
184 private:
191
200};
201
203{
204 public:
205
206
207 // Execution resources
208 //
209 // The ordering of units is:
210 // Vector ALUs
211 // Scalar ALUs
212 // GM Pipe
213 // LM Pipe
214 // Scalar Mem Pipe
215 //
216 // Note: the ordering of units is important and the code assumes the
217 // above ordering. However, there may be more than one resource of
218 // each type (e.g., 4 VALUs or 2 SALUs)
219
221 // Resource control for global memory to VRF data/address bus
223 // Resource control for Vector Register File->Global Memory pipe buses
225 // Resource control for Vector Global Memory execution unit
227
229 // Resource control for local memory to VRF data/address bus
231 // Resource control for Vector Register File->Local Memory pipe buses
233 // Resource control for Vector Shared/Local Memory execution unit
235
237 // Resource control for scalar memory to SRF data/address bus
239 // Resource control for Scalar Register File->Scalar Memory pipe buses
241 // Resource control for Scalar Memory execution unit
243
244 // vector ALU execution resources
247
248 // scalar ALU execution resources
251
252 // Return total number of execution units on this CU
253 int numExeUnits() const;
254 // index into readyList of the first memory unit
255 int firstMemUnit() const;
256 // index into readyList of the last memory unit
257 int lastMemUnit() const;
258 // index into scalarALUs vector of SALU used by the wavefront
259 int mapWaveToScalarAlu(Wavefront *w) const;
260 // index into readyList of SALU used by wavefront
262 // index into readyList of Global Memory unit used by wavefront
263 int mapWaveToGlobalMem(Wavefront *w) const;
264 // index into readyList of Local Memory unit used by wavefront
265 int mapWaveToLocalMem(Wavefront *w) const;
266 // index into readyList of Scalar Memory unit used by wavefront
267 int mapWaveToScalarMem(Wavefront *w) const;
268
269 int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
270 int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
271 int numCyclesPerStoreTransfer; // number of cycles per vector store
272 int numCyclesPerLoadTransfer; // number of cycles per vector load
273
274 // track presence of dynamic instructions in the Schedule pipeline
275 // stage. This is used to check the readiness of the oldest,
276 // non-dispatched instruction of every WF in the Scoreboard stage.
277 std::unordered_set<uint64_t> pipeMap;
278
280
288
290
291 typedef ComputeUnitParams Params;
293 int cu_id;
294
295 // array of vector register files, one per SIMD
297 // array of scalar register files, one per SIMD
299
301
302 // Width per VALU/SIMD unit: number of work items that can be executed
303 // on the vector ALU simultaneously in a SIMD unit
305 // number of pipe stages for bypassing data to next dependent single
306 // precision vector instruction inside the vector ALU pipeline
308 // number of pipe stages for bypassing data to next dependent double
309 // precision vector instruction inside the vector ALU pipeline
311 // number of pipe stages for register file cache
313 // number of pipe stages for scalar ALU
315 // number of pipe stages for operand collection & distribution network
317 // number of cycles per instruction issue period
319
320 // VRF to GM Bus latency
322 // SRF to Scalar Mem Bus latency
324 // VRF to LM Bus latency
326
327 // tracks the last cycle a vector instruction was executed on a SIMD
329
330 // tracks the number of dyn inst executed per SIMD
332
333 // true if we allow a separate TLB per lane
335 // if 0, TLB prefetching is off.
337 // if fixed-stride prefetching, this is the stride.
339
343 enums::PrefetchType prefetchType;
345
347 // Idle CU timeout in ticks
352
353 /*
354 * for Counting page accesses
355 */
357
359
364
372
373 // number of currently reserved vector registers per SIMD unit
375 // number of currently reserved scalar registers per SIMD unit
377 // number of vector registers per SIMD unit
379 // number of available scalar registers per SIMD unit
381
382 // this hash map will keep track of page divergence
383 // per memory instruction per wavefront. The hash map
384 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
385 std::map<Addr, int> pagesTouched;
386
389
390 ComputeUnit(const Params &p);
391 ~ComputeUnit();
392
393 // Timing Functions
395 int simdUnitWidth() const { return simdWidth; }
396 int spBypassLength() const { return spBypassPipeLength; }
397 int dpBypassLength() const { return dpBypassPipeLength; }
398 int rfcLength() const { return rfcPipeLength; }
399 int scalarPipeLength() const { return scalarPipeStages; }
402 int wfSize() const { return wavefrontSize; }
403
404 void exec();
405 void initiateFetch(Wavefront *wavefront);
406 void fetch(PacketPtr pkt, Wavefront *wavefront);
408
409 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
410 HSAQueueEntry *task, int bar_id,
411 bool fetchContext=false);
412
413 void doInvalidate(RequestPtr req, int kernId);
414 void doFlush(GPUDynInstPtr gpuDynInst);
415 void doSQCInvalidate(RequestPtr req, int kernId);
416
417 void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
418 bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
419
420 int cacheLineSize() const { return _cacheLineSize; }
421 int getCacheLineBits() const { return cacheLineBits; }
422
423 void resetRegisterPool();
424
425 private:
426 WFBarrier&
427 barrierSlot(int bar_id)
428 {
429 assert(bar_id > WFBarrier::InvalidID);
430 return wfBarrierSlots.at(bar_id);
431 }
432
433 int
435 {
436 assert(freeBarrierIds.size());
437 auto free_bar_id = freeBarrierIds.begin();
438 int bar_id = *free_bar_id;
439 freeBarrierIds.erase(free_bar_id);
440 return bar_id;
441 }
442
443 public:
444 int numYetToReachBarrier(int bar_id);
445 bool allAtBarrier(int bar_id);
446 void incNumAtBarrier(int bar_id);
447 int numAtBarrier(int bar_id);
448 int maxBarrierCnt(int bar_id);
449 void resetBarrier(int bar_id);
450 void decMaxBarrierCnt(int bar_id);
451 void releaseBarrier(int bar_id);
452 void releaseWFsFromBarrier(int bar_id);
453 int numBarrierSlots() const { return _numBarrierSlots; }
454
455 template<typename c0, typename c1>
456 void doSmReturn(GPUDynInstPtr gpuDynInst);
457
458 virtual void init() override;
459 void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
460 void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
461 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
462 bool kernelMemSync,
463 RequestPtr req=nullptr);
464 void handleMemPacket(PacketPtr pkt, int memport_index);
468
471
472 bool isDone() const;
473 bool isVectorAluIdle(uint32_t simdId) const;
474
475 void handleSQCReturn(PacketPtr pkt);
476
477 void sendInvL2(Addr paddr);
478
479 protected:
481
483
484 public:
485 LdsState &
486 getLds() const
487 {
488 return lds;
489 }
490
491 int32_t
492 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
493
494 [[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
495
496 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
498
499 void exitCallback();
500
502 {
503 public:
504 GMTokenPort(const std::string& name, SimObject *owner,
507 { }
509
510 protected:
511 bool recvTimingResp(PacketPtr) { return false; }
512 void recvReqRetry() { }
513 };
514
515 // Manager for the number of tokens available to this compute unit to
516 // send global memory request packets to the coalescer this is only used
517 // between global memory pipe and TCP coalescer.
520
522 class DataPort : public RequestPort
523 {
524 public:
525 DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
526 : RequestPort(_name, id), computeUnit(_cu) { }
527
529
531 {
536
537 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
538 Packet::SenderState *sender_state=nullptr)
539 : _gpuDynInst(gpuDynInst),
540 port_index(_port_index),
541 saved(sender_state) { }
542
544 Packet::SenderState *sender_state=nullptr)
545 : computeUnit(cu),
546 port_index(_port_index),
547 saved(sender_state) { }
548 };
549
550 class SystemHubEvent : public Event
551 {
554
555 public:
557 : dataPort(_dataPort), reqPkt(pkt)
558 {
560 }
561
562 void
564 {
565 // DMAs do not operate on packets and therefore do not
566 // convert to a response. Do that here instead.
569 }
570 };
571
574
577
579
580 bool handleResponse(PacketPtr pkt);
581
582 protected:
584
585 virtual bool recvTimingResp(PacketPtr pkt);
586 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
587 virtual void recvFunctional(PacketPtr pkt) { }
588 virtual void recvRangeChange() { }
589 virtual void recvReqRetry();
590
591 virtual void
593 {
594 resp.clear();
595 snoop = true;
596 }
597
598 };
599
600 // Scalar data cache access port
602 {
603 public:
604 ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
606 {
607 }
608
609 bool recvTimingResp(PacketPtr pkt) override;
610 void recvReqRetry() override;
611
613 {
615 Packet::SenderState *sender_state=nullptr)
616 : _gpuDynInst(gpuDynInst), saved(sender_state)
617 {
618 }
619
622 };
623
624 class MemReqEvent : public Event
625 {
626 private:
629
630 public:
631 MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
632 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
633 {
635 }
636
637 void process();
638 const char *description() const;
639 };
640
641 class SystemHubEvent : public Event
642 {
645
646 public:
648 : dataPort(_dataPort), reqPkt(pkt)
649 {
651 }
652
653 void
655 {
656 // DMAs do not operate on packets and therefore do not
657 // convert to a response. Do that here instead.
660 }
661 };
662
663 bool handleResponse(PacketPtr pkt);
664
666
667 private:
669 };
670
671 // Instruction cache access port
672 class SQCPort : public RequestPort
673 {
674 public:
675 SQCPort(const std::string &_name, ComputeUnit *_cu)
676 : RequestPort(_name), computeUnit(_cu) { }
677
679
681 {
684 // kernel id to be used in handling I-Cache invalidate response
686
688 *sender_state=nullptr, int _kernId=-1)
689 : wavefront(_wavefront), saved(sender_state),
690 kernId(_kernId){ }
691 };
692
693 class MemReqEvent : public Event
694 {
695 private:
698
699 public:
700 MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
701 : Event(), sqcPort(_sqc_port), pkt(_pkt)
702 {
704 }
705
706 void process();
707 const char *description() const;
708 };
709
711
712 protected:
714
715 virtual bool recvTimingResp(PacketPtr pkt);
716 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
717 virtual void recvFunctional(PacketPtr pkt) { }
718 virtual void recvRangeChange() { }
719 virtual void recvReqRetry();
720
721 virtual void
723 {
724 resp.clear();
725 snoop = true;
726 }
727 };
728
730 class DTLBPort : public RequestPort
731 {
732 public:
733 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
735 stalled(false)
736 { }
737
738 bool isStalled() { return stalled; }
739 void stallPort() { stalled = true; }
740 void unstallPort() { stalled = false; }
741
747
752 {
753 // the memInst that this is associated with
755
756 // the lane in the memInst this is associated with, so we send
757 // the memory request down the right port
759
760 // constructor used for packets involved in timing accesses
761 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
762 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
763
764 };
765
766 protected:
769
770 virtual bool recvTimingResp(PacketPtr pkt);
771 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
772 virtual void recvFunctional(PacketPtr pkt) { }
773 virtual void recvRangeChange() { }
774 virtual void recvReqRetry();
775 };
776
778 {
779 public:
780 ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
781 : RequestPort(_name), computeUnit(_cu), stalled(false)
782 {
783 }
784
786 {
787 SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
789 };
790
791 bool recvTimingResp(PacketPtr pkt) override;
792 void recvReqRetry() override { assert(false); }
793
794 bool isStalled() const { return stalled; }
795 void stallPort() { stalled = true; }
796 void unstallPort() { stalled = false; }
797
799
800 private:
803 };
804
805 class ITLBPort : public RequestPort
806 {
807 public:
808 ITLBPort(const std::string &_name, ComputeUnit *_cu)
809 : RequestPort(_name), computeUnit(_cu), stalled(false) { }
810
811
812 bool isStalled() { return stalled; }
813 void stallPort() { stalled = true; }
814 void unstallPort() { stalled = false; }
815
821
826 {
827 // The wavefront associated with this request
829
830 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
831 };
832
833 protected:
836
837 virtual bool recvTimingResp(PacketPtr pkt);
838 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
839 virtual void recvFunctional(PacketPtr pkt) { }
840 virtual void recvRangeChange() { }
841 virtual void recvReqRetry();
842 };
843
847 class LDSPort : public RequestPort
848 {
849 public:
850 LDSPort(const std::string &_name, ComputeUnit *_cu)
852 {
853 }
854
855 bool isStalled() const { return stalled; }
856 void stallPort() { stalled = true; }
857 void unstallPort() { stalled = false; }
858
863 std::queue<PacketPtr> retries;
864
870 {
871 protected:
872 // The actual read/write/atomic request that goes with this command
874
875 public:
877 _gpuDynInst(gpuDynInst)
878 {
879 }
880
883 {
884 return _gpuDynInst;
885 }
886 };
887
888 virtual bool
890
891 protected:
892
893 bool stalled = false;
894
896
897 virtual bool
899
900 virtual Tick
901 recvAtomic(PacketPtr pkt) { return 0; }
902
903 virtual void
905 {
906 }
907
908 virtual void
910 {
911 }
912
913 virtual void
914 recvReqRetry();
915 };
916
921
924 {
925 return memPortTokens;
926 }
927
932 // port to the TLB hierarchy (i.e., the L1 TLB)
934 // port to the scalar data cache
936 // port to the scalar data TLB
938 // port to the SQC (i.e. the I-cache)
940 // port to the SQC TLB (there's a separate TLB for each I-cache)
942
943 Port &
944 getPort(const std::string &if_name, PortID idx) override
945 {
946 if (if_name == "memory_port" && idx < memPort.size()) {
947 return memPort[idx];
948 } else if (if_name == "translation_port" && idx < tlbPort.size()) {
949 return tlbPort[idx];
950 } else if (if_name == "scalar_port") {
951 return scalarDataPort;
952 } else if (if_name == "scalar_tlb_port") {
953 return scalarDTLBPort;
954 } else if (if_name == "sqc_port") {
955 return sqcPort;
956 } else if (if_name == "sqc_tlb_port") {
957 return sqcTLBPort;
958 } else if (if_name == "ldsPort") {
959 return ldsPort;
960 } else if (if_name == "gmTokenPort") {
961 return gmTokenPort;
962 } else {
963 return ClockedObject::getPort(if_name, idx);
964 }
965 }
966
968
969 private:
970 const int _cacheLineSize;
975
1010
1018 std::unordered_set<int> freeBarrierIds;
1019
1020 // hold the time of the arrival of the first cache block related to
1021 // a particular GPUDynInst. This is used to calculate the difference
1022 // between the first and last chace block arrival times.
1023 std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
1024
1025 public:
1026 void updateInstStats(GPUDynInstPtr gpuDynInst);
1028
1030 {
1031 ComputeUnitStats(statistics::Group *parent, int n_wf);
1032
1055
1062
1063 // Cycles required to send register source (addr and data) from
1064 // register files to memory pipeline, per SIMD.
1068
1090
1092
1093 // the following stats compute the avg. TLB accesslatency per
1094 // uncoalesced request (only for data)
1098 // hitsPerTLBLevel[x] are the hits in Level x TLB.
1099 // x = 0 is the page table.
1101
1104
1105 // over all memory instructions executed over all wavefronts
1106 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
1108 // count of non-flat global memory vector instructions executed
1110 // count of flat global memory vector instructions executed
1113
1116 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
1117 // active when the instruction is committed, this number is still
1118 // incremented by 1
1120 // Number of cycles among successive instruction executions across all
1121 // wavefronts of the same CU
1123 // number of individual vector operations executed
1125 // number of individual f16 vector operations executed
1127 // number of individual f32 vector operations executed
1129 // number of individual f64 vector operations executed
1131 // number of individual FMA 16,32,64 vector operations executed
1135 // number of individual MAC 16,32,64 vector operations executed
1139 // number of individual MAD 16,32,64 vector operations executed
1143 // number of individual MFMA 16,32,64 vector operations executed
1149 // total number of two op FP vector operations executed
1151 // Total cycles that something is running on the GPU
1153 statistics::Formula vpc; // vector ops per cycle
1154 statistics::Formula vpc_f16; // vector ops per cycle
1155 statistics::Formula vpc_f32; // vector ops per cycle
1156 statistics::Formula vpc_f64; // vector ops per cycle
1157 statistics::Formula ipc; // vector instructions per cycle
1161 // number of vector ALU instructions received
1163 // number of times a WG cannot start due to lack of free VGPRs in SIMDs
1165 // number of times a WG cannot start due to lack of free SGPRs in SIMDs
1171
1172 // distrubtion in latency difference between first and last cache block
1173 // arrival ticks
1175
1176 // Track the amount of interleaving between wavefronts on each SIMD.
1177 // This stat is sampled using instExecPerSimd to compute the number
1178 // of instructions that have been executed on a SIMD between a WF
1179 // executing two successive instructions.
1182};
1183
1184} // namespace gem5
1185
1186#endif // __COMPUTE_UNIT_HH__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
GMTokenPort(const std::string &name, SimObject *owner, PortID id=InvalidPortID)
bool recvTimingResp(PacketPtr)
Receive a timing response from the peer.
void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ITLBPort(const std::string &_name, ComputeUnit *_cu)
SenderState is information carried along with the packet, esp.
SenderState(GPUDynInstPtr gpuDynInst)
the port intended to communicate between the CU and its LDS
bool stalled
whether or not it is stalled
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
LDSPort(const std::string &_name, ComputeUnit *_cu)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
const char * description() const
Return a C string describing the event.
MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
SQCPort(const std::string &_name, ComputeUnit *_cu)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
virtual Tick recvAtomic(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
const char * description() const
Return a C string describing the event.
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
int oprNetPipeLength() const
int simdUnitWidth() const
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
ComputeUnit(const Params &p)
bool processTimingPacket(PacketPtr pkt)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
std::vector< uint64_t > instExecPerSimd
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
const int _cacheLineSize
EXEC_POLICY exec_policy
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the CU
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
RequestorID _requestorId
void handleMemPacket(PacketPtr pkt, int memport_index)
int mapWaveToLocalMem(Wavefront *w) const
std::vector< RegisterFileCache * > rfc
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void doSmReturn(GPUDynInstPtr gpuDynInst)
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::unordered_map< Addr, std::pair< int, int > > pageDataStruct
void doSQCInvalidate(RequestPtr req, int kernId)
trigger SQCinvalidate operation in the CU
int getCacheLineBits() const
std::vector< WFBarrier > wfBarrierSlots
The barrier slots for this CU.
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
TokenManager * getTokenManager()
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
int cacheLineSize() const
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
void sendInvL2(Addr paddr)
LdsState & getLds() const
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > lastExecCycle
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
Port & getPort(const std::string &if_name, PortID idx) override
Get a port with a given name and index.
std::vector< ScalarRegisterFile * > srf
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
int dpBypassLength() const
int loadBusLength() const
enums::PrefetchType prefetchType
void processFetchReturn(PacketPtr pkt)
int numBarrierSlots() const
int scalarPipeLength() const
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
InstSeqNum globalSeqNum
int rfcLength() const
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
const int _numBarrierSlots
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
int storeBusLength() const
void initiateFetch(Wavefront *wavefront)
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
int spBypassLength() const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
void fetch(PacketPtr pkt, Wavefront *wavefront)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
static const FlagsType AutoDelete
Definition eventq.hh:110
void setFlags(Flags _flags)
Definition eventq.hh:331
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:58
const std::string _name
Definition named.hh:41
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void makeResponse()
Take a request packet and modify it in place to be suitable for returning as a response to that reque...
Definition packet.hh:1062
Ports are used to interface objects to each other.
Definition port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition port.hh:111
A RequestPort is a specialisation of a Port, which implements the default protocol for the three diff...
Definition port.hh:136
SimObject & owner
Definition port.hh:143
Communication interface between Schedule and Execute stages.
Definition comm.hh:99
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63
Abstract superclass for simulation objects.
WF barrier slots.
static const int InvalidID
int numAtBarrier() const
void decMaxBarrierCnt()
Decrement the number of WFs that are participating in this barrier.
int numYetToReachBarrier() const
Number of WFs that have not yet reached the barrier.
void setMaxBarrierCnt(int max_barrier_cnt)
Set the maximum barrier count (i.e., the number of WFs that are participating in the barrier).
void release()
Release this barrier resource so it can be used by other WGs.
void reset()
Reset the barrier.
void incNumAtBarrier()
Mark that a WF has reached the barrier.
int _maxBarrierCnt
The maximum number of WFs that can reach this barrier.
int _numAtBarrier
The number of WFs in the WG that have reached the barrier.
bool allAtBarrier() const
Have all WFs participating in this barrier reached the barrier? If so, then the barrier is satisfied ...
int maxBarrierCnt() const
A simple distribution stat.
A formula for statistics that is calculated when printed.
Statistics container.
Definition group.hh:93
This is a simple scalar statistic, like a counter.
A vector of distributions.
A vector of scalar stats.
STL deque class.
Definition stl.hh:44
STL vector class.
Definition stl.hh:37
ClockedObject declaration and implementation.
virtual Port & getPort(const std::string &if_name, PortID idx=InvalidPortID)
Get a port with a given name and index.
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Port Object Declaration.
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 0 > w
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
const PortID InvalidPortID
Definition types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
@ TLB_MISS_CACHE_MISS
@ TLB_MISS_CACHE_HIT
@ TLB_HIT_CACHE_HIT
@ TLB_HIT_CACHE_MISS
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of Statistics objects.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
ComputeUnitStats(statistics::Group *parent, int n_wf)
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar numVecOpsExecutedMFMAF16
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Scalar numVecOpsExecutedMFMAF64
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Scalar numVecOpsExecutedMFMAF32
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState(ComputeUnit *cu, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
SenderState(GPUDynInstPtr gpuDynInst, Packet::SenderState *sender_state=nullptr)
A virtual base opaque structure used to hold state associated with the packet (e.g....
Definition packet.hh:469

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0