gem5 v23.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
compute_unit.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __COMPUTE_UNIT_HH__
33#define __COMPUTE_UNIT_HH__
34
35#include <deque>
36#include <map>
37#include <unordered_set>
38#include <vector>
39
40#include "base/callback.hh"
41#include "base/compiler.hh"
42#include "base/statistics.hh"
43#include "base/stats/group.hh"
44#include "base/types.hh"
45#include "config/the_gpu_isa.hh"
46#include "enums/PrefetchType.hh"
47#include "gpu-compute/comm.hh"
57#include "mem/port.hh"
58#include "mem/token_port.hh"
59#include "sim/clocked_object.hh"
60
61namespace gem5
62{
63
64class HSAQueueEntry;
65class LdsChunk;
66class ScalarRegisterFile;
67class Shader;
68class VectorRegisterFile;
69
70struct ComputeUnitParams;
71
73{
74 OLDEST = 0,
75 RR
76};
77
79{
84};
85
91{
92 public:
94 {
95 }
96
97 static const int InvalidID = -1;
98
99 int
101 {
102 return _numAtBarrier;
103 }
104
108 int
110 {
112 }
113
114 int
116 {
117 return _maxBarrierCnt;
118 }
119
124 void
125 setMaxBarrierCnt(int max_barrier_cnt)
126 {
127 _maxBarrierCnt = max_barrier_cnt;
128 }
129
133 void
135 {
138 }
139
145 bool
147 {
149 }
150
155 void
157 {
158 assert(_maxBarrierCnt > 0);
160 }
161
166 void
168 {
169 _numAtBarrier = 0;
170 _maxBarrierCnt = 0;
171 }
172
177 void
179 {
180 _numAtBarrier = 0;
181 }
182
183 private:
190
199};
200
202{
203 public:
204
205
206 // Execution resources
207 //
208 // The ordering of units is:
209 // Vector ALUs
210 // Scalar ALUs
211 // GM Pipe
212 // LM Pipe
213 // Scalar Mem Pipe
214 //
215 // Note: the ordering of units is important and the code assumes the
216 // above ordering. However, there may be more than one resource of
217 // each type (e.g., 4 VALUs or 2 SALUs)
218
220 // Resource control for global memory to VRF data/address bus
222 // Resource control for Vector Register File->Global Memory pipe buses
224 // Resource control for Vector Global Memory execution unit
226
228 // Resource control for local memory to VRF data/address bus
230 // Resource control for Vector Register File->Local Memory pipe buses
232 // Resource control for Vector Shared/Local Memory execution unit
234
236 // Resource control for scalar memory to SRF data/address bus
238 // Resource control for Scalar Register File->Scalar Memory pipe buses
240 // Resource control for Scalar Memory execution unit
242
243 // vector ALU execution resources
246
247 // scalar ALU execution resources
250
251 // Return total number of execution units on this CU
252 int numExeUnits() const;
253 // index into readyList of the first memory unit
254 int firstMemUnit() const;
255 // index into readyList of the last memory unit
256 int lastMemUnit() const;
257 // index into scalarALUs vector of SALU used by the wavefront
258 int mapWaveToScalarAlu(Wavefront *w) const;
259 // index into readyList of SALU used by wavefront
261 // index into readyList of Global Memory unit used by wavefront
262 int mapWaveToGlobalMem(Wavefront *w) const;
263 // index into readyList of Local Memory unit used by wavefront
264 int mapWaveToLocalMem(Wavefront *w) const;
265 // index into readyList of Scalar Memory unit used by wavefront
266 int mapWaveToScalarMem(Wavefront *w) const;
267
268 int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
269 int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
270 int numCyclesPerStoreTransfer; // number of cycles per vector store
271 int numCyclesPerLoadTransfer; // number of cycles per vector load
272
273 // track presence of dynamic instructions in the Schedule pipeline
274 // stage. This is used to check the readiness of the oldest,
275 // non-dispatched instruction of every WF in the Scoreboard stage.
276 std::unordered_set<uint64_t> pipeMap;
277
279
287
289
290 typedef ComputeUnitParams Params;
292 int cu_id;
293
294 // array of vector register files, one per SIMD
296 // array of scalar register files, one per SIMD
298
299 // Width per VALU/SIMD unit: number of work items that can be executed
300 // on the vector ALU simultaneously in a SIMD unit
302 // number of pipe stages for bypassing data to next dependent single
303 // precision vector instruction inside the vector ALU pipeline
305 // number of pipe stages for bypassing data to next dependent double
306 // precision vector instruction inside the vector ALU pipeline
308 // number of pipe stages for scalar ALU
310 // number of pipe stages for operand collection & distribution network
312 // number of cycles per instruction issue period
314
315 // VRF to GM Bus latency
317 // SRF to Scalar Mem Bus latency
319 // VRF to LM Bus latency
321
322 // tracks the last cycle a vector instruction was executed on a SIMD
324
325 // tracks the number of dyn inst executed per SIMD
327
328 // true if we allow a separate TLB per lane
330 // if 0, TLB prefetching is off.
332 // if fixed-stride prefetching, this is the stride.
334
338 enums::PrefetchType prefetchType;
340
342 // Idle CU timeout in ticks
347
348 /*
349 * for Counting page accesses
350 */
352
354
359
367
368 // number of currently reserved vector registers per SIMD unit
370 // number of currently reserved scalar registers per SIMD unit
372 // number of vector registers per SIMD unit
374 // number of available scalar registers per SIMD unit
376
377 // this hash map will keep track of page divergence
378 // per memory instruction per wavefront. The hash map
379 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
380 std::map<Addr, int> pagesTouched;
381
384
385 ComputeUnit(const Params &p);
386 ~ComputeUnit();
387
388 // Timing Functions
390 int simdUnitWidth() const { return simdWidth; }
391 int spBypassLength() const { return spBypassPipeLength; }
392 int dpBypassLength() const { return dpBypassPipeLength; }
393 int scalarPipeLength() const { return scalarPipeStages; }
396 int wfSize() const { return wavefrontSize; }
397
398 void exec();
399 void initiateFetch(Wavefront *wavefront);
400 void fetch(PacketPtr pkt, Wavefront *wavefront);
402
403 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
404 HSAQueueEntry *task, int bar_id,
405 bool fetchContext=false);
406
407 void doInvalidate(RequestPtr req, int kernId);
408 void doFlush(GPUDynInstPtr gpuDynInst);
409
410 void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
411 bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
412
413 int cacheLineSize() const { return _cacheLineSize; }
414 int getCacheLineBits() const { return cacheLineBits; }
415
416 void resetRegisterPool();
417
418 private:
419 WFBarrier&
420 barrierSlot(int bar_id)
421 {
422 assert(bar_id > WFBarrier::InvalidID);
423 return wfBarrierSlots.at(bar_id);
424 }
425
426 int
428 {
429 assert(freeBarrierIds.size());
430 auto free_bar_id = freeBarrierIds.begin();
431 int bar_id = *free_bar_id;
432 freeBarrierIds.erase(free_bar_id);
433 return bar_id;
434 }
435
436 public:
437 int numYetToReachBarrier(int bar_id);
438 bool allAtBarrier(int bar_id);
439 void incNumAtBarrier(int bar_id);
440 int numAtBarrier(int bar_id);
441 int maxBarrierCnt(int bar_id);
442 void resetBarrier(int bar_id);
443 void decMaxBarrierCnt(int bar_id);
444 void releaseBarrier(int bar_id);
445 void releaseWFsFromBarrier(int bar_id);
446 int numBarrierSlots() const { return _numBarrierSlots; }
447
448 template<typename c0, typename c1>
449 void doSmReturn(GPUDynInstPtr gpuDynInst);
450
451 virtual void init() override;
452 void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt);
453 void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
454 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
455 bool kernelMemSync,
456 RequestPtr req=nullptr);
457 void handleMemPacket(PacketPtr pkt, int memport_index);
461
464
465 bool isDone() const;
466 bool isVectorAluIdle(uint32_t simdId) const;
467
468 void handleSQCReturn(PacketPtr pkt);
469
470 protected:
472
474
475 public:
476 LdsState &
477 getLds() const
478 {
479 return lds;
480 }
481
482 int32_t
483 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
484
485 [[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst);
486
487 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
489
490 void exitCallback();
491
493 {
494 public:
495 GMTokenPort(const std::string& name, SimObject *owner,
498 { }
500
501 protected:
502 bool recvTimingResp(PacketPtr) { return false; }
503 void recvReqRetry() { }
504 };
505
506 // Manager for the number of tokens available to this compute unit to
507 // send global memory request packets to the coalescer this is only used
508 // between global memory pipe and TCP coalescer.
511
513 class DataPort : public RequestPort
514 {
515 public:
516 DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
517 : RequestPort(_name, id), computeUnit(_cu) { }
518
520
522 {
526
527 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
528 Packet::SenderState *sender_state=nullptr)
529 : _gpuDynInst(gpuDynInst),
530 port_index(_port_index),
531 saved(sender_state) { }
532 };
533
534 class SystemHubEvent : public Event
535 {
538
539 public:
541 : dataPort(_dataPort), reqPkt(pkt)
542 {
544 }
545
546 void
548 {
549 // DMAs do not operate on packets and therefore do not
550 // convert to a response. Do that here instead.
553 }
554 };
555
558
561
563
564 bool handleResponse(PacketPtr pkt);
565
566 protected:
568
569 virtual bool recvTimingResp(PacketPtr pkt);
570 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
571 virtual void recvFunctional(PacketPtr pkt) { }
572 virtual void recvRangeChange() { }
573 virtual void recvReqRetry();
574
575 virtual void
577 {
578 resp.clear();
579 snoop = true;
580 }
581
582 };
583
584 // Scalar data cache access port
586 {
587 public:
588 ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
590 {
591 }
592
593 bool recvTimingResp(PacketPtr pkt) override;
594 void recvReqRetry() override;
595
597 {
599 Packet::SenderState *sender_state=nullptr)
600 : _gpuDynInst(gpuDynInst), saved(sender_state)
601 {
602 }
603
606 };
607
608 class MemReqEvent : public Event
609 {
610 private:
613
614 public:
615 MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
616 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
617 {
619 }
620
621 void process();
622 const char *description() const;
623 };
624
625 class SystemHubEvent : public Event
626 {
629
630 public:
632 : dataPort(_dataPort), reqPkt(pkt)
633 {
635 }
636
637 void
639 {
640 // DMAs do not operate on packets and therefore do not
641 // convert to a response. Do that here instead.
644 }
645 };
646
647 bool handleResponse(PacketPtr pkt);
648
650
651 private:
653 };
654
655 // Instruction cache access port
656 class SQCPort : public RequestPort
657 {
658 public:
659 SQCPort(const std::string &_name, ComputeUnit *_cu)
660 : RequestPort(_name), computeUnit(_cu) { }
661
663
665 {
668 // kernel id to be used in handling I-Cache invalidate response
670
672 *sender_state=nullptr, int _kernId=-1)
673 : wavefront(_wavefront), saved(sender_state),
674 kernId(_kernId){ }
675 };
676
678
679 protected:
681
682 virtual bool recvTimingResp(PacketPtr pkt);
683 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
684 virtual void recvFunctional(PacketPtr pkt) { }
685 virtual void recvRangeChange() { }
686 virtual void recvReqRetry();
687
688 virtual void
690 {
691 resp.clear();
692 snoop = true;
693 }
694 };
695
697 class DTLBPort : public RequestPort
698 {
699 public:
700 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
702 stalled(false)
703 { }
704
705 bool isStalled() { return stalled; }
706 void stallPort() { stalled = true; }
707 void unstallPort() { stalled = false; }
708
714
719 {
720 // the memInst that this is associated with
722
723 // the lane in the memInst this is associated with, so we send
724 // the memory request down the right port
726
727 // constructor used for packets involved in timing accesses
728 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
729 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
730
731 };
732
733 protected:
736
737 virtual bool recvTimingResp(PacketPtr pkt);
738 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
739 virtual void recvFunctional(PacketPtr pkt) { }
740 virtual void recvRangeChange() { }
741 virtual void recvReqRetry();
742 };
743
745 {
746 public:
747 ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
748 : RequestPort(_name), computeUnit(_cu), stalled(false)
749 {
750 }
751
753 {
754 SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
756 };
757
758 bool recvTimingResp(PacketPtr pkt) override;
759 void recvReqRetry() override { assert(false); }
760
761 bool isStalled() const { return stalled; }
762 void stallPort() { stalled = true; }
763 void unstallPort() { stalled = false; }
764
766
767 private:
770 };
771
772 class ITLBPort : public RequestPort
773 {
774 public:
775 ITLBPort(const std::string &_name, ComputeUnit *_cu)
776 : RequestPort(_name), computeUnit(_cu), stalled(false) { }
777
778
779 bool isStalled() { return stalled; }
780 void stallPort() { stalled = true; }
781 void unstallPort() { stalled = false; }
782
788
793 {
794 // The wavefront associated with this request
796
797 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
798 };
799
800 protected:
803
804 virtual bool recvTimingResp(PacketPtr pkt);
805 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
806 virtual void recvFunctional(PacketPtr pkt) { }
807 virtual void recvRangeChange() { }
808 virtual void recvReqRetry();
809 };
810
814 class LDSPort : public RequestPort
815 {
816 public:
817 LDSPort(const std::string &_name, ComputeUnit *_cu)
819 {
820 }
821
822 bool isStalled() const { return stalled; }
823 void stallPort() { stalled = true; }
824 void unstallPort() { stalled = false; }
825
830 std::queue<PacketPtr> retries;
831
837 {
838 protected:
839 // The actual read/write/atomic request that goes with this command
841
842 public:
844 _gpuDynInst(gpuDynInst)
845 {
846 }
847
850 {
851 return _gpuDynInst;
852 }
853 };
854
855 virtual bool
857
858 protected:
859
860 bool stalled = false;
861
863
864 virtual bool
866
867 virtual Tick
868 recvAtomic(PacketPtr pkt) { return 0; }
869
870 virtual void
872 {
873 }
874
875 virtual void
877 {
878 }
879
880 virtual void
881 recvReqRetry();
882 };
883
888
891 {
892 return memPortTokens;
893 }
894
899 // port to the TLB hierarchy (i.e., the L1 TLB)
901 // port to the scalar data cache
903 // port to the scalar data TLB
905 // port to the SQC (i.e. the I-cache)
907 // port to the SQC TLB (there's a separate TLB for each I-cache)
909
910 Port &
911 getPort(const std::string &if_name, PortID idx) override
912 {
913 if (if_name == "memory_port" && idx < memPort.size()) {
914 return memPort[idx];
915 } else if (if_name == "translation_port" && idx < tlbPort.size()) {
916 return tlbPort[idx];
917 } else if (if_name == "scalar_port") {
918 return scalarDataPort;
919 } else if (if_name == "scalar_tlb_port") {
920 return scalarDTLBPort;
921 } else if (if_name == "sqc_port") {
922 return sqcPort;
923 } else if (if_name == "sqc_tlb_port") {
924 return sqcTLBPort;
925 } else if (if_name == "ldsPort") {
926 return ldsPort;
927 } else if (if_name == "gmTokenPort") {
928 return gmTokenPort;
929 } else {
930 return ClockedObject::getPort(if_name, idx);
931 }
932 }
933
935
936 private:
937 const int _cacheLineSize;
942
977
985 std::unordered_set<int> freeBarrierIds;
986
987 // hold the time of the arrival of the first cache block related to
988 // a particular GPUDynInst. This is used to calculate the difference
989 // between the first and last chace block arrival times.
990 std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
991
992 public:
993 void updateInstStats(GPUDynInstPtr gpuDynInst);
995
997 {
998 ComputeUnitStats(statistics::Group *parent, int n_wf);
999
1022
1029
1030 // Cycles required to send register source (addr and data) from
1031 // register files to memory pipeline, per SIMD.
1035
1057
1059
1060 // the following stats compute the avg. TLB accesslatency per
1061 // uncoalesced request (only for data)
1065 // hitsPerTLBLevel[x] are the hits in Level x TLB.
1066 // x = 0 is the page table.
1068
1071
1072 // over all memory instructions executed over all wavefronts
1073 // how many touched 0-4 pages, 4-8, ..., 60-64 pages
1075 // count of non-flat global memory vector instructions executed
1077 // count of flat global memory vector instructions executed
1080
1083 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
1084 // active when the instruction is committed, this number is still
1085 // incremented by 1
1087 // Number of cycles among successive instruction executions across all
1088 // wavefronts of the same CU
1090 // number of individual vector operations executed
1092 // number of individual f16 vector operations executed
1094 // number of individual f32 vector operations executed
1096 // number of individual f64 vector operations executed
1098 // number of individual FMA 16,32,64 vector operations executed
1102 // number of individual MAC 16,32,64 vector operations executed
1106 // number of individual MAD 16,32,64 vector operations executed
1110 // total number of two op FP vector operations executed
1112 // Total cycles that something is running on the GPU
1114 statistics::Formula vpc; // vector ops per cycle
1115 statistics::Formula vpc_f16; // vector ops per cycle
1116 statistics::Formula vpc_f32; // vector ops per cycle
1117 statistics::Formula vpc_f64; // vector ops per cycle
1118 statistics::Formula ipc; // vector instructions per cycle
1122 // number of vector ALU instructions received
1124 // number of times a WG cannot start due to lack of free VGPRs in SIMDs
1126 // number of times a WG cannot start due to lack of free SGPRs in SIMDs
1132
1133 // distrubtion in latency difference between first and last cache block
1134 // arrival ticks
1136
1137 // Track the amount of interleaving between wavefronts on each SIMD.
1138 // This stat is sampled using instExecPerSimd to compute the number
1139 // of instructions that have been executed on a SIMD between a WF
1140 // executing two successive instructions.
1143};
1144
1145} // namespace gem5
1146
1147#endif // __COMPUTE_UNIT_HH__
Defines global host-dependent types: Counter, Tick, and (indirectly) {int,uint}{8,...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
void processMemReqEvent(PacketPtr pkt)
EventFunctionWrapper * createMemReqEvent(PacketPtr pkt)
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
EventFunctionWrapper * createMemRespEvent(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
std::deque< std::pair< PacketPtr, GPUDynInstPtr > > retries
void processMemRespEvent(PacketPtr pkt)
bool handleResponse(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
DataPort(const std::string &_name, ComputeUnit *_cu, PortID id)
GMTokenPort(const std::string &name, SimObject *owner, PortID id=InvalidPortID)
bool recvTimingResp(PacketPtr)
Receive a timing response from the peer.
void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
virtual void recvFunctional(PacketPtr pkt)
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
ITLBPort(const std::string &_name, ComputeUnit *_cu)
SenderState is information carried along with the packet, esp.
SenderState(GPUDynInstPtr gpuDynInst)
the port intended to communicate between the CU and its LDS
bool stalled
whether or not it is stalled
virtual bool recvTimingResp(PacketPtr pkt)
get the result of packets sent to the LDS when they return
virtual bool sendTimingReq(PacketPtr pkt)
attempt to send this packet, either the port is already stalled, the request is nack'd and must stall...
virtual void recvReqRetry()
the bus is telling the port that there is now space so retrying stalled requests should work now this...
LDSPort(const std::string &_name, ComputeUnit *_cu)
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
virtual Tick recvAtomic(PacketPtr pkt)
virtual void recvFunctional(PacketPtr pkt)
std::queue< PacketPtr > retries
here we queue all the requests that were not successfully sent.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
virtual void recvRangeChange()
Called to receive an address range change from the peer response port.
SQCPort(const std::string &_name, ComputeUnit *_cu)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
virtual Tick recvAtomic(PacketPtr pkt)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
const char * description() const
Return a C string describing the event.
MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt)
SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
bool recvTimingResp(PacketPtr pkt) override
Receive a timing response from the peer.
ScalarDataPort(const std::string &_name, ComputeUnit *_cu)
void recvReqRetry() override
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
std::deque< PacketPtr > retries
int oprNetPipeLength() const
int simdUnitWidth() const
void releaseBarrier(int bar_id)
int mapWaveToScalarAlu(Wavefront *w) const
bool processTimingPacket(PacketPtr pkt)
WFBarrier & barrierSlot(int bar_id)
void updatePageDivergenceDist(Addr addr)
std::vector< WaitClass > scalarALUs
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
WaitClass scalarMemUnit
std::vector< uint64_t > instExecPerSimd
virtual void init() override
init() is called after all C++ SimObjects have been created and all ports are connected.
const int _cacheLineSize
EXEC_POLICY exec_policy
std::unordered_set< uint64_t > pipeMap
void updateInstStats(GPUDynInstPtr gpuDynInst)
WaitClass vectorGlobalMemUnit
void doInvalidate(RequestPtr req, int kernId)
trigger invalidate operation in the cu
bool isDone() const
std::vector< int > numWfsToSched
Number of WFs to schedule to each SIMD.
LocalMemPipeline localMemoryPipe
int mapWaveToGlobalMem(Wavefront *w) const
RequestorID _requestorId
void handleMemPacket(PacketPtr pkt, int memport_index)
int mapWaveToLocalMem(Wavefront *w) const
WaitClass scalarMemToSrfBus
ScalarDTLBPort scalarDTLBPort
void doSmReturn(GPUDynInstPtr gpuDynInst)
void releaseWFsFromBarrier(int bar_id)
int numYetToReachBarrier(int bar_id)
WaitClass vrfToLocalMemPipeBus
int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
std::unordered_map< Addr, std::pair< int, int > > pageDataStruct
int getCacheLineBits() const
std::vector< WFBarrier > wfBarrierSlots
The barrier slots for this CU.
void resetBarrier(int bar_id)
WaitClass locMemToVrfBus
std::vector< std::vector< Addr > > lastVaddrSimd
ComputeUnitParams Params
TokenManager * getTokenManager()
std::unordered_set< int > freeBarrierIds
A set used to easily retrieve a free barrier ID.
pageDataStruct pageAccesses
int cacheLineSize() const
WaitClass srfToScalarMemPipeBus
int lastMemUnit() const
ScalarMemPipeline scalarMemoryPipe
bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
int numExeUnits() const
LdsState & getLds() const
WaitClass glbMemToVrfBus
LDSPort ldsPort
The port to access the Local Data Store Can be connected to a LDS object.
std::vector< uint64_t > lastExecCycle
GlobalMemPipeline globalMemoryPipe
std::map< Addr, int > pagesTouched
bool sendToLds(GPUDynInstPtr gpuDynInst)
send a general request to the LDS make sure to look at the return value here as your request might be...
int maxBarrierCnt(int bar_id)
void insertInPipeMap(Wavefront *w)
int numAtBarrier(int bar_id)
ScoreboardCheckToSchedule scoreboardCheckToSchedule
TODO: Update these comments once the pipe stage interface has been fully refactored.
void incNumAtBarrier(int bar_id)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr)
std::vector< int > vectorRegsReserved
Port & getPort(const std::string &if_name, PortID idx) override
Get a port with a given name and index.
std::vector< ScalarRegisterFile * > srf
std::vector< std::vector< std::vector< Addr > > > lastVaddrWF
int firstMemUnit() const
ScoreboardCheckStage scoreboardCheckStage
GMTokenPort gmTokenPort
std::vector< WaitClass > vectorALUs
int mapWaveToScalarMem(Wavefront *w) const
RegisterManager * registerManager
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false)
EventFunctionWrapper tickEvent
TokenManager * memPortTokens
ScalarDataPort scalarDataPort
void fillKernelState(Wavefront *w, HSAQueueEntry *task)
void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
WaitClass vectorSharedMemUnit
int dpBypassLength() const
int loadBusLength() const
enums::PrefetchType prefetchType
void processFetchReturn(PacketPtr pkt)
int numBarrierSlots() const
int scalarPipeLength() const
std::vector< int > scalarRegsReserved
std::vector< DTLBPort > tlbPort
std::vector< std::vector< Wavefront * > > wfList
int mapWaveToScalarAluGlobalIdx(Wavefront *w) const
ScheduleToExecute scheduleToExecute
std::vector< VectorRegisterFile * > vrf
void decMaxBarrierCnt(int bar_id)
InstSeqNum globalSeqNum
std::unordered_map< GPUDynInstPtr, Tick > headTailMap
std::vector< Addr > lastVaddrCU
const int _numBarrierSlots
FetchStage fetchStage
WaitClass vrfToGlobalMemPipeBus
ScheduleStage scheduleStage
int storeBusLength() const
void initiateFetch(Wavefront *wavefront)
bool allAtBarrier(int bar_id)
bool isVectorAluIdle(uint32_t simdId) const
int spBypassLength() const
InstSeqNum getAndIncSeqNum()
void doFlush(GPUDynInstPtr gpuDynInst)
trigger flush operation in the cu
RequestorID requestorId()
std::vector< DataPort > memPort
The memory port for SIMD data accesses.
void deleteFromPipeMap(Wavefront *w)
void handleSQCReturn(PacketPtr pkt)
void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
void fetch(PacketPtr pkt, Wavefront *wavefront)
gem5::ComputeUnit::ComputeUnitStats stats
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
static const FlagsType AutoDelete
Definition eventq.hh:110
void setFlags(Flags _flags)
Definition eventq.hh:331
this represents a slice of the overall LDS, intended to be associated with an individual workgroup
Definition lds_state.hh:57
const std::string _name
Definition named.hh:41
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void makeResponse()
Take a request packet and modify it in place to be suitable for returning as a response to that reque...
Definition packet.hh:1062
Ports are used to interface objects to each other.
Definition port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition port.hh:111
A RequestPort is a specialisation of a Port, which implements the default protocol for the three diff...
Definition port.hh:79
SimObject & owner
Definition port.hh:86
Communication interface between Schedule and Execute stages.
Definition comm.hh:99
Communication interface between ScoreboardCheck and Schedule stages.
Definition comm.hh:63
Abstract superclass for simulation objects.
WF barrier slots.
static const int InvalidID
int numAtBarrier() const
void decMaxBarrierCnt()
Decrement the number of WFs that are participating in this barrier.
int numYetToReachBarrier() const
Number of WFs that have not yet reached the barrier.
void setMaxBarrierCnt(int max_barrier_cnt)
Set the maximum barrier count (i.e., the number of WFs that are participating in the barrier).
void release()
Release this barrier resource so it can be used by other WGs.
void reset()
Reset the barrier.
void incNumAtBarrier()
Mark that a WF has reached the barrier.
int _maxBarrierCnt
The maximum number of WFs that can reach this barrier.
int _numAtBarrier
The number of WFs in the WG that have reached the barrier.
bool allAtBarrier() const
Have all WFs participating in this barrier reached the barrier? If so, then the barrier is satisfied ...
int maxBarrierCnt() const
A simple distribution stat.
A formula for statistics that is calculated when printed.
Statistics container.
Definition group.hh:93
This is a simple scalar statistic, like a counter.
A vector of distributions.
A vector of scalar stats.
STL deque class.
Definition stl.hh:44
STL vector class.
Definition stl.hh:37
ClockedObject declaration and implementation.
virtual Port & getPort(const std::string &if_name, PortID idx=InvalidPortID)
Get a port with a given name and index.
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Port Object Declaration.
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 0 > w
Bitfield< 3 > addr
Definition types.hh:84
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
const PortID InvalidPortID
Definition types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
@ TLB_MISS_CACHE_MISS
@ TLB_MISS_CACHE_HIT
@ TLB_HIT_CACHE_HIT
@ TLB_HIT_CACHE_MISS
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of Statistics objects.
statistics::Distribution activeLanesPerLMemInstrDist
statistics::Formula vectorMemWritesPerKiloInst
statistics::VectorDistribution instInterleave
statistics::Scalar wgBlockedDueBarrierAllocation
statistics::Scalar wgBlockedDueLdsAllocation
statistics::Scalar numVecOpsExecutedTwoOpFP
statistics::Distribution waveLevelParallelism
statistics::Formula scalarMemInstsPerKiloInst
statistics::Distribution controlFlowDivergenceDist
statistics::Scalar numTimesWgBlockedDueSgprAlloc
statistics::Distribution ldsBankConflictDist
statistics::Formula scalarMemWritesPerKiloInst
statistics::Formula vectorMemReadsPerKiloInst
statistics::Formula scalarMemReadsPerKiloInst
statistics::Scalar numTimesWgBlockedDueVgprAlloc
statistics::Distribution pageDivergenceDist
statistics::Distribution activeLanesPerGMemInstrDist
statistics::Distribution headTailLatency
statistics::Distribution execRateDist
statistics::Formula vectorMemInstsPerKiloInst
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr)
SenderState is information carried along with the packet throughout the TLB hierarchy.
SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr, int _kernId=-1)
SenderState(GPUDynInstPtr gpuDynInst, Packet::SenderState *sender_state=nullptr)
A virtual base opaque structure used to hold state associated with the packet (e.g....
Definition packet.hh:469

Generated on Mon Jul 10 2023 14:24:31 for gem5 by doxygen 1.9.7