gem5 v24.1.0.1
Loading...
Searching...
No Matches
GPUCoalescer.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
33#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
34
35#include <iostream>
36#include <unordered_map>
37
38#include "base/statistics.hh"
40#include "gpu-compute/misc.hh"
41#include "mem/request.hh"
44#include "mem/ruby/protocol/PrefetchBit.hh"
45#include "mem/ruby/protocol/RubyAccessMode.hh"
46#include "mem/ruby/protocol/RubyRequestType.hh"
47#include "mem/ruby/protocol/SequencerRequestType.hh"
49#include "mem/token_port.hh"
50
51namespace gem5
52{
53
54struct RubyGPUCoalescerParams;
55
56namespace ruby
57{
58
59class DataBlock;
60class CacheMsg;
61struct MachineID;
62class CacheMemory;
63
64// List of packets that belongs to a specific instruction.
66
68{
69 public:
72
73 void insertPacket(PacketPtr pkt);
74 void insertReqType(PacketPtr pkt, RubyRequestType type);
75 bool packetAvailable();
76 void printRequestTable(std::stringstream& ss);
77
78 // Modify packets remaining map. Init sets value iff the seqNum has not
79 // yet been seen before. get/set act as a regular getter/setter.
80 void initPacketsRemaining(InstSeqNum seqNum, int count);
82 void setPacketsRemaining(InstSeqNum seqNum, int count);
83
84 // Returns a pointer to the list of packets corresponding to an
85 // instruction in the instruction map or nullptr if there are no
86 // instructions at the offset.
88 void updateResources();
89 bool areRequestsDone(const InstSeqNum instSeqNum);
90
91 // Check if a packet hasn't been removed from instMap in too long.
92 // Panics if a deadlock is detected and returns nothing otherwise.
93 void checkDeadlock(Tick threshold);
94
95 private:
97
98 // Maps an instructions unique sequence number to a queue of packets
99 // which need responses. This data structure assumes the sequence number
100 // is monotonically increasing (which is true for CU class) in order to
101 // issue packets in age order.
102 std::map<InstSeqNum, PerInstPackets> instMap;
103
104 std::map<InstSeqNum, int> instPktsRemaining;
105
106 std::map<InstSeqNum, RubyRequestType> reqTypeMap;
107};
108
110{
111 public:
112 CoalescedRequest(uint64_t _seqNum)
113 : seqNum(_seqNum), issueTime(Cycles(0)),
114 rubyType(RubyRequestType_NULL)
115 {}
117
118 void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
119 void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
120 void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
121 void setRubyType(RubyRequestType type) { rubyType = type; }
122
123 uint64_t getSeqNum() const { return seqNum; }
124 PacketPtr getFirstPkt() const { return pkts[0]; }
125 Cycles getIssueTime() const { return issueTime; }
126 RubyRequestType getRubyType() const { return rubyType; }
128
129 private:
130 uint64_t seqNum;
132 RubyRequestType rubyType;
134};
135
136// PendingWriteInst tracks the number of outstanding Ruby requests
137// per write instruction. Once all requests associated with one instruction
138// are completely done in Ruby, we call back the requestor to mark
139// that this instruction is complete.
141{
142 public:
144 : numPendingStores(0),
145 originalPort(nullptr),
146 gpuDynInstPtr(nullptr)
147 {}
148
151
152 void
154 bool usingRubyTester)
155 {
156 assert(port);
157 originalPort = port;
158
159 if (!usingRubyTester) {
160 gpuDynInstPtr = inst;
161 }
162
164 }
165
166 // return true if no more ack is expected
167 bool
169 {
170 assert(numPendingStores > 0);
172 return (numPendingStores == 0) ? true : false;
173 }
174
175 // ack the original requestor that this write instruction is complete
176 void
177 ackWriteCompletion(bool usingRubyTester)
178 {
179 assert(numPendingStores == 0);
180
181 // make a response packet
182 PacketPtr pkt = new Packet(std::make_shared<Request>(),
184
185 if (!usingRubyTester) {
186 assert(gpuDynInstPtr);
189 (gpuDynInstPtr, 0, nullptr);
190 pkt->senderState = ss;
191 }
192
193 // send the ack response to the requestor
195 }
196
197 int
201
202 private:
203 // the number of stores waiting for writeCompleteCallback
205 // The original port that sent one of packets associated with this
206 // write instruction. We may have more than one packet per instruction,
207 // which implies multiple ports per instruction. However, we need
208 // only 1 of the ports to call back the CU. Therefore, here we keep
209 // track the port that sent the first packet of this instruction.
211 // similar to the originalPort, this gpuDynInstPtr is set only for
212 // the first packet of this instruction.
214};
215
216class GPUCoalescer : public RubyPort
217{
218 public:
220 {
221 public:
222 GMTokenPort(const std::string& name,
225 { }
227
228 protected:
231 bool recvTimingReq(PacketPtr) { return false; }
233 {
234 AddrRangeList ranges;
235 return ranges;
236 }
237 };
238
239 typedef RubyGPUCoalescerParams Params;
240 GPUCoalescer(const Params &);
242
243 Port &getPort(const std::string &if_name,
244 PortID idx = InvalidPortID) override;
245
246 // Public Methods
247 void wakeup(); // Used only for deadlock detection
248 void printRequestTable(std::stringstream& ss);
249
250 void printProgress(std::ostream& out) const;
251 void resetStats() override;
253
254 // each store request needs two callbacks:
255 // (1) writeCallback is called when the store is received and processed
256 // by TCP. This writeCallback does not guarantee the store is actually
257 // completed at its destination cache or memory. writeCallback helps
258 // release hardware resources (e.g., its entry in coalescedTable)
259 // allocated for the store so that subsequent requests will not be
260 // blocked unnecessarily due to hardware resource constraints.
261 // (2) writeCompleteCallback is called when the store is fully completed
262 // at its destination cache or memory. writeCompleteCallback
263 // guarantees that the store is fully completed. This callback
264 // will decrement hardware counters in CU
265 void writeCallback(Addr address, DataBlock& data);
266
267 void writeCallback(Addr address,
268 MachineType mach,
269 DataBlock& data);
270
271 void writeCallback(Addr address,
272 MachineType mach,
274 Cycles initialRequestTime,
275 Cycles forwardRequestTime,
276 Cycles firstResponseTime,
277 bool isRegion);
278
279 void writeCallback(Addr address,
280 MachineType mach,
282 Cycles initialRequestTime,
283 Cycles forwardRequestTime,
284 Cycles firstResponseTime);
285
286 void writeCompleteCallback(Addr address,
287 uint64_t instSeqNum,
288 MachineType mach);
289
290 void readCallback(Addr address, DataBlock& data);
291
292 void readCallback(Addr address,
293 MachineType mach,
295 bool externalHit);
296
297 void readCallback(Addr address,
298 MachineType mach,
300 Cycles initialRequestTime,
301 Cycles forwardRequestTime,
302 Cycles firstResponseTime,
303 bool externalHit);
304
305 void readCallback(Addr address,
306 MachineType mach,
308 Cycles initialRequestTime,
309 Cycles forwardRequestTime,
310 Cycles firstResponseTime,
311 bool isRegion,
312 bool externalHit);
313
314 /* atomics need their own callback because the data
315 might be const coming from SLICC */
316 virtual void atomicCallback(Addr address,
317 MachineType mach,
318 const DataBlock& data);
319
320 RequestStatus makeRequest(PacketPtr pkt) override;
321 int outstandingCount() const override { return m_outstanding_count; }
322
323 bool
325 {
327 }
328
329 void
334
335 bool empty() const;
336
337 void print(std::ostream& out) const;
338
339 void evictionCallback(Addr address);
340 void completeIssue();
341
342 void insertKernel(int wavefront_id, PacketPtr pkt);
343
345
347
349
353
358
361
363 getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
364 { return *m_missTypeMachLatencyHist[r][t]; }
365
368
370 getInitialToForwardDelayHist(const MachineType t) const
371 { return *m_InitialToForwardDelayHist[t]; }
372
376
380
381 protected:
382 bool tryCacheAccess(Addr addr, RubyRequestType type,
383 Addr pc, RubyAccessMode access_mode,
384 int size, DataBlock*& data_ptr);
385
386 // since the two following issue functions are protocol-specific,
387 // they must be implemented in a derived coalescer
388 virtual void issueRequest(CoalescedRequest* crequest) = 0;
389 virtual void issueMemSyncRequest(PacketPtr pkt) {}
390
391 void kernelCallback(int wavefront_id);
392
393 void hitCallback(CoalescedRequest* crequest,
394 MachineType mach,
396 bool success,
397 Cycles initialRequestTime,
398 Cycles forwardRequestTime,
399 Cycles firstResponseTime,
400 bool isRegion,
401 bool externalHit);
402 void recordMissLatency(CoalescedRequest* crequest,
403 MachineType mach,
404 Cycles initialRequestTime,
405 Cycles forwardRequestTime,
406 Cycles firstResponseTime,
407 bool success, bool isRegion);
409
410 virtual RubyRequestType getRequestType(PacketPtr pkt);
411
413
414 // Attempt to remove a packet from the uncoalescedTable and coalesce
415 // with a previous request from the same instruction. If there is no
416 // previous instruction and the max number of outstanding requests has
417 // not be reached, a new coalesced request is created and added to the
418 // "target" list of the coalescedTable.
419 bool coalescePacket(PacketPtr pkt);
420
422
423 protected:
426
429
430 // coalescingWindow is the maximum number of instructions that are
431 // allowed to be coalesced in a single cycle.
433
434 // The uncoalescedTable contains several "columns" which hold memory
435 // request packets for an instruction. The maximum size is the number of
436 // columns * the wavefront size.
438
439 // An MSHR-like struct for holding coalesced requests. The requests in
440 // this table may or may not be outstanding in the memory hierarchy. The
441 // maximum size is equal to the maximum outstanding requests for a CU
442 // (typically the number of blocks in TCP). If there are duplicates of
443 // an address, the are serviced in age order.
444 std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
445 // Map of instruction sequence number to coalesced requests that get
446 // created in coalescePacket, used in completeIssue to send the fully
447 // coalesced request
448 std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
449
450 // a map btw an instruction sequence number and PendingWriteInst
451 // this is used to do a final call back for each write when it is
452 // completely done in the memory system
453 std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
454
455 // Global outstanding request count, across all request tables
458 std::unordered_map<int, PacketPtr> kernelEndList;
460
465
467
470
471// TODO - Need to update the following stats once the VIPER protocol
472// is re-integrated.
473// // m5 style stats for TCP hit/miss counts
474// statistics::Scalar GPU_TCPLdHits;
475// statistics::Scalar GPU_TCPLdTransfers;
476// statistics::Scalar GPU_TCCLdHits;
477// statistics::Scalar GPU_LdMiss;
478//
479// statistics::Scalar GPU_TCPStHits;
480// statistics::Scalar GPU_TCPStTransfers;
481// statistics::Scalar GPU_TCCStHits;
482// statistics::Scalar GPU_StMiss;
483//
484// statistics::Scalar CP_TCPLdHits;
485// statistics::Scalar CP_TCPLdTransfers;
486// statistics::Scalar CP_TCCLdHits;
487// statistics::Scalar CP_LdMiss;
488//
489// statistics::Scalar CP_TCPStHits;
490// statistics::Scalar CP_TCPStTransfers;
491// statistics::Scalar CP_TCCStHits;
492// statistics::Scalar CP_StMiss;
493
496
500
505
511
517
518// TODO - Need to update the following stats once the VIPER protocol
519// is re-integrated.
520// statistics::Distribution numHopDelays;
521// statistics::Distribution tcpToTccDelay;
522// statistics::Distribution tccToSdDelay;
523// statistics::Distribution sdToSdDelay;
524// statistics::Distribution sdToTccDelay;
525// statistics::Distribution tccToTcpDelay;
526//
527// statistics::Average avgTcpToTcc;
528// statistics::Average avgTccToSd;
529// statistics::Average avgSdToSd;
530// statistics::Average avgSdToTcc;
531// statistics::Average avgTccToTcp;
532
533 private:
534 // Token port is used to send/receive tokens to/from GPU's global memory
535 // pipeline across the port boundary. There is one per <wave size> data
536 // ports in the CU.
538
539 // Private copy constructor and assignment operator
542};
543
544inline std::ostream&
545operator<<(std::ostream& out, const GPUCoalescer& obj)
546{
547 obj.print(out);
548 out << std::flush;
549 return out;
550}
551
552} // namespace ruby
553} // namespace gem5
554
555#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
const char data[]
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
@ WriteCompleteResp
Definition packet.hh:92
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
Ports are used to interface objects to each other.
Definition port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition port.hh:111
bool sendTimingResp(PacketPtr pkt)
Attempt to send a timing response to the request port by calling its corresponding receive function.
Definition port.hh:454
CoalescedRequest(uint64_t _seqNum)
std::vector< PacketPtr > pkts
void setSeqNum(uint64_t _seqNum)
void setIssueTime(Cycles _issueTime)
void insertPacket(PacketPtr pkt)
void setRubyType(RubyRequestType type)
PacketPtr getFirstPkt() const
RubyRequestType getRubyType() const
std::vector< PacketPtr > & getPackets()
GMTokenPort(const std::string &name, PortID id=InvalidPortID)
Tick recvAtomic(PacketPtr)
Receive an atomic request packet from the peer.
void recvFunctional(PacketPtr)
Receive a functional request packet from the peer.
AddrRangeList getAddrRanges() const
Get a list of the non-overlapping address ranges the owner is responsible for.
bool recvTimingReq(PacketPtr)
Receive a timing request from the peer.
virtual RubyRequestType getRequestType(PacketPtr pkt)
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
void writeCallback(Addr address, DataBlock &data)
statistics::Histogram & getFirstResponseToCompletionDelayHist(const MachineType t) const
std::vector< statistics::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
GPUCoalescer & operator=(const GPUCoalescer &obj)
void evictionCallback(Addr address)
void kernelCallback(int wavefront_id)
statistics::Histogram & getInitialToForwardDelayHist(const MachineType t) const
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
RubySystem * getRubySystem()
virtual void issueMemSyncRequest(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
GMTokenPort & getGMTokenPort()
std::vector< statistics::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
statistics::Histogram & getIssueToInitialDelayHist(uint32_t t) const
statistics::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void resetStats() override
Callback to reset stats.
statistics::Histogram & getOutstandReqHist()
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
statistics::Histogram & getForwardRequestToFirstResponseHist(const MachineType t) const
RubyGPUCoalescerParams Params
void printProgress(std::ostream &out) const
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
UncoalescedTable uncoalescedTable
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion, bool externalHit)
void insertKernel(int wavefront_id, PacketPtr pkt)
statistics::Histogram & getTypeLatencyHist(uint32_t t)
std::unordered_map< int, PacketPtr > kernelEndList
virtual void issueRequest(CoalescedRequest *crequest)=0
statistics::Histogram & getMissLatencyHist()
bool tryCacheAccess(Addr addr, RubyRequestType type, Addr pc, RubyAccessMode access_mode, int size, DataBlock *&data_ptr)
bool isDeadlockEventScheduled() const override
statistics::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
statistics::Histogram & getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
bool coalescePacket(PacketPtr pkt)
std::vector< statistics::Histogram * > m_InitialToForwardDelayHist
std::vector< statistics::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< statistics::Histogram * > m_ForwardToFirstResponseDelayHist
RequestStatus makeRequest(PacketPtr pkt) override
void readCallback(Addr address, DataBlock &data)
void completeHitCallback(std::vector< PacketPtr > &mylist)
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
std::vector< statistics::Histogram * > m_typeLatencyHist
void print(std::ostream &out) const
statistics::Histogram & getMissMachLatencyHist(uint32_t t) const
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
std::vector< int > newKernelEnds
std::vector< statistics::Histogram * > m_missTypeLatencyHist
std::vector< std::vector< statistics::Histogram * > > m_missTypeMachLatencyHist
int outstandingCount() const override
statistics::Histogram & getMissTypeLatencyHist(uint32_t t)
CacheMemory * m_instCache_ptr
statistics::Histogram & getLatencyHist()
CacheMemory * m_dataCache_ptr
statistics::Histogram m_outstandReqHist
Histogram for number of outstanding requests per cycle.
void descheduleDeadlockEvent() override
EventFunctionWrapper issueEvent
GPUDynInstPtr getDynInst(PacketPtr pkt) const
GPUCoalescer(const GPUCoalescer &obj)
EventFunctionWrapper deadlockCheckEvent
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
void ackWriteCompletion(bool usingRubyTester)
RubyPort::MemResponsePort * originalPort
RubySystem * m_ruby_system
Definition RubyPort.hh:207
void setPacketsRemaining(InstSeqNum seqNum, int count)
std::map< InstSeqNum, RubyRequestType > reqTypeMap
void insertPacket(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
bool areRequestsDone(const InstSeqNum instSeqNum)
void insertReqType(PacketPtr pkt, RubyRequestType type)
std::map< InstSeqNum, PerInstPackets > instMap
void initPacketsRemaining(InstSeqNum seqNum, int count)
int getPacketsRemaining(InstSeqNum seqNum)
void checkDeadlock(Tick threshold)
PerInstPackets * getInstPackets(int offset)
std::map< InstSeqNum, int > instPktsRemaining
A simple histogram stat.
STL list class.
Definition stl.hh:51
STL vector class.
Definition stl.hh:37
void deschedule(Event &event)
Definition eventq.hh:1021
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
Bitfield< 5 > t
Definition misc_types.hh:71
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 4 > pc
Bitfield< 3 > addr
Definition types.hh:84
std::list< PacketPtr > PerInstPackets
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
const PortID InvalidPortID
Definition types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
std::ostream & operator<<(std::ostream &os, const BaseSemihosting::InPlaceArg &ipa)
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of a request, the overall memory request consisting of the parts of the request that are ...
Declaration of Statistics objects.

Generated on Mon Jan 13 2025 04:28:41 for gem5 by doxygen 1.9.8