gem5 v24.0.0.0
Loading...
Searching...
No Matches
GPUCoalescer.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
33#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
34
35#include <iostream>
36#include <unordered_map>
37
38#include "base/statistics.hh"
40#include "gpu-compute/misc.hh"
41#include "mem/request.hh"
44#include "mem/ruby/protocol/PrefetchBit.hh"
45#include "mem/ruby/protocol/RubyAccessMode.hh"
46#include "mem/ruby/protocol/RubyRequestType.hh"
47#include "mem/ruby/protocol/SequencerRequestType.hh"
49#include "mem/token_port.hh"
50
51namespace gem5
52{
53
54struct RubyGPUCoalescerParams;
55
56namespace ruby
57{
58
59class DataBlock;
60class CacheMsg;
61struct MachineID;
62class CacheMemory;
63
64// List of packets that belongs to a specific instruction.
66
68{
69 public:
72
73 void insertPacket(PacketPtr pkt);
74 void insertReqType(PacketPtr pkt, RubyRequestType type);
75 bool packetAvailable();
76 void printRequestTable(std::stringstream& ss);
77
78 // Modify packets remaining map. Init sets value iff the seqNum has not
79 // yet been seen before. get/set act as a regular getter/setter.
80 void initPacketsRemaining(InstSeqNum seqNum, int count);
82 void setPacketsRemaining(InstSeqNum seqNum, int count);
83
84 // Returns a pointer to the list of packets corresponding to an
85 // instruction in the instruction map or nullptr if there are no
86 // instructions at the offset.
88 void updateResources();
89 bool areRequestsDone(const InstSeqNum instSeqNum);
90
91 // Check if a packet hasn't been removed from instMap in too long.
92 // Panics if a deadlock is detected and returns nothing otherwise.
93 void checkDeadlock(Tick threshold);
94
95 private:
97
98 // Maps an instructions unique sequence number to a queue of packets
99 // which need responses. This data structure assumes the sequence number
100 // is monotonically increasing (which is true for CU class) in order to
101 // issue packets in age order.
102 std::map<InstSeqNum, PerInstPackets> instMap;
103
104 std::map<InstSeqNum, int> instPktsRemaining;
105
106 std::map<InstSeqNum, RubyRequestType> reqTypeMap;
107};
108
110{
111 public:
112 CoalescedRequest(uint64_t _seqNum)
113 : seqNum(_seqNum), issueTime(Cycles(0)),
114 rubyType(RubyRequestType_NULL)
115 {}
117
118 void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
119 void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
120 void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
121 void setRubyType(RubyRequestType type) { rubyType = type; }
122
123 uint64_t getSeqNum() const { return seqNum; }
124 PacketPtr getFirstPkt() const { return pkts[0]; }
125 Cycles getIssueTime() const { return issueTime; }
126 RubyRequestType getRubyType() const { return rubyType; }
128
129 private:
130 uint64_t seqNum;
132 RubyRequestType rubyType;
134};
135
136// PendingWriteInst tracks the number of outstanding Ruby requests
137// per write instruction. Once all requests associated with one instruction
138// are completely done in Ruby, we call back the requestor to mark
139// that this instruction is complete.
141{
142 public:
144 : numPendingStores(0),
145 originalPort(nullptr),
146 gpuDynInstPtr(nullptr)
147 {}
148
151
152 void
154 bool usingRubyTester)
155 {
156 assert(port);
157 originalPort = port;
158
159 if (!usingRubyTester) {
160 gpuDynInstPtr = inst;
161 }
162
164 }
165
166 // return true if no more ack is expected
167 bool
169 {
170 assert(numPendingStores > 0);
172 return (numPendingStores == 0) ? true : false;
173 }
174
175 // ack the original requestor that this write instruction is complete
176 void
177 ackWriteCompletion(bool usingRubyTester)
178 {
179 assert(numPendingStores == 0);
180
181 // make a response packet
182 PacketPtr pkt = new Packet(std::make_shared<Request>(),
184
185 if (!usingRubyTester) {
186 assert(gpuDynInstPtr);
189 (gpuDynInstPtr, 0, nullptr);
190 pkt->senderState = ss;
191 }
192
193 // send the ack response to the requestor
195 }
196
197 int
201
202 private:
203 // the number of stores waiting for writeCompleteCallback
205 // The original port that sent one of packets associated with this
206 // write instruction. We may have more than one packet per instruction,
207 // which implies multiple ports per instruction. However, we need
208 // only 1 of the ports to call back the CU. Therefore, here we keep
209 // track the port that sent the first packet of this instruction.
211 // similar to the originalPort, this gpuDynInstPtr is set only for
212 // the first packet of this instruction.
214};
215
216class GPUCoalescer : public RubyPort
217{
218 public:
220 {
221 public:
222 GMTokenPort(const std::string& name,
225 { }
227
228 protected:
231 bool recvTimingReq(PacketPtr) { return false; }
233 {
234 AddrRangeList ranges;
235 return ranges;
236 }
237 };
238
239 typedef RubyGPUCoalescerParams Params;
240 GPUCoalescer(const Params &);
242
243 Port &getPort(const std::string &if_name,
244 PortID idx = InvalidPortID) override;
245
246 // Public Methods
247 void wakeup(); // Used only for deadlock detection
248 void printRequestTable(std::stringstream& ss);
249
250 void printProgress(std::ostream& out) const;
251 void resetStats() override;
253
254 // each store request needs two callbacks:
255 // (1) writeCallback is called when the store is received and processed
256 // by TCP. This writeCallback does not guarantee the store is actually
257 // completed at its destination cache or memory. writeCallback helps
258 // release hardware resources (e.g., its entry in coalescedTable)
259 // allocated for the store so that subsequent requests will not be
260 // blocked unnecessarily due to hardware resource constraints.
261 // (2) writeCompleteCallback is called when the store is fully completed
262 // at its destination cache or memory. writeCompleteCallback
263 // guarantees that the store is fully completed. This callback
264 // will decrement hardware counters in CU
265 void writeCallback(Addr address, DataBlock& data);
266
267 void writeCallback(Addr address,
268 MachineType mach,
269 DataBlock& data);
270
271 void writeCallback(Addr address,
272 MachineType mach,
274 Cycles initialRequestTime,
275 Cycles forwardRequestTime,
276 Cycles firstResponseTime,
277 bool isRegion);
278
279 void writeCallback(Addr address,
280 MachineType mach,
282 Cycles initialRequestTime,
283 Cycles forwardRequestTime,
284 Cycles firstResponseTime);
285
286 void writeCompleteCallback(Addr address,
287 uint64_t instSeqNum,
288 MachineType mach);
289
290 void readCallback(Addr address, DataBlock& data);
291
292 void readCallback(Addr address,
293 MachineType mach,
294 DataBlock& data);
295
296 void readCallback(Addr address,
297 MachineType mach,
299 Cycles initialRequestTime,
300 Cycles forwardRequestTime,
301 Cycles firstResponseTime);
302
303 void readCallback(Addr address,
304 MachineType mach,
306 Cycles initialRequestTime,
307 Cycles forwardRequestTime,
308 Cycles firstResponseTime,
309 bool isRegion);
310
311 /* atomics need their own callback because the data
312 might be const coming from SLICC */
313 virtual void atomicCallback(Addr address,
314 MachineType mach,
315 const DataBlock& data);
316
317 RequestStatus makeRequest(PacketPtr pkt) override;
318 int outstandingCount() const override { return m_outstanding_count; }
319
320 bool
322 {
324 }
325
326 void
331
332 bool empty() const;
333
334 void print(std::ostream& out) const;
335
336 void evictionCallback(Addr address);
337 void completeIssue();
338
339 void insertKernel(int wavefront_id, PacketPtr pkt);
340
342
344
348
353
356
358 getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
359 { return *m_missTypeMachLatencyHist[r][t]; }
360
363
365 getInitialToForwardDelayHist(const MachineType t) const
366 { return *m_InitialToForwardDelayHist[t]; }
367
371
375
376 protected:
377 bool tryCacheAccess(Addr addr, RubyRequestType type,
378 Addr pc, RubyAccessMode access_mode,
379 int size, DataBlock*& data_ptr);
380
381 // since the two following issue functions are protocol-specific,
382 // they must be implemented in a derived coalescer
383 virtual void issueRequest(CoalescedRequest* crequest) = 0;
384 virtual void issueMemSyncRequest(PacketPtr pkt) {}
385
386 void kernelCallback(int wavefront_id);
387
388 void hitCallback(CoalescedRequest* crequest,
389 MachineType mach,
391 bool success,
392 Cycles initialRequestTime,
393 Cycles forwardRequestTime,
394 Cycles firstResponseTime,
395 bool isRegion);
396 void recordMissLatency(CoalescedRequest* crequest,
397 MachineType mach,
398 Cycles initialRequestTime,
399 Cycles forwardRequestTime,
400 Cycles firstResponseTime,
401 bool success, bool isRegion);
403
404 virtual RubyRequestType getRequestType(PacketPtr pkt);
405
407
408 // Attempt to remove a packet from the uncoalescedTable and coalesce
409 // with a previous request from the same instruction. If there is no
410 // previous instruction and the max number of outstanding requests has
411 // not be reached, a new coalesced request is created and added to the
412 // "target" list of the coalescedTable.
413 bool coalescePacket(PacketPtr pkt);
414
416
417 protected:
420
423
424 // coalescingWindow is the maximum number of instructions that are
425 // allowed to be coalesced in a single cycle.
427
428 // The uncoalescedTable contains several "columns" which hold memory
429 // request packets for an instruction. The maximum size is the number of
430 // columns * the wavefront size.
432
433 // An MSHR-like struct for holding coalesced requests. The requests in
434 // this table may or may not be outstanding in the memory hierarchy. The
435 // maximum size is equal to the maximum outstanding requests for a CU
436 // (typically the number of blocks in TCP). If there are duplicates of
437 // an address, the are serviced in age order.
438 std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
439 // Map of instruction sequence number to coalesced requests that get
440 // created in coalescePacket, used in completeIssue to send the fully
441 // coalesced request
442 std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
443
444 // a map btw an instruction sequence number and PendingWriteInst
445 // this is used to do a final call back for each write when it is
446 // completely done in the memory system
447 std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
448
449 // Global outstanding request count, across all request tables
452 std::unordered_map<int, PacketPtr> kernelEndList;
454
459
461
464
465// TODO - Need to update the following stats once the VIPER protocol
466// is re-integrated.
467// // m5 style stats for TCP hit/miss counts
468// statistics::Scalar GPU_TCPLdHits;
469// statistics::Scalar GPU_TCPLdTransfers;
470// statistics::Scalar GPU_TCCLdHits;
471// statistics::Scalar GPU_LdMiss;
472//
473// statistics::Scalar GPU_TCPStHits;
474// statistics::Scalar GPU_TCPStTransfers;
475// statistics::Scalar GPU_TCCStHits;
476// statistics::Scalar GPU_StMiss;
477//
478// statistics::Scalar CP_TCPLdHits;
479// statistics::Scalar CP_TCPLdTransfers;
480// statistics::Scalar CP_TCCLdHits;
481// statistics::Scalar CP_LdMiss;
482//
483// statistics::Scalar CP_TCPStHits;
484// statistics::Scalar CP_TCPStTransfers;
485// statistics::Scalar CP_TCCStHits;
486// statistics::Scalar CP_StMiss;
487
490
494
499
505
511
512// TODO - Need to update the following stats once the VIPER protocol
513// is re-integrated.
514// statistics::Distribution numHopDelays;
515// statistics::Distribution tcpToTccDelay;
516// statistics::Distribution tccToSdDelay;
517// statistics::Distribution sdToSdDelay;
518// statistics::Distribution sdToTccDelay;
519// statistics::Distribution tccToTcpDelay;
520//
521// statistics::Average avgTcpToTcc;
522// statistics::Average avgTccToSd;
523// statistics::Average avgSdToSd;
524// statistics::Average avgSdToTcc;
525// statistics::Average avgTccToTcp;
526
527 private:
528 // Token port is used to send/receive tokens to/from GPU's global memory
529 // pipeline across the port boundary. There is one per <wave size> data
530 // ports in the CU.
532
533 // Private copy constructor and assignment operator
536};
537
538inline std::ostream&
539operator<<(std::ostream& out, const GPUCoalescer& obj)
540{
541 obj.print(out);
542 out << std::flush;
543 return out;
544}
545
546} // namespace ruby
547} // namespace gem5
548
549#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
const char data[]
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
@ WriteCompleteResp
Definition packet.hh:92
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
Ports are used to interface objects to each other.
Definition port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition port.hh:111
bool sendTimingResp(PacketPtr pkt)
Attempt to send a timing response to the request port by calling its corresponding receive function.
Definition port.hh:454
CoalescedRequest(uint64_t _seqNum)
std::vector< PacketPtr > pkts
void setSeqNum(uint64_t _seqNum)
void setIssueTime(Cycles _issueTime)
void insertPacket(PacketPtr pkt)
void setRubyType(RubyRequestType type)
PacketPtr getFirstPkt() const
RubyRequestType getRubyType() const
std::vector< PacketPtr > & getPackets()
GMTokenPort(const std::string &name, PortID id=InvalidPortID)
Tick recvAtomic(PacketPtr)
Receive an atomic request packet from the peer.
void recvFunctional(PacketPtr)
Receive a functional request packet from the peer.
AddrRangeList getAddrRanges() const
Get a list of the non-overlapping address ranges the owner is responsible for.
bool recvTimingReq(PacketPtr)
Receive a timing request from the peer.
virtual RubyRequestType getRequestType(PacketPtr pkt)
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
void writeCallback(Addr address, DataBlock &data)
statistics::Histogram & getFirstResponseToCompletionDelayHist(const MachineType t) const
std::vector< statistics::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
GPUCoalescer & operator=(const GPUCoalescer &obj)
void evictionCallback(Addr address)
void kernelCallback(int wavefront_id)
statistics::Histogram & getInitialToForwardDelayHist(const MachineType t) const
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
virtual void issueMemSyncRequest(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
GMTokenPort & getGMTokenPort()
std::vector< statistics::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
statistics::Histogram & getIssueToInitialDelayHist(uint32_t t) const
statistics::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void resetStats() override
Callback to reset stats.
statistics::Histogram & getOutstandReqHist()
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
statistics::Histogram & getForwardRequestToFirstResponseHist(const MachineType t) const
RubyGPUCoalescerParams Params
void printProgress(std::ostream &out) const
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
UncoalescedTable uncoalescedTable
void insertKernel(int wavefront_id, PacketPtr pkt)
statistics::Histogram & getTypeLatencyHist(uint32_t t)
std::unordered_map< int, PacketPtr > kernelEndList
virtual void issueRequest(CoalescedRequest *crequest)=0
statistics::Histogram & getMissLatencyHist()
bool tryCacheAccess(Addr addr, RubyRequestType type, Addr pc, RubyAccessMode access_mode, int size, DataBlock *&data_ptr)
bool isDeadlockEventScheduled() const override
statistics::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
statistics::Histogram & getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
bool coalescePacket(PacketPtr pkt)
std::vector< statistics::Histogram * > m_InitialToForwardDelayHist
std::vector< statistics::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< statistics::Histogram * > m_ForwardToFirstResponseDelayHist
RequestStatus makeRequest(PacketPtr pkt) override
void readCallback(Addr address, DataBlock &data)
void completeHitCallback(std::vector< PacketPtr > &mylist)
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
std::vector< statistics::Histogram * > m_typeLatencyHist
GPUCoalescer(const Params &)
void print(std::ostream &out) const
statistics::Histogram & getMissMachLatencyHist(uint32_t t) const
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
std::vector< int > newKernelEnds
std::vector< statistics::Histogram * > m_missTypeLatencyHist
std::vector< std::vector< statistics::Histogram * > > m_missTypeMachLatencyHist
int outstandingCount() const override
statistics::Histogram & getMissTypeLatencyHist(uint32_t t)
CacheMemory * m_instCache_ptr
statistics::Histogram & getLatencyHist()
CacheMemory * m_dataCache_ptr
statistics::Histogram m_outstandReqHist
Histogram for number of outstanding requests per cycle.
void descheduleDeadlockEvent() override
EventFunctionWrapper issueEvent
GPUDynInstPtr getDynInst(PacketPtr pkt) const
GPUCoalescer(const GPUCoalescer &obj)
EventFunctionWrapper deadlockCheckEvent
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
void ackWriteCompletion(bool usingRubyTester)
RubyPort::MemResponsePort * originalPort
void setPacketsRemaining(InstSeqNum seqNum, int count)
std::map< InstSeqNum, RubyRequestType > reqTypeMap
void insertPacket(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
bool areRequestsDone(const InstSeqNum instSeqNum)
void insertReqType(PacketPtr pkt, RubyRequestType type)
std::map< InstSeqNum, PerInstPackets > instMap
UncoalescedTable(GPUCoalescer *gc)
void initPacketsRemaining(InstSeqNum seqNum, int count)
int getPacketsRemaining(InstSeqNum seqNum)
void checkDeadlock(Tick threshold)
PerInstPackets * getInstPackets(int offset)
std::map< InstSeqNum, int > instPktsRemaining
A simple histogram stat.
STL list class.
Definition stl.hh:51
STL vector class.
Definition stl.hh:37
void deschedule(Event &event)
Definition eventq.hh:1021
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
Bitfield< 5 > t
Definition misc_types.hh:71
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 4 > pc
Bitfield< 3 > addr
Definition types.hh:84
std::list< PacketPtr > PerInstPackets
std::ostream & operator<<(std::ostream &os, const BoolVec &myvector)
Definition BoolVec.cc:49
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
const PortID InvalidPortID
Definition types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of a request, the overall memory request consisting of the parts of the request that are ...
Declaration of Statistics objects.

Generated on Tue Jun 18 2024 16:24:05 for gem5 by doxygen 1.11.0