gem5 v23.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
GPUCoalescer.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
33#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
34
35#include <iostream>
36#include <unordered_map>
37
38#include "base/statistics.hh"
40#include "gpu-compute/misc.hh"
41#include "mem/request.hh"
44#include "mem/ruby/protocol/PrefetchBit.hh"
45#include "mem/ruby/protocol/RubyAccessMode.hh"
46#include "mem/ruby/protocol/RubyRequestType.hh"
47#include "mem/ruby/protocol/SequencerRequestType.hh"
49#include "mem/token_port.hh"
50
51namespace gem5
52{
53
54struct RubyGPUCoalescerParams;
55
56namespace ruby
57{
58
59class DataBlock;
60class CacheMsg;
61struct MachineID;
62class CacheMemory;
63
64// List of packets that belongs to a specific instruction.
66
68{
69 public:
72
73 void insertPacket(PacketPtr pkt);
74 bool packetAvailable();
75 void printRequestTable(std::stringstream& ss);
76
77 // Modify packets remaining map. Init sets value iff the seqNum has not
78 // yet been seen before. get/set act as a regular getter/setter.
79 void initPacketsRemaining(InstSeqNum seqNum, int count);
81 void setPacketsRemaining(InstSeqNum seqNum, int count);
82
83 // Returns a pointer to the list of packets corresponding to an
84 // instruction in the instruction map or nullptr if there are no
85 // instructions at the offset.
87 void updateResources();
88 bool areRequestsDone(const InstSeqNum instSeqNum);
89
90 // Check if a packet hasn't been removed from instMap in too long.
91 // Panics if a deadlock is detected and returns nothing otherwise.
92 void checkDeadlock(Tick threshold);
93
94 private:
96
97 // Maps an instructions unique sequence number to a queue of packets
98 // which need responses. This data structure assumes the sequence number
99 // is monotonically increasing (which is true for CU class) in order to
100 // issue packets in age order.
101 std::map<InstSeqNum, PerInstPackets> instMap;
102
103 std::map<InstSeqNum, int> instPktsRemaining;
104};
105
107{
108 public:
109 CoalescedRequest(uint64_t _seqNum)
110 : seqNum(_seqNum), issueTime(Cycles(0)),
111 rubyType(RubyRequestType_NULL)
112 {}
114
115 void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
116 void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
117 void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
118 void setRubyType(RubyRequestType type) { rubyType = type; }
119
120 uint64_t getSeqNum() const { return seqNum; }
121 PacketPtr getFirstPkt() const { return pkts[0]; }
122 Cycles getIssueTime() const { return issueTime; }
123 RubyRequestType getRubyType() const { return rubyType; }
125
126 private:
127 uint64_t seqNum;
129 RubyRequestType rubyType;
131};
132
133// PendingWriteInst tracks the number of outstanding Ruby requests
134// per write instruction. Once all requests associated with one instruction
135// are completely done in Ruby, we call back the requestor to mark
136// that this instruction is complete.
138{
139 public:
141 : numPendingStores(0),
142 originalPort(nullptr),
143 gpuDynInstPtr(nullptr)
144 {}
145
147 {}
148
149 void
151 bool usingRubyTester)
152 {
153 assert(port);
154 originalPort = port;
155
156 if (!usingRubyTester) {
157 gpuDynInstPtr = inst;
158 }
159
161 }
162
163 // return true if no more ack is expected
164 bool
166 {
167 assert(numPendingStores > 0);
169 return (numPendingStores == 0) ? true : false;
170 }
171
172 // ack the original requestor that this write instruction is complete
173 void
174 ackWriteCompletion(bool usingRubyTester)
175 {
176 assert(numPendingStores == 0);
177
178 // make a response packet
179 PacketPtr pkt = new Packet(std::make_shared<Request>(),
181
182 if (!usingRubyTester) {
183 assert(gpuDynInstPtr);
186 (gpuDynInstPtr, 0, nullptr);
187 pkt->senderState = ss;
188 }
189
190 // send the ack response to the requestor
192 }
193
194 int
196 return numPendingStores;
197 }
198
199 private:
200 // the number of stores waiting for writeCompleteCallback
202 // The original port that sent one of packets associated with this
203 // write instruction. We may have more than one packet per instruction,
204 // which implies multiple ports per instruction. However, we need
205 // only 1 of the ports to call back the CU. Therefore, here we keep
206 // track the port that sent the first packet of this instruction.
208 // similar to the originalPort, this gpuDynInstPtr is set only for
209 // the first packet of this instruction.
211};
212
213class GPUCoalescer : public RubyPort
214{
215 public:
217 {
218 public:
219 GMTokenPort(const std::string& name,
222 { }
224
225 protected:
228 bool recvTimingReq(PacketPtr) { return false; }
230 {
231 AddrRangeList ranges;
232 return ranges;
233 }
234 };
235
236 typedef RubyGPUCoalescerParams Params;
237 GPUCoalescer(const Params &);
239
240 Port &getPort(const std::string &if_name,
241 PortID idx = InvalidPortID) override;
242
243 // Public Methods
244 void wakeup(); // Used only for deadlock detection
245 void printRequestTable(std::stringstream& ss);
246
247 void printProgress(std::ostream& out) const;
248 void resetStats() override;
250
251 // each store request needs two callbacks:
252 // (1) writeCallback is called when the store is received and processed
253 // by TCP. This writeCallback does not guarantee the store is actually
254 // completed at its destination cache or memory. writeCallback helps
255 // release hardware resources (e.g., its entry in coalescedTable)
256 // allocated for the store so that subsequent requests will not be
257 // blocked unnecessarily due to hardware resource constraints.
258 // (2) writeCompleteCallback is called when the store is fully completed
259 // at its destination cache or memory. writeCompleteCallback
260 // guarantees that the store is fully completed. This callback
261 // will decrement hardware counters in CU
262 void writeCallback(Addr address, DataBlock& data);
263
264 void writeCallback(Addr address,
265 MachineType mach,
266 DataBlock& data);
267
268 void writeCallback(Addr address,
269 MachineType mach,
271 Cycles initialRequestTime,
272 Cycles forwardRequestTime,
273 Cycles firstResponseTime,
274 bool isRegion);
275
276 void writeCallback(Addr address,
277 MachineType mach,
279 Cycles initialRequestTime,
280 Cycles forwardRequestTime,
281 Cycles firstResponseTime);
282
283 void writeCompleteCallback(Addr address,
284 uint64_t instSeqNum,
285 MachineType mach);
286
287 void readCallback(Addr address, DataBlock& data);
288
289 void readCallback(Addr address,
290 MachineType mach,
291 DataBlock& data);
292
293 void readCallback(Addr address,
294 MachineType mach,
296 Cycles initialRequestTime,
297 Cycles forwardRequestTime,
298 Cycles firstResponseTime);
299
300 void readCallback(Addr address,
301 MachineType mach,
303 Cycles initialRequestTime,
304 Cycles forwardRequestTime,
305 Cycles firstResponseTime,
306 bool isRegion);
307
308 /* atomics need their own callback because the data
309 might be const coming from SLICC */
310 virtual void atomicCallback(Addr address,
311 MachineType mach,
312 const DataBlock& data);
313
314 RequestStatus makeRequest(PacketPtr pkt) override;
315 int outstandingCount() const override { return m_outstanding_count; }
316
317 bool
319 {
321 }
322
323 void
325 {
327 }
328
329 bool empty() const;
330
331 void print(std::ostream& out) const;
332
333 void evictionCallback(Addr address);
334 void completeIssue();
335
336 void insertKernel(int wavefront_id, PacketPtr pkt);
337
339
341
344 { return *m_typeLatencyHist[t]; }
345
347 { return m_missLatencyHist; }
349 { return *m_missTypeLatencyHist[t]; }
350
352 { return *m_missMachLatencyHist[t]; }
353
355 getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
356 { return *m_missTypeMachLatencyHist[r][t]; }
357
359 { return *m_IssueToInitialDelayHist[t]; }
360
362 getInitialToForwardDelayHist(const MachineType t) const
363 { return *m_InitialToForwardDelayHist[t]; }
364
368
372
373 protected:
374 bool tryCacheAccess(Addr addr, RubyRequestType type,
375 Addr pc, RubyAccessMode access_mode,
376 int size, DataBlock*& data_ptr);
377
378 // since the two following issue functions are protocol-specific,
379 // they must be implemented in a derived coalescer
380 virtual void issueRequest(CoalescedRequest* crequest) = 0;
381 virtual void issueMemSyncRequest(PacketPtr pkt) {}
382
383 void kernelCallback(int wavefront_id);
384
385 void hitCallback(CoalescedRequest* crequest,
386 MachineType mach,
388 bool success,
389 Cycles initialRequestTime,
390 Cycles forwardRequestTime,
391 Cycles firstResponseTime,
392 bool isRegion);
393 void recordMissLatency(CoalescedRequest* crequest,
394 MachineType mach,
395 Cycles initialRequestTime,
396 Cycles forwardRequestTime,
397 Cycles firstResponseTime,
398 bool success, bool isRegion);
400
401 virtual RubyRequestType getRequestType(PacketPtr pkt);
402
404
405 // Attempt to remove a packet from the uncoalescedTable and coalesce
406 // with a previous request from the same instruction. If there is no
407 // previous instruction and the max number of outstanding requests has
408 // not be reached, a new coalesced request is created and added to the
409 // "target" list of the coalescedTable.
410 bool coalescePacket(PacketPtr pkt);
411
413
414 protected:
417
420
421 // coalescingWindow is the maximum number of instructions that are
422 // allowed to be coalesced in a single cycle.
424
425 // The uncoalescedTable contains several "columns" which hold memory
426 // request packets for an instruction. The maximum size is the number of
427 // columns * the wavefront size.
429
430 // An MSHR-like struct for holding coalesced requests. The requests in
431 // this table may or may not be outstanding in the memory hierarchy. The
432 // maximum size is equal to the maximum outstanding requests for a CU
433 // (typically the number of blocks in TCP). If there are duplicates of
434 // an address, the are serviced in age order.
435 std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
436 // Map of instruction sequence number to coalesced requests that get
437 // created in coalescePacket, used in completeIssue to send the fully
438 // coalesced request
439 std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
440
441 // a map btw an instruction sequence number and PendingWriteInst
442 // this is used to do a final call back for each write when it is
443 // completely done in the memory system
444 std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
445
446 // Global outstanding request count, across all request tables
449 std::unordered_map<int, PacketPtr> kernelEndList;
451
456
458
461
462// TODO - Need to update the following stats once the VIPER protocol
463// is re-integrated.
464// // m5 style stats for TCP hit/miss counts
465// statistics::Scalar GPU_TCPLdHits;
466// statistics::Scalar GPU_TCPLdTransfers;
467// statistics::Scalar GPU_TCCLdHits;
468// statistics::Scalar GPU_LdMiss;
469//
470// statistics::Scalar GPU_TCPStHits;
471// statistics::Scalar GPU_TCPStTransfers;
472// statistics::Scalar GPU_TCCStHits;
473// statistics::Scalar GPU_StMiss;
474//
475// statistics::Scalar CP_TCPLdHits;
476// statistics::Scalar CP_TCPLdTransfers;
477// statistics::Scalar CP_TCCLdHits;
478// statistics::Scalar CP_LdMiss;
479//
480// statistics::Scalar CP_TCPStHits;
481// statistics::Scalar CP_TCPStTransfers;
482// statistics::Scalar CP_TCCStHits;
483// statistics::Scalar CP_StMiss;
484
487
491
496
502
508
509// TODO - Need to update the following stats once the VIPER protocol
510// is re-integrated.
511// statistics::Distribution numHopDelays;
512// statistics::Distribution tcpToTccDelay;
513// statistics::Distribution tccToSdDelay;
514// statistics::Distribution sdToSdDelay;
515// statistics::Distribution sdToTccDelay;
516// statistics::Distribution tccToTcpDelay;
517//
518// statistics::Average avgTcpToTcc;
519// statistics::Average avgTccToSd;
520// statistics::Average avgSdToSd;
521// statistics::Average avgSdToTcc;
522// statistics::Average avgTccToTcp;
523
524 private:
525 // Token port is used to send/receive tokens to/from GPU's global memory
526 // pipeline across the port boundary. There is one per <wave size> data
527 // ports in the CU.
529
530 // Private copy constructor and assignment operator
533};
534
535inline std::ostream&
536operator<<(std::ostream& out, const GPUCoalescer& obj)
537{
538 obj.print(out);
539 out << std::flush;
540 return out;
541}
542
543} // namespace ruby
544} // namespace gem5
545
546#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
const char data[]
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
@ WriteCompleteResp
Definition packet.hh:92
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
Ports are used to interface objects to each other.
Definition port.hh:62
const PortID id
A numeric identifier to distinguish ports in a vector, and set to InvalidPortID in case this port is ...
Definition port.hh:79
const std::string name() const
Return port name (for DPRINTF).
Definition port.hh:111
bool sendTimingResp(PacketPtr pkt)
Attempt to send a timing response to the request port by calling its corresponding receive function.
Definition port.hh:393
CoalescedRequest(uint64_t _seqNum)
std::vector< PacketPtr > pkts
void setSeqNum(uint64_t _seqNum)
void setIssueTime(Cycles _issueTime)
void insertPacket(PacketPtr pkt)
void setRubyType(RubyRequestType type)
PacketPtr getFirstPkt() const
RubyRequestType getRubyType() const
std::vector< PacketPtr > & getPackets()
GMTokenPort(const std::string &name, PortID id=InvalidPortID)
Tick recvAtomic(PacketPtr)
Receive an atomic request packet from the peer.
void recvFunctional(PacketPtr)
Receive a functional request packet from the peer.
AddrRangeList getAddrRanges() const
Get a list of the non-overlapping address ranges the owner is responsible for.
bool recvTimingReq(PacketPtr)
Receive a timing request from the peer.
virtual RubyRequestType getRequestType(PacketPtr pkt)
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
void writeCallback(Addr address, DataBlock &data)
statistics::Histogram & getFirstResponseToCompletionDelayHist(const MachineType t) const
std::vector< statistics::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
GPUCoalescer & operator=(const GPUCoalescer &obj)
void evictionCallback(Addr address)
void kernelCallback(int wavefront_id)
statistics::Histogram & getInitialToForwardDelayHist(const MachineType t) const
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
virtual void issueMemSyncRequest(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
GMTokenPort & getGMTokenPort()
std::vector< statistics::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
statistics::Histogram & getIssueToInitialDelayHist(uint32_t t) const
statistics::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void resetStats() override
Callback to reset stats.
statistics::Histogram & getOutstandReqHist()
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
statistics::Histogram & getForwardRequestToFirstResponseHist(const MachineType t) const
RubyGPUCoalescerParams Params
void printProgress(std::ostream &out) const
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
UncoalescedTable uncoalescedTable
void insertKernel(int wavefront_id, PacketPtr pkt)
statistics::Histogram & getTypeLatencyHist(uint32_t t)
std::unordered_map< int, PacketPtr > kernelEndList
virtual void issueRequest(CoalescedRequest *crequest)=0
statistics::Histogram & getMissLatencyHist()
bool tryCacheAccess(Addr addr, RubyRequestType type, Addr pc, RubyAccessMode access_mode, int size, DataBlock *&data_ptr)
bool isDeadlockEventScheduled() const override
statistics::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
statistics::Histogram & getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
bool coalescePacket(PacketPtr pkt)
std::vector< statistics::Histogram * > m_InitialToForwardDelayHist
std::vector< statistics::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< statistics::Histogram * > m_ForwardToFirstResponseDelayHist
RequestStatus makeRequest(PacketPtr pkt) override
void readCallback(Addr address, DataBlock &data)
void completeHitCallback(std::vector< PacketPtr > &mylist)
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
std::vector< statistics::Histogram * > m_typeLatencyHist
void print(std::ostream &out) const
statistics::Histogram & getMissMachLatencyHist(uint32_t t) const
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
std::vector< int > newKernelEnds
std::vector< statistics::Histogram * > m_missTypeLatencyHist
std::vector< std::vector< statistics::Histogram * > > m_missTypeMachLatencyHist
int outstandingCount() const override
statistics::Histogram & getMissTypeLatencyHist(uint32_t t)
CacheMemory * m_instCache_ptr
statistics::Histogram & getLatencyHist()
CacheMemory * m_dataCache_ptr
statistics::Histogram m_outstandReqHist
Histogram for number of outstanding requests per cycle.
void descheduleDeadlockEvent() override
EventFunctionWrapper issueEvent
GPUDynInstPtr getDynInst(PacketPtr pkt) const
GPUCoalescer(const GPUCoalescer &obj)
EventFunctionWrapper deadlockCheckEvent
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
void ackWriteCompletion(bool usingRubyTester)
RubyPort::MemResponsePort * originalPort
void setPacketsRemaining(InstSeqNum seqNum, int count)
void insertPacket(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
bool areRequestsDone(const InstSeqNum instSeqNum)
std::map< InstSeqNum, PerInstPackets > instMap
void initPacketsRemaining(InstSeqNum seqNum, int count)
int getPacketsRemaining(InstSeqNum seqNum)
void checkDeadlock(Tick threshold)
PerInstPackets * getInstPackets(int offset)
std::map< InstSeqNum, int > instPktsRemaining
A simple histogram stat.
STL list class.
Definition stl.hh:51
STL vector class.
Definition stl.hh:37
void deschedule(Event &event)
Definition eventq.hh:1021
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
Bitfield< 5 > t
Definition misc_types.hh:71
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 4 > pc
Bitfield< 3 > addr
Definition types.hh:84
std::list< PacketPtr > PerInstPackets
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
const PortID InvalidPortID
Definition types.hh:246
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
std::ostream & operator<<(std::ostream &os, const ArmSemihosting::InPlaceArg &ipa)
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of a request, the overall memory request consisting of the parts of the request that are ...
Declaration of Statistics objects.

Generated on Mon Jul 10 2023 14:24:33 for gem5 by doxygen 1.9.7