gem5 v23.0.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
GPUCoalescer.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include "base/compiler.hh"
35#include "base/logging.hh"
36#include "base/str.hh"
38#include "debug/GPUCoalescer.hh"
39#include "debug/MemoryAccess.hh"
40#include "debug/ProtocolTrace.hh"
41#include "debug/RubyPort.hh"
42#include "debug/RubyStats.hh"
43#include "gpu-compute/shader.hh"
44#include "mem/packet.hh"
53#include "params/RubyGPUCoalescer.hh"
54
55namespace gem5
56{
57
58namespace ruby
59{
60
62 : coalescer(gc)
63{
64}
65
66void
68{
69 uint64_t seqNum = pkt->req->getReqInstSeqNum();
70
71 instMap[seqNum].push_back(pkt);
72 DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
73 pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
74}
75
76bool
78{
79 return !instMap.empty();
80}
81
82void
84{
85 if (!instPktsRemaining.count(seqNum)) {
86 instPktsRemaining[seqNum] = count;
87 }
88}
89
90int
92{
93 return instPktsRemaining[seqNum];
94}
95
96void
98{
99 instPktsRemaining[seqNum] = count;
100}
101
104{
105 if (offset >= instMap.size()) {
106 return nullptr;
107 }
108
109 auto instMapIter = instMap.begin();
110 std::advance(instMapIter, offset);
111
112 return &(instMapIter->second);
113}
114
115void
117{
118 for (auto iter = instMap.begin(); iter != instMap.end(); ) {
119 InstSeqNum seq_num = iter->first;
120 DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
121 coalescer->name().c_str(), seq_num);
122 assert(instPktsRemaining.count(seq_num));
123
124 if (instPktsRemaining[seq_num] == 0) {
125 assert(iter->second.empty());
126
127 // Remove from both maps
128 instMap.erase(iter++);
129 instPktsRemaining.erase(seq_num);
130
131 // Release the token
132 DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
134 } else {
135 ++iter;
136 }
137 }
138}
139
140bool
141UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
142 // iterate the instructions held in UncoalescedTable to see whether there
143 // are more requests to issue; if yes, not yet done; otherwise, done
144 for (auto& inst : instMap) {
145 DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
146 ,inst.first, inst.second.size());
147 if (inst.first == instSeqNum) { return false; }
148 }
149
150 return true;
151}
152
153void
155{
156 ss << "Listing pending packets from " << instMap.size() << " instructions";
157
158 for (auto& inst : instMap) {
159 ss << "\tAddr: " << printAddress(inst.first) << " with "
160 << inst.second.size() << " pending packets" << std::endl;
161 }
162}
163
164void
166{
167 Tick current_time = curTick();
168
169 for (auto &it : instMap) {
170 for (auto &pkt : it.second) {
171 if (current_time - pkt->req->time() > threshold) {
172 std::stringstream ss;
174
175 panic("Possible Deadlock detected. Aborting!\n"
176 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
177 "current time: %u issue_time: %d difference: %d\n"
178 "Request Tables:\n\n%s", coalescer->getId(),
179 pkt->getAddr(), instMap.size(), current_time,
180 pkt->req->time(), current_time - pkt->req->time(),
181 ss.str());
182 }
183 }
184 }
185}
186
188 : RubyPort(p),
189 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
191 uncoalescedTable(this),
192 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
193 gmTokenPort(name() + ".gmTokenPort")
194{
195 m_store_waiting_on_load_cycles = 0;
196 m_store_waiting_on_store_cycles = 0;
197 m_load_waiting_on_store_cycles = 0;
198 m_load_waiting_on_load_cycles = 0;
199
200 m_outstanding_count = 0;
201
202 coalescingWindow = p.max_coalesces_per_cycle;
203
204 m_max_outstanding_requests = 0;
205 m_instCache_ptr = nullptr;
206 m_dataCache_ptr = nullptr;
207
208 m_instCache_ptr = p.icache;
209 m_dataCache_ptr = p.dcache;
210 m_max_outstanding_requests = p.max_outstanding_requests;
211 m_deadlock_threshold = p.deadlock_threshold;
212
213 assert(m_max_outstanding_requests > 0);
214 assert(m_deadlock_threshold > 0);
215 assert(m_instCache_ptr);
216 assert(m_dataCache_ptr);
217
218 m_runningGarnetStandalone = p.garnet_standalone;
219
220
221 // These statistical variables are not for display.
222 // The profiler will collate these across different
223 // coalescers and display those collated statistics.
224 m_outstandReqHist.init(10);
225 m_latencyHist.init(10);
226 m_missLatencyHist.init(10);
227
228 for (int i = 0; i < RubyRequestType_NUM; i++) {
229 m_typeLatencyHist.push_back(new statistics::Histogram());
230 m_typeLatencyHist[i]->init(10);
231
232 m_missTypeLatencyHist.push_back(new statistics::Histogram());
233 m_missTypeLatencyHist[i]->init(10);
234 }
235
236 for (int i = 0; i < MachineType_NUM; i++) {
237 m_missMachLatencyHist.push_back(new statistics::Histogram());
238 m_missMachLatencyHist[i]->init(10);
239
240 m_IssueToInitialDelayHist.push_back(new statistics::Histogram());
241 m_IssueToInitialDelayHist[i]->init(10);
242
243 m_InitialToForwardDelayHist.push_back(new statistics::Histogram());
244 m_InitialToForwardDelayHist[i]->init(10);
245
246 m_ForwardToFirstResponseDelayHist.push_back(
247 new statistics::Histogram());
248 m_ForwardToFirstResponseDelayHist[i]->init(10);
249
250 m_FirstResponseToCompletionDelayHist.push_back(
251 new statistics::Histogram());
252 m_FirstResponseToCompletionDelayHist[i]->init(10);
253 }
254
255 for (int i = 0; i < RubyRequestType_NUM; i++) {
256 m_missTypeMachLatencyHist.push_back(
258
259 for (int j = 0; j < MachineType_NUM; j++) {
260 m_missTypeMachLatencyHist[i].push_back(
261 new statistics::Histogram());
262 m_missTypeMachLatencyHist[i][j]->init(10);
263 }
264 }
265
266}
267
269{
270}
271
272Port &
273GPUCoalescer::getPort(const std::string &if_name, PortID idx)
274{
275 if (if_name == "gmTokenPort") {
276 return gmTokenPort;
277 }
278
279 // delgate to RubyPort otherwise
280 return RubyPort::getPort(if_name, idx);
281}
282
283void
285{
286 Cycles current_time = curCycle();
287 for (auto& requestList : coalescedTable) {
288 for (auto& req : requestList.second) {
289 if (current_time - req->getIssueTime() > m_deadlock_threshold) {
290 std::stringstream ss;
292 warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
293 m_version, ss.str());
294 panic("Aborting due to deadlock!\n");
295 }
296 }
297 }
298
299 Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
300 uncoalescedTable.checkDeadlock(tick_threshold);
301
302 if (m_outstanding_count > 0) {
305 curTick());
306 }
307}
308
309void
311{
312 ss << "Printing out " << coalescedTable.size()
313 << " outstanding requests in the coalesced table\n";
314
315 for (auto& requestList : coalescedTable) {
316 for (auto& request : requestList.second) {
317 ss << "\tAddr: " << printAddress(requestList.first) << "\n"
318 << "\tInstruction sequence number: "
319 << request->getSeqNum() << "\n"
320 << "\t\tType: "
321 << RubyRequestType_to_string(request->getRubyType()) << "\n"
322 << "\t\tNumber of associated packets: "
323 << request->getPackets().size() << "\n"
324 << "\t\tIssue time: "
325 << request->getIssueTime() * clockPeriod() << "\n"
326 << "\t\tDifference from current tick: "
327 << (curCycle() - request->getIssueTime()) * clockPeriod();
328 }
329 }
330
331 // print out packets waiting to be issued in uncoalesced table
333}
334
335void
337{
340 for (int i = 0; i < RubyRequestType_NUM; i++) {
341 m_typeLatencyHist[i]->reset();
342 m_missTypeLatencyHist[i]->reset();
343 for (int j = 0; j < MachineType_NUM; j++) {
344 m_missTypeMachLatencyHist[i][j]->reset();
345 }
346 }
347
348 for (int i = 0; i < MachineType_NUM; i++) {
349 m_missMachLatencyHist[i]->reset();
350
355 }
356}
357
358void
359GPUCoalescer::printProgress(std::ostream& out) const
360{
361}
362
363// sets the kernelEndList
364void
366{
367 // Don't know if this will happen or is possible
368 // but I just want to be careful and not have it become
369 // simulator hang in the future
370 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
371 assert(kernelEndList.count(wavefront_id) == 0);
372
373 kernelEndList[wavefront_id] = pkt;
374 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
375 kernelEndList.size());
376}
377
378void
380{
381 writeCallback(address, MachineType_NULL, data);
382}
383
384void
386 MachineType mach,
388{
389 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
390}
391
392void
394 MachineType mach,
396 Cycles initialRequestTime,
397 Cycles forwardRequestTime,
398 Cycles firstResponseTime)
399{
400 writeCallback(address, mach, data,
401 initialRequestTime, forwardRequestTime, firstResponseTime,
402 false);
403}
404
405void
407 MachineType mach,
409 Cycles initialRequestTime,
410 Cycles forwardRequestTime,
411 Cycles firstResponseTime,
412 bool isRegion)
413{
414 assert(address == makeLineAddress(address));
415 assert(coalescedTable.count(address));
416
417 auto crequest = coalescedTable.at(address).front();
418
419 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
420 forwardRequestTime, firstResponseTime, isRegion);
421
422 // remove this crequest in coalescedTable
423 delete crequest;
424 coalescedTable.at(address).pop_front();
425
426 if (coalescedTable.at(address).empty()) {
427 coalescedTable.erase(address);
428 } else {
429 auto nextRequest = coalescedTable.at(address).front();
430 issueRequest(nextRequest);
431 }
432}
433
434void
436 uint64_t instSeqNum,
437 MachineType mach)
438{
439 DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
440 " instSeqNum = %d\n", address, instSeqNum);
441
442 assert(pendingWriteInsts.count(instSeqNum) == 1);
443 PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
444
445 // check the uncoalescedTable to see whether all requests for the inst
446 // have been issued or not
447 bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
448 DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
449 "reqsAllIssued=%d\n", reqsAllIssued,
450 inst.getNumPendingStores()-1, reqsAllIssued);
451
452 if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
453 // if the pending write instruction has received all write completion
454 // callbacks for its issued Ruby requests, we can now start respond
455 // the requesting CU in one response packet.
457
458 DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
459 instSeqNum);
460 pendingWriteInsts.erase(instSeqNum);
461 }
462}
463
464void
466{
467 readCallback(address, MachineType_NULL, data);
468}
469
470void
472 MachineType mach,
474{
475 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
476}
477
478void
480 MachineType mach,
482 Cycles initialRequestTime,
483 Cycles forwardRequestTime,
484 Cycles firstResponseTime)
485{
486
487 readCallback(address, mach, data,
488 initialRequestTime, forwardRequestTime, firstResponseTime,
489 false);
490}
491
492void
494 MachineType mach,
496 Cycles initialRequestTime,
497 Cycles forwardRequestTime,
498 Cycles firstResponseTime,
499 bool isRegion)
500{
501 assert(address == makeLineAddress(address));
502 assert(coalescedTable.count(address));
503
504 auto crequest = coalescedTable.at(address).front();
505 fatal_if(crequest->getRubyType() != RubyRequestType_LD,
506 "readCallback received non-read type response\n");
507
508 // Iterate over the coalesced requests to respond to as many loads as
509 // possible until another request type is seen. Models MSHR for TCP.
510 while (crequest->getRubyType() == RubyRequestType_LD) {
511 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
512 forwardRequestTime, firstResponseTime, isRegion);
513
514 delete crequest;
515 coalescedTable.at(address).pop_front();
516 if (coalescedTable.at(address).empty()) {
517 break;
518 }
519
520 crequest = coalescedTable.at(address).front();
521 }
522
523 if (coalescedTable.at(address).empty()) {
524 coalescedTable.erase(address);
525 } else {
526 auto nextRequest = coalescedTable.at(address).front();
527 issueRequest(nextRequest);
528 }
529}
530
531void
533 MachineType mach,
535 bool success,
536 Cycles initialRequestTime,
537 Cycles forwardRequestTime,
538 Cycles firstResponseTime,
539 bool isRegion)
540{
541 PacketPtr pkt = crequest->getFirstPkt();
542 Addr request_address = pkt->getAddr();
543 [[maybe_unused]] Addr request_line_address =
544 makeLineAddress(request_address);
545
546 RubyRequestType type = crequest->getRubyType();
547
548 DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
549
550 recordMissLatency(crequest, mach,
551 initialRequestTime,
552 forwardRequestTime,
553 firstResponseTime,
554 success, isRegion);
555 // update the data
556 //
557 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
558 std::vector<PacketPtr> pktList = crequest->getPackets();
559 DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
560 pktList.size(), request_line_address);
561 for (auto& pkt : pktList) {
562 request_address = pkt->getAddr();
563 if (pkt->getPtr<uint8_t>()) {
564 if ((type == RubyRequestType_LD) ||
565 (type == RubyRequestType_ATOMIC) ||
566 (type == RubyRequestType_ATOMIC_RETURN) ||
567 (type == RubyRequestType_IFETCH) ||
568 (type == RubyRequestType_RMW_Read) ||
569 (type == RubyRequestType_Locked_RMW_Read) ||
570 (type == RubyRequestType_Load_Linked)) {
571 pkt->setData(
572 data.getData(getOffset(request_address), pkt->getSize()));
573 } else {
574 data.setData(pkt->getPtr<uint8_t>(),
575 getOffset(request_address), pkt->getSize());
576 }
577 } else {
578 DPRINTF(MemoryAccess,
579 "WARNING. Data not transfered from Ruby to M5 for type " \
580 "%s\n",
581 RubyRequestType_to_string(type));
582 }
583 }
584
586 assert(m_outstanding_count >= 0);
587
588 completeHitCallback(pktList);
589}
590
591bool
593{
594 return coalescedTable.empty();
595}
596
597RubyRequestType
599{
600 RubyRequestType req_type = RubyRequestType_NULL;
601
602 // These types are not support or not used in GPU caches.
603 assert(!pkt->req->isLLSC());
604 assert(!pkt->req->isLockedRMW());
605 assert(!pkt->req->isInstFetch());
606 assert(!pkt->isFlush());
607
608 if (pkt->req->isAtomicReturn()) {
609 req_type = RubyRequestType_ATOMIC_RETURN;
610 } else if (pkt->req->isAtomicNoReturn()) {
611 req_type = RubyRequestType_ATOMIC_NO_RETURN;
612 } else if (pkt->isRead()) {
613 req_type = RubyRequestType_LD;
614 } else if (pkt->isWrite()) {
615 req_type = RubyRequestType_ST;
616 } else {
617 panic("Unsupported ruby packet type\n");
618 }
619
620 return req_type;
621}
622
623// Places an uncoalesced packet in uncoalescedTable. If the packet is a
624// special type (MemFence, scoping, etc), it is issued immediately.
625RequestStatus
627{
628 // all packets must have valid instruction sequence numbers
629 assert(pkt->req->hasInstSeqNum());
630
631 if (pkt->cmd == MemCmd::MemSyncReq) {
632 // issue mem_sync requests immediately to the cache system without
633 // going through uncoalescedTable like normal LD/ST/Atomic requests
635 } else {
636 // otherwise, this must be either read or write command
637 assert(pkt->isRead() || pkt->isWrite());
638
639 InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
640
641 // in the case of protocol tester, there is one packet per sequence
642 // number. The number of packets during simulation depends on the
643 // number of lanes actives for that vmem request (i.e., the popcnt
644 // of the exec_mask.
645 int num_packets = 1;
646 if (!m_usingRubyTester) {
647 num_packets = 0;
648 for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
649 num_packets += getDynInst(pkt)->getLaneStatus(i);
650 }
651 }
652
653 // the pkt is temporarily stored in the uncoalesced table until
654 // it's picked for coalescing process later in this cycle or in a
655 // future cycle. Packets remaining is set to the number of excepted
656 // requests from the instruction based on its exec_mask.
658 uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
659 DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
660 pkt->getAddr());
661
662 // we schedule an issue event here to process the uncoalesced table
663 // and try to issue Ruby request to cache system
664 if (!issueEvent.scheduled()) {
665 DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
666 seq_num);
668 }
669 }
670
671 // we always return RequestStatus_Issued in this coalescer
672 // b/c the coalescer's resouce was checked ealier and the coalescer is
673 // queueing up aliased requets in its coalesced table
674 return RequestStatus_Issued;
675}
676
677template <class KEY, class VALUE>
678std::ostream &
679operator<<(std::ostream &out, const std::unordered_map<KEY, VALUE> &map)
680{
681 out << "[";
682 for (auto i = map.begin(); i != map.end(); ++i)
683 out << " " << i->first << "=" << i->second;
684 out << " ]";
685
686 return out;
687}
688
689void
690GPUCoalescer::print(std::ostream& out) const
691{
692 out << "[GPUCoalescer: " << m_version
693 << ", outstanding requests: " << m_outstanding_count
694 << "]";
695}
696
699{
701 safe_cast<RubyPort::SenderState*>(pkt->senderState);
702
704 safe_cast<ComputeUnit::DataPort::SenderState*>
705 (ss->predecessor);
706
707 return cu_state->_gpuDynInst;
708}
709
710bool
712{
713 uint64_t seqNum = pkt->req->getReqInstSeqNum();
714 Addr line_addr = makeLineAddress(pkt->getAddr());
715
716 // If the packet has the same line address as a request already in the
717 // coalescedTable and has the same sequence number, it can be coalesced.
718 if (coalescedTable.count(line_addr)) {
719 // Search for a previous coalesced request with the same seqNum.
720 auto& creqQueue = coalescedTable.at(line_addr);
721 auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
722 [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
723 );
724 if (citer != creqQueue.end()) {
725 (*citer)->insertPacket(pkt);
726 return true;
727 }
728 }
729
731 // This is an "aliased" or new request. Create a RubyRequest and
732 // append it to the list of "targets" in the coalescing table.
733 DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
734 line_addr);
735
736 CoalescedRequest *creq = new CoalescedRequest(seqNum);
737 creq->insertPacket(pkt);
738 creq->setRubyType(getRequestType(pkt));
739 creq->setIssueTime(curCycle());
740
741 if (!coalescedTable.count(line_addr)) {
742 // If there is no outstanding request for this line address,
743 // create a new coalecsed request and issue it immediately.
744 auto reqList = std::deque<CoalescedRequest*> { creq };
745 coalescedTable.insert(std::make_pair(line_addr, reqList));
746 if (!coalescedReqs.count(seqNum)) {
747 coalescedReqs.insert(std::make_pair(seqNum, reqList));
748 } else {
749 coalescedReqs.at(seqNum).push_back(creq);
750 }
751 } else {
752 // The request is for a line address that is already outstanding
753 // but for a different instruction. Add it as a new request to be
754 // issued when the current outstanding request is completed.
755 coalescedTable.at(line_addr).push_back(creq);
756 DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
757 line_addr, seqNum);
758 }
759
760 // In both cases, requests are added to the coalescing table and will
761 // be counted as outstanding requests.
763
764 // We track all issued or to-be-issued Ruby requests associated with
765 // write instructions. An instruction may have multiple Ruby
766 // requests.
767 if (pkt->cmd == MemCmd::WriteReq) {
768 DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
769 " the pending write instruction list\n", seqNum,
770 line_addr);
771
773 safe_cast<RubyPort::SenderState*>(pkt->senderState);
774
775 // we need to save this port because it will be used to call
776 // back the requesting CU when we receive write
777 // complete callbacks for all issued Ruby requests of this
778 // instruction.
779 RubyPort::MemResponsePort* mem_response_port = ss->port;
780
781 GPUDynInstPtr gpuDynInst = nullptr;
782
783 if (!m_usingRubyTester) {
784 // If this coalescer is connected to a real CU, we need
785 // to save the corresponding gpu dynamic instruction.
786 // CU will use that instruction to decrement wait counters
787 // in the issuing wavefront.
788 // For Ruby tester, gpuDynInst == nullptr
789 gpuDynInst = getDynInst(pkt);
790 }
791
792 PendingWriteInst& inst = pendingWriteInsts[seqNum];
793 inst.addPendingReq(mem_response_port, gpuDynInst,
795 }
796
797 return true;
798 }
799
800 // The maximum number of outstanding requests have been issued.
801 return false;
802}
803
804void
806{
807 // Iterate over the maximum number of instructions we can coalesce
808 // per cycle (coalescingWindow).
809 for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
810 PerInstPackets *pkt_list =
812
813 // getInstPackets will return nullptr if no instruction
814 // exists at the current offset.
815 if (!pkt_list) {
816 break;
817 } else if (pkt_list->empty()) {
818 // Found something, but it has not been cleaned up by update
819 // resources yet. See if there is anything else to coalesce.
820 // Assume we can't check anymore if the coalescing window is 1.
821 continue;
822 } else {
823 // All packets in the list have the same seqNum, use first.
824 InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
825
826 // The difference in list size before and after tells us the
827 // number of packets which were coalesced.
828 size_t pkt_list_size = pkt_list->size();
829
830 // Since we have a pointer to the list of packets in the inst,
831 // erase them from the list if coalescing is successful and
832 // leave them in the list otherwise. This aggressively attempts
833 // to coalesce as many packets as possible from the current inst.
834 pkt_list->remove_if(
835 [&](PacketPtr pkt) { return coalescePacket(pkt); }
836 );
837
838 if (coalescedReqs.count(seq_num)) {
839 auto& creqs = coalescedReqs.at(seq_num);
840 for (auto creq : creqs) {
841 DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
842 RubyRequestType_to_string(creq->getRubyType()),
843 seq_num);
844 issueRequest(creq);
845 }
846 coalescedReqs.erase(seq_num);
847 }
848
849 assert(pkt_list_size >= pkt_list->size());
850 size_t pkt_list_diff = pkt_list_size - pkt_list->size();
851
852 int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
853 num_remaining -= pkt_list_diff;
854 assert(num_remaining >= 0);
855
856 uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
858 "Coalesced %d pkts for seqNum %d, %d remaining\n",
859 pkt_list_diff, seq_num, num_remaining);
860 }
861 }
862
863 // Clean up any instructions in the uncoalesced table that have had
864 // all of their packets coalesced and return a token for that column.
866
867 // have Kernel End releases been issued this cycle
868 int len = newKernelEnds.size();
869 for (int i = 0; i < len; i++) {
871 }
872 newKernelEnds.clear();
873}
874
875void
877{
878 ruby_eviction_callback(address);
879}
880
881void
883{
884 assert(kernelEndList.count(wavefront_id));
885
886 ruby_hit_callback(kernelEndList[wavefront_id]);
887
888 kernelEndList.erase(wavefront_id);
889}
890
891void
893 MachineType mach,
894 const DataBlock& data)
895{
896 assert(address == makeLineAddress(address));
897 assert(coalescedTable.count(address));
898
899 auto crequest = coalescedTable.at(address).front();
900
901 fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
902 crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
903 crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
904 "atomicCallback saw non-atomic type response\n");
905
906 hitCallback(crequest, mach, (DataBlock&)data, true,
907 crequest->getIssueTime(), Cycles(0), Cycles(0), false);
908
909 delete crequest;
910 coalescedTable.at(address).pop_front();
911
912 if (coalescedTable.at(address).empty()) {
913 coalescedTable.erase(address);
914 } else {
915 auto nextRequest = coalescedTable.at(address).front();
916 issueRequest(nextRequest);
917 }
918}
919
920void
922{
923 for (auto& pkt : mylist) {
925 safe_cast<RubyPort::SenderState *>(pkt->senderState);
926 MemResponsePort *port = ss->port;
927 assert(port != NULL);
928
929 pkt->senderState = ss->predecessor;
930
931 if (pkt->cmd != MemCmd::WriteReq) {
932 // for WriteReq, we keep the original senderState until
933 // writeCompleteCallback
934 delete ss;
935 }
936
937 port->hitCallback(pkt);
939 }
940
941 // We schedule an event in the same tick as hitCallback (similar to
942 // makeRequest) rather than calling completeIssue directly to reduce
943 // function calls to complete issue. This can only happen if the max
944 // outstanding requests is less than the number of slots in the
945 // uncoalesced table and makeRequest is not called again.
948 }
949
951}
952
953void
955 MachineType mach,
956 Cycles initialRequestTime,
957 Cycles forwardRequestTime,
958 Cycles firstResponseTime,
959 bool success, bool isRegion)
960{
961}
962
963} // namespace ruby
964} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
const char data[]
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Tick cyclesToTicks(Cycles c) const
Tick clockPeriod() const
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
virtual std::string name() const
Definition named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
bool isRead() const
Definition packet.hh:593
Addr getAddr() const
Definition packet.hh:807
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225
void setData(const uint8_t *p)
Copy data into the packet from the provided pointer.
Definition packet.hh:1293
bool isWrite() const
Definition packet.hh:594
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
unsigned getSize() const
Definition packet.hh:817
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
bool isFlush() const
Definition packet.hh:624
Ports are used to interface objects to each other.
Definition port.hh:62
void sendTokens(int num_tokens)
Return num_tokens tokens back to the request port.
Definition token_port.cc:78
void setIssueTime(Cycles _issueTime)
void insertPacket(PacketPtr pkt)
void setRubyType(RubyRequestType type)
PacketPtr getFirstPkt() const
RubyRequestType getRubyType() const
std::vector< PacketPtr > & getPackets()
virtual RubyRequestType getRequestType(PacketPtr pkt)
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
void writeCallback(Addr address, DataBlock &data)
std::vector< statistics::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
void evictionCallback(Addr address)
void kernelCallback(int wavefront_id)
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
virtual void issueMemSyncRequest(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
GMTokenPort & getGMTokenPort()
std::vector< statistics::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
statistics::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void resetStats() override
Callback to reset stats.
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
RubyGPUCoalescerParams Params
void printProgress(std::ostream &out) const
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
UncoalescedTable uncoalescedTable
void insertKernel(int wavefront_id, PacketPtr pkt)
std::unordered_map< int, PacketPtr > kernelEndList
virtual void issueRequest(CoalescedRequest *crequest)=0
statistics::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
bool coalescePacket(PacketPtr pkt)
std::vector< statistics::Histogram * > m_InitialToForwardDelayHist
std::vector< statistics::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< statistics::Histogram * > m_ForwardToFirstResponseDelayHist
RequestStatus makeRequest(PacketPtr pkt) override
void readCallback(Addr address, DataBlock &data)
void completeHitCallback(std::vector< PacketPtr > &mylist)
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
std::vector< statistics::Histogram * > m_typeLatencyHist
GPUCoalescer(const Params &)
void print(std::ostream &out) const
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
std::vector< int > newKernelEnds
std::vector< statistics::Histogram * > m_missTypeLatencyHist
std::vector< std::vector< statistics::Histogram * > > m_missTypeMachLatencyHist
EventFunctionWrapper issueEvent
GPUDynInstPtr getDynInst(PacketPtr pkt) const
EventFunctionWrapper deadlockCheckEvent
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
void ackWriteCompletion(bool usingRubyTester)
void ruby_hit_callback(PacketPtr pkt)
Definition RubyPort.cc:452
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition RubyPort.cc:104
void ruby_eviction_callback(Addr address)
Definition RubyPort.cc:695
void setPacketsRemaining(InstSeqNum seqNum, int count)
void insertPacket(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
bool areRequestsDone(const InstSeqNum instSeqNum)
std::map< InstSeqNum, PerInstPackets > instMap
UncoalescedTable(GPUCoalescer *gc)
void initPacketsRemaining(InstSeqNum seqNum, int count)
int getPacketsRemaining(InstSeqNum seqNum)
void checkDeadlock(Tick threshold)
PerInstPackets * getInstPackets(int offset)
std::map< InstSeqNum, int > instPktsRemaining
void reset()
Reset stat value to default.
A simple histogram stat.
STL deque class.
Definition stl.hh:44
STL list class.
Definition stl.hh:51
STL vector class.
Definition stl.hh:37
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority Progress_Event_Pri
Progress events come at the end.
Definition eventq.hh:229
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define warn(...)
Definition logging.hh:256
Bitfield< 18, 16 > len
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 29 > c
Definition misc_types.hh:53
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 24 > j
Definition misc_types.hh:57
Bitfield< 0 > p
Addr makeLineAddress(Addr addr)
Definition Address.cc:60
Addr getOffset(Addr addr)
Definition Address.cc:54
std::string printAddress(Addr addr)
Definition Address.cc:80
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
std::ostream & operator<<(std::ostream &os, const ArmSemihosting::InPlaceArg &ipa)
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of the Packet class.
const std::string & name()
Definition trace.cc:48

Generated on Mon Jul 10 2023 15:32:05 for gem5 by doxygen 1.9.7