gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
GPUCoalescer.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include "base/compiler.hh"
35#include "base/logging.hh"
36#include "base/str.hh"
38#include "debug/GPUCoalescer.hh"
39#include "debug/MemoryAccess.hh"
40#include "debug/ProtocolTrace.hh"
41#include "debug/RubyHitMiss.hh"
42#include "debug/RubyPort.hh"
43#include "debug/RubyStats.hh"
44#include "gpu-compute/shader.hh"
45#include "mem/packet.hh"
54#include "params/RubyGPUCoalescer.hh"
55
56namespace gem5
57{
58
59namespace ruby
60{
61
66
67void
69{
70 uint64_t seqNum = pkt->req->getReqInstSeqNum();
71
72 instMap[seqNum].push_back(pkt);
73 DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
74 pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
75}
76
77void
79{
80 uint64_t seqNum = pkt->req->getReqInstSeqNum();
81
82 reqTypeMap[seqNum] = type;
83}
84
85bool
87{
88 return !instMap.empty();
89}
90
91void
93{
94 if (!instPktsRemaining.count(seqNum)) {
95 instPktsRemaining[seqNum] = count;
96 }
97}
98
99int
104
105void
110
113{
114 if (offset >= instMap.size()) {
115 return nullptr;
116 }
117
118 auto instMapIter = instMap.begin();
119 std::advance(instMapIter, offset);
120
121 return &(instMapIter->second);
122}
123
124void
126{
127 for (auto iter = instMap.begin(); iter != instMap.end(); ) {
128 InstSeqNum seq_num = iter->first;
129 DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
130 coalescer->name().c_str(), seq_num);
131 assert(instPktsRemaining.count(seq_num));
132
133 if (instPktsRemaining[seq_num] == 0) {
134 assert(iter->second.empty());
135
136 // Remove from both maps
137 instMap.erase(iter++);
138 instPktsRemaining.erase(seq_num);
139
140 // Release the token if the Ruby system is not in cooldown
141 // or warmup phases. When in these phases, the RubyPorts
142 // are accessed directly using the makeRequest() command
143 // instead of accessing through the port. This makes
144 // sending tokens through the port unnecessary
145 if (!coalescer->getRubySystem()->getWarmupEnabled() &&
146 !coalescer->getRubySystem()->getCooldownEnabled()) {
147 if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
149 "Returning token seqNum %d\n", seq_num);
150 coalescer->getGMTokenPort().sendTokens(1);
151 }
152 }
153
154 reqTypeMap.erase(seq_num);
155 } else {
156 ++iter;
157 }
158 }
159}
160
161bool
162UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
163 // iterate the instructions held in UncoalescedTable to see whether there
164 // are more requests to issue; if yes, not yet done; otherwise, done
165 for (auto& inst : instMap) {
166 DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
167 ,inst.first, inst.second.size());
168 if (inst.first == instSeqNum) { return false; }
169 }
170
171 return true;
172}
173
174void
176{
177 ss << "Listing pending packets from " << instMap.size() << " instructions";
178
179 for (auto& inst : instMap) {
180 ss << "\tAddr: " << coalescer->printAddress(inst.first) << " with "
181 << inst.second.size() << " pending packets" << std::endl;
182 }
183}
184
185void
187{
188 Tick current_time = curTick();
189
190 for (auto &it : instMap) {
191 for (auto &pkt : it.second) {
192 if (current_time - pkt->req->time() > threshold) {
193 std::stringstream ss;
195
196 panic("Possible Deadlock detected. Aborting!\n"
197 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
198 "current time: %u issue_time: %d difference: %d\n"
199 "Request Tables:\n\n%s", coalescer->getId(),
200 pkt->getAddr(), instMap.size(), current_time,
201 pkt->req->time(), current_time - pkt->req->time(),
202 ss.str());
203 }
204 }
205 }
206}
207
209 : RubyPort(p),
210 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
212 uncoalescedTable(this),
213 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
214 stats(this),
215 gmTokenPort(name() + ".gmTokenPort")
216{
217 m_store_waiting_on_load_cycles = 0;
218 m_store_waiting_on_store_cycles = 0;
219 m_load_waiting_on_store_cycles = 0;
220 m_load_waiting_on_load_cycles = 0;
221
222 m_outstanding_count = 0;
223
224 coalescingWindow = p.max_coalesces_per_cycle;
225
226 m_max_outstanding_requests = 0;
227 m_instCache_ptr = nullptr;
228 m_dataCache_ptr = nullptr;
229
230 m_instCache_ptr = p.icache;
231 m_dataCache_ptr = p.dcache;
232 m_max_outstanding_requests = p.max_outstanding_requests;
233 m_deadlock_threshold = p.deadlock_threshold;
234
235 assert(m_max_outstanding_requests > 0);
236 assert(m_deadlock_threshold > 0);
237 assert(m_instCache_ptr);
238 assert(m_dataCache_ptr);
239
240 m_runningGarnetStandalone = p.garnet_standalone;
241
242
243 // These statistical variables are not for display.
244 // The profiler will collate these across different
245 // coalescers and display those collated statistics.
246 m_outstandReqHist.init(10);
247 m_latencyHist.init(10);
248 m_missLatencyHist.init(10);
249
250 for (int i = 0; i < RubyRequestType_NUM; i++) {
251 m_typeLatencyHist.push_back(new statistics::Histogram());
252 m_typeLatencyHist[i]->init(10);
253
254 m_missTypeLatencyHist.push_back(new statistics::Histogram());
255 m_missTypeLatencyHist[i]->init(10);
256 }
257
258 for (int i = 0; i < MachineType_NUM; i++) {
259 m_missMachLatencyHist.push_back(new statistics::Histogram());
260 m_missMachLatencyHist[i]->init(10);
261
262 m_IssueToInitialDelayHist.push_back(new statistics::Histogram());
263 m_IssueToInitialDelayHist[i]->init(10);
264
265 m_InitialToForwardDelayHist.push_back(new statistics::Histogram());
266 m_InitialToForwardDelayHist[i]->init(10);
267
268 m_ForwardToFirstResponseDelayHist.push_back(
269 new statistics::Histogram());
270 m_ForwardToFirstResponseDelayHist[i]->init(10);
271
272 m_FirstResponseToCompletionDelayHist.push_back(
273 new statistics::Histogram());
274 m_FirstResponseToCompletionDelayHist[i]->init(10);
275 }
276
277 for (int i = 0; i < RubyRequestType_NUM; i++) {
278 m_missTypeMachLatencyHist.push_back(
279 std::vector<statistics::Histogram *>());
280
281 for (int j = 0; j < MachineType_NUM; j++) {
282 m_missTypeMachLatencyHist[i].push_back(
283 new statistics::Histogram());
284 m_missTypeMachLatencyHist[i][j]->init(10);
285 }
286 }
287
288}
289
293
294Port &
295GPUCoalescer::getPort(const std::string &if_name, PortID idx)
296{
297 if (if_name == "gmTokenPort") {
298 return gmTokenPort;
299 }
300
301 // delgate to RubyPort otherwise
302 return RubyPort::getPort(if_name, idx);
303}
304
305void
307{
308 Cycles current_time = curCycle();
309 for (auto& requestList : coalescedTable) {
310 for (auto& req : requestList.second) {
311 if (current_time - req->getIssueTime() > m_deadlock_threshold) {
312 std::stringstream ss;
314 warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
315 m_version, ss.str());
316 panic("Aborting due to deadlock!\n");
317 }
318 }
319 }
320
321 Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
322 uncoalescedTable.checkDeadlock(tick_threshold);
323
324 if (m_outstanding_count > 0) {
327 curTick());
328 }
329}
330
331void
333{
334 ss << "Printing out " << coalescedTable.size()
335 << " outstanding requests in the coalesced table\n";
336
337 for (auto& requestList : coalescedTable) {
338 for (auto& request : requestList.second) {
339 ss << "\tAddr: " << printAddress(requestList.first) << "\n"
340 << "\tInstruction sequence number: "
341 << request->getSeqNum() << "\n"
342 << "\t\tType: "
343 << RubyRequestType_to_string(request->getRubyType()) << "\n"
344 << "\t\tNumber of associated packets: "
345 << request->getPackets().size() << "\n"
346 << "\t\tIssue time: "
347 << request->getIssueTime() * clockPeriod() << "\n"
348 << "\t\tDifference from current tick: "
349 << (curCycle() - request->getIssueTime()) * clockPeriod()
350 << "\n";
351 }
352 }
353
354 // print out packets waiting to be issued in uncoalesced table
355 uncoalescedTable.printRequestTable(ss);
356}
357
358void
360{
361 m_latencyHist.reset();
362 m_missLatencyHist.reset();
363 for (int i = 0; i < RubyRequestType_NUM; i++) {
364 m_typeLatencyHist[i]->reset();
365 m_missTypeLatencyHist[i]->reset();
366 for (int j = 0; j < MachineType_NUM; j++) {
367 m_missTypeMachLatencyHist[i][j]->reset();
368 }
369 }
370
371 for (int i = 0; i < MachineType_NUM; i++) {
372 m_missMachLatencyHist[i]->reset();
373
378 }
379}
380
381void
382GPUCoalescer::printProgress(std::ostream& out) const
383{
384}
385
386// sets the kernelEndList
387void
389{
390 // Don't know if this will happen or is possible
391 // but I just want to be careful and not have it become
392 // simulator hang in the future
393 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
394 assert(kernelEndList.count(wavefront_id) == 0);
395
396 kernelEndList[wavefront_id] = pkt;
397 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
398 kernelEndList.size());
399}
400
401void
403{
404 writeCallback(address, MachineType_NULL, data);
405}
406
407void
409 MachineType mach,
411{
412 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
413}
414
415void
417 MachineType mach,
419 Cycles initialRequestTime,
420 Cycles forwardRequestTime,
421 Cycles firstResponseTime)
422{
423 writeCallback(address, mach, data,
424 initialRequestTime, forwardRequestTime, firstResponseTime,
425 false);
426}
427
428void
430 MachineType mach,
432 Cycles initialRequestTime,
433 Cycles forwardRequestTime,
434 Cycles firstResponseTime,
435 bool isRegion)
436{
437 assert(address == makeLineAddress(address));
438 assert(coalescedTable.count(address));
439
440 auto crequest = coalescedTable.at(address).front();
441
442 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
443 forwardRequestTime, firstResponseTime, isRegion, false, false);
444
445 // remove this crequest in coalescedTable
446 delete crequest;
447 coalescedTable.at(address).pop_front();
448
449 if (coalescedTable.at(address).empty()) {
450 coalescedTable.erase(address);
451 } else {
452 auto nextRequest = coalescedTable.at(address).front();
453 issueRequest(nextRequest);
454 }
455}
456
457void
459 uint64_t instSeqNum,
460 MachineType mach)
461{
462 DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
463 " instSeqNum = %d\n", address, instSeqNum);
464
465 assert(pendingWriteInsts.count(instSeqNum) == 1);
466 PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
467
468 // check the uncoalescedTable to see whether all requests for the inst
469 // have been issued or not
470 bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
471 DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
472 "reqsAllIssued=%d\n", reqsAllIssued,
473 inst.getNumPendingStores()-1, reqsAllIssued);
474
475 if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
476 // if the pending write instruction has received all write completion
477 // callbacks for its issued Ruby requests, we can now start respond
478 // the requesting CU in one response packet.
480
481 DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
482 instSeqNum);
483 pendingWriteInsts.erase(instSeqNum);
484 }
485}
486
487void
489{
490 readCallback(address, MachineType_NULL, data, false);
491}
492
493void
495 MachineType mach,
497 bool externalHit = false)
498{
499 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0), externalHit);
500}
501
502void
504 MachineType mach,
506 Cycles initialRequestTime,
507 Cycles forwardRequestTime,
508 Cycles firstResponseTime,
509 bool externalHit = false)
510{
511
512 readCallback(address, mach, data,
513 initialRequestTime, forwardRequestTime, firstResponseTime,
514 false, externalHit);
515}
516
517void
519 MachineType mach,
521 Cycles initialRequestTime,
522 Cycles forwardRequestTime,
523 Cycles firstResponseTime,
524 bool isRegion,
525 bool externalHit = false)
526{
527 assert(address == makeLineAddress(address));
528 assert(coalescedTable.count(address));
529
530 auto crequest = coalescedTable.at(address).front();
531 fatal_if(crequest->getRubyType() != RubyRequestType_LD,
532 "readCallback received non-read type response\n");
533
534 bool mshr_hit_under_miss = false;
535 // Iterate over the coalesced requests to respond to as many loads as
536 // possible until another request type is seen. Models MSHR for
537 // Coalescer. Do not respond to pending loads that have SLC/GLC flags
538 // set; issue them instead
539 while (crequest->getRubyType() == RubyRequestType_LD) {
540 hitCallback(crequest, mach, data, true,
541 crequest->getIssueTime(), forwardRequestTime, firstResponseTime,
542 isRegion, externalHit, mshr_hit_under_miss);
543
544 delete crequest;
545 coalescedTable.at(address).pop_front();
546 if (coalescedTable.at(address).empty()) {
547 break;
548 }
549
550 crequest = coalescedTable.at(address).front();
551
552 PacketPtr pkt = crequest->getFirstPkt();
553 bool is_request_local = !pkt->isGLCSet() && !pkt->isSLCSet();
554 if (!is_request_local) {
555 break;
556 }
557
558 mshr_hit_under_miss = true;
559 }
560
561 if (coalescedTable.at(address).empty()) {
562 coalescedTable.erase(address);
563 } else {
564 auto nextRequest = coalescedTable.at(address).front();
565 issueRequest(nextRequest);
566 }
567}
568
569void
571 MachineType mach,
573 bool success,
574 Cycles initialRequestTime,
575 Cycles forwardRequestTime,
576 Cycles firstResponseTime,
577 bool isRegion,
578 bool externalHit = false,
579 bool mshrHitUnderMiss = false)
580{
581 PacketPtr pkt = crequest->getFirstPkt();
582 Addr request_address = pkt->getAddr();
583 [[maybe_unused]] Addr request_line_address =
584 makeLineAddress(request_address);
585
586 RubyRequestType type = crequest->getRubyType();
587
588 DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
589
590 DPRINTF(RubyHitMiss, "GPU TCP Cache %s at %#x\n",
591 externalHit ? "hit" : "miss",
592 printAddress(request_address));
593
594 recordStats(crequest, mach,
595 initialRequestTime,
596 forwardRequestTime,
597 firstResponseTime,
598 isRegion,
599 mshrHitUnderMiss);
600 // update the data
601 //
602 // MUST ADD DOING THIS FOR EACH REQUEST IN COALESCER
603 std::vector<PacketPtr> pktList = crequest->getPackets();
604
605 uint8_t* log = nullptr;
606 DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
607 pktList.size(), request_line_address);
608 uint32_t offset;
609 int pkt_size;
610 for (auto& pkt : pktList) {
611 offset = getOffset(pkt->getAddr());
612 pkt_size = pkt->getSize();
613 request_address = pkt->getAddr();
614
615 // When the Ruby system is cooldown phase, the requests come from
616 // the cache recorder. These requests do not get coalesced and
617 // do not return valid data.
618 if (m_ruby_system->getCooldownEnabled())
619 continue;
620
621 if (pkt->getPtr<uint8_t>()) {
622 switch(type) {
623 // Store and AtomicNoReturns follow the same path, as the
624 // data response is not needed.
625 case RubyRequestType_ATOMIC_NO_RETURN:
626 assert(pkt->isAtomicOp());
627 break;
628 case RubyRequestType_ST:
629 break;
630 case RubyRequestType_LD:
631 pkt->setData(data.getData(offset, pkt_size));
632 break;
633 case RubyRequestType_ATOMIC_RETURN:
634 assert(pkt->isAtomicOp());
635 // Atomic operations are performed by the WriteMask
636 // in packet order, set by the crequest. Thus, when
637 // unpacking the changes from the log, we read from
638 // the front of the log to correctly map response
639 // data into the packets.
640
641 // Log entry contains the old value before the current
642 // atomic operation occurred.
643 log = data.popAtomicLogEntryFront();
644 pkt->setData(&log[offset]);
645 delete [] log;
646 log = nullptr;
647 break;
648 default:
649 panic("Unsupported ruby packet type:%s\n",
650 RubyRequestType_to_string(type));
651 break;
652 }
653 } else {
654 DPRINTF(MemoryAccess,
655 "WARNING. Data not transfered from Ruby to M5 for type " \
656 "%s\n",
657 RubyRequestType_to_string(type));
658 }
659 }
660 assert(data.numAtomicLogEntries() == 0);
661
663 assert(m_outstanding_count >= 0);
664
665 completeHitCallback(pktList);
666}
667
668bool
670{
671 return coalescedTable.empty();
672}
673
674RubyRequestType
676{
677 RubyRequestType req_type = RubyRequestType_NULL;
678
679 // These types are not support or not used in GPU caches.
680 assert(!pkt->req->isLLSC());
681 assert(!pkt->req->isLockedRMW());
682 assert(!pkt->req->isInstFetch());
683
684 if (pkt->req->isAtomicReturn()) {
685 req_type = RubyRequestType_ATOMIC_RETURN;
686 } else if (pkt->req->isAtomicNoReturn()) {
687 req_type = RubyRequestType_ATOMIC_NO_RETURN;
688 } else if (pkt->isRead()) {
689 req_type = RubyRequestType_LD;
690 } else if (pkt->isWrite()) {
691 req_type = RubyRequestType_ST;
692 } else if (pkt->isFlush()) {
693 req_type = RubyRequestType_FLUSH;
694 } else {
695 panic("Unsupported ruby packet type\n");
696 }
697
698 return req_type;
699}
700
701// Places an uncoalesced packet in uncoalescedTable. If the packet is a
702// special type (MemFence, scoping, etc), it is issued immediately.
703RequestStatus
705{
706 if (pkt->cmd == MemCmd::MemSyncReq) {
707 // issue mem_sync requests immediately to the cache system without
708 // going through uncoalescedTable like normal LD/ST/Atomic requests
710 } else {
711 // all packets must have valid instruction sequence numbers
712 assert(pkt->req->hasInstSeqNum());
713
714 // otherwise, this must be either read or write command
715 assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
716
717 InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
718
719 // in the case of protocol tester, there is one packet per sequence
720 // number. The number of packets during simulation depends on the
721 // number of lanes actives for that vmem request (i.e., the popcnt
722 // of the exec_mask.
723 int num_packets = 1;
724
725 // When Ruby is in warmup or cooldown phase, the requests come from
726 // the cache recorder. There is no dynamic instruction associated
727 // with these requests either
728 if (!m_ruby_system->getWarmupEnabled()
729 && !m_ruby_system->getCooldownEnabled()) {
730 if (!m_usingRubyTester) {
731 num_packets = 0;
732 for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
733 num_packets += getDynInst(pkt)->getLaneStatus(i);
734 }
735 }
736 }
737
738 // the pkt is temporarily stored in the uncoalesced table until
739 // it's picked for coalescing process later in this cycle or in a
740 // future cycle. Packets remaining is set to the number of excepted
741 // requests from the instruction based on its exec_mask.
742 uncoalescedTable.insertPacket(pkt);
743 uncoalescedTable.insertReqType(pkt, getRequestType(pkt));
744 uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
745 DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
746 pkt->getAddr());
747
748 // we schedule an issue event here to process the uncoalesced table
749 // and try to issue Ruby request to cache system
750 if (!issueEvent.scheduled()) {
751 DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
752 seq_num);
754 }
755 }
756
757 // we always return RequestStatus_Issued in this coalescer
758 // b/c the coalescer's resouce was checked ealier and the coalescer is
759 // queueing up aliased requets in its coalesced table
760 return RequestStatus_Issued;
761}
762
763template <class KEY, class VALUE>
764std::ostream &
765operator<<(std::ostream &out, const std::unordered_map<KEY, VALUE> &map)
766{
767 out << "[";
768 for (auto i = map.begin(); i != map.end(); ++i)
769 out << " " << i->first << "=" << i->second;
770 out << " ]";
771
772 return out;
773}
774
775void
776GPUCoalescer::print(std::ostream& out) const
777{
778 out << "[GPUCoalescer: " << m_version
779 << ", outstanding requests: " << m_outstanding_count
780 << "]";
781}
782
795
796bool
798{
799 uint64_t seqNum = pkt->req->getReqInstSeqNum();
800 Addr line_addr = makeLineAddress(pkt->getAddr());
801
802 // If the packet has the same line address as a request already in the
803 // coalescedTable and has the same sequence number, it can be coalesced.
804 if (coalescedTable.count(line_addr)) {
805 // Search for a previous coalesced request with the same seqNum.
806 auto& creqQueue = coalescedTable.at(line_addr);
807 auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
808 [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
809 );
810 if (citer != creqQueue.end()) {
811 (*citer)->insertPacket(pkt);
812 return true;
813 }
814 }
815
817 // This is an "aliased" or new request. Create a RubyRequest and
818 // append it to the list of "targets" in the coalescing table.
819 DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
820 line_addr);
821
822 CoalescedRequest *creq = new CoalescedRequest(seqNum);
823 creq->insertPacket(pkt);
824 creq->setRubyType(getRequestType(pkt));
825 creq->setIssueTime(curCycle());
826
827 if (!coalescedTable.count(line_addr)) {
828 // If there is no outstanding request for this line address,
829 // create a new coalecsed request and issue it immediately.
830 auto reqList = std::deque<CoalescedRequest*> { creq };
831 coalescedTable.insert(std::make_pair(line_addr, reqList));
832 if (!coalescedReqs.count(seqNum)) {
833 coalescedReqs.insert(std::make_pair(seqNum, reqList));
834 } else {
835 coalescedReqs.at(seqNum).push_back(creq);
836 }
837 } else {
838 // The request is for a line address that is already outstanding
839 // but for a different instruction. Add it as a new request to be
840 // issued when the current outstanding request is completed.
841 coalescedTable.at(line_addr).push_back(creq);
842 DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
843 line_addr, seqNum);
844 }
845
846 // In both cases, requests are added to the coalescing table and will
847 // be counted as outstanding requests.
849
850 // We track all issued or to-be-issued Ruby requests associated with
851 // write instructions. An instruction may have multiple Ruby
852 // requests.
853 if (pkt->cmd == MemCmd::WriteReq) {
854 DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
855 " the pending write instruction list\n", seqNum,
856 line_addr);
857
860
861 // we need to save this port because it will be used to call
862 // back the requesting CU when we receive write
863 // complete callbacks for all issued Ruby requests of this
864 // instruction.
865 RubyPort::MemResponsePort* mem_response_port = ss->port;
866
867 GPUDynInstPtr gpuDynInst = nullptr;
868
869 if (!m_usingRubyTester) {
870 // If this coalescer is connected to a real CU, we need
871 // to save the corresponding gpu dynamic instruction.
872 // CU will use that instruction to decrement wait counters
873 // in the issuing wavefront.
874 // For Ruby tester, gpuDynInst == nullptr
875 gpuDynInst = getDynInst(pkt);
876 }
877
878 PendingWriteInst& inst = pendingWriteInsts[seqNum];
879 inst.addPendingReq(mem_response_port, gpuDynInst,
881 }
882
883 return true;
884 }
885
886 // The maximum number of outstanding requests have been issued.
887 return false;
888}
889
890void
892{
893 // Iterate over the maximum number of instructions we can coalesce
894 // per cycle (coalescingWindow).
895 for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
896 PerInstPackets *pkt_list =
897 uncoalescedTable.getInstPackets(instIdx);
898
899 // getInstPackets will return nullptr if no instruction
900 // exists at the current offset.
901 if (!pkt_list) {
902 break;
903 } else if (pkt_list->empty()) {
904 // Found something, but it has not been cleaned up by update
905 // resources yet. See if there is anything else to coalesce.
906 // Assume we can't check anymore if the coalescing window is 1.
907 continue;
908 } else {
909 // All packets in the list have the same seqNum, use first.
910 InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
911
912 // The difference in list size before and after tells us the
913 // number of packets which were coalesced.
914 size_t pkt_list_size = pkt_list->size();
915
916 // Since we have a pointer to the list of packets in the inst,
917 // erase them from the list if coalescing is successful and
918 // leave them in the list otherwise. This aggressively attempts
919 // to coalesce as many packets as possible from the current inst.
920 pkt_list->remove_if(
921 [&](PacketPtr pkt) { return coalescePacket(pkt); }
922 );
923
924 if (coalescedReqs.count(seq_num)) {
925 auto& creqs = coalescedReqs.at(seq_num);
926 for (auto creq : creqs) {
927 DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
928 RubyRequestType_to_string(creq->getRubyType()),
929 seq_num);
930 issueRequest(creq);
931 }
932 coalescedReqs.erase(seq_num);
933 }
934
935 assert(pkt_list_size >= pkt_list->size());
936 size_t pkt_list_diff = pkt_list_size - pkt_list->size();
937
938 int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
939 num_remaining -= pkt_list_diff;
940 assert(num_remaining >= 0);
941
942 uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
944 "Coalesced %d pkts for seqNum %d, %d remaining\n",
945 pkt_list_diff, seq_num, num_remaining);
946 }
947 }
948
949 // Clean up any instructions in the uncoalesced table that have had
950 // all of their packets coalesced and return a token for that column.
951 uncoalescedTable.updateResources();
952
953 // have Kernel End releases been issued this cycle
954 int len = newKernelEnds.size();
955 for (int i = 0; i < len; i++) {
957 }
958 newKernelEnds.clear();
959}
960
961void
966
967void
969{
970 assert(kernelEndList.count(wavefront_id));
971
972 ruby_hit_callback(kernelEndList[wavefront_id]);
973
974 kernelEndList.erase(wavefront_id);
975}
976
977void
979 MachineType mach,
980 const DataBlock& data)
981{
982 assert(address == makeLineAddress(address));
983 assert(coalescedTable.count(address));
984
985 auto crequest = coalescedTable.at(address).front();
986
987 fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
988 crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
989 crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
990 "atomicCallback saw non-atomic type response\n");
991
992 hitCallback(crequest, mach, (DataBlock&)data, true,
993 crequest->getIssueTime(), Cycles(0), Cycles(0), false, false);
994
995 delete crequest;
996 coalescedTable.at(address).pop_front();
997
998 if (coalescedTable.at(address).empty()) {
999 coalescedTable.erase(address);
1000 } else {
1001 auto nextRequest = coalescedTable.at(address).front();
1002 issueRequest(nextRequest);
1003 }
1004}
1005
1006void
1008{
1009 for (auto& pkt : mylist) {
1010 // When Ruby is in warmup or cooldown phase, the requests come
1011 // from the cache recorder. They do not track which port to use
1012 // and do not need to send the response back
1013 if (!m_ruby_system->getWarmupEnabled()
1014 && !m_ruby_system->getCooldownEnabled()) {
1016 safe_cast<RubyPort::SenderState *>(pkt->senderState);
1017 MemResponsePort *port = ss->port;
1018 assert(port != NULL);
1019
1020 pkt->senderState = ss->predecessor;
1021
1022 if (pkt->cmd != MemCmd::WriteReq) {
1023 // for WriteReq, we keep the original senderState until
1024 // writeCompleteCallback
1025 delete ss;
1026 }
1027
1028 port->hitCallback(pkt);
1030 }
1031 }
1032
1033 // We schedule an event in the same tick as hitCallback (similar to
1034 // makeRequest) rather than calling completeIssue directly to reduce
1035 // function calls to complete issue. This can only happen if the max
1036 // outstanding requests is less than the number of slots in the
1037 // uncoalesced table and makeRequest is not called again.
1038 if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
1040 }
1041
1043 if (m_ruby_system->getWarmupEnabled()) {
1044 rs->m_cache_recorder->enqueueNextFetchRequest();
1045 } else if (m_ruby_system->getCooldownEnabled()) {
1046 rs->m_cache_recorder->enqueueNextFlushRequest();
1047 } else {
1049 }
1050}
1051
1052void
1054 MachineType mach,
1055 Cycles initialRequestTime,
1056 Cycles forwardRequestTime,
1057 Cycles firstResponseTime,
1058 bool isRegion, bool mshrHitUnderMiss)
1059{
1060 RubyRequestType type = crequest->getRubyType();
1061
1062 if (mshrHitUnderMiss) {
1063 // Add the number of mshr hits under misses to the
1064 // TCP demand hits stat.
1065 // We don't need to profile misses since they will be
1066 // profiled at the TCP. Only the MSHR hits under misses
1067 // needs to be profiled here
1068 PacketPtr pkt = crequest->getFirstPkt();
1069 if (!pkt->isGLCSet() &&
1070 !pkt->isSLCSet()) {
1071 m_dataCache_ptr->profileDemandHit();
1072 }
1073
1074 // Since the request hit in the mshr, update mshr stats
1075 if (type == RubyRequestType_LD) {
1076 stats.m_mshr_ld_hits_under_miss++;
1077 }
1078 } else {
1079 if (type == RubyRequestType_LD) {
1080 stats.m_mshr_ld_misses++;
1081 } else {
1082 stats.m_mshr_st_misses++;
1083 }
1084 }
1085}
1086
1088 : statistics::Group(parent),
1090 "Number of load requests that hit in the coalescer MSHR"),
1092 "Number of load requests that miss in the coalescer MSHR"),
1094 "Number of store requests that miss in the coalescer MSHR"),
1096 "Number of mshr accesses",
1099{
1100}
1101
1102} // namespace ruby
1103} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
const char data[]
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Tick cyclesToTicks(Cycles c) const
Tick clockPeriod() const
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
bool isRead() const
Definition packet.hh:593
Addr getAddr() const
Definition packet.hh:807
bool isAtomicOp() const
Definition packet.hh:846
bool isGLCSet() const
Accessor functions for the cache bypass flags.
Definition packet.hh:1113
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225
void setData(const uint8_t *p)
Copy data into the packet from the provided pointer.
Definition packet.hh:1293
bool isWrite() const
Definition packet.hh:594
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
unsigned getSize() const
Definition packet.hh:817
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
bool isFlush() const
Definition packet.hh:624
bool isSLCSet() const
Definition packet.hh:1114
Ports are used to interface objects to each other.
Definition port.hh:62
void setIssueTime(Cycles _issueTime)
void insertPacket(PacketPtr pkt)
void setRubyType(RubyRequestType type)
PacketPtr getFirstPkt() const
RubyRequestType getRubyType() const
std::vector< PacketPtr > & getPackets()
virtual RubyRequestType getRequestType(PacketPtr pkt)
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
void writeCallback(Addr address, DataBlock &data)
std::vector< statistics::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
void evictionCallback(Addr address)
void kernelCallback(int wavefront_id)
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
virtual void issueMemSyncRequest(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
std::vector< statistics::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
statistics::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void resetStats() override
Callback to reset stats.
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion, bool externalHit, bool mshrHitUnderMiss)
void recordStats(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion, bool mshrHitUnderMiss)
RubyGPUCoalescerParams Params
void printProgress(std::ostream &out) const
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
UncoalescedTable uncoalescedTable
void insertKernel(int wavefront_id, PacketPtr pkt)
std::unordered_map< int, PacketPtr > kernelEndList
virtual void issueRequest(CoalescedRequest *crequest)=0
statistics::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
bool coalescePacket(PacketPtr pkt)
std::vector< statistics::Histogram * > m_InitialToForwardDelayHist
std::vector< statistics::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< statistics::Histogram * > m_ForwardToFirstResponseDelayHist
RequestStatus makeRequest(PacketPtr pkt) override
void readCallback(Addr address, DataBlock &data)
void completeHitCallback(std::vector< PacketPtr > &mylist)
gem5::ruby::GPUCoalescer::GPUCoalescerStats stats
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
std::vector< statistics::Histogram * > m_typeLatencyHist
GPUCoalescer(const Params &)
void print(std::ostream &out) const
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
std::vector< int > newKernelEnds
std::vector< statistics::Histogram * > m_missTypeLatencyHist
std::vector< std::vector< statistics::Histogram * > > m_missTypeMachLatencyHist
CacheMemory * m_dataCache_ptr
EventFunctionWrapper issueEvent
GPUDynInstPtr getDynInst(PacketPtr pkt) const
EventFunctionWrapper deadlockCheckEvent
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
void ackWriteCompletion(bool usingRubyTester)
void ruby_hit_callback(PacketPtr pkt)
Definition RubyPort.cc:462
Addr makeLineAddress(Addr addr) const
Definition RubyPort.cc:759
std::string printAddress(Addr addr) const
Definition RubyPort.cc:765
RubySystem * m_ruby_system
Definition RubyPort.hh:207
RubyPort(const Params &p)
Definition RubyPort.cc:61
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition RubyPort.cc:104
void ruby_eviction_callback(Addr address)
Definition RubyPort.cc:707
Addr getOffset(Addr addr) const
Definition RubyPort.cc:753
void setPacketsRemaining(InstSeqNum seqNum, int count)
std::map< InstSeqNum, RubyRequestType > reqTypeMap
void insertPacket(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
bool areRequestsDone(const InstSeqNum instSeqNum)
void insertReqType(PacketPtr pkt, RubyRequestType type)
std::map< InstSeqNum, PerInstPackets > instMap
UncoalescedTable(GPUCoalescer *gc)
void initPacketsRemaining(InstSeqNum seqNum, int count)
int getPacketsRemaining(InstSeqNum seqNum)
void checkDeadlock(Tick threshold)
PerInstPackets * getInstPackets(int offset)
std::map< InstSeqNum, int > instPktsRemaining
Statistics container.
Definition group.hh:93
A simple histogram stat.
STL deque class.
Definition stl.hh:44
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority Progress_Event_Pri
Progress events come at the end.
Definition eventq.hh:229
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268
#define warn(...)
Definition logging.hh:288
Bitfield< 18, 16 > len
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 9, 8 > rs
Bitfield< 29 > c
Definition misc_types.hh:53
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 0 > p
std::list< PacketPtr > PerInstPackets
std::ostream & operator<<(std::ostream &os, const BoolVec &myvector)
Definition BoolVec.cc:49
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
Packet * PacketPtr
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of the Packet class.
GPUCoalescerStats(statistics::Group *parent)
const std::string & name()
Definition trace.cc:48

Generated on Mon May 26 2025 09:19:12 for gem5 by doxygen 1.13.2