gem5 v24.1.0.1
Loading...
Searching...
No Matches
GPUCoalescer.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include "base/compiler.hh"
35#include "base/logging.hh"
36#include "base/str.hh"
38#include "debug/GPUCoalescer.hh"
39#include "debug/MemoryAccess.hh"
40#include "debug/ProtocolTrace.hh"
41#include "debug/RubyHitMiss.hh"
42#include "debug/RubyPort.hh"
43#include "debug/RubyStats.hh"
44#include "gpu-compute/shader.hh"
45#include "mem/packet.hh"
54#include "params/RubyGPUCoalescer.hh"
55
56namespace gem5
57{
58
59namespace ruby
60{
61
63 : coalescer(gc)
64{
65}
66
67void
69{
70 uint64_t seqNum = pkt->req->getReqInstSeqNum();
71
72 instMap[seqNum].push_back(pkt);
73 DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
74 pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
75}
76
77void
79{
80 uint64_t seqNum = pkt->req->getReqInstSeqNum();
81
82 reqTypeMap[seqNum] = type;
83}
84
85bool
87{
88 return !instMap.empty();
89}
90
91void
93{
94 if (!instPktsRemaining.count(seqNum)) {
95 instPktsRemaining[seqNum] = count;
96 }
97}
98
99int
104
105void
110
113{
114 if (offset >= instMap.size()) {
115 return nullptr;
116 }
117
118 auto instMapIter = instMap.begin();
119 std::advance(instMapIter, offset);
120
121 return &(instMapIter->second);
122}
123
124void
126{
127 for (auto iter = instMap.begin(); iter != instMap.end(); ) {
128 InstSeqNum seq_num = iter->first;
129 DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
130 coalescer->name().c_str(), seq_num);
131 assert(instPktsRemaining.count(seq_num));
132
133 if (instPktsRemaining[seq_num] == 0) {
134 assert(iter->second.empty());
135
136 // Remove from both maps
137 instMap.erase(iter++);
138 instPktsRemaining.erase(seq_num);
139
140 // Release the token if the Ruby system is not in cooldown
141 // or warmup phases. When in these phases, the RubyPorts
142 // are accessed directly using the makeRequest() command
143 // instead of accessing through the port. This makes
144 // sending tokens through the port unnecessary
147 if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
149 "Returning token seqNum %d\n", seq_num);
151 }
152 }
153
154 reqTypeMap.erase(seq_num);
155 } else {
156 ++iter;
157 }
158 }
159}
160
161bool
162UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
163 // iterate the instructions held in UncoalescedTable to see whether there
164 // are more requests to issue; if yes, not yet done; otherwise, done
165 for (auto& inst : instMap) {
166 DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
167 ,inst.first, inst.second.size());
168 if (inst.first == instSeqNum) { return false; }
169 }
170
171 return true;
172}
173
174void
176{
177 ss << "Listing pending packets from " << instMap.size() << " instructions";
178
179 for (auto& inst : instMap) {
180 ss << "\tAddr: " << coalescer->printAddress(inst.first) << " with "
181 << inst.second.size() << " pending packets" << std::endl;
182 }
183}
184
185void
187{
188 Tick current_time = curTick();
189
190 for (auto &it : instMap) {
191 for (auto &pkt : it.second) {
192 if (current_time - pkt->req->time() > threshold) {
193 std::stringstream ss;
195
196 panic("Possible Deadlock detected. Aborting!\n"
197 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
198 "current time: %u issue_time: %d difference: %d\n"
199 "Request Tables:\n\n%s", coalescer->getId(),
200 pkt->getAddr(), instMap.size(), current_time,
201 pkt->req->time(), current_time - pkt->req->time(),
202 ss.str());
203 }
204 }
205 }
206}
207
209 : RubyPort(p),
210 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
212 uncoalescedTable(this),
213 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
214 gmTokenPort(name() + ".gmTokenPort")
215{
216 m_store_waiting_on_load_cycles = 0;
217 m_store_waiting_on_store_cycles = 0;
218 m_load_waiting_on_store_cycles = 0;
219 m_load_waiting_on_load_cycles = 0;
220
221 m_outstanding_count = 0;
222
223 coalescingWindow = p.max_coalesces_per_cycle;
224
225 m_max_outstanding_requests = 0;
226 m_instCache_ptr = nullptr;
227 m_dataCache_ptr = nullptr;
228
229 m_instCache_ptr = p.icache;
230 m_dataCache_ptr = p.dcache;
231 m_max_outstanding_requests = p.max_outstanding_requests;
232 m_deadlock_threshold = p.deadlock_threshold;
233
234 assert(m_max_outstanding_requests > 0);
235 assert(m_deadlock_threshold > 0);
236 assert(m_instCache_ptr);
237 assert(m_dataCache_ptr);
238
239 m_runningGarnetStandalone = p.garnet_standalone;
240
241
242 // These statistical variables are not for display.
243 // The profiler will collate these across different
244 // coalescers and display those collated statistics.
245 m_outstandReqHist.init(10);
246 m_latencyHist.init(10);
247 m_missLatencyHist.init(10);
248
249 for (int i = 0; i < RubyRequestType_NUM; i++) {
250 m_typeLatencyHist.push_back(new statistics::Histogram());
251 m_typeLatencyHist[i]->init(10);
252
253 m_missTypeLatencyHist.push_back(new statistics::Histogram());
254 m_missTypeLatencyHist[i]->init(10);
255 }
256
257 for (int i = 0; i < MachineType_NUM; i++) {
258 m_missMachLatencyHist.push_back(new statistics::Histogram());
259 m_missMachLatencyHist[i]->init(10);
260
261 m_IssueToInitialDelayHist.push_back(new statistics::Histogram());
262 m_IssueToInitialDelayHist[i]->init(10);
263
264 m_InitialToForwardDelayHist.push_back(new statistics::Histogram());
265 m_InitialToForwardDelayHist[i]->init(10);
266
267 m_ForwardToFirstResponseDelayHist.push_back(
268 new statistics::Histogram());
269 m_ForwardToFirstResponseDelayHist[i]->init(10);
270
271 m_FirstResponseToCompletionDelayHist.push_back(
272 new statistics::Histogram());
273 m_FirstResponseToCompletionDelayHist[i]->init(10);
274 }
275
276 for (int i = 0; i < RubyRequestType_NUM; i++) {
277 m_missTypeMachLatencyHist.push_back(
279
280 for (int j = 0; j < MachineType_NUM; j++) {
281 m_missTypeMachLatencyHist[i].push_back(
282 new statistics::Histogram());
283 m_missTypeMachLatencyHist[i][j]->init(10);
284 }
285 }
286
287}
288
292
293Port &
294GPUCoalescer::getPort(const std::string &if_name, PortID idx)
295{
296 if (if_name == "gmTokenPort") {
297 return gmTokenPort;
298 }
299
300 // delgate to RubyPort otherwise
301 return RubyPort::getPort(if_name, idx);
302}
303
304void
306{
307 Cycles current_time = curCycle();
308 for (auto& requestList : coalescedTable) {
309 for (auto& req : requestList.second) {
310 if (current_time - req->getIssueTime() > m_deadlock_threshold) {
311 std::stringstream ss;
313 warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
314 m_version, ss.str());
315 panic("Aborting due to deadlock!\n");
316 }
317 }
318 }
319
320 Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
321 uncoalescedTable.checkDeadlock(tick_threshold);
322
323 if (m_outstanding_count > 0) {
326 curTick());
327 }
328}
329
330void
332{
333 ss << "Printing out " << coalescedTable.size()
334 << " outstanding requests in the coalesced table\n";
335
336 for (auto& requestList : coalescedTable) {
337 for (auto& request : requestList.second) {
338 ss << "\tAddr: " << printAddress(requestList.first) << "\n"
339 << "\tInstruction sequence number: "
340 << request->getSeqNum() << "\n"
341 << "\t\tType: "
342 << RubyRequestType_to_string(request->getRubyType()) << "\n"
343 << "\t\tNumber of associated packets: "
344 << request->getPackets().size() << "\n"
345 << "\t\tIssue time: "
346 << request->getIssueTime() * clockPeriod() << "\n"
347 << "\t\tDifference from current tick: "
348 << (curCycle() - request->getIssueTime()) * clockPeriod()
349 << "\n";
350 }
351 }
352
353 // print out packets waiting to be issued in uncoalesced table
355}
356
357void
359{
362 for (int i = 0; i < RubyRequestType_NUM; i++) {
363 m_typeLatencyHist[i]->reset();
364 m_missTypeLatencyHist[i]->reset();
365 for (int j = 0; j < MachineType_NUM; j++) {
366 m_missTypeMachLatencyHist[i][j]->reset();
367 }
368 }
369
370 for (int i = 0; i < MachineType_NUM; i++) {
371 m_missMachLatencyHist[i]->reset();
372
377 }
378}
379
380void
381GPUCoalescer::printProgress(std::ostream& out) const
382{
383}
384
385// sets the kernelEndList
386void
388{
389 // Don't know if this will happen or is possible
390 // but I just want to be careful and not have it become
391 // simulator hang in the future
392 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
393 assert(kernelEndList.count(wavefront_id) == 0);
394
395 kernelEndList[wavefront_id] = pkt;
396 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
397 kernelEndList.size());
398}
399
400void
402{
403 writeCallback(address, MachineType_NULL, data);
404}
405
406void
408 MachineType mach,
410{
411 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
412}
413
414void
416 MachineType mach,
418 Cycles initialRequestTime,
419 Cycles forwardRequestTime,
420 Cycles firstResponseTime)
421{
422 writeCallback(address, mach, data,
423 initialRequestTime, forwardRequestTime, firstResponseTime,
424 false);
425}
426
427void
429 MachineType mach,
431 Cycles initialRequestTime,
432 Cycles forwardRequestTime,
433 Cycles firstResponseTime,
434 bool isRegion)
435{
436 assert(address == makeLineAddress(address));
437 assert(coalescedTable.count(address));
438
439 auto crequest = coalescedTable.at(address).front();
440
441 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
442 forwardRequestTime, firstResponseTime, isRegion, false);
443
444 // remove this crequest in coalescedTable
445 delete crequest;
446 coalescedTable.at(address).pop_front();
447
448 if (coalescedTable.at(address).empty()) {
449 coalescedTable.erase(address);
450 } else {
451 auto nextRequest = coalescedTable.at(address).front();
452 issueRequest(nextRequest);
453 }
454}
455
456void
458 uint64_t instSeqNum,
459 MachineType mach)
460{
461 DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
462 " instSeqNum = %d\n", address, instSeqNum);
463
464 assert(pendingWriteInsts.count(instSeqNum) == 1);
465 PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
466
467 // check the uncoalescedTable to see whether all requests for the inst
468 // have been issued or not
469 bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
470 DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
471 "reqsAllIssued=%d\n", reqsAllIssued,
472 inst.getNumPendingStores()-1, reqsAllIssued);
473
474 if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
475 // if the pending write instruction has received all write completion
476 // callbacks for its issued Ruby requests, we can now start respond
477 // the requesting CU in one response packet.
479
480 DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
481 instSeqNum);
482 pendingWriteInsts.erase(instSeqNum);
483 }
484}
485
486void
488{
489 readCallback(address, MachineType_NULL, data, false);
490}
491
492void
494 MachineType mach,
496 bool externalHit = false)
497{
498 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0), externalHit);
499}
500
501void
503 MachineType mach,
505 Cycles initialRequestTime,
506 Cycles forwardRequestTime,
507 Cycles firstResponseTime,
508 bool externalHit = false)
509{
510
511 readCallback(address, mach, data,
512 initialRequestTime, forwardRequestTime, firstResponseTime,
513 false, externalHit);
514}
515
516void
518 MachineType mach,
520 Cycles initialRequestTime,
521 Cycles forwardRequestTime,
522 Cycles firstResponseTime,
523 bool isRegion,
524 bool externalHit = false)
525{
526 assert(address == makeLineAddress(address));
527 assert(coalescedTable.count(address));
528
529 auto crequest = coalescedTable.at(address).front();
530 fatal_if(crequest->getRubyType() != RubyRequestType_LD,
531 "readCallback received non-read type response\n");
532
533 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
534 forwardRequestTime, firstResponseTime, isRegion, externalHit);
535
536 delete crequest;
537 coalescedTable.at(address).pop_front();
538 if (coalescedTable.at(address).empty()) {
539 coalescedTable.erase(address);
540 } else {
541 auto nextRequest = coalescedTable.at(address).front();
542 issueRequest(nextRequest);
543 }
544}
545
546void
548 MachineType mach,
550 bool success,
551 Cycles initialRequestTime,
552 Cycles forwardRequestTime,
553 Cycles firstResponseTime,
554 bool isRegion,
555 bool externalHit = false)
556{
557 PacketPtr pkt = crequest->getFirstPkt();
558 Addr request_address = pkt->getAddr();
559 [[maybe_unused]] Addr request_line_address =
560 makeLineAddress(request_address);
561
562 RubyRequestType type = crequest->getRubyType();
563
564 DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
565
566 DPRINTF(RubyHitMiss, "GPU TCP Cache %s at %#x\n",
567 externalHit ? "hit" : "miss",
568 printAddress(request_address));
569
570 recordMissLatency(crequest, mach,
571 initialRequestTime,
572 forwardRequestTime,
573 firstResponseTime,
574 success, isRegion);
575 // update the data
576 //
577 // MUST ADD DOING THIS FOR EACH REQUEST IN COALESCER
578 std::vector<PacketPtr> pktList = crequest->getPackets();
579
580 uint8_t* log = nullptr;
581 DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
582 pktList.size(), request_line_address);
583 uint32_t offset;
584 int pkt_size;
585 for (auto& pkt : pktList) {
586 offset = getOffset(pkt->getAddr());
587 pkt_size = pkt->getSize();
588 request_address = pkt->getAddr();
589
590 // When the Ruby system is cooldown phase, the requests come from
591 // the cache recorder. These requests do not get coalesced and
592 // do not return valid data.
594 continue;
595
596 if (pkt->getPtr<uint8_t>()) {
597 switch(type) {
598 // Store and AtomicNoReturns follow the same path, as the
599 // data response is not needed.
600 case RubyRequestType_ATOMIC_NO_RETURN:
601 assert(pkt->isAtomicOp());
602 break;
603 case RubyRequestType_ST:
604 break;
605 case RubyRequestType_LD:
606 pkt->setData(data.getData(offset, pkt_size));
607 break;
608 case RubyRequestType_ATOMIC_RETURN:
609 assert(pkt->isAtomicOp());
610 // Atomic operations are performed by the WriteMask
611 // in packet order, set by the crequest. Thus, when
612 // unpacking the changes from the log, we read from
613 // the front of the log to correctly map response
614 // data into the packets.
615
616 // Log entry contains the old value before the current
617 // atomic operation occurred.
618 log = data.popAtomicLogEntryFront();
619 pkt->setData(&log[offset]);
620 delete [] log;
621 log = nullptr;
622 break;
623 default:
624 panic("Unsupported ruby packet type:%s\n",
625 RubyRequestType_to_string(type));
626 break;
627 }
628 } else {
629 DPRINTF(MemoryAccess,
630 "WARNING. Data not transfered from Ruby to M5 for type " \
631 "%s\n",
632 RubyRequestType_to_string(type));
633 }
634 }
635 assert(data.numAtomicLogEntries() == 0);
636
638 assert(m_outstanding_count >= 0);
639
640 completeHitCallback(pktList);
641}
642
643bool
645{
646 return coalescedTable.empty();
647}
648
649RubyRequestType
651{
652 RubyRequestType req_type = RubyRequestType_NULL;
653
654 // These types are not support or not used in GPU caches.
655 assert(!pkt->req->isLLSC());
656 assert(!pkt->req->isLockedRMW());
657 assert(!pkt->req->isInstFetch());
658
659 if (pkt->req->isAtomicReturn()) {
660 req_type = RubyRequestType_ATOMIC_RETURN;
661 } else if (pkt->req->isAtomicNoReturn()) {
662 req_type = RubyRequestType_ATOMIC_NO_RETURN;
663 } else if (pkt->isRead()) {
664 req_type = RubyRequestType_LD;
665 } else if (pkt->isWrite()) {
666 req_type = RubyRequestType_ST;
667 } else if (pkt->isFlush()) {
668 req_type = RubyRequestType_FLUSH;
669 } else {
670 panic("Unsupported ruby packet type\n");
671 }
672
673 return req_type;
674}
675
676// Places an uncoalesced packet in uncoalescedTable. If the packet is a
677// special type (MemFence, scoping, etc), it is issued immediately.
678RequestStatus
680{
681 if (pkt->cmd == MemCmd::MemSyncReq) {
682 // issue mem_sync requests immediately to the cache system without
683 // going through uncoalescedTable like normal LD/ST/Atomic requests
685 } else {
686 // all packets must have valid instruction sequence numbers
687 assert(pkt->req->hasInstSeqNum());
688
689 // otherwise, this must be either read or write command
690 assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
691
692 InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
693
694 // in the case of protocol tester, there is one packet per sequence
695 // number. The number of packets during simulation depends on the
696 // number of lanes actives for that vmem request (i.e., the popcnt
697 // of the exec_mask.
698 int num_packets = 1;
699
700 // When Ruby is in warmup or cooldown phase, the requests come from
701 // the cache recorder. There is no dynamic instruction associated
702 // with these requests either
705 if (!m_usingRubyTester) {
706 num_packets = 0;
707 for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
708 num_packets += getDynInst(pkt)->getLaneStatus(i);
709 }
710 }
711 }
712
713 // the pkt is temporarily stored in the uncoalesced table until
714 // it's picked for coalescing process later in this cycle or in a
715 // future cycle. Packets remaining is set to the number of excepted
716 // requests from the instruction based on its exec_mask.
719 uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
720 DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
721 pkt->getAddr());
722
723 // we schedule an issue event here to process the uncoalesced table
724 // and try to issue Ruby request to cache system
725 if (!issueEvent.scheduled()) {
726 DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
727 seq_num);
729 }
730 }
731
732 // we always return RequestStatus_Issued in this coalescer
733 // b/c the coalescer's resouce was checked ealier and the coalescer is
734 // queueing up aliased requets in its coalesced table
735 return RequestStatus_Issued;
736}
737
738template <class KEY, class VALUE>
739std::ostream &
740operator<<(std::ostream &out, const std::unordered_map<KEY, VALUE> &map)
741{
742 out << "[";
743 for (auto i = map.begin(); i != map.end(); ++i)
744 out << " " << i->first << "=" << i->second;
745 out << " ]";
746
747 return out;
748}
749
750void
751GPUCoalescer::print(std::ostream& out) const
752{
753 out << "[GPUCoalescer: " << m_version
754 << ", outstanding requests: " << m_outstanding_count
755 << "]";
756}
757
760{
762 safe_cast<RubyPort::SenderState*>(pkt->senderState);
763
765 safe_cast<ComputeUnit::DataPort::SenderState*>
766 (ss->predecessor);
767
768 return cu_state->_gpuDynInst;
769}
770
771bool
773{
774 uint64_t seqNum = pkt->req->getReqInstSeqNum();
775 Addr line_addr = makeLineAddress(pkt->getAddr());
776
777 // If the packet has the same line address as a request already in the
778 // coalescedTable and has the same sequence number, it can be coalesced.
779 if (coalescedTable.count(line_addr)) {
780 // Search for a previous coalesced request with the same seqNum.
781 auto& creqQueue = coalescedTable.at(line_addr);
782 auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
783 [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
784 );
785 if (citer != creqQueue.end()) {
786 (*citer)->insertPacket(pkt);
787 return true;
788 }
789 }
790
792 // This is an "aliased" or new request. Create a RubyRequest and
793 // append it to the list of "targets" in the coalescing table.
794 DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
795 line_addr);
796
797 CoalescedRequest *creq = new CoalescedRequest(seqNum);
798 creq->insertPacket(pkt);
799 creq->setRubyType(getRequestType(pkt));
800 creq->setIssueTime(curCycle());
801
802 if (!coalescedTable.count(line_addr)) {
803 // If there is no outstanding request for this line address,
804 // create a new coalecsed request and issue it immediately.
805 auto reqList = std::deque<CoalescedRequest*> { creq };
806 coalescedTable.insert(std::make_pair(line_addr, reqList));
807 if (!coalescedReqs.count(seqNum)) {
808 coalescedReqs.insert(std::make_pair(seqNum, reqList));
809 } else {
810 coalescedReqs.at(seqNum).push_back(creq);
811 }
812 } else {
813 // The request is for a line address that is already outstanding
814 // but for a different instruction. Add it as a new request to be
815 // issued when the current outstanding request is completed.
816 coalescedTable.at(line_addr).push_back(creq);
817 DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
818 line_addr, seqNum);
819 }
820
821 // In both cases, requests are added to the coalescing table and will
822 // be counted as outstanding requests.
824
825 // We track all issued or to-be-issued Ruby requests associated with
826 // write instructions. An instruction may have multiple Ruby
827 // requests.
828 if (pkt->cmd == MemCmd::WriteReq) {
829 DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
830 " the pending write instruction list\n", seqNum,
831 line_addr);
832
834 safe_cast<RubyPort::SenderState*>(pkt->senderState);
835
836 // we need to save this port because it will be used to call
837 // back the requesting CU when we receive write
838 // complete callbacks for all issued Ruby requests of this
839 // instruction.
840 RubyPort::MemResponsePort* mem_response_port = ss->port;
841
842 GPUDynInstPtr gpuDynInst = nullptr;
843
844 if (!m_usingRubyTester) {
845 // If this coalescer is connected to a real CU, we need
846 // to save the corresponding gpu dynamic instruction.
847 // CU will use that instruction to decrement wait counters
848 // in the issuing wavefront.
849 // For Ruby tester, gpuDynInst == nullptr
850 gpuDynInst = getDynInst(pkt);
851 }
852
853 PendingWriteInst& inst = pendingWriteInsts[seqNum];
854 inst.addPendingReq(mem_response_port, gpuDynInst,
856 }
857
858 return true;
859 }
860
861 // The maximum number of outstanding requests have been issued.
862 return false;
863}
864
865void
867{
868 // Iterate over the maximum number of instructions we can coalesce
869 // per cycle (coalescingWindow).
870 for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
871 PerInstPackets *pkt_list =
873
874 // getInstPackets will return nullptr if no instruction
875 // exists at the current offset.
876 if (!pkt_list) {
877 break;
878 } else if (pkt_list->empty()) {
879 // Found something, but it has not been cleaned up by update
880 // resources yet. See if there is anything else to coalesce.
881 // Assume we can't check anymore if the coalescing window is 1.
882 continue;
883 } else {
884 // All packets in the list have the same seqNum, use first.
885 InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
886
887 // The difference in list size before and after tells us the
888 // number of packets which were coalesced.
889 size_t pkt_list_size = pkt_list->size();
890
891 // Since we have a pointer to the list of packets in the inst,
892 // erase them from the list if coalescing is successful and
893 // leave them in the list otherwise. This aggressively attempts
894 // to coalesce as many packets as possible from the current inst.
895 pkt_list->remove_if(
896 [&](PacketPtr pkt) { return coalescePacket(pkt); }
897 );
898
899 if (coalescedReqs.count(seq_num)) {
900 auto& creqs = coalescedReqs.at(seq_num);
901 for (auto creq : creqs) {
902 DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
903 RubyRequestType_to_string(creq->getRubyType()),
904 seq_num);
905 issueRequest(creq);
906 }
907 coalescedReqs.erase(seq_num);
908 }
909
910 assert(pkt_list_size >= pkt_list->size());
911 size_t pkt_list_diff = pkt_list_size - pkt_list->size();
912
913 int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
914 num_remaining -= pkt_list_diff;
915 assert(num_remaining >= 0);
916
917 uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
919 "Coalesced %d pkts for seqNum %d, %d remaining\n",
920 pkt_list_diff, seq_num, num_remaining);
921 }
922 }
923
924 // Clean up any instructions in the uncoalesced table that have had
925 // all of their packets coalesced and return a token for that column.
927
928 // have Kernel End releases been issued this cycle
929 int len = newKernelEnds.size();
930 for (int i = 0; i < len; i++) {
932 }
933 newKernelEnds.clear();
934}
935
936void
941
942void
944{
945 assert(kernelEndList.count(wavefront_id));
946
947 ruby_hit_callback(kernelEndList[wavefront_id]);
948
949 kernelEndList.erase(wavefront_id);
950}
951
952void
954 MachineType mach,
955 const DataBlock& data)
956{
957 assert(address == makeLineAddress(address));
958 assert(coalescedTable.count(address));
959
960 auto crequest = coalescedTable.at(address).front();
961
962 fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
963 crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
964 crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
965 "atomicCallback saw non-atomic type response\n");
966
967 hitCallback(crequest, mach, (DataBlock&)data, true,
968 crequest->getIssueTime(), Cycles(0), Cycles(0), false, false);
969
970 delete crequest;
971 coalescedTable.at(address).pop_front();
972
973 if (coalescedTable.at(address).empty()) {
974 coalescedTable.erase(address);
975 } else {
976 auto nextRequest = coalescedTable.at(address).front();
977 issueRequest(nextRequest);
978 }
979}
980
981void
983{
984 for (auto& pkt : mylist) {
985 // When Ruby is in warmup or cooldown phase, the requests come
986 // from the cache recorder. They do not track which port to use
987 // and do not need to send the response back
991 safe_cast<RubyPort::SenderState *>(pkt->senderState);
992 MemResponsePort *port = ss->port;
993 assert(port != NULL);
994
995 pkt->senderState = ss->predecessor;
996
997 if (pkt->cmd != MemCmd::WriteReq) {
998 // for WriteReq, we keep the original senderState until
999 // writeCompleteCallback
1000 delete ss;
1001 }
1002
1003 port->hitCallback(pkt);
1005 }
1006 }
1007
1008 // We schedule an event in the same tick as hitCallback (similar to
1009 // makeRequest) rather than calling completeIssue directly to reduce
1010 // function calls to complete issue. This can only happen if the max
1011 // outstanding requests is less than the number of slots in the
1012 // uncoalesced table and makeRequest is not called again.
1015 }
1016
1020 } else if (m_ruby_system->getCooldownEnabled()) {
1021 rs->m_cache_recorder->enqueueNextFlushRequest();
1022 } else {
1024 }
1025}
1026
1027void
1029 MachineType mach,
1030 Cycles initialRequestTime,
1031 Cycles forwardRequestTime,
1032 Cycles firstResponseTime,
1033 bool success, bool isRegion)
1034{
1035}
1036
1037} // namespace ruby
1038} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
const char data[]
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Tick cyclesToTicks(Cycles c) const
Tick clockPeriod() const
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
virtual std::string name() const
Definition named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
bool isRead() const
Definition packet.hh:593
Addr getAddr() const
Definition packet.hh:807
bool isAtomicOp() const
Definition packet.hh:846
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
T * getPtr()
get a pointer to the data ptr.
Definition packet.hh:1225
void setData(const uint8_t *p)
Copy data into the packet from the provided pointer.
Definition packet.hh:1293
bool isWrite() const
Definition packet.hh:594
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
unsigned getSize() const
Definition packet.hh:817
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
bool isFlush() const
Definition packet.hh:624
Ports are used to interface objects to each other.
Definition port.hh:62
void sendTokens(int num_tokens)
Return num_tokens tokens back to the request port.
Definition token_port.cc:78
void enqueueNextFetchRequest()
Function for fetching warming up the memory and the caches.
void setIssueTime(Cycles _issueTime)
void insertPacket(PacketPtr pkt)
void setRubyType(RubyRequestType type)
PacketPtr getFirstPkt() const
RubyRequestType getRubyType() const
std::vector< PacketPtr > & getPackets()
virtual RubyRequestType getRequestType(PacketPtr pkt)
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
void writeCallback(Addr address, DataBlock &data)
std::vector< statistics::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
void evictionCallback(Addr address)
void kernelCallback(int wavefront_id)
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
RubySystem * getRubySystem()
virtual void issueMemSyncRequest(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
GMTokenPort & getGMTokenPort()
std::vector< statistics::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
statistics::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void resetStats() override
Callback to reset stats.
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
RubyGPUCoalescerParams Params
void printProgress(std::ostream &out) const
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
UncoalescedTable uncoalescedTable
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion, bool externalHit)
void insertKernel(int wavefront_id, PacketPtr pkt)
std::unordered_map< int, PacketPtr > kernelEndList
virtual void issueRequest(CoalescedRequest *crequest)=0
statistics::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
bool coalescePacket(PacketPtr pkt)
std::vector< statistics::Histogram * > m_InitialToForwardDelayHist
std::vector< statistics::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< statistics::Histogram * > m_ForwardToFirstResponseDelayHist
RequestStatus makeRequest(PacketPtr pkt) override
void readCallback(Addr address, DataBlock &data)
void completeHitCallback(std::vector< PacketPtr > &mylist)
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
std::vector< statistics::Histogram * > m_typeLatencyHist
GPUCoalescer(const Params &)
void print(std::ostream &out) const
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
std::vector< int > newKernelEnds
std::vector< statistics::Histogram * > m_missTypeLatencyHist
std::vector< std::vector< statistics::Histogram * > > m_missTypeMachLatencyHist
EventFunctionWrapper issueEvent
GPUDynInstPtr getDynInst(PacketPtr pkt) const
EventFunctionWrapper deadlockCheckEvent
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
void ackWriteCompletion(bool usingRubyTester)
void ruby_hit_callback(PacketPtr pkt)
Definition RubyPort.cc:462
Addr makeLineAddress(Addr addr) const
Definition RubyPort.cc:759
std::string printAddress(Addr addr) const
Definition RubyPort.cc:765
RubySystem * m_ruby_system
Definition RubyPort.hh:207
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition RubyPort.cc:104
void ruby_eviction_callback(Addr address)
Definition RubyPort.cc:707
Addr getOffset(Addr addr) const
Definition RubyPort.cc:753
CacheRecorder * m_cache_recorder
void setPacketsRemaining(InstSeqNum seqNum, int count)
std::map< InstSeqNum, RubyRequestType > reqTypeMap
void insertPacket(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
bool areRequestsDone(const InstSeqNum instSeqNum)
void insertReqType(PacketPtr pkt, RubyRequestType type)
std::map< InstSeqNum, PerInstPackets > instMap
UncoalescedTable(GPUCoalescer *gc)
void initPacketsRemaining(InstSeqNum seqNum, int count)
int getPacketsRemaining(InstSeqNum seqNum)
void checkDeadlock(Tick threshold)
PerInstPackets * getInstPackets(int offset)
std::map< InstSeqNum, int > instPktsRemaining
void reset()
Reset stat value to default.
A simple histogram stat.
STL deque class.
Definition stl.hh:44
STL list class.
Definition stl.hh:51
STL vector class.
Definition stl.hh:37
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority Progress_Event_Pri
Progress events come at the end.
Definition eventq.hh:229
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define warn(...)
Definition logging.hh:256
Bitfield< 18, 16 > len
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 9, 8 > rs
Bitfield< 29 > c
Definition misc_types.hh:53
Bitfield< 21 > ss
Definition misc_types.hh:60
Bitfield< 0 > p
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
uint64_t Tick
Tick count type.
Definition types.hh:58
std::ostream & operator<<(std::ostream &os, const BaseSemihosting::InPlaceArg &ipa)
uint64_t InstSeqNum
Definition inst_seq.hh:40
Declaration of the Packet class.
const std::string & name()
Definition trace.cc:48

Generated on Mon Jan 13 2025 04:28:41 for gem5 by doxygen 1.9.8