gem5  v22.1.0.0
GPUCoalescer.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include "base/compiler.hh"
35 #include "base/logging.hh"
36 #include "base/str.hh"
38 #include "debug/GPUCoalescer.hh"
39 #include "debug/MemoryAccess.hh"
40 #include "debug/ProtocolTrace.hh"
41 #include "debug/RubyPort.hh"
42 #include "debug/RubyStats.hh"
43 #include "gpu-compute/shader.hh"
44 #include "mem/packet.hh"
53 #include "params/RubyGPUCoalescer.hh"
54 
55 namespace gem5
56 {
57 
58 namespace ruby
59 {
60 
62  : coalescer(gc)
63 {
64 }
65 
66 void
68 {
69  uint64_t seqNum = pkt->req->getReqInstSeqNum();
70 
71  instMap[seqNum].push_back(pkt);
72  DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
73  pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
74 }
75 
76 bool
78 {
79  return !instMap.empty();
80 }
81 
82 void
84 {
85  if (!instPktsRemaining.count(seqNum)) {
86  instPktsRemaining[seqNum] = count;
87  }
88 }
89 
90 int
92 {
93  return instPktsRemaining[seqNum];
94 }
95 
96 void
98 {
99  instPktsRemaining[seqNum] = count;
100 }
101 
104 {
105  if (offset >= instMap.size()) {
106  return nullptr;
107  }
108 
109  auto instMapIter = instMap.begin();
110  std::advance(instMapIter, offset);
111 
112  return &(instMapIter->second);
113 }
114 
115 void
117 {
118  for (auto iter = instMap.begin(); iter != instMap.end(); ) {
119  InstSeqNum seq_num = iter->first;
120  DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
121  coalescer->name().c_str(), seq_num);
122  assert(instPktsRemaining.count(seq_num));
123 
124  if (instPktsRemaining[seq_num] == 0) {
125  assert(iter->second.empty());
126 
127  // Remove from both maps
128  instMap.erase(iter++);
129  instPktsRemaining.erase(seq_num);
130 
131  // Release the token
132  DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
134  } else {
135  ++iter;
136  }
137  }
138 }
139 
140 bool
141 UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
142  // iterate the instructions held in UncoalescedTable to see whether there
143  // are more requests to issue; if yes, not yet done; otherwise, done
144  for (auto& inst : instMap) {
145  DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
146  ,inst.first, inst.second.size());
147  if (inst.first == instSeqNum) { return false; }
148  }
149 
150  return true;
151 }
152 
153 void
155 {
156  ss << "Listing pending packets from " << instMap.size() << " instructions";
157 
158  for (auto& inst : instMap) {
159  ss << "\tAddr: " << printAddress(inst.first) << " with "
160  << inst.second.size() << " pending packets" << std::endl;
161  }
162 }
163 
164 void
166 {
167  Tick current_time = curTick();
168 
169  for (auto &it : instMap) {
170  for (auto &pkt : it.second) {
171  if (current_time - pkt->req->time() > threshold) {
172  std::stringstream ss;
174 
175  panic("Possible Deadlock detected. Aborting!\n"
176  "version: %d request.paddr: 0x%x uncoalescedTable: %d "
177  "current time: %u issue_time: %d difference: %d\n"
178  "Request Tables:\n\n%s", coalescer->getId(),
179  pkt->getAddr(), instMap.size(), current_time,
180  pkt->req->time(), current_time - pkt->req->time(),
181  ss.str());
182  }
183  }
184  }
185 }
186 
188  : RubyPort(p),
189  issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
191  uncoalescedTable(this),
192  deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
193  gmTokenPort(name() + ".gmTokenPort", this)
194 {
195  m_store_waiting_on_load_cycles = 0;
196  m_store_waiting_on_store_cycles = 0;
197  m_load_waiting_on_store_cycles = 0;
198  m_load_waiting_on_load_cycles = 0;
199 
200  m_outstanding_count = 0;
201 
202  coalescingWindow = p.max_coalesces_per_cycle;
203 
204  m_max_outstanding_requests = 0;
205  m_instCache_ptr = nullptr;
206  m_dataCache_ptr = nullptr;
207 
208  m_instCache_ptr = p.icache;
209  m_dataCache_ptr = p.dcache;
210  m_max_outstanding_requests = p.max_outstanding_requests;
211  m_deadlock_threshold = p.deadlock_threshold;
212 
213  assert(m_max_outstanding_requests > 0);
214  assert(m_deadlock_threshold > 0);
215  assert(m_instCache_ptr);
216  assert(m_dataCache_ptr);
217 
218  m_runningGarnetStandalone = p.garnet_standalone;
219 
220 
221  // These statistical variables are not for display.
222  // The profiler will collate these across different
223  // coalescers and display those collated statistics.
224  m_outstandReqHist.init(10);
225  m_latencyHist.init(10);
226  m_missLatencyHist.init(10);
227 
228  for (int i = 0; i < RubyRequestType_NUM; i++) {
229  m_typeLatencyHist.push_back(new statistics::Histogram());
230  m_typeLatencyHist[i]->init(10);
231 
232  m_missTypeLatencyHist.push_back(new statistics::Histogram());
233  m_missTypeLatencyHist[i]->init(10);
234  }
235 
236  for (int i = 0; i < MachineType_NUM; i++) {
237  m_missMachLatencyHist.push_back(new statistics::Histogram());
238  m_missMachLatencyHist[i]->init(10);
239 
240  m_IssueToInitialDelayHist.push_back(new statistics::Histogram());
241  m_IssueToInitialDelayHist[i]->init(10);
242 
243  m_InitialToForwardDelayHist.push_back(new statistics::Histogram());
244  m_InitialToForwardDelayHist[i]->init(10);
245 
246  m_ForwardToFirstResponseDelayHist.push_back(
247  new statistics::Histogram());
248  m_ForwardToFirstResponseDelayHist[i]->init(10);
249 
250  m_FirstResponseToCompletionDelayHist.push_back(
251  new statistics::Histogram());
252  m_FirstResponseToCompletionDelayHist[i]->init(10);
253  }
254 
255  for (int i = 0; i < RubyRequestType_NUM; i++) {
256  m_missTypeMachLatencyHist.push_back(
258 
259  for (int j = 0; j < MachineType_NUM; j++) {
260  m_missTypeMachLatencyHist[i].push_back(
261  new statistics::Histogram());
262  m_missTypeMachLatencyHist[i][j]->init(10);
263  }
264  }
265 
266 }
267 
269 {
270 }
271 
272 Port &
273 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
274 {
275  if (if_name == "gmTokenPort") {
276  return gmTokenPort;
277  }
278 
279  // delgate to RubyPort otherwise
280  return RubyPort::getPort(if_name, idx);
281 }
282 
283 void
285 {
286  Cycles current_time = curCycle();
287  for (auto& requestList : coalescedTable) {
288  for (auto& req : requestList.second) {
289  if (current_time - req->getIssueTime() > m_deadlock_threshold) {
290  std::stringstream ss;
292  warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
293  m_version, ss.str());
294  panic("Aborting due to deadlock!\n");
295  }
296  }
297  }
298 
299  Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
300  uncoalescedTable.checkDeadlock(tick_threshold);
301 
302  if (m_outstanding_count > 0) {
305  curTick());
306  }
307 }
308 
309 void
311 {
312  ss << "Printing out " << coalescedTable.size()
313  << " outstanding requests in the coalesced table\n";
314 
315  for (auto& requestList : coalescedTable) {
316  for (auto& request : requestList.second) {
317  ss << "\tAddr: " << printAddress(requestList.first) << "\n"
318  << "\tInstruction sequence number: "
319  << request->getSeqNum() << "\n"
320  << "\t\tType: "
321  << RubyRequestType_to_string(request->getRubyType()) << "\n"
322  << "\t\tNumber of associated packets: "
323  << request->getPackets().size() << "\n"
324  << "\t\tIssue time: "
325  << request->getIssueTime() * clockPeriod() << "\n"
326  << "\t\tDifference from current tick: "
327  << (curCycle() - request->getIssueTime()) * clockPeriod();
328  }
329  }
330 
331  // print out packets waiting to be issued in uncoalesced table
333 }
334 
335 void
337 {
340  for (int i = 0; i < RubyRequestType_NUM; i++) {
341  m_typeLatencyHist[i]->reset();
342  m_missTypeLatencyHist[i]->reset();
343  for (int j = 0; j < MachineType_NUM; j++) {
344  m_missTypeMachLatencyHist[i][j]->reset();
345  }
346  }
347 
348  for (int i = 0; i < MachineType_NUM; i++) {
349  m_missMachLatencyHist[i]->reset();
350 
351  m_IssueToInitialDelayHist[i]->reset();
352  m_InitialToForwardDelayHist[i]->reset();
355  }
356 }
357 
358 void
359 GPUCoalescer::printProgress(std::ostream& out) const
360 {
361 }
362 
363 // sets the kernelEndList
364 void
365 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
366 {
367  // Don't know if this will happen or is possible
368  // but I just want to be careful and not have it become
369  // simulator hang in the future
370  DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
371  assert(kernelEndList.count(wavefront_id) == 0);
372 
373  kernelEndList[wavefront_id] = pkt;
374  DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
375  kernelEndList.size());
376 }
377 
378 void
380 {
381  writeCallback(address, MachineType_NULL, data);
382 }
383 
384 void
386  MachineType mach,
387  DataBlock& data)
388 {
389  writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
390 }
391 
392 void
394  MachineType mach,
395  DataBlock& data,
396  Cycles initialRequestTime,
397  Cycles forwardRequestTime,
398  Cycles firstResponseTime)
399 {
400  writeCallback(address, mach, data,
401  initialRequestTime, forwardRequestTime, firstResponseTime,
402  false);
403 }
404 
405 void
407  MachineType mach,
408  DataBlock& data,
409  Cycles initialRequestTime,
410  Cycles forwardRequestTime,
411  Cycles firstResponseTime,
412  bool isRegion)
413 {
414  assert(address == makeLineAddress(address));
415  assert(coalescedTable.count(address));
416 
417  auto crequest = coalescedTable.at(address).front();
418 
419  hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
420  forwardRequestTime, firstResponseTime, isRegion);
421 
422  // remove this crequest in coalescedTable
423  delete crequest;
424  coalescedTable.at(address).pop_front();
425 
426  if (coalescedTable.at(address).empty()) {
427  coalescedTable.erase(address);
428  } else {
429  auto nextRequest = coalescedTable.at(address).front();
430  issueRequest(nextRequest);
431  }
432 }
433 
434 void
436  uint64_t instSeqNum,
437  MachineType mach)
438 {
439  DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
440  " instSeqNum = %d\n", address, instSeqNum);
441 
442  assert(pendingWriteInsts.count(instSeqNum) == 1);
443  PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
444 
445  // check the uncoalescedTable to see whether all requests for the inst
446  // have been issued or not
447  bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
448  DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
449  "reqsAllIssued=%d\n", reqsAllIssued,
450  inst.getNumPendingStores()-1, reqsAllIssued);
451 
452  if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
453  // if the pending write instruction has received all write completion
454  // callbacks for its issued Ruby requests, we can now start respond
455  // the requesting CU in one response packet.
457 
458  DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
459  instSeqNum);
460  pendingWriteInsts.erase(instSeqNum);
461  }
462 }
463 
464 void
466 {
467  readCallback(address, MachineType_NULL, data);
468 }
469 
470 void
472  MachineType mach,
473  DataBlock& data)
474 {
475  readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
476 }
477 
478 void
480  MachineType mach,
481  DataBlock& data,
482  Cycles initialRequestTime,
483  Cycles forwardRequestTime,
484  Cycles firstResponseTime)
485 {
486 
487  readCallback(address, mach, data,
488  initialRequestTime, forwardRequestTime, firstResponseTime,
489  false);
490 }
491 
492 void
494  MachineType mach,
495  DataBlock& data,
496  Cycles initialRequestTime,
497  Cycles forwardRequestTime,
498  Cycles firstResponseTime,
499  bool isRegion)
500 {
501  assert(address == makeLineAddress(address));
502  assert(coalescedTable.count(address));
503 
504  auto crequest = coalescedTable.at(address).front();
505  fatal_if(crequest->getRubyType() != RubyRequestType_LD,
506  "readCallback received non-read type response\n");
507 
508  // Iterate over the coalesced requests to respond to as many loads as
509  // possible until another request type is seen. Models MSHR for TCP.
510  while (crequest->getRubyType() == RubyRequestType_LD) {
511  hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
512  forwardRequestTime, firstResponseTime, isRegion);
513 
514  delete crequest;
515  coalescedTable.at(address).pop_front();
516  if (coalescedTable.at(address).empty()) {
517  break;
518  }
519 
520  crequest = coalescedTable.at(address).front();
521  }
522 
523  if (coalescedTable.at(address).empty()) {
524  coalescedTable.erase(address);
525  } else {
526  auto nextRequest = coalescedTable.at(address).front();
527  issueRequest(nextRequest);
528  }
529 }
530 
531 void
533  MachineType mach,
534  DataBlock& data,
535  bool success,
536  Cycles initialRequestTime,
537  Cycles forwardRequestTime,
538  Cycles firstResponseTime,
539  bool isRegion)
540 {
541  PacketPtr pkt = crequest->getFirstPkt();
542  Addr request_address = pkt->getAddr();
543  [[maybe_unused]] Addr request_line_address =
544  makeLineAddress(request_address);
545 
546  RubyRequestType type = crequest->getRubyType();
547 
548  DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
549 
550  recordMissLatency(crequest, mach,
551  initialRequestTime,
552  forwardRequestTime,
553  firstResponseTime,
554  success, isRegion);
555  // update the data
556  //
557  // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
558  std::vector<PacketPtr> pktList = crequest->getPackets();
559  DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
560  pktList.size(), request_line_address);
561  for (auto& pkt : pktList) {
562  request_address = pkt->getAddr();
563  if (pkt->getPtr<uint8_t>()) {
564  if ((type == RubyRequestType_LD) ||
565  (type == RubyRequestType_ATOMIC) ||
566  (type == RubyRequestType_ATOMIC_RETURN) ||
567  (type == RubyRequestType_IFETCH) ||
568  (type == RubyRequestType_RMW_Read) ||
569  (type == RubyRequestType_Locked_RMW_Read) ||
570  (type == RubyRequestType_Load_Linked)) {
571  pkt->setData(
572  data.getData(getOffset(request_address), pkt->getSize()));
573  } else {
574  data.setData(pkt->getPtr<uint8_t>(),
575  getOffset(request_address), pkt->getSize());
576  }
577  } else {
578  DPRINTF(MemoryAccess,
579  "WARNING. Data not transfered from Ruby to M5 for type " \
580  "%s\n",
581  RubyRequestType_to_string(type));
582  }
583  }
584 
586  assert(m_outstanding_count >= 0);
587 
588  completeHitCallback(pktList);
589 }
590 
591 bool
593 {
594  return coalescedTable.empty();
595 }
596 
597 RubyRequestType
599 {
600  RubyRequestType req_type = RubyRequestType_NULL;
601 
602  // These types are not support or not used in GPU caches.
603  assert(!pkt->req->isLLSC());
604  assert(!pkt->req->isLockedRMW());
605  assert(!pkt->req->isInstFetch());
606  assert(!pkt->isFlush());
607 
608  if (pkt->req->isAtomicReturn()) {
609  req_type = RubyRequestType_ATOMIC_RETURN;
610  } else if (pkt->req->isAtomicNoReturn()) {
611  req_type = RubyRequestType_ATOMIC_NO_RETURN;
612  } else if (pkt->isRead()) {
613  req_type = RubyRequestType_LD;
614  } else if (pkt->isWrite()) {
615  req_type = RubyRequestType_ST;
616  } else {
617  panic("Unsupported ruby packet type\n");
618  }
619 
620  return req_type;
621 }
622 
623 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
624 // special type (MemFence, scoping, etc), it is issued immediately.
625 RequestStatus
627 {
628  // all packets must have valid instruction sequence numbers
629  assert(pkt->req->hasInstSeqNum());
630 
631  if (pkt->cmd == MemCmd::MemSyncReq) {
632  // issue mem_sync requests immediately to the cache system without
633  // going through uncoalescedTable like normal LD/ST/Atomic requests
634  issueMemSyncRequest(pkt);
635  } else {
636  // otherwise, this must be either read or write command
637  assert(pkt->isRead() || pkt->isWrite());
638 
639  InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
640 
641  // in the case of protocol tester, there is one packet per sequence
642  // number. The number of packets during simulation depends on the
643  // number of lanes actives for that vmem request (i.e., the popcnt
644  // of the exec_mask.
645  int num_packets = 1;
646  if (!m_usingRubyTester) {
647  num_packets = 0;
648  for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
649  num_packets += getDynInst(pkt)->getLaneStatus(i);
650  }
651  }
652 
653  // the pkt is temporarily stored in the uncoalesced table until
654  // it's picked for coalescing process later in this cycle or in a
655  // future cycle. Packets remaining is set to the number of excepted
656  // requests from the instruction based on its exec_mask.
658  uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
659  DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
660  pkt->getAddr());
661 
662  // we schedule an issue event here to process the uncoalesced table
663  // and try to issue Ruby request to cache system
664  if (!issueEvent.scheduled()) {
665  DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
666  seq_num);
668  }
669  }
670 
671  // we always return RequestStatus_Issued in this coalescer
672  // b/c the coalescer's resouce was checked ealier and the coalescer is
673  // queueing up aliased requets in its coalesced table
674  return RequestStatus_Issued;
675 }
676 
677 template <class KEY, class VALUE>
678 std::ostream &
679 operator<<(std::ostream &out, const std::unordered_map<KEY, VALUE> &map)
680 {
681  out << "[";
682  for (auto i = map.begin(); i != map.end(); ++i)
683  out << " " << i->first << "=" << i->second;
684  out << " ]";
685 
686  return out;
687 }
688 
689 void
690 GPUCoalescer::print(std::ostream& out) const
691 {
692  out << "[GPUCoalescer: " << m_version
693  << ", outstanding requests: " << m_outstanding_count
694  << "]";
695 }
696 
699 {
701  safe_cast<RubyPort::SenderState*>(pkt->senderState);
702 
704  safe_cast<ComputeUnit::DataPort::SenderState*>
705  (ss->predecessor);
706 
707  return cu_state->_gpuDynInst;
708 }
709 
710 bool
712 {
713  uint64_t seqNum = pkt->req->getReqInstSeqNum();
714  Addr line_addr = makeLineAddress(pkt->getAddr());
715 
716  // If the packet has the same line address as a request already in the
717  // coalescedTable and has the same sequence number, it can be coalesced.
718  if (coalescedTable.count(line_addr)) {
719  // Search for a previous coalesced request with the same seqNum.
720  auto& creqQueue = coalescedTable.at(line_addr);
721  auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
722  [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
723  );
724  if (citer != creqQueue.end()) {
725  (*citer)->insertPacket(pkt);
726  return true;
727  }
728  }
729 
731  // This is an "aliased" or new request. Create a RubyRequest and
732  // append it to the list of "targets" in the coalescing table.
733  DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
734  line_addr);
735 
736  CoalescedRequest *creq = new CoalescedRequest(seqNum);
737  creq->insertPacket(pkt);
738  creq->setRubyType(getRequestType(pkt));
739  creq->setIssueTime(curCycle());
740 
741  if (!coalescedTable.count(line_addr)) {
742  // If there is no outstanding request for this line address,
743  // create a new coalecsed request and issue it immediately.
744  auto reqList = std::deque<CoalescedRequest*> { creq };
745  coalescedTable.insert(std::make_pair(line_addr, reqList));
746  if (!coalescedReqs.count(seqNum)) {
747  coalescedReqs.insert(std::make_pair(seqNum, reqList));
748  } else {
749  coalescedReqs.at(seqNum).push_back(creq);
750  }
751  } else {
752  // The request is for a line address that is already outstanding
753  // but for a different instruction. Add it as a new request to be
754  // issued when the current outstanding request is completed.
755  coalescedTable.at(line_addr).push_back(creq);
756  DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
757  line_addr, seqNum);
758  }
759 
760  // In both cases, requests are added to the coalescing table and will
761  // be counted as outstanding requests.
763 
764  // We track all issued or to-be-issued Ruby requests associated with
765  // write instructions. An instruction may have multiple Ruby
766  // requests.
767  if (pkt->cmd == MemCmd::WriteReq) {
768  DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
769  " the pending write instruction list\n", seqNum,
770  line_addr);
771 
773  safe_cast<RubyPort::SenderState*>(pkt->senderState);
774 
775  // we need to save this port because it will be used to call
776  // back the requesting CU when we receive write
777  // complete callbacks for all issued Ruby requests of this
778  // instruction.
779  RubyPort::MemResponsePort* mem_response_port = ss->port;
780 
781  GPUDynInstPtr gpuDynInst = nullptr;
782 
783  if (!m_usingRubyTester) {
784  // If this coalescer is connected to a real CU, we need
785  // to save the corresponding gpu dynamic instruction.
786  // CU will use that instruction to decrement wait counters
787  // in the issuing wavefront.
788  // For Ruby tester, gpuDynInst == nullptr
789  gpuDynInst = getDynInst(pkt);
790  }
791 
792  PendingWriteInst& inst = pendingWriteInsts[seqNum];
793  inst.addPendingReq(mem_response_port, gpuDynInst,
795  }
796 
797  return true;
798  }
799 
800  // The maximum number of outstanding requests have been issued.
801  return false;
802 }
803 
804 void
806 {
807  // Iterate over the maximum number of instructions we can coalesce
808  // per cycle (coalescingWindow).
809  for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
810  PerInstPackets *pkt_list =
812 
813  // getInstPackets will return nullptr if no instruction
814  // exists at the current offset.
815  if (!pkt_list) {
816  break;
817  } else if (pkt_list->empty()) {
818  // Found something, but it has not been cleaned up by update
819  // resources yet. See if there is anything else to coalesce.
820  // Assume we can't check anymore if the coalescing window is 1.
821  continue;
822  } else {
823  // All packets in the list have the same seqNum, use first.
824  InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
825 
826  // The difference in list size before and after tells us the
827  // number of packets which were coalesced.
828  size_t pkt_list_size = pkt_list->size();
829 
830  // Since we have a pointer to the list of packets in the inst,
831  // erase them from the list if coalescing is successful and
832  // leave them in the list otherwise. This aggressively attempts
833  // to coalesce as many packets as possible from the current inst.
834  pkt_list->remove_if(
835  [&](PacketPtr pkt) { return coalescePacket(pkt); }
836  );
837 
838  if (coalescedReqs.count(seq_num)) {
839  auto& creqs = coalescedReqs.at(seq_num);
840  for (auto creq : creqs) {
841  DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
842  RubyRequestType_to_string(creq->getRubyType()),
843  seq_num);
844  issueRequest(creq);
845  }
846  coalescedReqs.erase(seq_num);
847  }
848 
849  assert(pkt_list_size >= pkt_list->size());
850  size_t pkt_list_diff = pkt_list_size - pkt_list->size();
851 
852  int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
853  num_remaining -= pkt_list_diff;
854  assert(num_remaining >= 0);
855 
856  uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
858  "Coalesced %d pkts for seqNum %d, %d remaining\n",
859  pkt_list_diff, seq_num, num_remaining);
860  }
861  }
862 
863  // Clean up any instructions in the uncoalesced table that have had
864  // all of their packets coalesced and return a token for that column.
866 
867  // have Kernel End releases been issued this cycle
868  int len = newKernelEnds.size();
869  for (int i = 0; i < len; i++) {
871  }
872  newKernelEnds.clear();
873 }
874 
875 void
877 {
878  ruby_eviction_callback(address);
879 }
880 
881 void
883 {
884  assert(kernelEndList.count(wavefront_id));
885 
886  ruby_hit_callback(kernelEndList[wavefront_id]);
887 
888  kernelEndList.erase(wavefront_id);
889 }
890 
891 void
893  MachineType mach,
894  const DataBlock& data)
895 {
896  assert(address == makeLineAddress(address));
897  assert(coalescedTable.count(address));
898 
899  auto crequest = coalescedTable.at(address).front();
900 
901  fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
902  crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
903  crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
904  "atomicCallback saw non-atomic type response\n");
905 
906  hitCallback(crequest, mach, (DataBlock&)data, true,
907  crequest->getIssueTime(), Cycles(0), Cycles(0), false);
908 
909  delete crequest;
910  coalescedTable.at(address).pop_front();
911 
912  if (coalescedTable.at(address).empty()) {
913  coalescedTable.erase(address);
914  } else {
915  auto nextRequest = coalescedTable.at(address).front();
916  issueRequest(nextRequest);
917  }
918 }
919 
920 void
922 {
923  for (auto& pkt : mylist) {
925  safe_cast<RubyPort::SenderState *>(pkt->senderState);
926  MemResponsePort *port = ss->port;
927  assert(port != NULL);
928 
929  pkt->senderState = ss->predecessor;
930 
931  if (pkt->cmd != MemCmd::WriteReq) {
932  // for WriteReq, we keep the original senderState until
933  // writeCompleteCallback
934  delete ss;
935  }
936 
937  port->hitCallback(pkt);
938  trySendRetries();
939  }
940 
941  // We schedule an event in the same tick as hitCallback (similar to
942  // makeRequest) rather than calling completeIssue directly to reduce
943  // function calls to complete issue. This can only happen if the max
944  // outstanding requests is less than the number of slots in the
945  // uncoalesced table and makeRequest is not called again.
948  }
949 
951 }
952 
953 void
955  MachineType mach,
956  Cycles initialRequestTime,
957  Cycles forwardRequestTime,
958  Cycles firstResponseTime,
959  bool success, bool isRegion)
960 {
961 }
962 
963 } // namespace ruby
964 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
const char data[]
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Tick cyclesToTicks(Cycles c) const
Tick clockPeriod() const
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:79
virtual std::string name() const
Definition: named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
bool isRead() const
Definition: packet.hh:592
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1212
Addr getAddr() const
Definition: packet.hh:805
SenderState * senderState
This packet's sender state.
Definition: packet.hh:544
void setData(const uint8_t *p)
Copy data into the packet from the provided pointer.
Definition: packet.hh:1280
bool isWrite() const
Definition: packet.hh:593
RequestPtr req
A pointer to the original request.
Definition: packet.hh:376
unsigned getSize() const
Definition: packet.hh:815
MemCmd cmd
The command field of the packet.
Definition: packet.hh:371
bool isFlush() const
Definition: packet.hh:623
Ports are used to interface objects to each other.
Definition: port.hh:62
void sendTokens(int num_tokens)
Return num_tokens tokens back to the request port.
Definition: token_port.cc:78
void setIssueTime(Cycles _issueTime)
void insertPacket(PacketPtr pkt)
void setRubyType(RubyRequestType type)
std::vector< PacketPtr > & getPackets()
PacketPtr getFirstPkt() const
RubyRequestType getRubyType() const
virtual RubyRequestType getRequestType(PacketPtr pkt)
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
void writeCallback(Addr address, DataBlock &data)
std::vector< statistics::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
void evictionCallback(Addr address)
void kernelCallback(int wavefront_id)
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
virtual void issueMemSyncRequest(PacketPtr pkt)
void printRequestTable(std::stringstream &ss)
std::vector< statistics::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
statistics::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void resetStats() override
Callback to reset stats.
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
RubyGPUCoalescerParams Params
void printProgress(std::ostream &out) const
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
UncoalescedTable uncoalescedTable
void insertKernel(int wavefront_id, PacketPtr pkt)
std::unordered_map< int, PacketPtr > kernelEndList
virtual void issueRequest(CoalescedRequest *crequest)=0
statistics::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
bool coalescePacket(PacketPtr pkt)
std::vector< statistics::Histogram * > m_InitialToForwardDelayHist
std::vector< statistics::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< statistics::Histogram * > m_ForwardToFirstResponseDelayHist
RequestStatus makeRequest(PacketPtr pkt) override
void readCallback(Addr address, DataBlock &data)
void completeHitCallback(std::vector< PacketPtr > &mylist)
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
std::vector< statistics::Histogram * > m_typeLatencyHist
GPUCoalescer(const Params &)
void print(std::ostream &out) const
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
std::vector< int > newKernelEnds
std::vector< statistics::Histogram * > m_missTypeLatencyHist
std::vector< std::vector< statistics::Histogram * > > m_missTypeMachLatencyHist
GMTokenPort & getGMTokenPort()
EventFunctionWrapper issueEvent
GPUDynInstPtr getDynInst(PacketPtr pkt) const
EventFunctionWrapper deadlockCheckEvent
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
void ackWriteCompletion(bool usingRubyTester)
void ruby_hit_callback(PacketPtr pkt)
Definition: RubyPort.cc:454
void testDrainComplete()
Definition: RubyPort.cc:551
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: RubyPort.cc:104
void ruby_eviction_callback(Addr address)
Definition: RubyPort.cc:701
uint32_t getId()
Definition: RubyPort.hh:172
void setPacketsRemaining(InstSeqNum seqNum, int count)
Definition: GPUCoalescer.cc:97
void insertPacket(PacketPtr pkt)
Definition: GPUCoalescer.cc:67
void printRequestTable(std::stringstream &ss)
bool areRequestsDone(const InstSeqNum instSeqNum)
std::map< InstSeqNum, PerInstPackets > instMap
UncoalescedTable(GPUCoalescer *gc)
Definition: GPUCoalescer.cc:61
void initPacketsRemaining(InstSeqNum seqNum, int count)
Definition: GPUCoalescer.cc:83
int getPacketsRemaining(InstSeqNum seqNum)
Definition: GPUCoalescer.cc:91
void checkDeadlock(Tick threshold)
PerInstPackets * getInstPackets(int offset)
std::map< InstSeqNum, int > instPktsRemaining
void reset()
Reset stat value to default.
Definition: statistics.hh:1352
A simple histogram stat.
Definition: statistics.hh:2127
STL deque class.
Definition: stl.hh:44
STL list class.
Definition: stl.hh:51
STL vector class.
Definition: stl.hh:37
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
static const Priority Progress_Event_Pri
Progress events come at the end.
Definition: eventq.hh:226
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
uint16_t len
Definition: helpers.cc:62
#define warn(...)
Definition: logging.hh:246
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 23, 0 > offset
Definition: types.hh:144
constexpr unsigned NumVecElemPerVecReg
Definition: vec.hh:61
Bitfield< 24 > j
Definition: misc_types.hh:57
Bitfield< 2 > c
Definition: pagetable.hh:63
Bitfield< 54 > p
Definition: pagetable.hh:70
Addr makeLineAddress(Addr addr)
Definition: Address.cc:60
Addr getOffset(Addr addr)
Definition: Address.cc:54
std::ostream & operator<<(std::ostream &os, const BoolVec &myvector)
Definition: BoolVec.cc:49
std::string printAddress(Addr addr)
Definition: Address.cc:80
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:245
uint64_t Tick
Tick count type.
Definition: types.hh:58
uint64_t InstSeqNum
Definition: inst_seq.hh:40
Declaration of the Packet class.
const std::string & name()
Definition: trace.cc:49
std::stringstream ss
Definition: trace.test.cc:45

Generated on Wed Dec 21 2022 10:22:38 for gem5 by doxygen 1.9.1