gem5  v21.0.1.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
GPUCoalescer.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
35 
36 #include "base/logging.hh"
37 #include "base/str.hh"
38 #include "config/the_isa.hh"
40 #include "debug/GPUCoalescer.hh"
41 #include "debug/MemoryAccess.hh"
42 #include "debug/ProtocolTrace.hh"
43 #include "debug/RubyPort.hh"
44 #include "debug/RubyStats.hh"
45 #include "gpu-compute/shader.hh"
46 #include "mem/packet.hh"
55 #include "params/RubyGPUCoalescer.hh"
56 
58  : coalescer(gc)
59 {
60 }
61 
62 void
64 {
65  uint64_t seqNum = pkt->req->getReqInstSeqNum();
66 
67  instMap[seqNum].push_back(pkt);
68  DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
69  pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
70 }
71 
72 bool
74 {
75  return !instMap.empty();
76 }
77 
78 void
80 {
81  if (!instPktsRemaining.count(seqNum)) {
82  instPktsRemaining[seqNum] = count;
83  }
84 }
85 
86 int
88 {
89  return instPktsRemaining[seqNum];
90 }
91 
92 void
94 {
95  instPktsRemaining[seqNum] = count;
96 }
97 
100 {
101  if (offset >= instMap.size()) {
102  return nullptr;
103  }
104 
105  auto instMapIter = instMap.begin();
106  std::advance(instMapIter, offset);
107 
108  return &(instMapIter->second);
109 }
110 
111 void
113 {
114  for (auto iter = instMap.begin(); iter != instMap.end(); ) {
115  InstSeqNum seq_num = iter->first;
116  DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
117  coalescer->name().c_str(), seq_num);
118  assert(instPktsRemaining.count(seq_num));
119 
120  if (instPktsRemaining[seq_num] == 0) {
121  assert(iter->second.empty());
122 
123  // Remove from both maps
124  instMap.erase(iter++);
125  instPktsRemaining.erase(seq_num);
126 
127  // Release the token
128  DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
130  } else {
131  ++iter;
132  }
133  }
134 }
135 
136 bool
137 UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
138  // iterate the instructions held in UncoalescedTable to see whether there
139  // are more requests to issue; if yes, not yet done; otherwise, done
140  for (auto& inst : instMap) {
141  DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
142  ,inst.first, inst.second.size());
143  if (inst.first == instSeqNum) { return false; }
144  }
145 
146  return true;
147 }
148 
149 void
151 {
152  ss << "Listing pending packets from " << instMap.size() << " instructions";
153 
154  for (auto& inst : instMap) {
155  ss << "\tAddr: " << printAddress(inst.first) << " with "
156  << inst.second.size() << " pending packets" << std::endl;
157  }
158 }
159 
160 void
162 {
163  Tick current_time = curTick();
164 
165  for (auto &it : instMap) {
166  for (auto &pkt : it.second) {
167  if (current_time - pkt->req->time() > threshold) {
168  std::stringstream ss;
170 
171  panic("Possible Deadlock detected. Aborting!\n"
172  "version: %d request.paddr: 0x%x uncoalescedTable: %d "
173  "current time: %u issue_time: %d difference: %d\n"
174  "Request Tables:\n\n%s", coalescer->getId(),
175  pkt->getAddr(), instMap.size(), current_time,
176  pkt->req->time(), current_time - pkt->req->time(),
177  ss.str());
178  }
179  }
180  }
181 }
182 
184  : RubyPort(p),
185  issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
187  uncoalescedTable(this),
188  deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
189  gmTokenPort(name() + ".gmTokenPort", this)
190 {
191  m_store_waiting_on_load_cycles = 0;
192  m_store_waiting_on_store_cycles = 0;
193  m_load_waiting_on_store_cycles = 0;
194  m_load_waiting_on_load_cycles = 0;
195 
196  m_outstanding_count = 0;
197 
198  coalescingWindow = p.max_coalesces_per_cycle;
199 
200  m_max_outstanding_requests = 0;
201  m_instCache_ptr = nullptr;
202  m_dataCache_ptr = nullptr;
203 
204  m_instCache_ptr = p.icache;
205  m_dataCache_ptr = p.dcache;
206  m_max_outstanding_requests = p.max_outstanding_requests;
207  m_deadlock_threshold = p.deadlock_threshold;
208 
209  assert(m_max_outstanding_requests > 0);
210  assert(m_deadlock_threshold > 0);
211  assert(m_instCache_ptr);
212  assert(m_dataCache_ptr);
213 
214  m_runningGarnetStandalone = p.garnet_standalone;
215 
216 
217  // These statistical variables are not for display.
218  // The profiler will collate these across different
219  // coalescers and display those collated statistics.
220  m_outstandReqHist.init(10);
221  m_latencyHist.init(10);
222  m_missLatencyHist.init(10);
223 
224  for (int i = 0; i < RubyRequestType_NUM; i++) {
225  m_typeLatencyHist.push_back(new Stats::Histogram());
226  m_typeLatencyHist[i]->init(10);
227 
228  m_missTypeLatencyHist.push_back(new Stats::Histogram());
229  m_missTypeLatencyHist[i]->init(10);
230  }
231 
232  for (int i = 0; i < MachineType_NUM; i++) {
233  m_missMachLatencyHist.push_back(new Stats::Histogram());
234  m_missMachLatencyHist[i]->init(10);
235 
236  m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
237  m_IssueToInitialDelayHist[i]->init(10);
238 
239  m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
240  m_InitialToForwardDelayHist[i]->init(10);
241 
242  m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
243  m_ForwardToFirstResponseDelayHist[i]->init(10);
244 
245  m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
246  m_FirstResponseToCompletionDelayHist[i]->init(10);
247  }
248 
249  for (int i = 0; i < RubyRequestType_NUM; i++) {
250  m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
251 
252  for (int j = 0; j < MachineType_NUM; j++) {
253  m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
254  m_missTypeMachLatencyHist[i][j]->init(10);
255  }
256  }
257 
258 }
259 
261 {
262 }
263 
264 Port &
265 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
266 {
267  if (if_name == "gmTokenPort") {
268  return gmTokenPort;
269  }
270 
271  // delgate to RubyPort otherwise
272  return RubyPort::getPort(if_name, idx);
273 }
274 
275 void
277 {
278  Cycles current_time = curCycle();
279  for (auto& requestList : coalescedTable) {
280  for (auto& req : requestList.second) {
281  if (current_time - req->getIssueTime() > m_deadlock_threshold) {
282  std::stringstream ss;
284  warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
285  m_version, ss.str());
286  panic("Aborting due to deadlock!\n");
287  }
288  }
289  }
290 
291  Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
292  uncoalescedTable.checkDeadlock(tick_threshold);
293 
294  if (m_outstanding_count > 0) {
297  curTick());
298  }
299 }
300 
301 void
303 {
304  ss << "Printing out " << coalescedTable.size()
305  << " outstanding requests in the coalesced table\n";
306 
307  for (auto& requestList : coalescedTable) {
308  for (auto& request : requestList.second) {
309  ss << "\tAddr: " << printAddress(requestList.first) << "\n"
310  << "\tInstruction sequence number: "
311  << request->getSeqNum() << "\n"
312  << "\t\tType: "
313  << RubyRequestType_to_string(request->getRubyType()) << "\n"
314  << "\t\tNumber of associated packets: "
315  << request->getPackets().size() << "\n"
316  << "\t\tIssue time: "
317  << request->getIssueTime() * clockPeriod() << "\n"
318  << "\t\tDifference from current tick: "
319  << (curCycle() - request->getIssueTime()) * clockPeriod();
320  }
321  }
322 
323  // print out packets waiting to be issued in uncoalesced table
325 }
326 
327 void
329 {
332  for (int i = 0; i < RubyRequestType_NUM; i++) {
333  m_typeLatencyHist[i]->reset();
334  m_missTypeLatencyHist[i]->reset();
335  for (int j = 0; j < MachineType_NUM; j++) {
336  m_missTypeMachLatencyHist[i][j]->reset();
337  }
338  }
339 
340  for (int i = 0; i < MachineType_NUM; i++) {
341  m_missMachLatencyHist[i]->reset();
342 
343  m_IssueToInitialDelayHist[i]->reset();
344  m_InitialToForwardDelayHist[i]->reset();
347  }
348 }
349 
350 void
351 GPUCoalescer::printProgress(std::ostream& out) const
352 {
353 }
354 
355 // sets the kernelEndList
356 void
357 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
358 {
359  // Don't know if this will happen or is possible
360  // but I just want to be careful and not have it become
361  // simulator hang in the future
362  DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
363  assert(kernelEndList.count(wavefront_id) == 0);
364 
365  kernelEndList[wavefront_id] = pkt;
366  DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
367  kernelEndList.size());
368 }
369 
370 void
372 {
373  writeCallback(address, MachineType_NULL, data);
374 }
375 
376 void
378  MachineType mach,
379  DataBlock& data)
380 {
381  writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
382 }
383 
384 void
386  MachineType mach,
387  DataBlock& data,
388  Cycles initialRequestTime,
389  Cycles forwardRequestTime,
390  Cycles firstResponseTime)
391 {
392  writeCallback(address, mach, data,
393  initialRequestTime, forwardRequestTime, firstResponseTime,
394  false);
395 }
396 
397 void
399  MachineType mach,
400  DataBlock& data,
401  Cycles initialRequestTime,
402  Cycles forwardRequestTime,
403  Cycles firstResponseTime,
404  bool isRegion)
405 {
406  assert(address == makeLineAddress(address));
407  assert(coalescedTable.count(address));
408 
409  auto crequest = coalescedTable.at(address).front();
410 
411  hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
412  forwardRequestTime, firstResponseTime, isRegion);
413 
414  // remove this crequest in coalescedTable
415  delete crequest;
416  coalescedTable.at(address).pop_front();
417 
418  if (coalescedTable.at(address).empty()) {
419  coalescedTable.erase(address);
420  } else {
421  auto nextRequest = coalescedTable.at(address).front();
422  issueRequest(nextRequest);
423  }
424 }
425 
426 void
428  uint64_t instSeqNum,
429  MachineType mach)
430 {
431  DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
432  " instSeqNum = %d\n", address, instSeqNum);
433 
434  assert(pendingWriteInsts.count(instSeqNum) == 1);
435  PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
436 
437  // check the uncoalescedTable to see whether all requests for the inst
438  // have been issued or not
439  bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
440  DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
441  "reqsAllIssued=%d\n", reqsAllIssued,
442  inst.getNumPendingStores()-1, reqsAllIssued);
443 
444  if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
445  // if the pending write instruction has received all write completion
446  // callbacks for its issued Ruby requests, we can now start respond
447  // the requesting CU in one response packet.
449 
450  DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
451  instSeqNum);
452  pendingWriteInsts.erase(instSeqNum);
453  }
454 }
455 
456 void
458 {
459  readCallback(address, MachineType_NULL, data);
460 }
461 
462 void
464  MachineType mach,
465  DataBlock& data)
466 {
467  readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
468 }
469 
470 void
472  MachineType mach,
473  DataBlock& data,
474  Cycles initialRequestTime,
475  Cycles forwardRequestTime,
476  Cycles firstResponseTime)
477 {
478 
479  readCallback(address, mach, data,
480  initialRequestTime, forwardRequestTime, firstResponseTime,
481  false);
482 }
483 
484 void
486  MachineType mach,
487  DataBlock& data,
488  Cycles initialRequestTime,
489  Cycles forwardRequestTime,
490  Cycles firstResponseTime,
491  bool isRegion)
492 {
493  assert(address == makeLineAddress(address));
494  assert(coalescedTable.count(address));
495 
496  auto crequest = coalescedTable.at(address).front();
497  fatal_if(crequest->getRubyType() != RubyRequestType_LD,
498  "readCallback received non-read type response\n");
499 
500  // Iterate over the coalesced requests to respond to as many loads as
501  // possible until another request type is seen. Models MSHR for TCP.
502  while (crequest->getRubyType() == RubyRequestType_LD) {
503  hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
504  forwardRequestTime, firstResponseTime, isRegion);
505 
506  delete crequest;
507  coalescedTable.at(address).pop_front();
508  if (coalescedTable.at(address).empty()) {
509  break;
510  }
511 
512  crequest = coalescedTable.at(address).front();
513  }
514 
515  if (coalescedTable.at(address).empty()) {
516  coalescedTable.erase(address);
517  } else {
518  auto nextRequest = coalescedTable.at(address).front();
519  issueRequest(nextRequest);
520  }
521 }
522 
523 void
525  MachineType mach,
526  DataBlock& data,
527  bool success,
528  Cycles initialRequestTime,
529  Cycles forwardRequestTime,
530  Cycles firstResponseTime,
531  bool isRegion)
532 {
533  PacketPtr pkt = crequest->getFirstPkt();
534  Addr request_address = pkt->getAddr();
535  M5_VAR_USED Addr request_line_address = makeLineAddress(request_address);
536 
537  RubyRequestType type = crequest->getRubyType();
538 
539  DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
540 
541  recordMissLatency(crequest, mach,
542  initialRequestTime,
543  forwardRequestTime,
544  firstResponseTime,
545  success, isRegion);
546  // update the data
547  //
548  // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
549  std::vector<PacketPtr> pktList = crequest->getPackets();
550  DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
551  pktList.size(), request_line_address);
552  for (auto& pkt : pktList) {
553  request_address = pkt->getAddr();
554  if (pkt->getPtr<uint8_t>()) {
555  if ((type == RubyRequestType_LD) ||
556  (type == RubyRequestType_ATOMIC) ||
557  (type == RubyRequestType_ATOMIC_RETURN) ||
558  (type == RubyRequestType_IFETCH) ||
559  (type == RubyRequestType_RMW_Read) ||
560  (type == RubyRequestType_Locked_RMW_Read) ||
561  (type == RubyRequestType_Load_Linked)) {
562  pkt->setData(
563  data.getData(getOffset(request_address), pkt->getSize()));
564  } else {
565  data.setData(pkt->getPtr<uint8_t>(),
566  getOffset(request_address), pkt->getSize());
567  }
568  } else {
569  DPRINTF(MemoryAccess,
570  "WARNING. Data not transfered from Ruby to M5 for type " \
571  "%s\n",
572  RubyRequestType_to_string(type));
573  }
574  }
575 
577  assert(m_outstanding_count >= 0);
578 
579  completeHitCallback(pktList);
580 }
581 
582 bool
584 {
585  return coalescedTable.empty();
586 }
587 
588 RubyRequestType
590 {
591  RubyRequestType req_type = RubyRequestType_NULL;
592 
593  // These types are not support or not used in GPU caches.
594  assert(!pkt->req->isLLSC());
595  assert(!pkt->req->isLockedRMW());
596  assert(!pkt->req->isInstFetch());
597  assert(!pkt->isFlush());
598 
599  if (pkt->req->isAtomicReturn()) {
600  req_type = RubyRequestType_ATOMIC_RETURN;
601  } else if (pkt->req->isAtomicNoReturn()) {
602  req_type = RubyRequestType_ATOMIC_NO_RETURN;
603  } else if (pkt->isRead()) {
604  req_type = RubyRequestType_LD;
605  } else if (pkt->isWrite()) {
606  req_type = RubyRequestType_ST;
607  } else {
608  panic("Unsupported ruby packet type\n");
609  }
610 
611  return req_type;
612 }
613 
614 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
615 // special type (MemFence, scoping, etc), it is issued immediately.
616 RequestStatus
618 {
619  // all packets must have valid instruction sequence numbers
620  assert(pkt->req->hasInstSeqNum());
621 
622  if (pkt->cmd == MemCmd::MemSyncReq) {
623  // issue mem_sync requests immediately to the cache system without
624  // going through uncoalescedTable like normal LD/ST/Atomic requests
625  issueMemSyncRequest(pkt);
626  } else {
627  // otherwise, this must be either read or write command
628  assert(pkt->isRead() || pkt->isWrite());
629 
630  InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
631 
632  // in the case of protocol tester, there is one packet per sequence
633  // number. The number of packets during simulation depends on the
634  // number of lanes actives for that vmem request (i.e., the popcnt
635  // of the exec_mask.
636  int num_packets = 1;
637  if (!m_usingRubyTester) {
638  num_packets = getDynInst(pkt)->exec_mask.count();
639  }
640 
641  // the pkt is temporarily stored in the uncoalesced table until
642  // it's picked for coalescing process later in this cycle or in a
643  // future cycle. Packets remaining is set to the number of excepted
644  // requests from the instruction based on its exec_mask.
646  uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
647  DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
648  pkt->getAddr());
649 
650  // we schedule an issue event here to process the uncoalesced table
651  // and try to issue Ruby request to cache system
652  if (!issueEvent.scheduled()) {
653  DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
654  seq_num);
656  }
657  }
658 
659  // we always return RequestStatus_Issued in this coalescer
660  // b/c the coalescer's resouce was checked ealier and the coalescer is
661  // queueing up aliased requets in its coalesced table
662  return RequestStatus_Issued;
663 }
664 
665 template <class KEY, class VALUE>
666 std::ostream &
667 operator<<(std::ostream &out, const std::unordered_map<KEY, VALUE> &map)
668 {
669  out << "[";
670  for (auto i = map.begin(); i != map.end(); ++i)
671  out << " " << i->first << "=" << i->second;
672  out << " ]";
673 
674  return out;
675 }
676 
677 void
678 GPUCoalescer::print(std::ostream& out) const
679 {
680  out << "[GPUCoalescer: " << m_version
681  << ", outstanding requests: " << m_outstanding_count
682  << "]";
683 }
684 
687 {
689  safe_cast<RubyPort::SenderState*>(pkt->senderState);
690 
692  safe_cast<ComputeUnit::DataPort::SenderState*>
693  (ss->predecessor);
694 
695  return cu_state->_gpuDynInst;
696 }
697 
698 bool
700 {
701  uint64_t seqNum = pkt->req->getReqInstSeqNum();
702  Addr line_addr = makeLineAddress(pkt->getAddr());
703 
704  // If the packet has the same line address as a request already in the
705  // coalescedTable and has the same sequence number, it can be coalesced.
706  if (coalescedTable.count(line_addr)) {
707  // Search for a previous coalesced request with the same seqNum.
708  auto& creqQueue = coalescedTable.at(line_addr);
709  auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
710  [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
711  );
712  if (citer != creqQueue.end()) {
713  (*citer)->insertPacket(pkt);
714  return true;
715  }
716  }
717 
719  // This is an "aliased" or new request. Create a RubyRequest and
720  // append it to the list of "targets" in the coalescing table.
721  DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
722  line_addr);
723 
724  CoalescedRequest *creq = new CoalescedRequest(seqNum);
725  creq->insertPacket(pkt);
726  creq->setRubyType(getRequestType(pkt));
727  creq->setIssueTime(curCycle());
728 
729  if (!coalescedTable.count(line_addr)) {
730  // If there is no outstanding request for this line address,
731  // create a new coalecsed request and issue it immediately.
732  auto reqList = std::deque<CoalescedRequest*> { creq };
733  coalescedTable.insert(std::make_pair(line_addr, reqList));
734  if (!coalescedReqs.count(seqNum)) {
735  coalescedReqs.insert(std::make_pair(seqNum, reqList));
736  } else {
737  coalescedReqs.at(seqNum).push_back(creq);
738  }
739  } else {
740  // The request is for a line address that is already outstanding
741  // but for a different instruction. Add it as a new request to be
742  // issued when the current outstanding request is completed.
743  coalescedTable.at(line_addr).push_back(creq);
744  DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
745  line_addr, seqNum);
746  }
747 
748  // In both cases, requests are added to the coalescing table and will
749  // be counted as outstanding requests.
751 
752  // We track all issued or to-be-issued Ruby requests associated with
753  // write instructions. An instruction may have multiple Ruby
754  // requests.
755  if (pkt->cmd == MemCmd::WriteReq) {
756  DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
757  " the pending write instruction list\n", seqNum,
758  line_addr);
759 
761  safe_cast<RubyPort::SenderState*>(pkt->senderState);
762 
763  // we need to save this port because it will be used to call
764  // back the requesting CU when we receive write
765  // complete callbacks for all issued Ruby requests of this
766  // instruction.
767  RubyPort::MemResponsePort* mem_response_port = ss->port;
768 
769  GPUDynInstPtr gpuDynInst = nullptr;
770 
771  if (!m_usingRubyTester) {
772  // If this coalescer is connected to a real CU, we need
773  // to save the corresponding gpu dynamic instruction.
774  // CU will use that instruction to decrement wait counters
775  // in the issuing wavefront.
776  // For Ruby tester, gpuDynInst == nullptr
777  gpuDynInst = getDynInst(pkt);
778  }
779 
780  PendingWriteInst& inst = pendingWriteInsts[seqNum];
781  inst.addPendingReq(mem_response_port, gpuDynInst,
783  }
784 
785  return true;
786  }
787 
788  // The maximum number of outstanding requests have been issued.
789  return false;
790 }
791 
792 void
794 {
795  // Iterate over the maximum number of instructions we can coalesce
796  // per cycle (coalescingWindow).
797  for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
798  PerInstPackets *pkt_list =
800 
801  // getInstPackets will return nullptr if no instruction
802  // exists at the current offset.
803  if (!pkt_list) {
804  break;
805  } else if (pkt_list->empty()) {
806  // Found something, but it has not been cleaned up by update
807  // resources yet. See if there is anything else to coalesce.
808  // Assume we can't check anymore if the coalescing window is 1.
809  continue;
810  } else {
811  // All packets in the list have the same seqNum, use first.
812  InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
813 
814  // The difference in list size before and after tells us the
815  // number of packets which were coalesced.
816  size_t pkt_list_size = pkt_list->size();
817 
818  // Since we have a pointer to the list of packets in the inst,
819  // erase them from the list if coalescing is successful and
820  // leave them in the list otherwise. This aggressively attempts
821  // to coalesce as many packets as possible from the current inst.
822  pkt_list->remove_if(
823  [&](PacketPtr pkt) { return coalescePacket(pkt); }
824  );
825 
826  if (coalescedReqs.count(seq_num)) {
827  auto& creqs = coalescedReqs.at(seq_num);
828  for (auto creq : creqs) {
829  DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
830  RubyRequestType_to_string(creq->getRubyType()),
831  seq_num);
832  issueRequest(creq);
833  }
834  coalescedReqs.erase(seq_num);
835  }
836 
837  assert(pkt_list_size >= pkt_list->size());
838  size_t pkt_list_diff = pkt_list_size - pkt_list->size();
839 
840  int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
841  num_remaining -= pkt_list_diff;
842  assert(num_remaining >= 0);
843 
844  uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
846  "Coalesced %d pkts for seqNum %d, %d remaining\n",
847  pkt_list_diff, seq_num, num_remaining);
848  }
849  }
850 
851  // Clean up any instructions in the uncoalesced table that have had
852  // all of their packets coalesced and return a token for that column.
854 
855  // have Kernel End releases been issued this cycle
856  int len = newKernelEnds.size();
857  for (int i = 0; i < len; i++) {
859  }
860  newKernelEnds.clear();
861 }
862 
863 void
865 {
866  ruby_eviction_callback(address);
867 }
868 
869 void
871 {
872  assert(kernelEndList.count(wavefront_id));
873 
874  ruby_hit_callback(kernelEndList[wavefront_id]);
875 
876  kernelEndList.erase(wavefront_id);
877 }
878 
879 void
881  MachineType mach,
882  const DataBlock& data)
883 {
884  assert(address == makeLineAddress(address));
885  assert(coalescedTable.count(address));
886 
887  auto crequest = coalescedTable.at(address).front();
888 
889  fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
890  crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
891  crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
892  "atomicCallback saw non-atomic type response\n");
893 
894  hitCallback(crequest, mach, (DataBlock&)data, true,
895  crequest->getIssueTime(), Cycles(0), Cycles(0), false);
896 
897  delete crequest;
898  coalescedTable.at(address).pop_front();
899 
900  if (coalescedTable.at(address).empty()) {
901  coalescedTable.erase(address);
902  } else {
903  auto nextRequest = coalescedTable.at(address).front();
904  issueRequest(nextRequest);
905  }
906 }
907 
908 void
910 {
911  for (auto& pkt : mylist) {
913  safe_cast<RubyPort::SenderState *>(pkt->senderState);
914  MemResponsePort *port = ss->port;
915  assert(port != NULL);
916 
917  pkt->senderState = ss->predecessor;
918 
919  if (pkt->cmd != MemCmd::WriteReq) {
920  // for WriteReq, we keep the original senderState until
921  // writeCompleteCallback
922  delete ss;
923  }
924 
925  port->hitCallback(pkt);
926  trySendRetries();
927  }
928 
929  // We schedule an event in the same tick as hitCallback (similar to
930  // makeRequest) rather than calling completeIssue directly to reduce
931  // function calls to complete issue. This can only happen if the max
932  // outstanding requests is less than the number of slots in the
933  // uncoalesced table and makeRequest is not called again.
936  }
937 
939 }
940 
941 void
943  MachineType mach,
944  Cycles initialRequestTime,
945  Cycles forwardRequestTime,
946  Cycles firstResponseTime,
947  bool success, bool isRegion)
948 {
949 }
950 
GPUCoalescer::m_deadlock_threshold
Cycles m_deadlock_threshold
Definition: GPUCoalescer.hh:412
Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:462
GPUCoalescer::issueMemSyncRequest
virtual void issueMemSyncRequest(PacketPtr pkt)
Definition: GPUCoalescer.hh:377
warn
#define warn(...)
Definition: logging.hh:239
RubyPort::m_usingRubyTester
bool m_usingRubyTester
Definition: RubyPort.hh:192
RubyPort::ruby_hit_callback
void ruby_hit_callback(PacketPtr pkt)
Definition: RubyPort.cc:434
GPUCoalescer::m_latencyHist
Stats::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
Definition: GPUCoalescer.hh:485
Profiler.hh
data
const char data[]
Definition: circlebuf.test.cc:47
shader.hh
UncoalescedTable::instPktsRemaining
std::map< InstSeqNum, int > instPktsRemaining
Definition: GPUCoalescer.hh:99
CoalescedRequest::getRubyType
RubyRequestType getRubyType() const
Definition: GPUCoalescer.hh:119
Packet::getAddr
Addr getAddr() const
Definition: packet.hh:755
makeLineAddress
Addr makeLineAddress(Addr addr)
Definition: Address.cc:54
CoalescedRequest::insertPacket
void insertPacket(PacketPtr pkt)
Definition: GPUCoalescer.hh:111
GPUCoalescer::m_missTypeMachLatencyHist
std::vector< std::vector< Stats::Histogram * > > m_missTypeMachLatencyHist
Definition: GPUCoalescer.hh:496
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
GPUCoalescer::writeCompleteCallback
void writeCompleteCallback(Addr address, uint64_t instSeqNum, MachineType mach)
Definition: GPUCoalescer.cc:427
PendingWriteInst::receiveWriteCompleteAck
bool receiveWriteCompleteAck()
Definition: GPUCoalescer.hh:161
GPUCoalescer::evictionCallback
void evictionCallback(Addr address)
Definition: GPUCoalescer.cc:864
GPUCoalescer::getGMTokenPort
GMTokenPort & getGMTokenPort()
Definition: GPUCoalescer.hh:334
UncoalescedTable::coalescer
GPUCoalescer * coalescer
Definition: GPUCoalescer.hh:91
GPUCoalescer
Definition: GPUCoalescer.hh:209
RubyPort::testDrainComplete
void testDrainComplete()
Definition: RubyPort.cc:483
Packet::isRead
bool isRead() const
Definition: packet.hh:557
GPUCoalescer::printRequestTable
void printRequestTable(std::stringstream &ss)
Definition: GPUCoalescer.cc:302
GPUCoalescer::Params
RubyGPUCoalescerParams Params
Definition: GPUCoalescer.hh:232
GPUCoalescer.hh
Tick
uint64_t Tick
Tick count type.
Definition: types.hh:59
GPUCoalescer::printProgress
void printProgress(std::ostream &out) const
Definition: GPUCoalescer.cc:351
AbstractController.hh
PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:243
UncoalescedTable::insertPacket
void insertPacket(PacketPtr pkt)
Definition: GPUCoalescer.cc:63
UncoalescedTable::getPacketsRemaining
int getPacketsRemaining(InstSeqNum seqNum)
Definition: GPUCoalescer.cc:87
GPUCoalescer::newKernelEnds
std::vector< int > newKernelEnds
Definition: GPUCoalescer.hh:446
Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:341
RubyRequest.hh
GPUCoalescer::gmTokenPort
GMTokenPort gmTokenPort
Definition: GPUCoalescer.hh:523
std::vector< Stats::Histogram * >
GPUCoalescer::kernelCallback
void kernelCallback(int wavefront_id)
Definition: GPUCoalescer.cc:870
PendingWriteInst::getNumPendingStores
int getNumPendingStores()
Definition: GPUCoalescer.hh:191
GPUCoalescer::m_outstanding_count
int m_outstanding_count
Definition: GPUCoalescer.hh:443
Packet::getSize
unsigned getSize() const
Definition: packet.hh:765
X86ISA::count
count
Definition: misc.hh:703
GPUCoalescer::getPort
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: GPUCoalescer.cc:265
PendingWriteInst::addPendingReq
void addPendingReq(RubyPort::MemResponsePort *port, GPUDynInstPtr inst, bool usingRubyTester)
Definition: GPUCoalescer.hh:146
RubyPort::getPort
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: RubyPort.cc:94
PendingWriteInst
Definition: GPUCoalescer.hh:133
CoalescedRequest::getFirstPkt
PacketPtr getFirstPkt() const
Definition: GPUCoalescer.hh:117
RubyPort::getId
uint32_t getId()
Definition: RubyPort.hh:165
operator<<
std::ostream & operator<<(std::ostream &out, const std::unordered_map< KEY, VALUE > &map)
Definition: GPUCoalescer.cc:667
Packet::setData
void setData(const uint8_t *p)
Copy data into the packet from the provided pointer.
Definition: packet.hh:1226
DataBlock
Definition: DataBlock.hh:54
packet.hh
Stats::Histogram
A simple histogram stat.
Definition: statistics.hh:2126
GPUCoalescer::deadlockCheckEvent
EventFunctionWrapper deadlockCheckEvent
Definition: GPUCoalescer.hh:455
str.hh
MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:86
GPUCoalescer::m_InitialToForwardDelayHist
std::vector< Stats::Histogram * > m_InitialToForwardDelayHist
Definition: GPUCoalescer.hh:500
GPUCoalescer::completeHitCallback
void completeHitCallback(std::vector< PacketPtr > &mylist)
Definition: GPUCoalescer.cc:909
ArmISA::j
Bitfield< 24 > j
Definition: miscregs_types.hh:54
RubyPort::MemResponsePort
Definition: RubyPort.hh:75
EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1016
UncoalescedTable::printRequestTable
void printRequestTable(std::stringstream &ss)
Definition: GPUCoalescer.cc:150
Clocked::cyclesToTicks
Tick cyclesToTicks(Cycles c) const
Definition: clocked_object.hh:224
GPUCoalescer::coalescedTable
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
Definition: GPUCoalescer.hh:431
ArmISA::ss
Bitfield< 21 > ss
Definition: miscregs_types.hh:56
CacheMemory.hh
GPUCoalescer::m_IssueToInitialDelayHist
std::vector< Stats::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
Definition: GPUCoalescer.hh:499
UncoalescedTable::packetAvailable
bool packetAvailable()
Definition: GPUCoalescer.cc:73
GPUCoalescer::~GPUCoalescer
~GPUCoalescer()
Definition: GPUCoalescer.cc:260
GPUCoalescer::atomicCallback
virtual void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
Definition: GPUCoalescer.cc:880
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:237
GPUCoalescer::print
void print(std::ostream &out) const
Definition: GPUCoalescer.cc:678
DataBlock.hh
GPUCoalescer::wakeup
void wakeup()
Definition: GPUCoalescer.cc:276
RubyPort
Definition: RubyPort.hh:58
UncoalescedTable::setPacketsRemaining
void setPacketsRemaining(InstSeqNum seqNum, int count)
Definition: GPUCoalescer.cc:93
Port
Ports are used to interface objects to each other.
Definition: port.hh:56
RubyTester.hh
Clocked::curCycle
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Definition: clocked_object.hh:192
GPUCoalescer::m_typeLatencyHist
std::vector< Stats::Histogram * > m_typeLatencyHist
Definition: GPUCoalescer.hh:486
GPUCoalescer::getRequestType
virtual RubyRequestType getRequestType(PacketPtr pkt)
Definition: GPUCoalescer.cc:589
UncoalescedTable::UncoalescedTable
UncoalescedTable(GPUCoalescer *gc)
Definition: GPUCoalescer.cc:57
GPUCoalescer::pendingWriteInsts
std::unordered_map< uint64_t, PendingWriteInst > pendingWriteInsts
Definition: GPUCoalescer.hh:440
GPUCoalescer::insertKernel
void insertKernel(int wavefront_id, PacketPtr pkt)
Definition: GPUCoalescer.cc:357
GPUCoalescer::getDynInst
GPUDynInstPtr getDynInst(PacketPtr pkt) const
Definition: GPUCoalescer.cc:686
GPUCoalescer::issueEvent
EventFunctionWrapper issueEvent
Definition: GPUCoalescer.hh:408
GPUCoalescer::m_ForwardToFirstResponseDelayHist
std::vector< Stats::Histogram * > m_ForwardToFirstResponseDelayHist
Definition: GPUCoalescer.hh:501
ComputeUnit::DataPort::SenderState
Definition: compute_unit.hh:515
UncoalescedTable::instMap
std::map< InstSeqNum, PerInstPackets > instMap
Definition: GPUCoalescer.hh:97
GPUCoalescer::resetStats
void resetStats() override
Callback to reset stats.
Definition: GPUCoalescer.cc:328
RubySystem.hh
InstSeqNum
uint64_t InstSeqNum
Definition: inst_seq.hh:37
Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:148
PendingWriteInst::ackWriteCompletion
void ackWriteCompletion(bool usingRubyTester)
Definition: GPUCoalescer.hh:170
name
const std::string & name()
Definition: trace.cc:48
GPUCoalescer::writeCallback
void writeCallback(Addr address, DataBlock &data)
Definition: GPUCoalescer.cc:371
Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:214
GPUCoalescer::makeRequest
RequestStatus makeRequest(PacketPtr pkt) override
Definition: GPUCoalescer.cc:617
GPUCoalescer::readCallback
void readCallback(Addr address, DataBlock &data)
Definition: GPUCoalescer.cc:457
GPUCoalescer::coalescePacket
bool coalescePacket(PacketPtr pkt)
Definition: GPUCoalescer.cc:699
SimObject::name
virtual const std::string name() const
Definition: sim_object.hh:182
ComputeUnit::DataPort::SenderState::_gpuDynInst
GPUDynInstPtr _gpuDynInst
Definition: compute_unit.hh:517
Packet::cmd
MemCmd cmd
The command field of the packet.
Definition: packet.hh:336
MessageBuffer.hh
GPUCoalescer::m_missTypeLatencyHist
std::vector< Stats::Histogram * > m_missTypeLatencyHist
Definition: GPUCoalescer.hh:491
GPUCoalescer::recordMissLatency
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
Definition: GPUCoalescer.cc:942
CoalescedRequest
Definition: GPUCoalescer.hh:102
MemCmd::MemSyncReq
@ MemSyncReq
Definition: packet.hh:116
GPUCoalescer::m_FirstResponseToCompletionDelayHist
std::vector< Stats::Histogram * > m_FirstResponseToCompletionDelayHist
Definition: GPUCoalescer.hh:502
Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:258
std::deque
STL deque class.
Definition: stl.hh:44
GPUCoalescer::kernelEndList
std::unordered_map< int, PacketPtr > kernelEndList
Definition: GPUCoalescer.hh:445
getOffset
Addr getOffset(Addr addr)
Definition: Address.cc:48
GPUCoalescer::m_missMachLatencyHist
std::vector< Stats::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
Definition: GPUCoalescer.hh:495
ArmISA::len
Bitfield< 18, 16 > len
Definition: miscregs_types.hh:439
GPUCoalescer::m_max_outstanding_requests
int m_max_outstanding_requests
Definition: GPUCoalescer.hh:411
Stats::DistBase::reset
void reset()
Reset stat value to default.
Definition: statistics.hh:1347
RubyPort::MemResponsePort::hitCallback
void hitCallback(PacketPtr pkt)
Definition: RubyPort.cc:517
UncoalescedTable::initPacketsRemaining
void initPacketsRemaining(InstSeqNum seqNum, int count)
Definition: GPUCoalescer.cc:79
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
logging.hh
Cycles
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:79
Packet::isWrite
bool isWrite() const
Definition: packet.hh:558
GPUCoalescer::completeIssue
void completeIssue()
Definition: GPUCoalescer.cc:793
Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1158
UncoalescedTable::checkDeadlock
void checkDeadlock(Tick threshold)
Definition: GPUCoalescer.cc:161
X86ISA::type
type
Definition: misc.hh:727
ArmISA::c
Bitfield< 29 > c
Definition: miscregs_types.hh:50
GPUCoalescer::empty
bool empty() const
Definition: GPUCoalescer.cc:583
CoalescedRequest::getPackets
std::vector< PacketPtr > & getPackets()
Definition: GPUCoalescer.hh:120
curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:43
GPUCoalescer::issueRequest
virtual void issueRequest(CoalescedRequest *crequest)=0
GPUCoalescer::coalescingWindow
int coalescingWindow
Definition: GPUCoalescer.hh:419
Packet::isFlush
bool isFlush() const
Definition: packet.hh:586
Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:509
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
std::list
STL list class.
Definition: stl.hh:51
GPUCoalescer::m_missLatencyHist
Stats::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
Definition: GPUCoalescer.hh:490
EventBase::Progress_Event_Pri
static const Priority Progress_Event_Pri
Progress events come at the end.
Definition: eventq.hh:223
RubyPort::trySendRetries
void trySendRetries()
Definition: RubyPort.cc:457
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:219
CoalescedRequest::setIssueTime
void setIssueTime(Cycles _issueTime)
Definition: GPUCoalescer.hh:113
GPUCoalescer::GPUCoalescer
GPUCoalescer(const Params &)
Definition: GPUCoalescer.cc:183
RubyPort::m_version
uint32_t m_version
Definition: RubyPort.hh:189
printAddress
std::string printAddress(Addr addr)
Definition: Address.cc:74
RubyPort::SenderState
Definition: RubyPort.hh:139
UncoalescedTable::getInstPackets
PerInstPackets * getInstPackets(int offset)
Definition: GPUCoalescer.cc:99
RubyPort::ruby_eviction_callback
void ruby_eviction_callback(Addr address)
Definition: RubyPort.cc:617
SubBlock.hh
UncoalescedTable::areRequestsDone
bool areRequestsDone(const InstSeqNum instSeqNum)
Definition: GPUCoalescer.cc:137
GPUCoalescer::coalescedReqs
std::unordered_map< uint64_t, std::deque< CoalescedRequest * > > coalescedReqs
Definition: GPUCoalescer.hh:435
TokenResponsePort::sendTokens
void sendTokens(int num_tokens)
Return num_tokens tokens back to the request port.
Definition: token_port.cc:77
CoalescedRequest::setRubyType
void setRubyType(RubyRequestType type)
Definition: GPUCoalescer.hh:114
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:171
UncoalescedTable::updateResources
void updateResources()
Definition: GPUCoalescer.cc:112
ArmISA::offset
Bitfield< 23, 0 > offset
Definition: types.hh:153
GPUCoalescer::uncoalescedTable
UncoalescedTable uncoalescedTable
Definition: GPUCoalescer.hh:424
GPUCoalescer::hitCallback
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
Definition: GPUCoalescer.cc:524

Generated on Tue Jun 22 2021 15:28:30 for gem5 by doxygen 1.8.17