gem5  v20.0.0.2
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
GPUCoalescer.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "base/logging.hh"
35 #include "base/str.hh"
36 #include "config/the_isa.hh"
37 
38 #if THE_ISA == X86_ISA
40 
41 #endif // X86_ISA
43 
45 #include "debug/GPUCoalescer.hh"
46 #include "debug/MemoryAccess.hh"
47 #include "debug/ProtocolTrace.hh"
48 #include "debug/RubyPort.hh"
49 #include "debug/RubyStats.hh"
50 #include "gpu-compute/shader.hh"
51 #include "mem/packet.hh"
60 #include "params/RubyGPUCoalescer.hh"
61 
62 using namespace std;
63 
65 RubyGPUCoalescerParams::create()
66 {
67  return new GPUCoalescer(this);
68 }
69 
70 HSAScope
72 {
73  HSAScope accessScope = HSAScope_UNSPECIFIED;
74  if (req->isScoped()) {
75  if (req->isWavefrontScope()) {
76  accessScope = HSAScope_WAVEFRONT;
77  } else if (req->isWorkgroupScope()) {
78  accessScope = HSAScope_WORKGROUP;
79  } else if (req->isDeviceScope()) {
80  accessScope = HSAScope_DEVICE;
81  } else if (req->isSystemScope()) {
82  accessScope = HSAScope_SYSTEM;
83  } else {
84  fatal("Bad scope type");
85  }
86  }
87  return accessScope;
88 }
89 
90 HSASegment
92 {
93  HSASegment accessSegment = HSASegment_GLOBAL;
94 
95  if (req->isGlobalSegment()) {
96  accessSegment = HSASegment_GLOBAL;
97  } else if (req->isGroupSegment()) {
98  accessSegment = HSASegment_GROUP;
99  } else if (req->isPrivateSegment()) {
100  accessSegment = HSASegment_PRIVATE;
101  } else if (req->isKernargSegment()) {
102  accessSegment = HSASegment_KERNARG;
103  } else if (req->isReadonlySegment()) {
104  accessSegment = HSASegment_READONLY;
105  } else if (req->isSpillSegment()) {
106  accessSegment = HSASegment_SPILL;
107  } else if (req->isArgSegment()) {
108  accessSegment = HSASegment_ARG;
109  } else {
110  fatal("Bad segment type");
111  }
112 
113  return accessSegment;
114 }
115 
117  : coalescer(gc)
118 {
119 }
120 
121 void
123 {
124  uint64_t seqNum = pkt->req->getReqInstSeqNum();
125 
126  instMap[seqNum].push_back(pkt);
127  DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
128  pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
129 }
130 
131 bool
133 {
134  return !instMap.empty();
135 }
136 
139 {
140  if (offset >= instMap.size()) {
141  return nullptr;
142  }
143 
144  auto instMapIter = instMap.begin();
145  std::advance(instMapIter, offset);
146 
147  return &(instMapIter->second);
148 }
149 
150 void
152 {
153  for (auto iter = instMap.begin(); iter != instMap.end(); ) {
154  if (iter->second.empty()) {
155  instMap.erase(iter++);
157  } else {
158  ++iter;
159  }
160  }
161 }
162 
163 void
165 {
166  ss << "UncoalescedTable contains " << instMap.size()
167  << " address entries." << std::endl;
168  for (auto& inst : instMap) {
169  ss << "Addr 0x" << std::hex << inst.first << std::dec
170  << " with " << inst.second.size() << " packets"
171  << std::endl;
172  }
173 }
174 
175 void
177 {
178  Tick current_time = curTick();
179 
180  for (auto &it : instMap) {
181  for (auto &pkt : it.second) {
182  if (current_time - pkt->req->time() > threshold) {
183  std::stringstream ss;
184  printRequestTable(ss);
185 
186  panic("Possible Deadlock detected. Aborting!\n"
187  "version: %d request.paddr: 0x%x uncoalescedTable: %d "
188  "current time: %u issue_time: %d difference: %d\n"
189  "Request Tables:\n\n%s", coalescer->getId(),
190  pkt->getAddr(), instMap.size(), current_time,
191  pkt->req->time(), current_time - pkt->req->time(),
192  ss.str());
193  }
194  }
195  }
196 }
197 
199  : RubyPort(p),
200  issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
202  uncoalescedTable(this),
203  deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
204  gmTokenPort(name() + ".gmTokenPort", this)
205 {
210 
212 
213  coalescingWindow = p->max_coalesces_per_cycle;
214 
216  m_instCache_ptr = nullptr;
217  m_dataCache_ptr = nullptr;
218 
219  m_instCache_ptr = p->icache;
220  m_dataCache_ptr = p->dcache;
221  m_max_outstanding_requests = p->max_outstanding_requests;
222  m_deadlock_threshold = p->deadlock_threshold;
223 
224  assert(m_max_outstanding_requests > 0);
225  assert(m_deadlock_threshold > 0);
226  assert(m_instCache_ptr);
227  assert(m_dataCache_ptr);
228 
229  m_runningGarnetStandalone = p->garnet_standalone;
230  assumingRfOCoherence = p->assume_rfo;
231 }
232 
234 {
235 }
236 
237 Port &
238 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
239 {
240  if (if_name == "gmTokenPort") {
241  return gmTokenPort;
242  }
243 
244  // delgate to RubyPort otherwise
245  return RubyPort::getPort(if_name, idx);
246 }
247 
248 void
250 {
251  Cycles current_time = curCycle();
252  for (auto& requestList : coalescedTable) {
253  for (auto& req : requestList.second) {
254  if (current_time - req->getIssueTime() > m_deadlock_threshold) {
255  std::stringstream ss;
256  printRequestTable(ss);
257  ss << "Outstanding requests: " << m_outstanding_count
258  << std::endl;
259 
260  panic("Possible Deadlock detected. Aborting!\n"
261  "version: %d request.paddr: 0x%x coalescedTable: %d "
262  "current time: %u issue_time: %d difference: %d\n"
263  "Request Tables:\n %s", m_version,
264  req->getFirstPkt()->getAddr(),
265  coalescedTable.size(), cyclesToTicks(current_time),
266  cyclesToTicks(req->getIssueTime()),
267  cyclesToTicks(current_time - req->getIssueTime()),
268  ss.str());
269  }
270  }
271  }
272 
273  Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
274  uncoalescedTable.checkDeadlock(tick_threshold);
275 
276  if (m_outstanding_count > 0) {
279  curTick());
280  }
281 }
282 
283 void
285 {
287 
288  ss << "CoalescedTable contains " << coalescedTable.size()
289  << " address entries." << std::endl;
290  for (auto& requestList : coalescedTable) {
291  ss << "Addr 0x" << std::hex << requestList.first << std::dec
292  << ": type-";
293  for (auto& request : requestList.second) {
294  ss << RubyRequestType_to_string(request->getRubyType())
295  << " pkts-" << request->getPackets().size()
296  << " issued-" << request->getIssueTime() << " seqNum-"
297  << request->getSeqNum() << "; ";
298  }
299  ss << std::endl;
300  }
301 }
302 
303 void
305 {
308  for (int i = 0; i < RubyRequestType_NUM; i++) {
309  m_typeLatencyHist[i]->reset();
310  m_missTypeLatencyHist[i]->reset();
311  for (int j = 0; j < MachineType_NUM; j++) {
312  m_missTypeMachLatencyHist[i][j]->reset();
313  }
314  }
315 
316  for (int i = 0; i < MachineType_NUM; i++) {
317  m_missMachLatencyHist[i]->reset();
318 
319  m_IssueToInitialDelayHist[i]->reset();
320  m_InitialToForwardDelayHist[i]->reset();
323  }
324 }
325 
326 void
327 GPUCoalescer::printProgress(ostream& out) const
328 {
329 }
330 
331 // sets the kernelEndList
332 void
333 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
334 {
335  // Don't know if this will happen or is possible
336  // but I just want to be careful and not have it become
337  // simulator hang in the future
338  DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
339  assert(kernelEndList.count(wavefront_id) == 0);
340 
341  kernelEndList[wavefront_id] = pkt;
342  DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
343  kernelEndList.size());
344 }
345 
346 void
348 {
349  writeCallback(address, MachineType_NULL, data);
350 }
351 
352 void
354  MachineType mach,
355  DataBlock& data)
356 {
357  writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
358 }
359 
360 void
362  MachineType mach,
363  DataBlock& data,
364  Cycles initialRequestTime,
365  Cycles forwardRequestTime,
366  Cycles firstResponseTime)
367 {
368  writeCallback(address, mach, data,
369  initialRequestTime, forwardRequestTime, firstResponseTime,
370  false);
371 }
372 
373 void
375  MachineType mach,
376  DataBlock& data,
377  Cycles initialRequestTime,
378  Cycles forwardRequestTime,
379  Cycles firstResponseTime,
380  bool isRegion)
381 {
382  assert(address == makeLineAddress(address));
383  assert(coalescedTable.count(address));
384 
385  auto crequest = coalescedTable.at(address).front();
386 
387  hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
388  forwardRequestTime, firstResponseTime, isRegion);
389 
390  delete crequest;
391  coalescedTable.at(address).pop_front();
392 
393  if (coalescedTable.at(address).empty()) {
394  coalescedTable.erase(address);
395  } else {
396  auto nextRequest = coalescedTable.at(address).front();
397  issueRequest(nextRequest);
398  }
399 }
400 
401 void
403 {
404  readCallback(address, MachineType_NULL, data);
405 }
406 
407 void
409  MachineType mach,
410  DataBlock& data)
411 {
412  readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
413 }
414 
415 void
417  MachineType mach,
418  DataBlock& data,
419  Cycles initialRequestTime,
420  Cycles forwardRequestTime,
421  Cycles firstResponseTime)
422 {
423 
424  readCallback(address, mach, data,
425  initialRequestTime, forwardRequestTime, firstResponseTime,
426  false);
427 }
428 
429 void
431  MachineType mach,
432  DataBlock& data,
433  Cycles initialRequestTime,
434  Cycles forwardRequestTime,
435  Cycles firstResponseTime,
436  bool isRegion)
437 {
438  assert(address == makeLineAddress(address));
439  assert(coalescedTable.count(address));
440 
441  auto crequest = coalescedTable.at(address).front();
442  fatal_if(crequest->getRubyType() != RubyRequestType_LD,
443  "readCallback received non-read type response\n");
444 
445  // Iterate over the coalesced requests to respond to as many loads as
446  // possible until another request type is seen. Models MSHR for TCP.
447  while (crequest->getRubyType() == RubyRequestType_LD) {
448  hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
449  forwardRequestTime, firstResponseTime, isRegion);
450 
451  delete crequest;
452  coalescedTable.at(address).pop_front();
453  if (coalescedTable.at(address).empty()) {
454  break;
455  }
456 
457  crequest = coalescedTable.at(address).front();
458  }
459 
460  if (coalescedTable.at(address).empty()) {
461  coalescedTable.erase(address);
462  } else {
463  auto nextRequest = coalescedTable.at(address).front();
464  issueRequest(nextRequest);
465  }
466 }
467 
468 void
470  MachineType mach,
471  DataBlock& data,
472  bool success,
473  Cycles initialRequestTime,
474  Cycles forwardRequestTime,
475  Cycles firstResponseTime,
476  bool isRegion)
477 {
478  PacketPtr pkt = crequest->getFirstPkt();
479  Addr request_address = pkt->getAddr();
480  Addr request_line_address = makeLineAddress(request_address);
481 
482  RubyRequestType type = crequest->getRubyType();
483 
484  DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
485 
486  recordMissLatency(crequest, mach,
487  initialRequestTime,
488  forwardRequestTime,
489  firstResponseTime,
490  success, isRegion);
491  // update the data
492  //
493  // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
494  std::vector<PacketPtr> pktList = crequest->getPackets();
495  DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
496  pktList.size(), request_line_address);
497  for (auto& pkt : pktList) {
498  request_address = pkt->getAddr();
499  if (pkt->getPtr<uint8_t>()) {
500  if ((type == RubyRequestType_LD) ||
501  (type == RubyRequestType_ATOMIC) ||
502  (type == RubyRequestType_ATOMIC_RETURN) ||
503  (type == RubyRequestType_IFETCH) ||
504  (type == RubyRequestType_RMW_Read) ||
505  (type == RubyRequestType_Locked_RMW_Read) ||
506  (type == RubyRequestType_Load_Linked)) {
507  pkt->setData(
508  data.getData(getOffset(request_address), pkt->getSize()));
509  } else {
510  data.setData(pkt->getPtr<uint8_t>(),
511  getOffset(request_address), pkt->getSize());
512  }
513  } else {
514  DPRINTF(MemoryAccess,
515  "WARNING. Data not transfered from Ruby to M5 for type " \
516  "%s\n",
517  RubyRequestType_to_string(type));
518  }
519 
520  // If using the RubyTester, update the RubyTester sender state's
521  // subBlock with the recieved data. The tester will later access
522  // this state.
523  // Note: RubyPort will access it's sender state before the
524  // RubyTester.
525  if (m_usingRubyTester) {
526  RubyPort::SenderState *requestSenderState =
528  RubyTester::SenderState* testerSenderState =
530  (requestSenderState->predecessor);
531  testerSenderState->subBlock.mergeFrom(data);
532  }
533  }
534 
535 
536 
538  assert(m_outstanding_count >= 0);
539 
540  completeHitCallback(pktList);
541 }
542 
543 bool
545 {
546  return coalescedTable.empty();
547 }
548 
549 RubyRequestType
551 {
552  RubyRequestType req_type = RubyRequestType_NULL;
553 
554  // These types are not support or not used in GPU caches.
555  assert(!pkt->req->isLLSC());
556  assert(!pkt->req->isLockedRMW());
557  assert(!pkt->req->isInstFetch());
558  assert(!pkt->isFlush());
559 
560  if (pkt->req->isAtomicReturn()) {
561  req_type = RubyRequestType_ATOMIC_RETURN;
562  } else if (pkt->req->isAtomicNoReturn()) {
563  req_type = RubyRequestType_ATOMIC_NO_RETURN;
564  } else if (pkt->isRead()) {
565  req_type = RubyRequestType_LD;
566  } else if (pkt->isWrite()) {
567  req_type = RubyRequestType_ST;
568  } else {
569  // Acquire and release packets will have been issued by
570  // makeRequest, so we do not need to check for it here.
571  panic("Unsupported ruby packet type\n");
572  }
573 
574  return req_type;
575 }
576 
577 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
578 // special type (MemFence, scoping, etc), it is issued immediately.
579 RequestStatus
581 {
582  // Check for GPU Barrier Kernel End or Kernel Begin
583  // Leave these to be handled by the child class
584  // Kernel End/Barrier = isFlush + isRelease
585  // Kernel Begin = isFlush + isAcquire
586  if (pkt->req->isKernel()) {
587  if (pkt->req->isAcquire()){
588  // This is a Kernel Begin leave handling to
589  // virtual xCoalescer::makeRequest
590  return RequestStatus_Issued;
591  }else if (pkt->req->isRelease()) {
592  // This is a Kernel End leave handling to
593  // virtual xCoalescer::makeRequest
594  // If we are here then we didn't call
595  // a virtual version of this function
596  // so we will also schedule the callback
597  int wf_id = 0;
598  if (pkt->req->hasContextId()) {
599  wf_id = pkt->req->contextId();
600  }
601  insertKernel(wf_id, pkt);
602  newKernelEnds.push_back(wf_id);
603  if (!issueEvent.scheduled()) {
605  }
606  return RequestStatus_Issued;
607  }
608  }
609 
610  if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
611  !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
612  (pkt->req->isRelease() || pkt->req->isAcquire())) {
613  if (assumingRfOCoherence) {
614  // If we reached here, this request must be a memFence
615  // and the protocol implements RfO, the coalescer can
616  // assume sequentially consistency and schedule the callback
617  // immediately.
618  // Currently the code implements fence callbacks
619  // by reusing the mechanism for kernel completions.
620  // This should be fixed.
621  int wf_id = 0;
622  if (pkt->req->hasContextId()) {
623  wf_id = pkt->req->contextId();
624  }
625  insertKernel(wf_id, pkt);
626  newKernelEnds.push_back(wf_id);
627  if (!issueEvent.scheduled()) {
629  }
630  return RequestStatus_Issued;
631  } else {
632  // If not RfO, return issued here and let the child coalescer
633  // take care of it.
634  return RequestStatus_Issued;
635  }
636  }
637 
639  DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
640 
641  if (!issueEvent.scheduled())
643  // TODO: issue hardware prefetches here
644  return RequestStatus_Issued;
645 }
646 
647 void
649 {
650  PacketPtr pkt = crequest->getFirstPkt();
651 
652  int proc_id = -1;
653  if (pkt != NULL && pkt->req->hasContextId()) {
654  proc_id = pkt->req->contextId();
655  }
656 
657  // If valid, copy the pc to the ruby request
658  Addr pc = 0;
659  if (pkt->req->hasPC()) {
660  pc = pkt->req->getPC();
661  }
662 
663  // At the moment setting scopes only counts
664  // for GPU spill space accesses
665  // which is pkt->req->isStack()
666  // this scope is REPLACE since it
667  // does not need to be flushed at the end
668  // of a kernel Private and local may need
669  // to be visible at the end of the kernel
670  HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
671  HSAScope accessScope = reqScopeToHSAScope(pkt->req);
672 
673  Addr line_addr = makeLineAddress(pkt->getAddr());
674 
675  // Creating WriteMask that records written bytes
676  // and atomic operations. This enables partial writes
677  // and partial reads of those writes
678  DataBlock dataBlock;
679  dataBlock.clear();
680  uint32_t blockSize = RubySystem::getBlockSizeBytes();
681  std::vector<bool> accessMask(blockSize,false);
683  uint32_t tableSize = crequest->getPackets().size();
684  for (int i = 0; i < tableSize; i++) {
685  PacketPtr tmpPkt = crequest->getPackets()[i];
686  uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
687  uint32_t tmpSize = tmpPkt->getSize();
688  if (tmpPkt->isAtomicOp()) {
689  std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
690  tmpPkt->getAtomicOp());
691  atomicOps.push_back(tmpAtomicOp);
692  } else if (tmpPkt->isWrite()) {
693  dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
694  tmpOffset, tmpSize);
695  }
696  for (int j = 0; j < tmpSize; j++) {
697  accessMask[tmpOffset + j] = true;
698  }
699  }
700  std::shared_ptr<RubyRequest> msg;
701  if (pkt->isAtomicOp()) {
702  msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
703  pkt->getPtr<uint8_t>(),
704  pkt->getSize(), pc, crequest->getRubyType(),
705  RubyAccessMode_Supervisor, pkt,
706  PrefetchBit_No, proc_id, 100,
707  blockSize, accessMask,
708  dataBlock, atomicOps,
709  accessScope, accessSegment);
710  } else {
711  msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
712  pkt->getPtr<uint8_t>(),
713  pkt->getSize(), pc, crequest->getRubyType(),
714  RubyAccessMode_Supervisor, pkt,
715  PrefetchBit_No, proc_id, 100,
716  blockSize, accessMask,
717  dataBlock,
718  accessScope, accessSegment);
719  }
720  DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
721  curTick(), m_version, "Coal", "Begin", "", "",
722  printAddress(msg->getPhysicalAddress()),
723  RubyRequestType_to_string(crequest->getRubyType()));
724 
725  fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
726  "there should not be any I-Fetch requests in the GPU Coalescer");
727 
728  Tick latency = cyclesToTicks(
730  assert(latency > 0);
731 
732  if (!deadlockCheckEvent.scheduled()) {
735  curTick());
736  }
737 
738  assert(m_mandatory_q_ptr);
739  m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
740 }
741 
742 template <class KEY, class VALUE>
743 std::ostream &
744 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
745 {
746  out << "[";
747  for (auto i = map.begin(); i != map.end(); ++i)
748  out << " " << i->first << "=" << i->second;
749  out << " ]";
750 
751  return out;
752 }
753 
754 void
755 GPUCoalescer::print(ostream& out) const
756 {
757  out << "[GPUCoalescer: " << m_version
758  << ", outstanding requests: " << m_outstanding_count
759  << "]";
760 }
761 
762 
763 void
764 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
765  DPRINTF(RubyStats, "Recorded statistic: %s\n",
766  SequencerRequestType_to_string(requestType));
767 }
768 
769 bool
771 {
772  uint64_t seqNum = pkt->req->getReqInstSeqNum();
773  Addr line_addr = makeLineAddress(pkt->getAddr());
774 
775  // If the packet has the same line address as a request already in the
776  // coalescedTable and has the same sequence number, it can be coalesced.
777  if (coalescedTable.count(line_addr)) {
778  // Search for a previous coalesced request with the same seqNum.
779  auto& creqQueue = coalescedTable.at(line_addr);
780  auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
781  [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
782  );
783  if (citer != creqQueue.end()) {
784  (*citer)->insertPacket(pkt);
785  return true;
786  }
787  }
788 
790  // This is an "aliased" or new request. Create a RubyRequest and
791  // append it to the list of "targets" in the coalescing table.
792  DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
793  line_addr);
794 
795  CoalescedRequest *creq = new CoalescedRequest(seqNum);
796  creq->insertPacket(pkt);
797  creq->setRubyType(getRequestType(pkt));
798  creq->setIssueTime(curCycle());
799 
800  if (!coalescedTable.count(line_addr)) {
801  // If there is no outstanding request for this line address,
802  // create a new coalecsed request and issue it immediately.
803  auto reqList = std::deque<CoalescedRequest*> { creq };
804  coalescedTable.insert(std::make_pair(line_addr, reqList));
805 
806  DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
807  RubyRequestType_to_string(creq->getRubyType()), seqNum);
808  issueRequest(creq);
809  } else {
810  // The request is for a line address that is already outstanding
811  // but for a different instruction. Add it as a new request to be
812  // issued when the current outstanding request is completed.
813  coalescedTable.at(line_addr).push_back(creq);
814  DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
815  line_addr, seqNum);
816  }
817 
818  // In both cases, requests are added to the coalescing table and will
819  // be counted as outstanding requests.
821 
822  return true;
823  }
824 
825  // The maximum number of outstanding requests have been issued.
826  return false;
827 }
828 
829 void
831 {
832  // Iterate over the maximum number of instructions we can coalesce
833  // per cycle (coalescingWindow).
834  for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
835  PerInstPackets *pktList =
837 
838  // getInstPackets will return nullptr if no instruction
839  // exists at the current offset.
840  if (!pktList) {
841  break;
842  } else {
843  // Since we have a pointer to the list of packets in the inst,
844  // erase them from the list if coalescing is successful and
845  // leave them in the list otherwise. This aggressively attempts
846  // to coalesce as many packets as possible from the current inst.
847  pktList->remove_if(
848  [&](PacketPtr pkt) { return coalescePacket(pkt); }
849  );
850  }
851  }
852 
853  // Clean up any instructions in the uncoalesced table that have had
854  // all of their packets coalesced and return a token for that column.
856 
857  // have Kernel End releases been issued this cycle
858  int len = newKernelEnds.size();
859  for (int i = 0; i < len; i++) {
861  }
862  newKernelEnds.clear();
863 }
864 
865 void
867 {
868  ruby_eviction_callback(address);
869 }
870 
871 void
873 {
874  assert(kernelEndList.count(wavefront_id));
875 
876  ruby_hit_callback(kernelEndList[wavefront_id]);
877 
878  kernelEndList.erase(wavefront_id);
879 }
880 
881 void
883  MachineType mach,
884  const DataBlock& data)
885 {
886  assert(address == makeLineAddress(address));
887  assert(coalescedTable.count(address));
888 
889  auto crequest = coalescedTable.at(address).front();
890 
891  fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
892  crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
893  crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
894  "atomicCallback saw non-atomic type response\n");
895 
896  hitCallback(crequest, mach, (DataBlock&)data, true,
897  crequest->getIssueTime(), Cycles(0), Cycles(0), false);
898 
899  delete crequest;
900  coalescedTable.at(address).pop_front();
901 
902  if (coalescedTable.at(address).empty()) {
903  coalescedTable.erase(address);
904  } else {
905  auto nextRequest = coalescedTable.at(address).front();
906  issueRequest(nextRequest);
907  }
908 }
909 
910 void
912 {
913  if (myMachID == senderMachID) {
914  CP_TCPLdHits++;
915  } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
917  } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
918  CP_TCCLdHits++;
919  } else {
920  CP_LdMiss++;
921  }
922 }
923 
924 void
926 {
927  if (myMachID == senderMachID) {
928  CP_TCPStHits++;
929  } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
931  } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
932  CP_TCCStHits++;
933  } else {
934  CP_StMiss++;
935  }
936 }
937 
938 void
940 {
941  for (auto& pkt : mylist) {
943  safe_cast<RubyPort::SenderState *>(pkt->senderState);
944  MemSlavePort *port = ss->port;
945  assert(port != NULL);
946 
947  pkt->senderState = ss->predecessor;
948  delete ss;
949  port->hitCallback(pkt);
950  trySendRetries();
951  }
952 
953  // We schedule an event in the same tick as hitCallback (similar to
954  // makeRequest) rather than calling completeIssue directly to reduce
955  // function calls to complete issue. This can only happen if the max
956  // outstanding requests is less than the number of slots in the
957  // uncoalesced table and makeRequest is not called again.
960  }
961 
963 }
964 
965 void
967  MachineType mach,
968  Cycles initialRequestTime,
969  Cycles forwardRequestTime,
970  Cycles firstResponseTime,
971  bool success, bool isRegion)
972 {
973  RubyRequestType type = crequest->getRubyType();
974  Cycles issued_time = crequest->getIssueTime();
975  Cycles completion_time = curCycle();
976  assert(completion_time >= issued_time);
977  Cycles total_lat = completion_time - issued_time;
978 
979  // cache stats (valid for RfO protocol only)
980  if (mach == MachineType_TCP) {
981  if (type == RubyRequestType_LD) {
982  GPU_TCPLdHits++;
983  } else {
984  GPU_TCPStHits++;
985  }
986  } else if (mach == MachineType_L1Cache_wCC) {
987  if (type == RubyRequestType_LD) {
989  } else {
991  }
992  } else if (mach == MachineType_TCC) {
993  if (type == RubyRequestType_LD) {
994  GPU_TCCLdHits++;
995  } else {
996  GPU_TCCStHits++;
997  }
998  } else {
999  if (type == RubyRequestType_LD) {
1000  GPU_LdMiss++;
1001  } else {
1002  GPU_StMiss++;
1003  }
1004  }
1005 
1006  // Profile all access latency, even zero latency accesses
1007  m_latencyHist.sample(total_lat);
1008  m_typeLatencyHist[type]->sample(total_lat);
1009 
1010  // Profile the miss latency for all non-zero demand misses
1011  if (total_lat != Cycles(0)) {
1012  m_missLatencyHist.sample(total_lat);
1013  m_missTypeLatencyHist[type]->sample(total_lat);
1014 
1015  if (mach != MachineType_NUM) {
1016  m_missMachLatencyHist[mach]->sample(total_lat);
1017  m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1018 
1019  if ((issued_time <= initialRequestTime) &&
1020  (initialRequestTime <= forwardRequestTime) &&
1021  (forwardRequestTime <= firstResponseTime) &&
1022  (firstResponseTime <= completion_time)) {
1023 
1024  m_IssueToInitialDelayHist[mach]->sample(
1025  initialRequestTime - issued_time);
1026  m_InitialToForwardDelayHist[mach]->sample(
1027  forwardRequestTime - initialRequestTime);
1028  m_ForwardToFirstResponseDelayHist[mach]->sample(
1029  firstResponseTime - forwardRequestTime);
1031  completion_time - firstResponseTime);
1032  }
1033  }
1034 
1035  }
1036 
1037  DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1038  curTick(), m_version, "Coal",
1039  success ? "Done" : "SC_Failed", "", "",
1040  printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
1041 }
1042 
1043 void
1045 {
1047 
1048  // These statistical variables are not for display.
1049  // The profiler will collate these across different
1050  // coalescers and display those collated statistics.
1051  m_outstandReqHist.init(10);
1052  m_latencyHist.init(10);
1053  m_missLatencyHist.init(10);
1054 
1055  for (int i = 0; i < RubyRequestType_NUM; i++) {
1056  m_typeLatencyHist.push_back(new Stats::Histogram());
1057  m_typeLatencyHist[i]->init(10);
1058 
1059  m_missTypeLatencyHist.push_back(new Stats::Histogram());
1060  m_missTypeLatencyHist[i]->init(10);
1061  }
1062 
1063  for (int i = 0; i < MachineType_NUM; i++) {
1064  m_missMachLatencyHist.push_back(new Stats::Histogram());
1065  m_missMachLatencyHist[i]->init(10);
1066 
1068  m_IssueToInitialDelayHist[i]->init(10);
1069 
1071  m_InitialToForwardDelayHist[i]->init(10);
1072 
1075 
1078  }
1079 
1080  for (int i = 0; i < RubyRequestType_NUM; i++) {
1082 
1083  for (int j = 0; j < MachineType_NUM; j++) {
1084  m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1085  m_missTypeMachLatencyHist[i][j]->init(10);
1086  }
1087  }
1088 
1089  // GPU cache stats
1091  .name(name() + ".gpu_tcp_ld_hits")
1092  .desc("loads that hit in the TCP")
1093  ;
1095  .name(name() + ".gpu_tcp_ld_transfers")
1096  .desc("TCP to TCP load transfers")
1097  ;
1099  .name(name() + ".gpu_tcc_ld_hits")
1100  .desc("loads that hit in the TCC")
1101  ;
1102  GPU_LdMiss
1103  .name(name() + ".gpu_ld_misses")
1104  .desc("loads that miss in the GPU")
1105  ;
1106 
1108  .name(name() + ".gpu_tcp_st_hits")
1109  .desc("stores that hit in the TCP")
1110  ;
1112  .name(name() + ".gpu_tcp_st_transfers")
1113  .desc("TCP to TCP store transfers")
1114  ;
1116  .name(name() + ".gpu_tcc_st_hits")
1117  .desc("stores that hit in the TCC")
1118  ;
1119  GPU_StMiss
1120  .name(name() + ".gpu_st_misses")
1121  .desc("stores that miss in the GPU")
1122  ;
1123 
1124  // CP cache stats
1125  CP_TCPLdHits
1126  .name(name() + ".cp_tcp_ld_hits")
1127  .desc("loads that hit in the TCP")
1128  ;
1130  .name(name() + ".cp_tcp_ld_transfers")
1131  .desc("TCP to TCP load transfers")
1132  ;
1133  CP_TCCLdHits
1134  .name(name() + ".cp_tcc_ld_hits")
1135  .desc("loads that hit in the TCC")
1136  ;
1137  CP_LdMiss
1138  .name(name() + ".cp_ld_misses")
1139  .desc("loads that miss in the GPU")
1140  ;
1141 
1142  CP_TCPStHits
1143  .name(name() + ".cp_tcp_st_hits")
1144  .desc("stores that hit in the TCP")
1145  ;
1147  .name(name() + ".cp_tcp_st_transfers")
1148  .desc("TCP to TCP store transfers")
1149  ;
1150  CP_TCCStHits
1151  .name(name() + ".cp_tcc_st_hits")
1152  .desc("stores that hit in the TCC")
1153  ;
1154  CP_StMiss
1155  .name(name() + ".cp_st_misses")
1156  .desc("stores that miss in the GPU")
1157  ;
1158 }
GMTokenPort & getGMTokenPort()
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
#define DPRINTF(x,...)
Definition: trace.hh:222
HSASegment reqSegmentToHSASegment(const RequestPtr &req)
Definition: GPUCoalescer.cc:91
void insertKernel(int wavefront_id, PacketPtr pkt)
Ports are used to interface objects to each other.
Definition: port.hh:56
void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
Stats::Scalar CP_TCCStHits
EventFunctionWrapper issueEvent
bool empty() const
const uint8_t * getData(int offset, int len) const
Definition: DataBlock.cc:95
Stats::Scalar GPU_TCPStHits
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:81
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:171
UncoalescedTable(GPUCoalescer *gc)
AtomicOpFunctor * getAtomicOp() const
Accessor function to atomic op.
Definition: packet.hh:758
Stats::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
Bitfield< 7 > i
std::vector< Stats::Histogram * > m_ForwardToFirstResponseDelayHist
STL pair class.
Definition: stl.hh:58
GPUCoalescer(const Params *)
bool coalescePacket(PacketPtr pkt)
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: RubyPort.cc:91
std::shared_ptr< Request > RequestPtr
Definition: request.hh:81
Stats::Scalar GPU_TCPLdHits
AbstractController * m_controller
Definition: RubyPort.hh:189
Stats::Scalar CP_StMiss
Stats::Scalar GPU_TCCStHits
void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
EventFunctionWrapper deadlockCheckEvent
void kernelCallback(int wavfront_id)
void trySendRetries()
Definition: RubyPort.cc:449
Stats::Scalar CP_LdMiss
Stats::Scalar GPU_TCPLdTransfers
Bitfield< 23, 0 > offset
Definition: types.hh:152
Histogram & init(size_type size)
Set the parameters of this histogram.
Definition: statistics.hh:2641
Overload hash function for BasicBlockRange type.
Definition: vec_reg.hh:587
Stats::Scalar CP_TCPLdTransfers
std::vector< Stats::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
int m_max_outstanding_requests
Stats::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
void checkDeadlock(Tick threshold)
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
Tick clockPeriod() const
std::vector< PacketPtr > & getPackets()
bool isWrite() const
Definition: packet.hh:523
void completeIssue()
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1084
bool isRead() const
Definition: packet.hh:522
STL vector class.
Definition: stl.hh:37
Stats::Scalar CP_TCPStTransfers
bool isAtomicOp() const
Definition: packet.hh:759
RequestPtr req
A pointer to the original request.
Definition: packet.hh:321
RubyGPUCoalescerParams Params
static const Priority Progress_Event_Pri
Progress events come at the end.
Definition: eventq.hh:221
unsigned getSize() const
Definition: packet.hh:730
Tick cyclesToTicks(Cycles c) const
uint8_t type
Definition: inet.hh:328
void printRequestTable(std::stringstream &ss)
Tick curTick()
The current simulated tick.
Definition: core.hh:44
Bitfield< 4 > pc
void printProgress(std::ostream &out) const
void setRubyType(RubyRequestType type)
Cycles getIssueTime() const
std::map< uint64_t, PerInstPackets > instMap
Definition: GPUCoalescer.hh:93
void setIssueTime(Cycles _issueTime)
Stats::Scalar GPU_LdMiss
void ruby_eviction_callback(Addr address)
Definition: RubyPort.cc:598
CacheMemory * m_dataCache_ptr
SenderState * predecessor
Definition: packet.hh:399
void resetStats() override
Callback to reset stats.
void setData(const uint8_t *p)
Copy data into the packet from the provided pointer.
Definition: packet.hh:1152
void readCallback(Addr address, DataBlock &data)
uint32_t getId()
Definition: RubyPort.hh:164
uint64_t Tick
Tick count type.
Definition: types.hh:61
void mergeFrom(const DataBlock &data)
Definition: SubBlock.hh:60
void writeCallback(Addr address, DataBlock &data)
bool assumingRfOCoherence
PerInstPackets * getInstPackets(int offset)
uint32_t m_version
Definition: RubyPort.hh:188
Stats::Scalar GPU_TCCLdHits
virtual RequestStatus makeRequest(PacketPtr pkt) override
int m_store_waiting_on_load_cycles
Bitfield< 18, 16 > len
A simple histogram stat.
Definition: statistics.hh:2626
Stats::Histogram m_outstandReqHist
Histogram for number of outstanding requests per cycle.
Addr getAddr() const
Definition: packet.hh:720
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
std::vector< std::vector< Stats::Histogram * > > m_missTypeMachLatencyHist
virtual void issueRequest(CoalescedRequest *crequest)
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:199
bool m_usingRubyTester
Definition: RubyPort.hh:191
void schedule(Event &event, Tick when)
Definition: eventq.hh:998
Addr getOffset(Addr addr)
Definition: Address.cc:48
Bitfield< 21 > ss
STL list class.
Definition: stl.hh:51
void recordRequestType(SequencerRequestType requestType)
std::vector< Stats::Histogram * > m_InitialToForwardDelayHist
void ruby_hit_callback(PacketPtr pkt)
Definition: RubyPort.cc:426
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140
int m_load_waiting_on_load_cycles
T safe_cast(U ptr)
Definition: cast.hh:59
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:249
void sendTokens(int num_tokens)
Return num_tokens tokens back to the master port.
Definition: token_port.cc:79
Addr makeLineAddress(Addr addr)
Definition: Address.cc:54
GPUCoalescer * coalescer
Definition: GPUCoalescer.hh:87
std::unordered_map< int, PacketPtr > kernelEndList
Tick clockEdge(Cycles cycles=Cycles(0)) const
Determine the tick when a cycle begins, by default the current one, but the argument also enables the...
std::string printAddress(Addr addr)
Definition: Address.cc:73
void reset()
Reset stat value to default.
Definition: statistics.hh:1920
HSAScope reqScopeToHSAScope(const RequestPtr &req)
Definition: GPUCoalescer.cc:71
Stats::Scalar GPU_StMiss
std::vector< Stats::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
Stats::Scalar CP_TCPLdHits
STL deque class.
Definition: stl.hh:44
Bitfield< 24 > j
RubyRequestType getRubyType() const
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:460
void insertPacket(PacketPtr pkt)
std::vector< Stats::Histogram * > m_typeLatencyHist
void insertPacket(PacketPtr pkt)
bool isLLSC() const
Definition: packet.hh:548
MessageBuffer * m_mandatory_q_ptr
Definition: RubyPort.hh:190
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:276
void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
virtual const std::string name() const
Definition: sim_object.hh:128
void testDrainComplete()
Definition: RubyPort.cc:475
Bitfield< 29 > c
std::vector< int > newKernelEnds
void clear()
Definition: DataBlock.cc:50
PacketPtr getFirstPkt() const
Declaration of the Packet class.
GMTokenPort gmTokenPort
virtual RubyRequestType getRequestType(PacketPtr pkt)
SenderState * senderState
This packet&#39;s sender state.
Definition: packet.hh:474
Stats::Scalar CP_TCPStHits
int m_load_waiting_on_store_cycles
virtual Cycles mandatoryQueueLatency(const RubyRequestType &param_type)
void printRequestTable(std::stringstream &ss)
CacheMemory * m_instCache_ptr
void regStats() override
Callback to set stat parameters.
int m_outstanding_count
void setData(const uint8_t *data, int offset, int len)
Definition: DataBlock.cc:108
UncoalescedTable uncoalescedTable
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:309
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:235
Cycles m_deadlock_threshold
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:64
MachineType machineIDToMachineType(MachineID machID)
void completeHitCallback(std::vector< PacketPtr > &mylist)
void print(std::ostream &out) const
bool m_runningGarnetStandalone
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
Bitfield< 0 > p
std::vector< Stats::Histogram * > m_FirstResponseToCompletionDelayHist
std::vector< Stats::Histogram * > m_missTypeLatencyHist
int m_store_waiting_on_store_cycles
const char data[]
bool isFlush() const
Definition: packet.hh:551
static uint32_t getBlockSizeBytes()
Definition: RubySystem.hh:59
void enqueue(MsgPtr message, Tick curTime, Tick delta)
void evictionCallback(Addr address)
Stats::Scalar GPU_TCPStTransfers
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1896
MemSlavePort * port
Definition: RubyPort.hh:140
#define DPRINTFR(...)
Definition: trace.hh:224
Stats::Scalar CP_TCCLdHits

Generated on Mon Jun 8 2020 15:45:12 for gem5 by doxygen 1.8.13