gem5  v22.1.0.0
tlb_coalescer.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <cstring>
35 
37 #include "arch/x86/page_size.hh"
38 #include "base/logging.hh"
39 #include "debug/GPUTLB.hh"
40 #include "sim/process.hh"
41 
42 namespace gem5
43 {
44 
46  : ClockedObject(p),
47  TLBProbesPerCycle(p.probesPerCycle),
48  coalescingWindow(p.coalescingWindow),
49  disableCoalescing(p.disableCoalescing),
50  probeTLBEvent([this]{ processProbeTLBEvent(); },
51  "Probe the TLB below",
52  false, Event::CPU_Tick_Pri),
53  cleanupEvent([this]{ processCleanupEvent(); },
54  "Cleanup issuedTranslationsTable hashmap",
55  false, Event::Maximum_Pri),
56  stats(this)
57 {
58  // create the response ports based on the number of connected ports
59  for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
60  cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
61  this, i));
62  }
63 
64  // create the request ports based on the number of connected ports
65  for (size_t i = 0; i < p.port_mem_side_ports_connection_count; ++i) {
66  memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
67  this, i));
68  }
69 }
70 
71 Port &
72 TLBCoalescer::getPort(const std::string &if_name, PortID idx)
73 {
74  if (if_name == "cpu_side_ports") {
75  if (idx >= static_cast<PortID>(cpuSidePort.size())) {
76  panic("TLBCoalescer::getPort: unknown index %d\n", idx);
77  }
78 
79  return *cpuSidePort[idx];
80  } else if (if_name == "mem_side_ports") {
81  if (idx >= static_cast<PortID>(memSidePort.size())) {
82  panic("TLBCoalescer::getPort: unknown index %d\n", idx);
83  }
84 
85  return *memSidePort[idx];
86  } else {
87  panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
88  }
89 }
90 
91 /*
92  * This method returns true if the <incoming_pkt>
93  * can be coalesced with <coalesced_pkt> and false otherwise.
94  * A given set of rules is checked.
95  * The rules can potentially be modified based on the TLB level.
96  */
97 bool
98 TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
99 {
100  if (disableCoalescing)
101  return false;
102 
103  GpuTranslationState *incoming_state =
104  safe_cast<GpuTranslationState*>(incoming_pkt->senderState);
105 
106  GpuTranslationState *coalesced_state =
107  safe_cast<GpuTranslationState*>(coalesced_pkt->senderState);
108 
109  // Rule 1: Coalesce requests only if they
110  // fall within the same virtual page
111  Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
113 
114  Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
116 
117  if (incoming_virt_page_addr != coalesced_virt_page_addr)
118  return false;
119 
120  //* Rule 2: Coalesce requests only if they
121  // share a TLB Mode, i.e. they are both read
122  // or write requests.
123  BaseMMU::Mode incoming_mode = incoming_state->tlbMode;
124  BaseMMU::Mode coalesced_mode = coalesced_state->tlbMode;
125 
126  if (incoming_mode != coalesced_mode)
127  return false;
128 
129  // when we can coalesce a packet update the reqCnt
130  // that is the number of packets represented by
131  // this coalesced packet
132  if (!incoming_state->isPrefetch)
133  coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
134 
135  return true;
136 }
137 
138 /*
139  * We need to update the physical addresses of all the translation requests
140  * that were coalesced into the one that just returned.
141  */
142 void
144 {
145  Addr virt_page_addr = roundDown(pkt->req->getVaddr(), X86ISA::PageBytes);
146 
147  DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
148  issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
149 
150  GpuTranslationState *sender_state =
151  safe_cast<GpuTranslationState*>(pkt->senderState);
152 
153  X86ISA::TlbEntry *tlb_entry =
154  safe_cast<X86ISA::TlbEntry *>(sender_state->tlbEntry);
155  assert(tlb_entry);
156  Addr first_entry_vaddr = tlb_entry->vaddr;
157  Addr first_entry_paddr = tlb_entry->paddr;
158  int page_size = tlb_entry->size();
159  bool uncacheable = tlb_entry->uncacheable;
160  int first_hit_level = sender_state->hitLevel;
161 
162  // Get the physical page address of the translated request
163  // Using the page_size specified in the TLBEntry allows us
164  // to support different page sizes.
165  Addr phys_page_paddr = pkt->req->getPaddr();
166  phys_page_paddr &= ~(page_size - 1);
167 
168  for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
169  PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
170  GpuTranslationState *sender_state =
171  safe_cast<GpuTranslationState*>(
172  local_pkt->senderState);
173 
174  // we are sending the packet back, so pop the reqCnt associated
175  // with this level in the TLB hiearchy
176  if (!sender_state->isPrefetch)
177  sender_state->reqCnt.pop_back();
178 
179  /*
180  * Only the first packet from this coalesced request has been
181  * translated. Grab the translated phys. page addr and update the
182  * physical addresses of the remaining packets with the appropriate
183  * page offsets.
184  */
185  if (i) {
186  Addr paddr = phys_page_paddr;
187  paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
188  local_pkt->req->setPaddr(paddr);
189 
190  if (uncacheable)
191  local_pkt->req->setFlags(Request::UNCACHEABLE);
192 
193  // update senderState->tlbEntry, so we can insert
194  // the correct TLBEentry in the TLBs above.
195  auto p = sender_state->tc->getProcessPtr();
196  sender_state->tlbEntry =
197  new X86ISA::TlbEntry(p->pid(), first_entry_vaddr,
198  first_entry_paddr, false, false);
199 
200  // update the hitLevel for all uncoalesced reqs
201  // so that each packet knows where it hit
202  // (used for statistics in the CUs)
203  sender_state->hitLevel = first_hit_level;
204  }
205 
206  ResponsePort *return_port = sender_state->ports.back();
207  sender_state->ports.pop_back();
208 
209  // Translation is done - Convert to a response pkt if necessary and
210  // send the translation back
211  if (local_pkt->isRequest()) {
212  local_pkt->makeTimingResponse();
213  }
214 
215  return_port->sendTimingResp(local_pkt);
216  }
217 
218  // schedule clean up for end of this cycle
219  // This is a maximum priority event and must be on
220  // the same cycle as GPUTLB cleanup event to prevent
221  // race conditions with an IssueProbeEvent caused by
222  // MemSidePort::recvReqRetry
223  cleanupQueue.push(virt_page_addr);
224 
225  if (!cleanupEvent.scheduled())
227 }
228 
229 // Receive translation requests, create a coalesced request,
230 // and send them to the TLB (TLBProbesPerCycle)
231 bool
233 {
234  // first packet of a coalesced request
235  PacketPtr first_packet = nullptr;
236  // true if we are able to do coalescing
237  bool didCoalesce = false;
238  // number of coalesced reqs for a given window
239  int coalescedReq_cnt = 0;
240 
241  GpuTranslationState *sender_state =
242  safe_cast<GpuTranslationState*>(pkt->senderState);
243 
244  // push back the port to remember the path back
245  sender_state->ports.push_back(this);
246 
247  bool update_stats = !sender_state->isPrefetch;
248 
249  if (update_stats) {
250  // if reqCnt is empty then this packet does not represent
251  // multiple uncoalesced reqs(pkts) but just a single pkt.
252  // If it does though then the reqCnt for each level in the
253  // hierarchy accumulates the total number of reqs this packet
254  // represents
255  int req_cnt = 1;
256 
257  if (!sender_state->reqCnt.empty())
258  req_cnt = sender_state->reqCnt.back();
259 
260  sender_state->reqCnt.push_back(req_cnt);
261 
262  // update statistics
264  req_cnt = sender_state->reqCnt.back();
265  DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
266  coalescer->stats.queuingCycles -= (curTick() * req_cnt);
268  }
269 
270  // FIXME if you want to coalesce not based on the issueTime
271  // of the packets (i.e., from the compute unit's perspective)
272  // but based on when they reached this coalescer then
273  // remove the following if statement and use curTick() or
274  // coalescingWindow for the tick_index.
275  if (!sender_state->issueTime)
276  sender_state->issueTime = curTick();
277 
278  // The tick index is used as a key to the coalescerFIFO hashmap.
279  // It is shared by all candidates that fall within the
280  // given coalescingWindow.
281  int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
282 
283  if (coalescer->coalescerFIFO.count(tick_index)) {
284  coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
285  }
286 
287  // see if we can coalesce the incoming pkt with another
288  // coalesced request with the same tick_index
289  for (int i = 0; i < coalescedReq_cnt; ++i) {
290  first_packet = coalescer->coalescerFIFO[tick_index][i][0];
291 
292  if (coalescer->canCoalesce(pkt, first_packet)) {
293  coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
294 
295  DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
296  i, tick_index,
297  coalescer->coalescerFIFO[tick_index][i].size());
298 
299  didCoalesce = true;
300  break;
301  }
302  }
303 
304  // if this is the first request for this tick_index
305  // or we did not manage to coalesce, update stats
306  // and make necessary allocations.
307  if (!coalescedReq_cnt || !didCoalesce) {
308  if (update_stats)
310 
311  std::vector<PacketPtr> new_array;
312  new_array.push_back(pkt);
313  coalescer->coalescerFIFO[tick_index].push_back(new_array);
314 
315  DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
316  "push\n", tick_index,
317  coalescer->coalescerFIFO[tick_index].size());
318  }
319 
320  //schedule probeTLBEvent next cycle to send the
321  //coalesced requests to the TLB
324  curTick() + coalescer->clockPeriod());
325  }
326 
327  return true;
328 }
329 
330 void
332 {
333  panic("recvReqRetry called");
334 }
335 
336 void
338 {
339 
340  GpuTranslationState *sender_state =
341  safe_cast<GpuTranslationState*>(pkt->senderState);
342 
343  bool update_stats = !sender_state->isPrefetch;
344 
345  if (update_stats)
346  coalescer->stats.uncoalescedAccesses++;
347 
348  // If there is a pending timing request for this virtual address
349  // print a warning message. This is a temporary caveat of
350  // the current simulator where atomic and timing requests can
351  // coexist. FIXME remove this check/warning in the future.
352  Addr virt_page_addr = roundDown(pkt->req->getVaddr(), X86ISA::PageBytes);
353  int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
354 
355  if (map_count) {
356  DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
357  "req. pending\n", virt_page_addr);
358  }
359 
360  coalescer->memSidePort[0]->sendFunctional(pkt);
361 }
362 
365 {
366  // currently not checked by the requestor
367  AddrRangeList ranges;
368 
369  return ranges;
370 }
371 
372 bool
374 {
375  // a translation completed and returned
376  coalescer->updatePhysAddresses(pkt);
377 
378  return true;
379 }
380 
381 void
383 {
384  //we've receeived a retry. Schedule a probeTLBEvent
385  if (!coalescer->probeTLBEvent.scheduled())
386  coalescer->schedule(coalescer->probeTLBEvent,
387  curTick() + coalescer->clockPeriod());
388 }
389 
390 void
392 {
393  fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
394 }
395 
396 /*
397  * Here we scan the coalescer FIFO and issue the max
398  * number of permitted probes to the TLB below. We
399  * permit bypassing of coalesced requests for the same
400  * tick_index.
401  *
402  * We do not access the next tick_index unless we've
403  * drained the previous one. The coalesced requests
404  * that are successfully sent are moved to the
405  * issuedTranslationsTable table (the table which keeps
406  * track of the outstanding reqs)
407  */
408 void
410 {
411  // number of TLB probes sent so far
412  int sent_probes = 0;
413  // rejected denotes a blocking event
414  bool rejected = false;
415 
416  // It is set to true either when the recvTiming of the TLB below
417  // returns false or when there is another outstanding request for the
418  // same virt. page.
419 
420  DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
421 
422  for (auto iter = coalescerFIFO.begin();
423  iter != coalescerFIFO.end() && !rejected; ) {
424  int coalescedReq_cnt = iter->second.size();
425  int i = 0;
426  int vector_index = 0;
427 
428  DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
429  coalescedReq_cnt, iter->first);
430 
431  while (i < coalescedReq_cnt) {
432  ++i;
433  PacketPtr first_packet = iter->second[vector_index][0];
434 
435  // compute virtual page address for this request
436  Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
438 
439  // is there another outstanding request for the same page addr?
440  int pending_reqs =
441  issuedTranslationsTable.count(virt_page_addr);
442 
443  if (pending_reqs) {
444  DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
445  "page %#x\n", virt_page_addr);
446 
447  ++vector_index;
448  rejected = true;
449 
450  continue;
451  }
452 
453  // send the coalesced request for virt_page_addr
454  if (!memSidePort[0]->sendTimingReq(first_packet)) {
455  DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
456  virt_page_addr);
457 
458  // No need for a retries queue since we are already buffering
459  // the coalesced request in coalescerFIFO.
460  rejected = true;
461  ++vector_index;
462  } else {
463  GpuTranslationState *tmp_sender_state =
464  safe_cast<GpuTranslationState*>
465  (first_packet->senderState);
466 
467  bool update_stats = !tmp_sender_state->isPrefetch;
468 
469  if (update_stats) {
470  // req_cnt is total number of packets represented
471  // by the one we just sent counting all the way from
472  // the top of TLB hiearchy (i.e., from the CU)
473  int req_cnt = tmp_sender_state->reqCnt.back();
474  stats.queuingCycles += (curTick() * req_cnt);
475 
476  DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
477  name(), req_cnt);
478 
479  // pkt_cnt is number of packets we coalesced into the one
480  // we just sent but only at this coalescer level
481  int pkt_cnt = iter->second[vector_index].size();
482  stats.localqueuingCycles += (curTick() * pkt_cnt);
483  }
484 
485  DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
486  virt_page_addr);
487 
488  //copy coalescedReq to issuedTranslationsTable
489  issuedTranslationsTable[virt_page_addr]
490  = iter->second[vector_index];
491 
492  //erase the entry of this coalesced req
493  iter->second.erase(iter->second.begin() + vector_index);
494 
495  if (iter->second.empty())
496  assert(i == coalescedReq_cnt);
497 
498  sent_probes++;
499  if (sent_probes == TLBProbesPerCycle)
500  return;
501  }
502  }
503 
504  //if there are no more coalesced reqs for this tick_index
505  //erase the hash_map with the first iterator
506  if (iter->second.empty()) {
507  coalescerFIFO.erase(iter++);
508  } else {
509  ++iter;
510  }
511  }
512 }
513 
514 void
516 {
517  while (!cleanupQueue.empty()) {
518  Addr cleanup_addr = cleanupQueue.front();
519  cleanupQueue.pop();
520  issuedTranslationsTable.erase(cleanup_addr);
521 
522  DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
523  cleanup_addr);
524  }
525 }
526 
528  : statistics::Group(parent),
529  ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
530  ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
531  ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
532  ADD_STAT(localqueuingCycles,
533  "Number of cycles spent in queue for all incoming reqs"),
534  ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
535 {
537 }
538 
539 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick clockPeriod() const
virtual std::string name() const
Definition: named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
void makeTimingResponse()
Definition: packet.hh:1077
SenderState * senderState
This packet's sender state.
Definition: packet.hh:544
RequestPtr req
A pointer to the original request.
Definition: packet.hh:376
bool isRequest() const
Definition: packet.hh:596
@ UNCACHEABLE
The request is to an uncacheable address.
Definition: request.hh:125
A ResponsePort is a specialization of a port.
Definition: port.hh:270
bool sendTimingResp(PacketPtr pkt)
Attempt to send a timing response to the request port by calling its corresponding receive function.
Definition: port.hh:370
virtual AddrRangeList getAddrRanges() const
Get a list of the non-overlapping address ranges the owner is responsible for.
virtual void recvFunctional(PacketPtr pkt)
Receive a functional request packet from the peer.
virtual bool recvTimingReq(PacketPtr pkt)
Receive a timing request from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLBCoalescerParams Params
std::queue< Addr > cleanupQueue
gem5::TLBCoalescer::TLBCoalescerStats stats
CoalescingTable issuedTranslationsTable
std::vector< MemSidePort * > memSidePort
void updatePhysAddresses(PacketPtr pkt)
EventFunctionWrapper cleanupEvent
The cleanupEvent is scheduled after a TLBEvent triggers in order to free memory and do the required c...
EventFunctionWrapper probeTLBEvent
This event issues the TLB probes.
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
CoalescingFIFO coalescerFIFO
TLBCoalescer(const Params &p)
bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2)
std::vector< CpuSidePort * > cpuSidePort
virtual Process * getProcessPtr()=0
Statistics container.
Definition: group.hh:94
STL vector class.
Definition: stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
static const Priority Maximum_Pri
Maximum priority.
Definition: eventq.hh:241
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 54 > p
Definition: pagetable.hh:70
const Addr PageBytes
Definition: page_size.hh:49
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:245
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:161
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
TLBCoalescerStats(statistics::Group *parent)

Generated on Wed Dec 21 2022 10:22:15 for gem5 by doxygen 1.9.1