gem5 v24.0.0.0
Loading...
Searching...
No Matches
tlb_coalescer.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cstring>
35
37#include "arch/x86/page_size.hh"
38#include "base/logging.hh"
39#include "debug/GPUTLB.hh"
40#include "sim/process.hh"
41
42namespace gem5
43{
44
47 TLBProbesPerCycle(p.probesPerCycle),
48 coalescingWindow(p.coalescingWindow),
49 disableCoalescing(p.disableCoalescing),
50 probeTLBEvent([this]{ processProbeTLBEvent(); },
51 "Probe the TLB below",
52 false, Event::CPU_Tick_Pri),
53 cleanupEvent([this]{ processCleanupEvent(); },
54 "Cleanup issuedTranslationsTable hashmap",
55 false, Event::Maximum_Pri),
56 stats(this)
57{
58 // create the response ports based on the number of connected ports
59 for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
60 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
61 this, i));
62 }
63
64 // create the request ports based on the number of connected ports
65 for (size_t i = 0; i < p.port_mem_side_ports_connection_count; ++i) {
66 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
67 this, i));
68 }
69}
70
71Port &
72TLBCoalescer::getPort(const std::string &if_name, PortID idx)
73{
74 if (if_name == "cpu_side_ports") {
75 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
76 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
77 }
78
79 return *cpuSidePort[idx];
80 } else if (if_name == "mem_side_ports") {
81 if (idx >= static_cast<PortID>(memSidePort.size())) {
82 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
83 }
84
85 return *memSidePort[idx];
86 } else {
87 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
88 }
89}
90
91/*
92 * This method returns true if the <incoming_pkt>
93 * can be coalesced with <coalesced_pkt> and false otherwise.
94 * A given set of rules is checked.
95 * The rules can potentially be modified based on the TLB level.
96 */
97bool
98TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
99{
101 return false;
102
103 GpuTranslationState *incoming_state =
105
106 GpuTranslationState *coalesced_state =
108
109 // Rule 1: Coalesce requests only if they
110 // fall within the same virtual page
111 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
113
114 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
116
117 if (incoming_virt_page_addr != coalesced_virt_page_addr)
118 return false;
119
120 //* Rule 2: Coalesce requests only if they
121 // share a TLB Mode, i.e. they are both read
122 // or write requests.
123 BaseMMU::Mode incoming_mode = incoming_state->tlbMode;
124 BaseMMU::Mode coalesced_mode = coalesced_state->tlbMode;
125
126 if (incoming_mode != coalesced_mode)
127 return false;
128
129 // when we can coalesce a packet update the reqCnt
130 // that is the number of packets represented by
131 // this coalesced packet
132 if (!incoming_state->isPrefetch)
133 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
134
135 return true;
136}
137
138/*
139 * We need to update the physical addresses of all the translation requests
140 * that were coalesced into the one that just returned.
141 */
142void
144{
145 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), X86ISA::PageBytes);
146
147 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
148 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
149
150 GpuTranslationState *sender_state =
152
153 X86ISA::TlbEntry *tlb_entry =
155 assert(tlb_entry);
156 Addr first_entry_vaddr = tlb_entry->vaddr;
157 Addr first_entry_paddr = tlb_entry->paddr;
158 int page_size = tlb_entry->size();
159 bool uncacheable = tlb_entry->uncacheable;
160 int first_hit_level = sender_state->hitLevel;
161
162 // Get the physical page address of the translated request
163 // Using the page_size specified in the TLBEntry allows us
164 // to support different page sizes.
165 Addr phys_page_paddr = pkt->req->getPaddr();
166 phys_page_paddr &= ~(page_size - 1);
167
168 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
169 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
170 GpuTranslationState *sender_state =
172 local_pkt->senderState);
173
174 // we are sending the packet back, so pop the reqCnt associated
175 // with this level in the TLB hiearchy
176 if (!sender_state->isPrefetch)
177 sender_state->reqCnt.pop_back();
178
179 /*
180 * Only the first packet from this coalesced request has been
181 * translated. Grab the translated phys. page addr and update the
182 * physical addresses of the remaining packets with the appropriate
183 * page offsets.
184 */
185 if (i) {
186 Addr paddr = phys_page_paddr;
187 paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
188 local_pkt->req->setPaddr(paddr);
189
190 if (uncacheable)
191 local_pkt->req->setFlags(Request::UNCACHEABLE);
192
193 // update senderState->tlbEntry, so we can insert
194 // the correct TLBEentry in the TLBs above.
195 auto p = sender_state->tc->getProcessPtr();
196 sender_state->tlbEntry =
197 new X86ISA::TlbEntry(p->pid(), first_entry_vaddr,
198 first_entry_paddr, false, false);
199
200 // update the hitLevel for all uncoalesced reqs
201 // so that each packet knows where it hit
202 // (used for statistics in the CUs)
203 sender_state->hitLevel = first_hit_level;
204 }
205
206 ResponsePort *return_port = sender_state->ports.back();
207 sender_state->ports.pop_back();
208
209 // Translation is done - Convert to a response pkt if necessary and
210 // send the translation back
211 if (local_pkt->isRequest()) {
212 local_pkt->makeTimingResponse();
213 }
214
215 return_port->sendTimingResp(local_pkt);
216 }
217
218 // schedule clean up for end of this cycle
219 // This is a maximum priority event and must be on
220 // the same cycle as GPUTLB cleanup event to prevent
221 // race conditions with an IssueProbeEvent caused by
222 // MemSidePort::recvReqRetry
223 cleanupQueue.push(virt_page_addr);
224
225 if (!cleanupEvent.scheduled())
227}
228
229// Receive translation requests, create a coalesced request,
230// and send them to the TLB (TLBProbesPerCycle)
231bool
233{
234 // first packet of a coalesced request
235 PacketPtr first_packet = nullptr;
236 // true if we are able to do coalescing
237 bool didCoalesce = false;
238 // number of coalesced reqs for a given window
239 int coalescedReq_cnt = 0;
240
241 GpuTranslationState *sender_state =
243
244 // push back the port to remember the path back
245 sender_state->ports.push_back(this);
246
247 bool update_stats = !sender_state->isPrefetch;
248
249 if (update_stats) {
250 // if reqCnt is empty then this packet does not represent
251 // multiple uncoalesced reqs(pkts) but just a single pkt.
252 // If it does though then the reqCnt for each level in the
253 // hierarchy accumulates the total number of reqs this packet
254 // represents
255 int req_cnt = 1;
256
257 if (!sender_state->reqCnt.empty())
258 req_cnt = sender_state->reqCnt.back();
259
260 sender_state->reqCnt.push_back(req_cnt);
261
262 // update statistics
264 req_cnt = sender_state->reqCnt.back();
265 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
266 coalescer->stats.queuingCycles -= (curTick() * req_cnt);
268 }
269
270 // FIXME if you want to coalesce not based on the issueTime
271 // of the packets (i.e., from the compute unit's perspective)
272 // but based on when they reached this coalescer then
273 // remove the following if statement and use curTick() or
274 // coalescingWindow for the tick_index.
275 if (!sender_state->issueTime)
276 sender_state->issueTime = curTick();
277
278 // The tick index is used as a key to the coalescerFIFO hashmap.
279 // It is shared by all candidates that fall within the
280 // given coalescingWindow.
281 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
282
283 if (coalescer->coalescerFIFO.count(tick_index)) {
284 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
285 }
286
287 // see if we can coalesce the incoming pkt with another
288 // coalesced request with the same tick_index
289 for (int i = 0; i < coalescedReq_cnt; ++i) {
290 first_packet = coalescer->coalescerFIFO[tick_index][i][0];
291
292 if (coalescer->canCoalesce(pkt, first_packet)) {
293 coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
294
295 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
296 i, tick_index,
297 coalescer->coalescerFIFO[tick_index][i].size());
298
299 didCoalesce = true;
300 break;
301 }
302 }
303
304 // if this is the first request for this tick_index
305 // or we did not manage to coalesce, update stats
306 // and make necessary allocations.
307 if (!coalescedReq_cnt || !didCoalesce) {
308 if (update_stats)
310
311 std::vector<PacketPtr> new_array;
312 new_array.push_back(pkt);
313 coalescer->coalescerFIFO[tick_index].push_back(new_array);
314
315 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
316 "push\n", tick_index,
317 coalescer->coalescerFIFO[tick_index].size());
318 }
319
320 //schedule probeTLBEvent next cycle to send the
321 //coalesced requests to the TLB
325 }
326
327 return true;
328}
329
330void
332{
333 panic("recvReqRetry called");
334}
335
336void
338{
339
340 GpuTranslationState *sender_state =
342
343 bool update_stats = !sender_state->isPrefetch;
344
345 if (update_stats)
346 coalescer->stats.uncoalescedAccesses++;
347
348 // If there is a pending timing request for this virtual address
349 // print a warning message. This is a temporary caveat of
350 // the current simulator where atomic and timing requests can
351 // coexist. FIXME remove this check/warning in the future.
352 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), X86ISA::PageBytes);
353 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
354
355 if (map_count) {
356 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
357 "req. pending\n", virt_page_addr);
358 }
359
360 coalescer->memSidePort[0]->sendFunctional(pkt);
361}
362
365{
366 // currently not checked by the requestor
367 AddrRangeList ranges;
368
369 return ranges;
370}
371
372bool
374{
375 // a translation completed and returned
376 coalescer->updatePhysAddresses(pkt);
377
378 return true;
379}
380
381void
383{
384 //we've receeived a retry. Schedule a probeTLBEvent
385 if (!coalescer->probeTLBEvent.scheduled())
386 coalescer->schedule(coalescer->probeTLBEvent,
387 curTick() + coalescer->clockPeriod());
388}
389
390void
392{
393 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
394}
395
396/*
397 * Here we scan the coalescer FIFO and issue the max
398 * number of permitted probes to the TLB below. We
399 * permit bypassing of coalesced requests for the same
400 * tick_index.
401 *
402 * We do not access the next tick_index unless we've
403 * drained the previous one. The coalesced requests
404 * that are successfully sent are moved to the
405 * issuedTranslationsTable table (the table which keeps
406 * track of the outstanding reqs)
407 */
408void
410{
411 // number of TLB probes sent so far
412 int sent_probes = 0;
413 // rejected denotes a blocking event
414 bool rejected = false;
415
416 // It is set to true either when the recvTiming of the TLB below
417 // returns false or when there is another outstanding request for the
418 // same virt. page.
419
420 DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
421
422 for (auto iter = coalescerFIFO.begin();
423 iter != coalescerFIFO.end() && !rejected; ) {
424 int coalescedReq_cnt = iter->second.size();
425 int i = 0;
426 int vector_index = 0;
427
428 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
429 coalescedReq_cnt, iter->first);
430
431 while (i < coalescedReq_cnt) {
432 ++i;
433 PacketPtr first_packet = iter->second[vector_index][0];
434
435 // compute virtual page address for this request
436 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
438
439 // is there another outstanding request for the same page addr?
440 int pending_reqs =
441 issuedTranslationsTable.count(virt_page_addr);
442
443 if (pending_reqs) {
444 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
445 "page %#x\n", virt_page_addr);
446
447 ++vector_index;
448 rejected = true;
449
450 continue;
451 }
452
453 // send the coalesced request for virt_page_addr
454 if (!memSidePort[0]->sendTimingReq(first_packet)) {
455 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
456 virt_page_addr);
457
458 // No need for a retries queue since we are already buffering
459 // the coalesced request in coalescerFIFO.
460 rejected = true;
461 ++vector_index;
462 } else {
463 GpuTranslationState *tmp_sender_state =
465 (first_packet->senderState);
466
467 bool update_stats = !tmp_sender_state->isPrefetch;
468
469 if (update_stats) {
470 // req_cnt is total number of packets represented
471 // by the one we just sent counting all the way from
472 // the top of TLB hiearchy (i.e., from the CU)
473 int req_cnt = tmp_sender_state->reqCnt.back();
474 stats.queuingCycles += (curTick() * req_cnt);
475
476 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
477 name(), req_cnt);
478
479 // pkt_cnt is number of packets we coalesced into the one
480 // we just sent but only at this coalescer level
481 int pkt_cnt = iter->second[vector_index].size();
482 stats.localqueuingCycles += (curTick() * pkt_cnt);
483 }
484
485 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x\n",
486 virt_page_addr);
487
488 //copy coalescedReq to issuedTranslationsTable
489 issuedTranslationsTable[virt_page_addr]
490 = iter->second[vector_index];
491
492 //erase the entry of this coalesced req
493 iter->second.erase(iter->second.begin() + vector_index);
494
495 if (iter->second.empty())
496 assert(i == coalescedReq_cnt);
497
498 sent_probes++;
499 if (sent_probes == TLBProbesPerCycle)
500 return;
501 }
502 }
503
504 //if there are no more coalesced reqs for this tick_index
505 //erase the hash_map with the first iterator
506 if (iter->second.empty()) {
507 coalescerFIFO.erase(iter++);
508 } else {
509 ++iter;
510 }
511 }
512}
513
514void
516{
517 while (!cleanupQueue.empty()) {
518 Addr cleanup_addr = cleanupQueue.front();
519 cleanupQueue.pop();
520 issuedTranslationsTable.erase(cleanup_addr);
521
522 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
523 cleanup_addr);
524 }
525}
526
528 : statistics::Group(parent),
529 ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
530 ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
531 ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
532 ADD_STAT(localqueuingCycles,
533 "Number of cycles spent in queue for all incoming reqs"),
534 ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
535{
537}
538
539} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Tick clockPeriod() const
virtual std::string name() const
Definition named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void makeTimingResponse()
Definition packet.hh:1080
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
bool isRequest() const
Definition packet.hh:597
@ UNCACHEABLE
The request is to an uncacheable address.
Definition request.hh:125
A ResponsePort is a specialization of a port.
Definition port.hh:349
bool sendTimingResp(PacketPtr pkt)
Attempt to send a timing response to the request port by calling its corresponding receive function.
Definition port.hh:454
virtual AddrRangeList getAddrRanges() const
Get a list of the non-overlapping address ranges the owner is responsible for.
virtual void recvFunctional(PacketPtr pkt)
Receive a functional request packet from the peer.
virtual bool recvTimingReq(PacketPtr pkt)
Receive a timing request from the peer.
virtual void recvFunctional(PacketPtr pkt)
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
TLBCoalescerParams Params
std::queue< Addr > cleanupQueue
gem5::TLBCoalescer::TLBCoalescerStats stats
CoalescingTable issuedTranslationsTable
std::vector< MemSidePort * > memSidePort
void updatePhysAddresses(PacketPtr pkt)
EventFunctionWrapper cleanupEvent
The cleanupEvent is scheduled after a TLBEvent triggers in order to free memory and do the required c...
EventFunctionWrapper probeTLBEvent
This event issues the TLB probes.
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
CoalescingFIFO coalescerFIFO
TLBCoalescer(const Params &p)
bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2)
std::vector< CpuSidePort * > cpuSidePort
virtual Process * getProcessPtr()=0
Statistics container.
Definition group.hh:93
STL vector class.
Definition stl.hh:37
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
static const Priority Maximum_Pri
Maximum priority.
Definition eventq.hh:244
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 0 > p
const Addr PageBytes
Definition page_size.hh:49
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< ResponsePort * > ports
TLBCoalescerStats(statistics::Group *parent)

Generated on Tue Jun 18 2024 16:23:55 for gem5 by doxygen 1.11.0