gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
bop.cc
Go to the documentation of this file.
1
41
43
44#include "debug/HWPrefetch.hh"
45#include "params/BOPPrefetcher.hh"
46
47namespace gem5
48{
49
50namespace prefetch
51{
52BOP::BOP(const BOPPrefetcherParams &p)
53 : Queued(p),
54 scoreMax(p.score_max), roundMax(p.round_max),
55 badScore(p.bad_score), rrEntries(p.rr_size),
56 tagMask((1 << p.tag_bits) - 1),
57 delayQueueEnabled(p.delay_queue_enable),
58 delayQueueSize(p.delay_queue_size),
59 delayTicks(cyclesToTicks(p.delay_queue_cycles)),
61 issuePrefetchRequests(false), bestOffset(1), phaseBestOffset(0),
62 bestScore(0), round(0), degree(p.degree)
63{
64 if (!isPowerOf2(rrEntries)) {
65 fatal("%s: number of RR entries is not power of 2\n", name());
66 }
67 if (!isPowerOf2(blkSize)) {
68 fatal("%s: cache line size is not power of 2\n", name());
69 }
70 if (p.negative_offsets_enable && (p.offset_list_size % 2 != 0)) {
71 fatal("%s: negative offsets enabled with odd offset list size\n",
72 name());
73 }
74 if (p.degree <= 0) {
75 fatal("%s: prefetch degree must be strictly greater than zero\n",
76 name());
77 }
78
79 rrLeft.resize(rrEntries);
80 rrRight.resize(rrEntries);
81
82 /*
83 * Following the paper implementation, a list with the specified number
84 * of offsets which are of the form 2^i * 3^j * 5^k with i,j,k >= 0
85 */
86 const int factors[] = { 2, 3, 5 };
87 unsigned int i = 0;
88 int64_t offset_i = 1;
89
90 while (i < p.offset_list_size)
91 {
92 int64_t offset = offset_i;
93
94 for (int n : factors) {
95 while ((offset % n) == 0) {
96 offset /= n;
97 }
98 }
99
100 if (offset == 1) {
101 offsetsList.push_back(OffsetListEntry(offset_i, 0));
102 i++;
103 /*
104 * If we want to use negative offsets, add also the negative value
105 * of the offset just calculated
106 */
107 if (p.negative_offsets_enable) {
108 offsetsList.push_back(OffsetListEntry(-offset_i, 0));
109 i++;
110 }
111 }
112
113 offset_i++;
114 }
115
116 offsetsListIterator = offsetsList.begin();
117}
118
119void
121{
122 while (!delayQueue.empty() &&
123 delayQueue.front().processTick <= curTick())
124 {
125 Addr addr_x = delayQueue.front().baseAddr;
126 Addr addr_tag = tag(addr_x);
127 insertIntoRR(addr_x, addr_tag, RRWay::Left);
128 delayQueue.pop_front();
129 }
130
131 // Schedule an event for the next element if there is one
132 if (!delayQueue.empty()) {
133 schedule(delayQueueEvent, delayQueue.front().processTick);
134 }
135}
136
137unsigned int
138BOP::index(Addr addr, unsigned int way) const
139{
140 /*
141 * The second parameter, way, is set to 0 for indexing the left side of the
142 * RR Table and, it is set to 1 for indexing the right side of the RR
143 * Table. This is because we always pass the enum RRWay as the way argument
144 * while calling index. This enum is defined in the bop.hh file.
145 *
146 * The indexing function in the author's ChampSim code, which can be found
147 * here: https://comparch-conf.gatech.edu/dpc2/final_program.html, computes
148 * the hash as follows:
149 *
150 * 1. For indexing the left side of the RR Table (way = 0), the cache line
151 * address is XORed with itself after right shifting it by the log base
152 * 2 of the number of entries in the RR Table.
153 * 2. For indexing the right side of the RR Table (way = 1), the cache
154 * line address is XORed with itself after right shifting it by the log
155 * base 2 of the number of entries in the RR Table, multiplied by two.
156 *
157 * Therefore, we if just left shift the log base 2 of the number of RR
158 * entries (log_rr_entries) with the parameter way, then if we are indexing
159 * the left side, we'll leave log_rr_entries as it is, but if we are
160 * indexing the right side, we'll multiply it with 2. Now once we have the
161 * result of this operation, we can right shift the cache line address
162 * (line_addr) by this value to get the first operand of the final XOR
163 * operation. The second operand of the XOR operation is line_addr itself
164 */
165 Addr log_rr_entries = floorLog2(rrEntries);
166 Addr line_addr = addr >> lBlkSize;
167 Addr hash = line_addr ^ (line_addr >> (log_rr_entries << way));
168 hash &= ((1ULL << log_rr_entries) - 1);
169 return hash % rrEntries;
170}
171
172void
173BOP::insertIntoRR(Addr addr, Addr tag, unsigned int way)
174{
175 switch (way) {
176 case RRWay::Left:
178 break;
179 case RRWay::Right:
181 break;
182 }
183}
184
185void
187{
188 if (delayQueue.size() == delayQueueSize) {
189 return;
190 }
191
192 /*
193 * Add the address to the delay queue and schedule an event to process
194 * it after the specified delay cycles
195 */
196 Tick process_tick = curTick() + delayTicks;
197
198 delayQueue.push_back(DelayQueueEntry(x, process_tick));
199
200 if (!delayQueueEvent.scheduled()) {
201 schedule(delayQueueEvent, process_tick);
202 }
203}
204
205void
207{
208 for (auto& it : offsetsList) {
209 it.second = 0;
210 }
211}
212
213inline Addr
215{
216 return (addr >> lBlkSize) & tagMask;
217}
218
219bool
220BOP::testRR(Addr addr_tag) const
221{
222 for (auto& it : rrLeft) {
223 if (it == addr_tag) {
224 return true;
225 }
226 }
227
228 for (auto& it : rrRight) {
229 if (it == addr_tag) {
230 return true;
231 }
232 }
233
234 return false;
235}
236
237void
239{
240 Addr offset_tag = (*offsetsListIterator).first;
241
242 /*
243 * Compute the lookup tag for the RR table. As tags are generated using
244 * lower 12 bits we subtract offset from the full address rather than the
245 * tag to avoid integer underflow.
246 */
247 Addr lookup_tag = tag((addr) - (offset_tag << lBlkSize));
248
249 // There was a hit in the RR table, increment the score for this offset
250 if (testRR(lookup_tag)) {
251 DPRINTF(HWPrefetch, "Address %#lx found in the RR table\n",
252 lookup_tag);
253 (*offsetsListIterator).second++;
254 if ((*offsetsListIterator).second > bestScore) {
255 bestScore = (*offsetsListIterator).second;
256 phaseBestOffset = (*offsetsListIterator).first;
257 DPRINTF(HWPrefetch, "New best score is %lu\n", bestScore);
258 }
259 }
260
261 // Move the offset iterator forward to prepare for the next time
263
264 /*
265 * All the offsets in the list were visited meaning that a learning
266 * phase finished. Check if
267 */
268 if (offsetsListIterator == offsetsList.end()) {
270 round++;
271 }
272
273 // Check if its time to re-calculate the best offset
274 if ((bestScore >= scoreMax) || (round >= roundMax)) {
275 round = 0;
276
277 /*
278 * If the current best score (bestScore) has exceed the threshold to
279 * enable prefetching (badScore), reset the learning structures and
280 * enable prefetch generation
281 */
282 if (bestScore > badScore) {
284 round = 0;
286 } else {
287 issuePrefetchRequests = false;
288 }
289 resetScores();
290 bestScore = 0;
291 phaseBestOffset = 0;
292 }
293}
294
295void
297 std::vector<AddrPriority> &addresses,
298 const CacheAccessor &cache)
299{
300 Addr addr = pfi.getAddr();
301 Addr tag_x = tag(addr);
302
303 if (delayQueueEnabled) {
305 } else {
307 }
308
309 /*
310 * Go through the nth offset and update the score, the best score and the
311 * current best offset if a better one is found
312 */
314
316 for (int i = 1; i <= degree; i++) {
317 Addr prefetch_addr = addr + ((i * bestOffset) << lBlkSize);
318 addresses.push_back(AddrPriority(prefetch_addr, 0));
319 DPRINTF(HWPrefetch, "Generated prefetch %#lx\n", prefetch_addr);
320 }
321 }
322}
323
324void
326{
327 const PacketPtr& pkt = arg.pkt;
328
329 // Only insert into the RR right way if it's the pkt is a hardware prefetch
330 if (!pkt->cmd.isHWPrefetch()) return;
331
332 Addr addr = pkt->getAddr();
333 Addr tag_y = tag(addr);
334
337 }
338}
339
340} // namespace prefetch
341} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
Information provided to probes on a cache event.
PacketPtr pkt
Packet that triggered the cache access.
Tick cyclesToTicks(Cycles c) const
bool isHWPrefetch() const
Definition packet.hh:254
Addr getAddr() const
Definition packet.hh:807
MemCmd cmd
The command field of the packet.
Definition packet.hh:372
std::deque< DelayQueueEntry > delayQueue
Definition bop.hh:109
void calculatePrefetch(const PrefetchInfo &pfi, std::vector< AddrPriority > &addresses, const CacheAccessor &cache) override
Definition bop.cc:296
unsigned int round
Current round.
Definition bop.hh:126
std::vector< Addr > rrLeft
Definition bop.hh:87
unsigned int bestScore
Max score found so far.
Definition bop.hh:124
unsigned int index(Addr addr, unsigned int way) const
Generate a hash for the specified address to index the RR table.
Definition bop.cc:138
const unsigned int delayQueueSize
Definition bop.hh:84
void insertIntoDelayQueue(Addr addr)
Insert the specified address into the delay queue.
Definition bop.cc:186
BOP(const BOPPrefetcherParams &p)
Definition bop.cc:52
const unsigned int rrEntries
Recent requests table parameteres.
Definition bop.hh:80
const unsigned int tagMask
Definition bop.hh:81
std::vector< OffsetListEntry >::iterator offsetsListIterator
Current test offset index.
Definition bop.hh:122
const unsigned int badScore
Definition bop.hh:78
const bool delayQueueEnabled
Delay queue parameters.
Definition bop.hh:83
const unsigned int scoreMax
Learning phase parameters.
Definition bop.hh:76
std::vector< Addr > rrRight
Definition bop.hh:88
bool issuePrefetchRequests
Hardware prefetcher enabled.
Definition bop.hh:116
Addr tag(Addr addr) const
Generate the tag for the specified address based on the tag bits and the block size.
Definition bop.cc:214
void delayQueueEventWrapper()
Event to handle the delay queue processing.
Definition bop.cc:120
const unsigned int roundMax
Definition bop.hh:77
void bestOffsetLearning(Addr addr)
Learning phase of the BOP.
Definition bop.cc:238
Addr phaseBestOffset
Current best offset found in the learning phase.
Definition bop.hh:120
EventFunctionWrapper delayQueueEvent
Definition bop.hh:113
void notifyFill(const CacheAccessProbeArg &arg) override
Update the RR right table after a prefetch fill.
Definition bop.cc:325
unsigned int degree
The prefetch degree, i.e.
Definition bop.hh:174
std::vector< OffsetListEntry > offsetsList
Definition bop.hh:92
void insertIntoRR(Addr addr, Addr addr_tag, unsigned int way)
Insert the specified address into the RR table.
Definition bop.cc:173
void resetScores()
Reset all the scores from the offset list.
Definition bop.cc:206
Addr bestOffset
Current best offset to issue prefetches.
Definition bop.hh:118
const unsigned int delayTicks
Definition bop.hh:85
bool testRR(Addr addr_tag) const
Test if @X-O is hitting in the RR table to update the offset score.
Definition bop.cc:220
Class containing the information needed by the prefetch to train and generate new prefetch requests.
Definition base.hh:113
Addr getAddr() const
Obtains the address value of this Prefetcher address.
Definition base.hh:140
unsigned lBlkSize
log_2(block size of the parent cache).
Definition base.hh:291
Queued(const QueuedPrefetcherParams &p)
Definition queued.cc:100
std::pair< Addr, int32_t > AddrPriority
Definition queued.hh:192
STL vector class.
Definition stl.hh:37
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
Definition intmath.hh:59
static constexpr bool isPowerOf2(const T &n)
Definition intmath.hh:98
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
Bitfield< 31 > n
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 30, 0 > index
Bitfield< 0 > p
Bitfield< 3 > x
Definition pagetable.hh:76
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
uint64_t Tick
Tick count type.
Definition types.hh:58
Packet * PacketPtr
Provides generic cache lookup functions.
In a first implementation of the BO prefetcher, both banks of the RR were written simultaneously when...
Definition bop.hh:101
const std::string & name()
Definition trace.cc:48

Generated on Mon May 26 2025 09:19:11 for gem5 by doxygen 1.13.2