gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
lds_state.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <array>
35#include <cstdio>
36#include <cstdlib>
37
40#include "gpu-compute/shader.hh"
41
42namespace gem5
43{
44
50 tickEvent(this),
51 cuPort(name() + ".port", this),
52 maximumSize(params.size),
56{
57 fatal_if(params.banks <= 0,
58 "Number of LDS banks should be positive number");
59 fatal_if((params.banks & (params.banks - 1)) != 0,
60 "Number of LDS banks should be a power of 2");
61 fatal_if(params.size <= 0,
62 "cannot allocate an LDS with a size less than 1");
63 fatal_if(params.size % 2,
64 "the LDS should be an even number");
65}
66
70void
72{
73 // check that this gets assigned to the same thing each time
74 fatal_if(!x_parent, "x_parent should not be nullptr");
75 fatal_if(x_parent == parent,
76 "should not be setting the parent twice");
77
78 parent = x_parent;
79 _name = x_parent->name() + ".LdsState";
80}
81
85unsigned
86LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
87{
88 Packet::SenderState *baseSenderState = packet->senderState;
89 while (baseSenderState->predecessor) {
90 baseSenderState = baseSenderState->predecessor;
91 }
92 const ComputeUnit::LDSPort::SenderState *senderState =
93 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
94
95 fatal_if(!senderState,
96 "did not get the right sort of sender state");
97
98 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
99
100 return countBankConflicts(gpuDynInst, bankAccesses);
101}
102
103// Count the total number of bank conflicts for the local memory packet
104unsigned
106 unsigned *numBankAccesses)
107{
108 int bank_conflicts = 0;
109 std::vector<int> bank;
110 // the number of LDS banks being touched by the memory instruction
111 int numBanks = std::min(parent->wfSize(), banks);
112 // if the wavefront size is larger than the number of LDS banks, we
113 // need to iterate over all work items to calculate the total
114 // number of bank conflicts
115 int groups = (parent->wfSize() > numBanks) ?
116 (parent->wfSize() / numBanks) : 1;
117 for (int i = 0; i < groups; i++) {
118 // Address Array holding all the work item addresses of an instruction
119 std::vector<Addr> addr_array;
120 addr_array.resize(numBanks, 0);
121 bank.clear();
122 bank.resize(banks, 0);
123 int max_bank = 0;
124
125 // populate the address array for all active work items
126 for (int j = 0; j < numBanks; j++) {
127 if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
128 addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
129 } else {
130 addr_array[j] = std::numeric_limits<Addr>::max();
131 }
132 }
133
134 if (gpuDynInst->isLoad() || gpuDynInst->isStore()) {
135 // mask identical addresses
136 for (int j = 0; j < numBanks; ++j) {
137 for (int j0 = 0; j0 < j; j0++) {
138 if (addr_array[j] != std::numeric_limits<Addr>::max()
139 && addr_array[j] == addr_array[j0]) {
140 addr_array[j] = std::numeric_limits<Addr>::max();
141 }
142 }
143 }
144 }
145 // calculate bank conflicts
146 for (int j = 0; j < numBanks; ++j) {
147 if (addr_array[j] != std::numeric_limits<Addr>::max()) {
148 int bankId = addr_array[j] % banks;
149 bank[bankId]++;
150 max_bank = std::max(max_bank, bank[bankId]);
151 // Count the number of LDS banks accessed.
152 // Since we have masked identical addresses all remaining
153 // accesses will need to be serialized if they access
154 // the same bank (bank conflict).
155 (*numBankAccesses)++;
156 }
157 }
158 bank_conflicts += max_bank;
159 }
160 panic_if(bank_conflicts > parent->wfSize(),
161 "Max bank conflicts should match num of work items per instr");
162 return bank_conflicts;
163}
164
168bool
170{
171 return ownerLds->processPacket(packet);
172}
173
176{
178 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
179 packet->senderState);
180 return ss->getMemInst();
181}
182
186bool
188{
189 unsigned bankAccesses = 0;
190 // the number of conflicts this packet will have when accessing the LDS
191 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
192 // count the total number of physical LDS bank accessed
193 parent->stats.ldsBankAccesses += bankAccesses;
194 // count the LDS bank conflicts. A number set to 1 indicates one
195 // access per bank maximum so there are no bank conflicts
196 parent->stats.ldsBankConflictDist.sample(bankConflicts-1);
197
198 GPUDynInstPtr dynInst = getDynInstr(packet);
199 // account for the LDS bank conflict overhead
200 int busLength = (dynInst->isLoad()) ? parent->loadBusLength() :
201 (dynInst->isStore()) ? parent->storeBusLength() :
202 parent->loadBusLength();
203 // delay for accessing the LDS
204 Tick processingTime =
205 parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
206 parent->cyclesToTicks(Cycles(busLength));
207 // choose (delay + last packet in queue) or (now + delay) as the time to
208 // return this
209 Tick doneAt = earliestReturnTime() + processingTime;
210 // then store it for processing
211 return returnQueuePush(std::make_pair(doneAt, packet));
212}
213
217bool
219{
220 // TODO add time limits (e.g. one packet per cycle) and queue size limits
221 // and implement flow control
222 returnQueue.push(thePair);
223
224 // if there is no set wakeup time, look through the queue
225 if (!tickEvent.scheduled()) {
226 process();
227 }
228
229 return true;
230}
231
235void
237{
238 fatal("not implemented");
239}
240
244void
246{
247 // TODO verify that this is the right way to do this
248 assert(ownerLds->isRetryResp());
249 ownerLds->setRetryResp(false);
250 ownerLds->process();
251}
252
256void
258{
259 fatal("not implemented");
260}
261
265bool
267{
268 Tick now = clockEdge();
269
270 // send back completed packets
271 while (!returnQueue.empty() && returnQueue.front().first <= now) {
272 PacketPtr packet = returnQueue.front().second;
273
275 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
276 packet->senderState);
277
278 GPUDynInstPtr gpuDynInst = ss->getMemInst();
279
280 gpuDynInst->initiateAcc(gpuDynInst);
281
282 packet->makeTimingResponse();
283
284 returnQueue.pop();
285
286 bool success = cuPort.sendTimingResp(packet);
287
288 if (!success) {
289 retryResp = true;
290 panic("have not handled timing responses being NACK'd when sent"
291 "back");
292 }
293 }
294
295 // determine the next wakeup time
296 if (!returnQueue.empty()) {
297
298 Tick next = returnQueue.front().first;
299
300 if (tickEvent.scheduled()) {
301
302 if (next < tickEvent.when()) {
303
304 tickEvent.deschedule();
305 tickEvent.schedule(next);
306 }
307 } else {
308 tickEvent.schedule(next);
309 }
310 }
311
312 return true;
313}
314
318void
320{
321 ldsState->process();
322}
323
324} // namespace gem5
ClockedObject(const ClockedObjectParams &p)
Tick clockEdge(Cycles cycles=Cycles(0)) const
Determine the tick when a cycle begins, by default the current one, but the argument also enables the...
SenderState is information carried along with the packet, esp.
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79
virtual void recvRetry()
receive a retry
Definition lds_state.cc:257
virtual bool recvTimingReq(PacketPtr pkt)
receive the packet from the CU
Definition lds_state.cc:169
virtual void recvFunctional(PacketPtr pkt)
receive a packet in functional mode
Definition lds_state.cc:236
virtual void recvRespRetry()
receive a retry for a response
Definition lds_state.cc:245
virtual void process()
wake up at this time and perform specified actions
Definition lds_state.cc:319
bool process()
look for packets to return at this time
Definition lds_state.cc:266
AddrRange range
Definition lds_state.hh:595
std::string _name
Definition lds_state.hh:586
bool returnQueuePush(std::pair< Tick, PacketPtr > thePair)
add this to the queue of packets to be returned
Definition lds_state.cc:218
LdsState(const Params &params)
the default constructor that works with SWIG
Definition lds_state.cc:48
bool processPacket(PacketPtr packet)
process an incoming packet, add it to the return queue
Definition lds_state.cc:187
int bankConflictPenalty
Definition lds_state.hh:598
TickEvent tickEvent
Definition lds_state.hh:304
unsigned countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
derive the gpu mem packet from the packet and then count the bank conflicts
Definition lds_state.cc:86
std::queue< std::pair< Tick, PacketPtr > > returnQueue
Definition lds_state.hh:309
ComputeUnit * parent
Definition lds_state.hh:584
LdsStateParams Params
Definition lds_state.hh:331
void setParent(ComputeUnit *x_parent)
set the parent and name based on the parent
Definition lds_state.cc:71
CuSidePort cuPort
Definition lds_state.hh:582
GPUDynInstPtr getDynInstr(PacketPtr packet)
Definition lds_state.cc:175
Tick earliestReturnTime() const
Definition lds_state.hh:479
virtual std::string name() const
Definition named.hh:60
void makeTimingResponse()
Definition packet.hh:1080
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
STL pair class.
Definition stl.hh:58
STL vector class.
Definition stl.hh:37
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
const Params & params() const
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 21 > ss
Definition misc_types.hh:60
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Tick
Tick count type.
Definition types.hh:58
Packet * PacketPtr
A virtual base opaque structure used to hold state associated with the packet (e.g....
Definition packet.hh:469
SenderState * predecessor
Definition packet.hh:470

Generated on Mon May 26 2025 09:19:10 for gem5 by doxygen 1.13.2