gem5 v24.0.0.0
Loading...
Searching...
No Matches
global_memory_pipeline.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#define __STDC_FORMAT_MACROS
33#include <cinttypes>
34#include "debug/GPUCoalescer.hh"
35#include "debug/GPUMem.hh"
36#include "debug/GPUReg.hh"
40#include "gpu-compute/shader.hh"
43
44namespace gem5
45{
46
47GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
48 ComputeUnit &cu)
49 : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
50 gmQueueSize(p.global_mem_queue_size),
51 maxWaveRequests(p.max_wave_requests), inflightStores(0),
52 inflightLoads(0), stats(&cu)
53{
54}
55
56void
61
62bool
64{
65 // System requests do not need GPU coalescer tokens. Make sure nothing
66 // has bypassed the operand gather check stage.
67 assert(!mp->isSystemReq());
68
69 // We require one token from the coalescer's uncoalesced table to
70 // proceed
71 int token_count = 1;
72
73 // Make sure the vector port has tokens. There is a single pool
74 // of tokens so only one port in the vector port needs to be checked.
75 // Lane 0 is chosen arbirarily.
76 DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
77 if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
78 DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
79 return false;
80 }
81
82 return true;
83}
84
85void
87{
88 // We require one token from the coalescer's uncoalesced table to
89 // proceed
90 int token_count = 1;
91
92 DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
93 assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
94 mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
95}
96
97bool
99{
100 // Ensure we haven't exceeded the maximum number of vmem requests
101 // for this wavefront
102 if ((mp->wavefront()->outstandingReqsRdGm
103 + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
104 return false;
105 }
106
107 return true;
108}
109
110void
112{
113 // apply any returned global memory operations
115
116 bool accessVrf = true;
117 Wavefront *w = nullptr;
118
119 // check the VRF to see if the operands of a load (or load component
120 // of an atomic) are accessible
121 if (m && (m->isLoad() || m->isAtomicRet())) {
122 w = m->wavefront();
123
124 accessVrf = w->computeUnit->vrf[w->simdId]->
125 canScheduleWriteOperandsFromLoad(w, m);
126
127 }
128
129 if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
130 accessVrf && (computeUnit.shader->coissue_return ||
132
133 w = m->wavefront();
134
135 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
136 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
137 m->completeAcc(m);
138 if (m->isFlat()) {
139 w->decLGKMInstsIssued();
140 }
141 w->decVMemInstsIssued();
142
143 if (m->isLoad() || m->isAtomicRet()) {
144 w->computeUnit->vrf[w->simdId]->
145 scheduleWriteOperandsFromLoad(w, m);
146 }
147
149
150 Tick accessTime = curTick() - m->getAccessTime();
151
152 // Decrement outstanding requests count
153 computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
154 if (m->isStore() || m->isAtomic() || m->isMemSync()) {
155 computeUnit.shader->sampleStore(accessTime);
156 computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
157 m->time, -1);
158 }
159
160 if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
161 computeUnit.shader->sampleLoad(accessTime);
162 computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
163 m->time, -1);
164 }
165
166 w->validateRequestCounters();
167
168 // Generate stats for round-trip time for vectory memory insts
169 // going all the way to memory and stats for individual cache
170 // blocks generated by the instruction.
171 m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
172 computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
173 computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
174
175 // Mark write bus busy for appropriate amount of time
178 w->computeUnit->vectorGlobalMemUnit.set(m->time);
179 }
180
181 // If pipeline has executed a global memory instruction
182 // execute global memory packets and issue global
183 // memory packets to DTLB
184 if (!gmIssuedRequests.empty()) {
186 if (mp->isLoad() || mp->isAtomic()) {
187 if (inflightLoads >= gmQueueSize) {
188 return;
189 } else {
191 }
192 } else if (mp->isStore()) {
194 return;
195 } else {
197 }
198 }
199
200 DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
201 mp->disassemble(), mp->seqNum());
202 mp->initiateAcc(mp);
203
204 if (mp->isStore() && mp->isGlobalSeg()) {
205 mp->wavefront()->decExpInstsIssued();
206 }
207
208 if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
218 gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
219 std::make_pair(mp, false)));
220 }
221
222 if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
234 }
235
236 gmIssuedRequests.pop();
237
238 DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
239 computeUnit.cu_id, mp->simdId, mp->wfSlotId);
240 }
241}
242
245{
246 if (!gmOrderedRespBuffer.empty()) {
247 auto mem_req = gmOrderedRespBuffer.begin();
248
249 if (mem_req->second.second) {
250 return mem_req->second.first;
251 }
252 }
253
254 return nullptr;
255}
256
257void
259{
260 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
261 assert(inflightLoads > 0);
263 } else if (gpuDynInst->isStore()) {
264 assert(inflightStores > 0);
266 }
267
268 // we should only pop the oldest requst, and it
269 // should be marked as done if we are here
270 assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
271 assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
272 assert(gmOrderedRespBuffer.begin()->second.second);
273 // remove this instruction from the buffer by its
274 // unique seq ID
275 gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
276}
277
278void
280{
281 Wavefront *wf = gpuDynInst->wavefront();
282 if (gpuDynInst->isLoad()) {
283 wf->rdGmReqsInPipe--;
285 } else if (gpuDynInst->isStore()) {
286 wf->wrGmReqsInPipe--;
288 } else {
289 // Atomic, both read and write
290 wf->rdGmReqsInPipe--;
292 wf->wrGmReqsInPipe--;
294 }
295
296 wf->outstandingReqs++;
298
299 gpuDynInst->setAccessTime(curTick());
300 gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
301 gmIssuedRequests.push(gpuDynInst);
302}
303
304void
306{
307 auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
308 // if we are getting a response for this mem request,
309 // then it ought to already be in the ordered response
310 // buffer
311 assert(mem_req != gmOrderedRespBuffer.end());
312 mem_req->second.second = true;
313}
314
317 : statistics::Group(parent, "GlobalMemPipeline"),
318 ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
319 "are delayed before updating the VRF")
320{
321}
322
323} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
WaitClass vectorGlobalMemUnit
TokenManager * getTokenManager()
WaitClass glbMemToVrfBus
std::vector< VectorRegisterFile * > vrf
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
void acqCoalescerToken(GPUDynInstPtr mp)
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
std::queue< GPUDynInstPtr > gmIssuedRequests
bool coalescerReady(GPUDynInstPtr mp) const
void ScheduleAdd(int *val, Tick when, int x)
Definition shader.cc:376
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
Definition shader.cc:507
int coissue_return
Definition shader.hh:241
int globalMemSize
Definition shader.hh:252
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition shader.cc:487
void sampleLoad(const Tick accessTime)
Definition shader.cc:480
void sampleStore(const Tick accessTime)
Definition shader.cc:470
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
void set(uint64_t i)
Definition misc.hh:82
bool rdy(Cycles cycles=Cycles(0)) const
Definition misc.hh:93
void validateRequestCounters()
Definition wavefront.cc:801
ComputeUnit * computeUnit
Definition wavefront.hh:108
Statistics container.
Definition group.hh:93
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
Bitfield< 0 > m
Bitfield< 11 > mp
Bitfield< 0 > p
Bitfield< 0 > w
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
@ Initiate
Definition misc.hh:53
@ Complete
Definition misc.hh:57
uint64_t Tick
Tick count type.
Definition types.hh:58
const std::string & name()
Definition trace.cc:48

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0