gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
global_memory_pipeline.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#define __STDC_FORMAT_MACROS
33#include <cinttypes>
34#include "debug/GPUCoalescer.hh"
35#include "debug/GPUMem.hh"
36#include "debug/GPUReg.hh"
40#include "gpu-compute/shader.hh"
43
44namespace gem5
45{
46
47GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
48 ComputeUnit &cu)
49 : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
50 gmQueueSize(p.global_mem_queue_size),
51 maxWaveRequests(p.max_wave_requests), inflightStores(0),
52 inflightLoads(0), stats(&cu)
53{
54}
55
56void
58{
59 globalMemSize = computeUnit.shader->globalMemSize;
60}
61
62bool
64{
65 // System requests do not need GPU coalescer tokens. Make sure nothing
66 // has bypassed the operand gather check stage.
67 assert(!mp->isSystemReq());
68
69 // We require one token from the coalescer's uncoalesced table to
70 // proceed
71 int token_count = 1;
72
73 // Make sure the vector port has tokens. There is a single pool
74 // of tokens so only one port in the vector port needs to be checked.
75 // Lane 0 is chosen arbirarily.
76 DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
77 if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
78 DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
79 return false;
80 }
81
82 return true;
83}
84
85void
87{
88 // We require one token from the coalescer's uncoalesced table to
89 // proceed
90 int token_count = 1;
91
92 DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
93 assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
94 mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
95}
96
97bool
99{
100 // Ensure we haven't exceeded the maximum number of vmem requests
101 // for this wavefront
102 if ((mp->wavefront()->outstandingReqsRdGm
103 + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
104 return false;
105 }
106
107 return true;
108}
109
110void
112{
113 // apply any returned global memory operations
115
116 bool accessVrf = true;
117 Wavefront *w = nullptr;
118
119 // check the VRF to see if the operands of a load (or load component
120 // of an atomic) are accessible
121 if (m && (m->isLoad() || m->isAtomicRet())) {
122 w = m->wavefront();
123
124 accessVrf = w->computeUnit->vrf[w->simdId]->
125 canScheduleWriteOperandsFromLoad(w, m);
126
127 }
128
129 if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
130 accessVrf && (computeUnit.shader->coissue_return ||
131 computeUnit.vectorGlobalMemUnit.rdy())) {
132
133 w = m->wavefront();
134
135 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
136 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
137 m->completeAcc(m);
138 if (m->isFlat()) {
139 w->decLGKMInstsIssued();
140 w->untrackLGKMInst(m);
141 }
142 w->decVMemInstsIssued();
143 w->untrackVMemInst(m);
144
145 if (m->isLoad() || m->isAtomicRet()) {
146 w->computeUnit->vrf[w->simdId]->
147 scheduleWriteOperandsFromLoad(w, m);
148 }
149
151
152 Tick accessTime = curTick() - m->getAccessTime();
153
154 // Decrement outstanding requests count
155 computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
156 if (m->isStore() || m->isAtomic() || m->isMemSync()) {
157 computeUnit.shader->sampleStore(accessTime);
158 computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
159 m->time, -1);
160 }
161
162 if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
163 computeUnit.shader->sampleLoad(accessTime);
164 computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
165 m->time, -1);
166 }
167
168 w->validateRequestCounters();
169
170 // Generate stats for round-trip time for vectory memory insts
171 // going all the way to memory and stats for individual cache
172 // blocks generated by the instruction.
173 m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
174 computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
175 computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
176
177 // Mark write bus busy for appropriate amount of time
178 computeUnit.glbMemToVrfBus.set(m->time);
179 if (!computeUnit.shader->coissue_return)
180 w->computeUnit->vectorGlobalMemUnit.set(m->time);
181 }
182
183 // If pipeline has executed a global memory instruction
184 // execute global memory packets and issue global
185 // memory packets to DTLB
186 if (!gmIssuedRequests.empty()) {
188 if (mp->isLoad() || mp->isAtomic()) {
189 if (inflightLoads >= gmQueueSize) {
190 return;
191 } else {
193 }
194 } else if (mp->isStore()) {
196 return;
197 } else {
199 }
200 }
201
202 DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
203 mp->disassemble(), mp->seqNum());
204 mp->initiateAcc(mp);
205
206 if (mp->isStore() && mp->isGlobalSeg()) {
207 mp->wavefront()->decExpInstsIssued();
208 mp->wavefront()->untrackExpInst(mp);
209 }
210
211 if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
221 gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
222 std::make_pair(mp, false)));
223 }
224
225 if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
236 computeUnit.getTokenManager()->recvTokens(1);
237 }
238
239 gmIssuedRequests.pop();
240
241 DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
242 computeUnit.cu_id, mp->simdId, mp->wfSlotId);
243 }
244}
245
248{
249 if (!gmOrderedRespBuffer.empty()) {
250 auto mem_req = gmOrderedRespBuffer.begin();
251
252 if (mem_req->second.second) {
253 return mem_req->second.first;
254 }
255 }
256
257 return nullptr;
258}
259
260void
262{
263 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
264 assert(inflightLoads > 0);
266 } else if (gpuDynInst->isStore()) {
267 assert(inflightStores > 0);
269 }
270
271 // we should only pop the oldest requst, and it
272 // should be marked as done if we are here
273 assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
274 assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
275 assert(gmOrderedRespBuffer.begin()->second.second);
276 // remove this instruction from the buffer by its
277 // unique seq ID
278 gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
279}
280
281void
283{
284 Wavefront *wf = gpuDynInst->wavefront();
285 if (gpuDynInst->isLoad()) {
286 wf->rdGmReqsInPipe--;
288 } else if (gpuDynInst->isStore()) {
289 wf->wrGmReqsInPipe--;
291 } else {
292 // Atomic, both read and write
293 wf->rdGmReqsInPipe--;
295 wf->wrGmReqsInPipe--;
297 }
298
299 wf->outstandingReqs++;
301
302 gpuDynInst->setAccessTime(curTick());
303 gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
304 gmIssuedRequests.push(gpuDynInst);
305}
306
307void
309{
310 auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
311 // if we are getting a response for this mem request,
312 // then it ought to already be in the ordered response
313 // buffer
314 assert(mem_req != gmOrderedRespBuffer.end());
315 mem_req->second.second = true;
316}
317
318void
320{
321 std::cout << "GMPipe inflight: " << inflightLoads << "/" << inflightStores
322 << " issued: " << gmIssuedRequests.size() << " returned: "
323 << gmOrderedRespBuffer.size() << " -- :\n";
324
325 for (auto &pair : gmOrderedRespBuffer) {
326 auto &inst_pair = pair.second;
327 auto &inst = inst_pair.first;
328 std::cout << "\t" << inst->disassemble() << " -- " << inst_pair.second
329 << "\n";
330 }
331}
332
335 : statistics::Group(parent, "GlobalMemPipeline"),
336 ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
337 "are delayed before updating the VRF")
338{
339}
340
341} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
void acqCoalescerToken(GPUDynInstPtr mp)
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
std::queue< GPUDynInstPtr > gmIssuedRequests
bool coalescerReady(GPUDynInstPtr mp) const
const std::string & name() const
gem5::GlobalMemPipeline::GlobalMemPipelineStats stats
void validateRequestCounters()
Definition wavefront.cc:827
Statistics container.
Definition group.hh:93
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
Bitfield< 0 > m
Bitfield< 11 > mp
Bitfield< 0 > p
Bitfield< 0 > w
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
@ Initiate
Definition misc.hh:53
@ Complete
Definition misc.hh:57
uint64_t Tick
Tick count type.
Definition types.hh:58

Generated on Mon May 26 2025 09:19:10 for gem5 by doxygen 1.13.2