gem5  v22.1.0.0
global_memory_pipeline.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #define __STDC_FORMAT_MACROS
33 #include <cinttypes>
34 #include "debug/GPUCoalescer.hh"
35 #include "debug/GPUMem.hh"
36 #include "debug/GPUReg.hh"
40 #include "gpu-compute/shader.hh"
42 #include "gpu-compute/wavefront.hh"
43 
44 namespace gem5
45 {
46 
47 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
48  ComputeUnit &cu)
49  : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
50  gmQueueSize(p.global_mem_queue_size),
51  maxWaveRequests(p.max_wave_requests), inflightStores(0),
52  inflightLoads(0), stats(&cu)
53 {
54 }
55 
56 void
58 {
60 }
61 
62 bool
64 {
65  // System requests do not need GPU coalescer tokens. Make sure nothing
66  // has bypassed the operand gather check stage.
67  assert(!mp->isSystemReq());
68 
69  // We require one token from the coalescer's uncoalesced table to
70  // proceed
71  int token_count = 1;
72 
73  // Make sure the vector port has tokens. There is a single pool
74  // of tokens so only one port in the vector port needs to be checked.
75  // Lane 0 is chosen arbirarily.
76  DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
77  if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
78  DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
79  return false;
80  }
81 
82  return true;
83 }
84 
85 void
87 {
88  // We require one token from the coalescer's uncoalesced table to
89  // proceed
90  int token_count = 1;
91 
92  DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
93  assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
94  mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
95 }
96 
97 bool
99 {
100  // Ensure we haven't exceeded the maximum number of vmem requests
101  // for this wavefront
102  if ((mp->wavefront()->outstandingReqsRdGm
103  + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
104  return false;
105  }
106 
107  return true;
108 }
109 
110 void
112 {
113  // apply any returned global memory operations
115 
116  bool accessVrf = true;
117  Wavefront *w = nullptr;
118 
119  // check the VRF to see if the operands of a load (or load component
120  // of an atomic) are accessible
121  if (m && (m->isLoad() || m->isAtomicRet())) {
122  w = m->wavefront();
123 
124  accessVrf = w->computeUnit->vrf[w->simdId]->
125  canScheduleWriteOperandsFromLoad(w, m);
126 
127  }
128 
129  if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
130  accessVrf && (computeUnit.shader->coissue_return ||
132 
133  w = m->wavefront();
134 
135  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
136  m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
137  m->completeAcc(m);
138  if (m->isFlat()) {
139  w->decLGKMInstsIssued();
140  }
141  w->decVMemInstsIssued();
142 
143  if (m->isLoad() || m->isAtomicRet()) {
144  w->computeUnit->vrf[w->simdId]->
145  scheduleWriteOperandsFromLoad(w, m);
146  }
147 
149 
150  Tick accessTime = curTick() - m->getAccessTime();
151 
152  // Decrement outstanding requests count
153  computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
154  if (m->isStore() || m->isAtomic() || m->isMemSync()) {
155  computeUnit.shader->sampleStore(accessTime);
156  computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
157  m->time, -1);
158  }
159 
160  if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
161  computeUnit.shader->sampleLoad(accessTime);
162  computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
163  m->time, -1);
164  }
165 
166  w->validateRequestCounters();
167 
168  // Generate stats for round-trip time for vectory memory insts
169  // going all the way to memory and stats for individual cache
170  // blocks generated by the instruction.
171  m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
172  computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
173  computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
174 
175  // Mark write bus busy for appropriate amount of time
178  w->computeUnit->vectorGlobalMemUnit.set(m->time);
179  }
180 
181  // If pipeline has executed a global memory instruction
182  // execute global memory packets and issue global
183  // memory packets to DTLB
184  if (!gmIssuedRequests.empty()) {
186  if (mp->isLoad() || mp->isAtomic()) {
187  if (inflightLoads >= gmQueueSize) {
188  return;
189  } else {
190  ++inflightLoads;
191  }
192  } else if (mp->isStore()) {
193  if (inflightStores >= gmQueueSize) {
194  return;
195  } else {
196  ++inflightStores;
197  }
198  }
199 
200  DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
201  mp->disassemble(), mp->seqNum());
202  mp->initiateAcc(mp);
203 
204  if (mp->isStore() && mp->isGlobalSeg()) {
205  mp->wavefront()->decExpInstsIssued();
206  }
207 
208  if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
218  gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
219  std::make_pair(mp, false)));
220  }
221 
222  if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
234  }
235 
236  gmIssuedRequests.pop();
237 
238  DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
239  computeUnit.cu_id, mp->simdId, mp->wfSlotId);
240  }
241 }
242 
245 {
246  if (!gmOrderedRespBuffer.empty()) {
247  auto mem_req = gmOrderedRespBuffer.begin();
248 
249  if (mem_req->second.second) {
250  return mem_req->second.first;
251  }
252  }
253 
254  return nullptr;
255 }
256 
257 void
259 {
260  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
261  assert(inflightLoads > 0);
262  --inflightLoads;
263  } else if (gpuDynInst->isStore()) {
264  assert(inflightStores > 0);
265  --inflightStores;
266  }
267 
268  // we should only pop the oldest requst, and it
269  // should be marked as done if we are here
270  assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
271  assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
272  assert(gmOrderedRespBuffer.begin()->second.second);
273  // remove this instruction from the buffer by its
274  // unique seq ID
275  gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
276 }
277 
278 void
280 {
281  Wavefront *wf = gpuDynInst->wavefront();
282  if (gpuDynInst->isLoad()) {
283  wf->rdGmReqsInPipe--;
284  wf->outstandingReqsRdGm++;
285  } else if (gpuDynInst->isStore()) {
286  wf->wrGmReqsInPipe--;
287  wf->outstandingReqsWrGm++;
288  } else {
289  // Atomic, both read and write
290  wf->rdGmReqsInPipe--;
291  wf->outstandingReqsRdGm++;
292  wf->wrGmReqsInPipe--;
293  wf->outstandingReqsWrGm++;
294  }
295 
296  wf->outstandingReqs++;
298 
299  gpuDynInst->setAccessTime(curTick());
300  gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
301  gmIssuedRequests.push(gpuDynInst);
302 }
303 
304 void
306 {
307  auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
308  // if we are getting a response for this mem request,
309  // then it ought to already be in the ordered response
310  // buffer
311  assert(mem_req != gmOrderedRespBuffer.end());
312  mem_req->second.second = true;
313 }
314 
317  : statistics::Group(parent, "GlobalMemPipeline"),
318  ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
319  "are delayed before updating the VRF")
320 {
321 }
322 
323 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
WaitClass vectorGlobalMemUnit
WaitClass glbMemToVrfBus
TokenManager * getTokenManager()
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
bool outstandingReqsCheck(GPUDynInstPtr mp) const
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
void acqCoalescerToken(GPUDynInstPtr mp)
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
std::queue< GPUDynInstPtr > gmIssuedRequests
bool coalescerReady(GPUDynInstPtr mp) const
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:357
int coissue_return
Definition: shader.hh:229
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:488
int globalMemSize
Definition: shader.hh:238
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:468
void sampleLoad(const Tick accessTime)
Definition: shader.cc:461
void sampleStore(const Tick accessTime)
Definition: shader.cc:451
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition: token_port.cc:155
void set(uint64_t i)
Definition: misc.hh:82
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
void validateRequestCounters()
Definition: wavefront.cc:770
int outstandingReqsWrGm
Definition: wavefront.hh:173
int outstandingReqsRdGm
Definition: wavefront.hh:177
Statistics container.
Definition: group.hh:94
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
Bitfield< 11 > mp
Definition: misc_types.hh:827
Bitfield< 6 > w
Definition: pagetable.hh:59
Bitfield< 54 > p
Definition: pagetable.hh:70
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
@ Initiate
Definition: misc.hh:53
@ Complete
Definition: misc.hh:57
uint64_t Tick
Tick count type.
Definition: types.hh:58
const std::string & name()
Definition: trace.cc:49

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1