gem5  v21.2.1.1
global_memory_pipeline.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #define __STDC_FORMAT_MACROS
33 #include <cinttypes>
34 #include "debug/GPUCoalescer.hh"
35 #include "debug/GPUMem.hh"
36 #include "debug/GPUReg.hh"
40 #include "gpu-compute/shader.hh"
42 #include "gpu-compute/wavefront.hh"
43 
44 namespace gem5
45 {
46 
47 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
48  ComputeUnit &cu)
49  : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
50  gmQueueSize(p.global_mem_queue_size),
51  maxWaveRequests(p.max_wave_requests), inflightStores(0),
52  inflightLoads(0), stats(&cu)
53 {
54 }
55 
56 void
58 {
60 }
61 
62 bool
64 {
65  // We require one token from the coalescer's uncoalesced table to
66  // proceed
67  int token_count = 1;
68 
69  // Make sure the vector port has tokens. There is a single pool
70  // of tokens so only one port in the vector port needs to be checked.
71  // Lane 0 is chosen arbirarily.
72  DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
73  if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
74  DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
75  return false;
76  }
77 
78  return true;
79 }
80 
81 void
83 {
84  // We require one token from the coalescer's uncoalesced table to
85  // proceed
86  int token_count = 1;
87 
88  DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
89  assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
90  mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
91 }
92 
93 bool
95 {
96  // Ensure we haven't exceeded the maximum number of vmem requests
97  // for this wavefront
98  if ((mp->wavefront()->outstandingReqsRdGm
99  + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
100  return false;
101  }
102 
103  return true;
104 }
105 
106 void
108 {
109  // apply any returned global memory operations
111 
112  bool accessVrf = true;
113  Wavefront *w = nullptr;
114 
115  // check the VRF to see if the operands of a load (or load component
116  // of an atomic) are accessible
117  if (m && (m->isLoad() || m->isAtomicRet())) {
118  w = m->wavefront();
119 
120  accessVrf = w->computeUnit->vrf[w->simdId]->
121  canScheduleWriteOperandsFromLoad(w, m);
122 
123  }
124 
125  if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
126  accessVrf && (computeUnit.shader->coissue_return ||
128 
129  w = m->wavefront();
130 
131  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
132  m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
133  m->completeAcc(m);
134  if (m->isFlat()) {
135  w->decLGKMInstsIssued();
136  }
137  w->decVMemInstsIssued();
138 
139  if (m->isLoad() || m->isAtomicRet()) {
140  w->computeUnit->vrf[w->simdId]->
141  scheduleWriteOperandsFromLoad(w, m);
142  }
143 
145 
146  Tick accessTime = curTick() - m->getAccessTime();
147 
148  // Decrement outstanding requests count
149  computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
150  if (m->isStore() || m->isAtomic() || m->isMemSync()) {
151  computeUnit.shader->sampleStore(accessTime);
152  computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
153  m->time, -1);
154  }
155 
156  if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
157  computeUnit.shader->sampleLoad(accessTime);
158  computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
159  m->time, -1);
160  }
161 
162  w->validateRequestCounters();
163 
164  // Generate stats for round-trip time for vectory memory insts
165  // going all the way to memory and stats for individual cache
166  // blocks generated by the instruction.
167  m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
168  computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
169  computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
170 
171  // Mark write bus busy for appropriate amount of time
174  w->computeUnit->vectorGlobalMemUnit.set(m->time);
175  }
176 
177  // If pipeline has executed a global memory instruction
178  // execute global memory packets and issue global
179  // memory packets to DTLB
180  if (!gmIssuedRequests.empty()) {
182  if (mp->isLoad() || mp->isAtomic()) {
183  if (inflightLoads >= gmQueueSize) {
184  return;
185  } else {
186  ++inflightLoads;
187  }
188  } else if (mp->isStore()) {
189  if (inflightStores >= gmQueueSize) {
190  return;
191  } else {
192  ++inflightStores;
193  }
194  }
195 
196  DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
197  mp->disassemble(), mp->seqNum());
198  mp->initiateAcc(mp);
199 
200  if (mp->isStore() && mp->isGlobalSeg()) {
201  mp->wavefront()->decExpInstsIssued();
202  }
203 
204  if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
214  gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
215  std::make_pair(mp, false)));
216  }
217 
218  if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
230  }
231 
232  gmIssuedRequests.pop();
233 
234  DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
235  computeUnit.cu_id, mp->simdId, mp->wfSlotId);
236  }
237 }
238 
241 {
242  if (!gmOrderedRespBuffer.empty()) {
243  auto mem_req = gmOrderedRespBuffer.begin();
244 
245  if (mem_req->second.second) {
246  return mem_req->second.first;
247  }
248  }
249 
250  return nullptr;
251 }
252 
253 void
255 {
256  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
257  assert(inflightLoads > 0);
258  --inflightLoads;
259  } else if (gpuDynInst->isStore()) {
260  assert(inflightStores > 0);
261  --inflightStores;
262  }
263 
264  // we should only pop the oldest requst, and it
265  // should be marked as done if we are here
266  assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
267  assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
268  assert(gmOrderedRespBuffer.begin()->second.second);
269  // remove this instruction from the buffer by its
270  // unique seq ID
271  gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
272 }
273 
274 void
276 {
277  Wavefront *wf = gpuDynInst->wavefront();
278  if (gpuDynInst->isLoad()) {
279  wf->rdGmReqsInPipe--;
280  wf->outstandingReqsRdGm++;
281  } else if (gpuDynInst->isStore()) {
282  wf->wrGmReqsInPipe--;
283  wf->outstandingReqsWrGm++;
284  } else {
285  // Atomic, both read and write
286  wf->rdGmReqsInPipe--;
287  wf->outstandingReqsRdGm++;
288  wf->wrGmReqsInPipe--;
289  wf->outstandingReqsWrGm++;
290  }
291 
292  wf->outstandingReqs++;
294 
295  gpuDynInst->setAccessTime(curTick());
296  gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
297  gmIssuedRequests.push(gpuDynInst);
298 }
299 
300 void
302 {
303  auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
304  // if we are getting a response for this mem request,
305  // then it ought to already be in the ordered response
306  // buffer
307  assert(mem_req != gmOrderedRespBuffer.end());
308  mem_req->second.second = true;
309 }
310 
313  : statistics::Group(parent, "GlobalMemPipeline"),
314  ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
315  "are delayed before updating the VRF")
316 {
317 }
318 
319 } // namespace gem5
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
gem5::GlobalMemPipeline::issueRequest
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
Definition: global_memory_pipeline.cc:275
gem5::MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:281
gem5::Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:460
shader.hh
gem5::GlobalMemPipeline::inflightStores
int inflightStores
Definition: global_memory_pipeline.hh:121
gem5::Shader::globalMemSize
int globalMemSize
Definition: shader.hh:206
global_memory_pipeline.hh
gem5::GlobalMemPipeline::gmIssuedRequests
std::queue< GPUDynInstPtr > gmIssuedRequests
Definition: global_memory_pipeline.hh:144
gem5::Wavefront
Definition: wavefront.hh:60
compute_unit.hh
gem5::Complete
@ Complete
Definition: misc.hh:57
gem5::ArmISA::mp
Bitfield< 11 > mp
Definition: misc_types.hh:769
gem5::Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:467
gem5::GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:57
gem5::ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:353
gem5::ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:292
wavefront.hh
gem5::GlobalMemPipeline::inflightLoads
int inflightLoads
Definition: global_memory_pipeline.hh:122
gem5::ComputeUnit
Definition: compute_unit.hh:201
gem5::Wavefront::validateRequestCounters
void validateRequestCounters()
Definition: wavefront.cc:746
gem5::GlobalMemPipeline::GlobalMemPipelineStats::GlobalMemPipelineStats
GlobalMemPipelineStats(statistics::Group *parent)
Definition: global_memory_pipeline.cc:312
vector_register_file.hh
gem5::Initiate
@ Initiate
Definition: misc.hh:53
gem5::Wavefront::wrGmReqsInPipe
int wrGmReqsInPipe
Definition: wavefront.hh:187
gem5::Wavefront::outstandingReqs
int outstandingReqs
Definition: wavefront.hh:171
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:186
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:326
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gpu_dyn_inst.hh
gem5::GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition: global_memory_pipeline.cc:82
gem5::ComputeUnit::getTokenManager
TokenManager * getTokenManager()
Definition: compute_unit.hh:837
name
const std::string & name()
Definition: trace.cc:49
gem5::GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:94
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::Wavefront::rdGmReqsInPipe
int rdGmReqsInPipe
Definition: wavefront.hh:185
gem5::GlobalMemPipeline::maxWaveRequests
int maxWaveRequests
Definition: global_memory_pipeline.hh:116
gem5::Shader::coissue_return
int coissue_return
Definition: shader.hh:197
gem5::WaitClass::set
void set(uint64_t i)
Definition: misc.hh:82
gem5::ArmISA::m
Bitfield< 0 > m
Definition: misc_types.hh:395
gem5::GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:63
gem5::GlobalMemPipeline::computeUnit
ComputeUnit & computeUnit
Definition: global_memory_pipeline.hh:113
gem5::Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:356
gem5::Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:487
gem5::GlobalMemPipeline::gmOrderedRespBuffer
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
Definition: global_memory_pipeline.hh:140
gem5::TokenManager::recvTokens
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition: token_port.cc:155
gem5::statistics::Group
Statistics container.
Definition: group.hh:93
gem5::Wavefront::outstandingReqsWrGm
int outstandingReqsWrGm
Definition: wavefront.hh:173
gem5::GlobalMemPipeline::completeRequest
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
Definition: global_memory_pipeline.cc:254
gem5::Wavefront::outstandingReqsRdGm
int outstandingReqsRdGm
Definition: wavefront.hh:177
gem5::GlobalMemPipeline::globalMemSize
int globalMemSize
Definition: global_memory_pipeline.hh:125
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: tlb.cc:60
gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:225
gem5::GlobalMemPipeline::gmQueueSize
int gmQueueSize
Definition: global_memory_pipeline.hh:115
gem5::Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:450
gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
gem5::GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:107
gem5::GlobalMemPipeline::getNextReadyResp
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
Definition: global_memory_pipeline.cc:240
gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:301
gem5::GlobalMemPipeline::GlobalMemPipeline
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
Definition: global_memory_pipeline.cc:47
gem5::ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:221

Generated on Wed May 4 2022 12:13:58 for gem5 by doxygen 1.8.17