gem5  v21.1.0.2
global_memory_pipeline.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #define __STDC_FORMAT_MACROS
35 #include <cinttypes>
36 #include "debug/GPUCoalescer.hh"
37 #include "debug/GPUMem.hh"
38 #include "debug/GPUReg.hh"
42 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/wavefront.hh"
45 
46 namespace gem5
47 {
48 
49 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
50  ComputeUnit &cu)
51  : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
52  gmQueueSize(p.global_mem_queue_size),
53  maxWaveRequests(p.max_wave_requests), inflightStores(0),
54  inflightLoads(0), stats(&cu)
55 {
56 }
57 
58 void
60 {
62 }
63 
64 bool
66 {
67  // We require one token from the coalescer's uncoalesced table to
68  // proceed
69  int token_count = 1;
70 
71  // Make sure the vector port has tokens. There is a single pool
72  // of tokens so only one port in the vector port needs to be checked.
73  // Lane 0 is chosen arbirarily.
74  DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
75  if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
76  DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
77  return false;
78  }
79 
80  return true;
81 }
82 
83 void
85 {
86  // We require one token from the coalescer's uncoalesced table to
87  // proceed
88  int token_count = 1;
89 
90  DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
91  assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
92  mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
93 }
94 
95 bool
97 {
98  // Ensure we haven't exceeded the maximum number of vmem requests
99  // for this wavefront
100  if ((mp->wavefront()->outstandingReqsRdGm
101  + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
102  return false;
103  }
104 
105  return true;
106 }
107 
108 void
110 {
111  // apply any returned global memory operations
113 
114  bool accessVrf = true;
115  Wavefront *w = nullptr;
116 
117  // check the VRF to see if the operands of a load (or load component
118  // of an atomic) are accessible
119  if (m && (m->isLoad() || m->isAtomicRet())) {
120  w = m->wavefront();
121 
122  accessVrf = w->computeUnit->vrf[w->simdId]->
123  canScheduleWriteOperandsFromLoad(w, m);
124 
125  }
126 
127  if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
128  accessVrf && (computeUnit.shader->coissue_return ||
130 
131  w = m->wavefront();
132 
133  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
134  m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
135  m->completeAcc(m);
136  if (m->isFlat()) {
137  w->decLGKMInstsIssued();
138  }
139  w->decVMemInstsIssued();
140 
141  if (m->isLoad() || m->isAtomicRet()) {
142  w->computeUnit->vrf[w->simdId]->
143  scheduleWriteOperandsFromLoad(w, m);
144  }
145 
147 
148  Tick accessTime = curTick() - m->getAccessTime();
149 
150  // Decrement outstanding requests count
151  computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
152  if (m->isStore() || m->isAtomic() || m->isMemSync()) {
153  computeUnit.shader->sampleStore(accessTime);
154  computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
155  m->time, -1);
156  }
157 
158  if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
159  computeUnit.shader->sampleLoad(accessTime);
160  computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
161  m->time, -1);
162  }
163 
164  w->validateRequestCounters();
165 
166  // Generate stats for round-trip time for vectory memory insts
167  // going all the way to memory and stats for individual cache
168  // blocks generated by the instruction.
169  m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
170  computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
171  computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
172 
173  // Mark write bus busy for appropriate amount of time
176  w->computeUnit->vectorGlobalMemUnit.set(m->time);
177  }
178 
179  // If pipeline has executed a global memory instruction
180  // execute global memory packets and issue global
181  // memory packets to DTLB
182  if (!gmIssuedRequests.empty()) {
184  if (mp->isLoad() || mp->isAtomic()) {
185  if (inflightLoads >= gmQueueSize) {
186  return;
187  } else {
188  ++inflightLoads;
189  }
190  } else if (mp->isStore()) {
191  if (inflightStores >= gmQueueSize) {
192  return;
193  } else {
194  ++inflightStores;
195  }
196  }
197 
198  DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
199  mp->disassemble(), mp->seqNum());
200  mp->initiateAcc(mp);
201 
202  if (mp->isStore() && mp->isGlobalSeg()) {
203  mp->wavefront()->decExpInstsIssued();
204  }
205 
206  if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
216  gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
217  std::make_pair(mp, false)));
218  }
219 
220  if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
232  }
233 
234  gmIssuedRequests.pop();
235 
236  DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
237  computeUnit.cu_id, mp->simdId, mp->wfSlotId);
238  }
239 }
240 
243 {
244  if (!gmOrderedRespBuffer.empty()) {
245  auto mem_req = gmOrderedRespBuffer.begin();
246 
247  if (mem_req->second.second) {
248  return mem_req->second.first;
249  }
250  }
251 
252  return nullptr;
253 }
254 
255 void
257 {
258  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
259  assert(inflightLoads > 0);
260  --inflightLoads;
261  } else if (gpuDynInst->isStore()) {
262  assert(inflightStores > 0);
263  --inflightStores;
264  }
265 
266  // we should only pop the oldest requst, and it
267  // should be marked as done if we are here
268  assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
269  assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
270  assert(gmOrderedRespBuffer.begin()->second.second);
271  // remove this instruction from the buffer by its
272  // unique seq ID
273  gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
274 }
275 
276 void
278 {
279  Wavefront *wf = gpuDynInst->wavefront();
280  if (gpuDynInst->isLoad()) {
281  wf->rdGmReqsInPipe--;
282  wf->outstandingReqsRdGm++;
283  } else if (gpuDynInst->isStore()) {
284  wf->wrGmReqsInPipe--;
285  wf->outstandingReqsWrGm++;
286  } else {
287  // Atomic, both read and write
288  wf->rdGmReqsInPipe--;
289  wf->outstandingReqsRdGm++;
290  wf->wrGmReqsInPipe--;
291  wf->outstandingReqsWrGm++;
292  }
293 
294  wf->outstandingReqs++;
296 
297  gpuDynInst->setAccessTime(curTick());
298  gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
299  gmIssuedRequests.push(gpuDynInst);
300 }
301 
302 void
304 {
305  auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
306  // if we are getting a response for this mem request,
307  // then it ought to already be in the ordered response
308  // buffer
309  assert(mem_req != gmOrderedRespBuffer.end());
310  mem_req->second.second = true;
311 }
312 
315  : statistics::Group(parent, "GlobalMemPipeline"),
316  ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
317  "are delayed before updating the VRF")
318 {
319 }
320 
321 } // namespace gem5
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
gem5::GlobalMemPipeline::issueRequest
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
Definition: global_memory_pipeline.cc:277
gem5::MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:281
gem5::Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:463
shader.hh
gem5::GlobalMemPipeline::inflightStores
int inflightStores
Definition: global_memory_pipeline.hh:123
gem5::Shader::globalMemSize
int globalMemSize
Definition: shader.hh:208
global_memory_pipeline.hh
gem5::GlobalMemPipeline::gmIssuedRequests
std::queue< GPUDynInstPtr > gmIssuedRequests
Definition: global_memory_pipeline.hh:146
gem5::Wavefront
Definition: wavefront.hh:62
compute_unit.hh
gem5::Complete
@ Complete
Definition: misc.hh:59
gem5::ArmISA::mp
Bitfield< 11 > mp
Definition: misc_types.hh:768
gem5::Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:470
gem5::GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:59
gem5::ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:355
gem5::ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:294
wavefront.hh
gem5::GlobalMemPipeline::inflightLoads
int inflightLoads
Definition: global_memory_pipeline.hh:124
gem5::ComputeUnit
Definition: compute_unit.hh:203
gem5::Wavefront::validateRequestCounters
void validateRequestCounters()
Definition: wavefront.cc:748
gem5::GlobalMemPipeline::GlobalMemPipelineStats::GlobalMemPipelineStats
GlobalMemPipelineStats(statistics::Group *parent)
Definition: global_memory_pipeline.cc:314
vector_register_file.hh
gem5::Initiate
@ Initiate
Definition: misc.hh:55
gem5::Wavefront::wrGmReqsInPipe
int wrGmReqsInPipe
Definition: wavefront.hh:189
gem5::Wavefront::outstandingReqs
int outstandingReqs
Definition: wavefront.hh:173
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:186
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:326
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gpu_dyn_inst.hh
gem5::GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition: global_memory_pipeline.cc:84
gem5::ComputeUnit::getTokenManager
TokenManager * getTokenManager()
Definition: compute_unit.hh:839
name
const std::string & name()
Definition: trace.cc:49
gem5::GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:96
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:51
gem5::Wavefront::rdGmReqsInPipe
int rdGmReqsInPipe
Definition: wavefront.hh:187
gem5::GlobalMemPipeline::maxWaveRequests
int maxWaveRequests
Definition: global_memory_pipeline.hh:118
gem5::Shader::coissue_return
int coissue_return
Definition: shader.hh:199
gem5::WaitClass::set
void set(uint64_t i)
Definition: misc.hh:84
gem5::ArmISA::m
Bitfield< 0 > m
Definition: misc_types.hh:394
gem5::GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:65
gem5::GlobalMemPipeline::computeUnit
ComputeUnit & computeUnit
Definition: global_memory_pipeline.hh:115
gem5::Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:359
gem5::Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:490
gem5::GlobalMemPipeline::gmOrderedRespBuffer
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
Definition: global_memory_pipeline.hh:142
gem5::TokenManager::recvTokens
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition: token_port.cc:157
gem5::statistics::Group
Statistics container.
Definition: group.hh:93
gem5::Wavefront::outstandingReqsWrGm
int outstandingReqsWrGm
Definition: wavefront.hh:175
gem5::GlobalMemPipeline::completeRequest
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
Definition: global_memory_pipeline.cc:256
gem5::Wavefront::outstandingReqsRdGm
int outstandingReqsRdGm
Definition: wavefront.hh:179
gem5::GlobalMemPipeline::globalMemSize
int globalMemSize
Definition: global_memory_pipeline.hh:127
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: decoder.cc:40
gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:227
gem5::GlobalMemPipeline::gmQueueSize
int gmQueueSize
Definition: global_memory_pipeline.hh:117
gem5::Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:453
gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:95
gem5::GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:109
gem5::GlobalMemPipeline::getNextReadyResp
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
Definition: global_memory_pipeline.cc:242
gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:303
gem5::GlobalMemPipeline::GlobalMemPipeline
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
Definition: global_memory_pipeline.cc:49
gem5::ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:223

Generated on Tue Sep 21 2021 12:25:23 for gem5 by doxygen 1.8.17