gem5  [DEVELOP-FOR-23.0]
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
global_memory_pipeline.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #define __STDC_FORMAT_MACROS
33 #include <cinttypes>
34 #include "debug/GPUCoalescer.hh"
35 #include "debug/GPUMem.hh"
36 #include "debug/GPUReg.hh"
40 #include "gpu-compute/shader.hh"
42 #include "gpu-compute/wavefront.hh"
43 
44 namespace gem5
45 {
46 
47 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
48  ComputeUnit &cu)
49  : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
50  gmQueueSize(p.global_mem_queue_size),
51  maxWaveRequests(p.max_wave_requests), inflightStores(0),
52  inflightLoads(0), stats(&cu)
53 {
54 }
55 
56 void
58 {
60 }
61 
62 bool
64 {
65  // System requests do not need GPU coalescer tokens. Make sure nothing
66  // has bypassed the operand gather check stage.
67  assert(!mp->isSystemReq());
68 
69  // We require one token from the coalescer's uncoalesced table to
70  // proceed
71  int token_count = 1;
72 
73  // Make sure the vector port has tokens. There is a single pool
74  // of tokens so only one port in the vector port needs to be checked.
75  // Lane 0 is chosen arbirarily.
76  DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
77  if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
78  DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
79  return false;
80  }
81 
82  return true;
83 }
84 
85 void
87 {
88  // We require one token from the coalescer's uncoalesced table to
89  // proceed
90  int token_count = 1;
91 
92  DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
93  assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
94  mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
95 }
96 
97 bool
99 {
100  // Ensure we haven't exceeded the maximum number of vmem requests
101  // for this wavefront
102  if ((mp->wavefront()->outstandingReqsRdGm
103  + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
104  return false;
105  }
106 
107  return true;
108 }
109 
110 void
112 {
113  // apply any returned global memory operations
115 
116  bool accessVrf = true;
117  Wavefront *w = nullptr;
118 
119  // check the VRF to see if the operands of a load (or load component
120  // of an atomic) are accessible
121  if (m && (m->isLoad() || m->isAtomicRet())) {
122  w = m->wavefront();
123 
124  accessVrf = w->computeUnit->vrf[w->simdId]->
125  canScheduleWriteOperandsFromLoad(w, m);
126 
127  }
128 
129  if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
130  accessVrf && (computeUnit.shader->coissue_return ||
132 
133  w = m->wavefront();
134 
135  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
136  m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
137  m->completeAcc(m);
138  if (m->isFlat()) {
139  w->decLGKMInstsIssued();
140  }
141  w->decVMemInstsIssued();
142 
143  if (m->isLoad() || m->isAtomicRet()) {
144  w->computeUnit->vrf[w->simdId]->
145  scheduleWriteOperandsFromLoad(w, m);
146  }
147 
149 
150  Tick accessTime = curTick() - m->getAccessTime();
151 
152  // Decrement outstanding requests count
153  computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
154  if (m->isStore() || m->isAtomic() || m->isMemSync()) {
155  computeUnit.shader->sampleStore(accessTime);
156  computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
157  m->time, -1);
158  }
159 
160  if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
161  computeUnit.shader->sampleLoad(accessTime);
162  computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
163  m->time, -1);
164  }
165 
166  w->validateRequestCounters();
167 
168  // Generate stats for round-trip time for vectory memory insts
169  // going all the way to memory and stats for individual cache
170  // blocks generated by the instruction.
171  m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
172  computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
173  computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
174 
175  // Mark write bus busy for appropriate amount of time
178  w->computeUnit->vectorGlobalMemUnit.set(m->time);
179  }
180 
181  // If pipeline has executed a global memory instruction
182  // execute global memory packets and issue global
183  // memory packets to DTLB
184  if (!gmIssuedRequests.empty()) {
186  if (mp->isLoad() || mp->isAtomic()) {
187  if (inflightLoads >= gmQueueSize) {
188  return;
189  } else {
190  ++inflightLoads;
191  }
192  } else if (mp->isStore()) {
193  if (inflightStores >= gmQueueSize) {
194  return;
195  } else {
196  ++inflightStores;
197  }
198  }
199 
200  DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
201  mp->disassemble(), mp->seqNum());
202  mp->initiateAcc(mp);
203 
204  if (mp->isStore() && mp->isGlobalSeg()) {
205  mp->wavefront()->decExpInstsIssued();
206  }
207 
208  if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
218  gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
219  std::make_pair(mp, false)));
220  }
221 
222  if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
234  }
235 
236  gmIssuedRequests.pop();
237 
238  DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
239  computeUnit.cu_id, mp->simdId, mp->wfSlotId);
240  }
241 }
242 
245 {
246  if (!gmOrderedRespBuffer.empty()) {
247  auto mem_req = gmOrderedRespBuffer.begin();
248 
249  if (mem_req->second.second) {
250  return mem_req->second.first;
251  }
252  }
253 
254  return nullptr;
255 }
256 
257 void
259 {
260  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
261  assert(inflightLoads > 0);
262  --inflightLoads;
263  } else if (gpuDynInst->isStore()) {
264  assert(inflightStores > 0);
265  --inflightStores;
266  }
267 
268  // we should only pop the oldest requst, and it
269  // should be marked as done if we are here
270  assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
271  assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
272  assert(gmOrderedRespBuffer.begin()->second.second);
273  // remove this instruction from the buffer by its
274  // unique seq ID
275  gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
276 }
277 
278 void
280 {
281  Wavefront *wf = gpuDynInst->wavefront();
282  if (gpuDynInst->isLoad()) {
283  wf->rdGmReqsInPipe--;
284  wf->outstandingReqsRdGm++;
285  } else if (gpuDynInst->isStore()) {
286  wf->wrGmReqsInPipe--;
287  wf->outstandingReqsWrGm++;
288  } else {
289  // Atomic, both read and write
290  wf->rdGmReqsInPipe--;
291  wf->outstandingReqsRdGm++;
292  wf->wrGmReqsInPipe--;
293  wf->outstandingReqsWrGm++;
294  }
295 
296  wf->outstandingReqs++;
298 
299  gpuDynInst->setAccessTime(curTick());
300  gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
301  gmIssuedRequests.push(gpuDynInst);
302 }
303 
304 void
306 {
307  auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
308  // if we are getting a response for this mem request,
309  // then it ought to already be in the ordered response
310  // buffer
311  assert(mem_req != gmOrderedRespBuffer.end());
312  mem_req->second.second = true;
313 }
314 
317  : statistics::Group(parent, "GlobalMemPipeline"),
318  ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
319  "are delayed before updating the VRF")
320 {
321 }
322 
323 } // namespace gem5
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
gem5::GlobalMemPipeline::issueRequest
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
Definition: global_memory_pipeline.cc:279
gem5::VegaISA::m
m
Definition: pagetable.hh:52
gem5::Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:461
shader.hh
gem5::GlobalMemPipeline::inflightStores
int inflightStores
Definition: global_memory_pipeline.hh:121
gem5::Shader::globalMemSize
int globalMemSize
Definition: shader.hh:238
global_memory_pipeline.hh
gem5::GlobalMemPipeline::gmIssuedRequests
std::queue< GPUDynInstPtr > gmIssuedRequests
Definition: global_memory_pipeline.hh:144
gem5::Wavefront
Definition: wavefront.hh:60
compute_unit.hh
gem5::Complete
@ Complete
Definition: misc.hh:57
gem5::ArmISA::mp
Bitfield< 11 > mp
Definition: misc_types.hh:910
gem5::VegaISA::w
Bitfield< 6 > w
Definition: pagetable.hh:59
gem5::Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:468
gem5::GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:57
gem5::ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:353
gem5::ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:292
wavefront.hh
gem5::GlobalMemPipeline::inflightLoads
int inflightLoads
Definition: global_memory_pipeline.hh:122
gem5::ComputeUnit
Definition: compute_unit.hh:201
gem5::Wavefront::validateRequestCounters
void validateRequestCounters()
Definition: wavefront.cc:778
gem5::GlobalMemPipeline::GlobalMemPipelineStats::GlobalMemPipelineStats
GlobalMemPipelineStats(statistics::Group *parent)
Definition: global_memory_pipeline.cc:316
vector_register_file.hh
gem5::Initiate
@ Initiate
Definition: misc.hh:53
gem5::Wavefront::wrGmReqsInPipe
int wrGmReqsInPipe
Definition: wavefront.hh:187
gem5::VegaISA::p
Bitfield< 54 > p
Definition: pagetable.hh:70
gem5::Wavefront::outstandingReqs
int outstandingReqs
Definition: wavefront.hh:171
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:210
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gpu_dyn_inst.hh
gem5::GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition: global_memory_pipeline.cc:86
gem5::ComputeUnit::getTokenManager
TokenManager * getTokenManager()
Definition: compute_unit.hh:890
name
const std::string & name()
Definition: trace.cc:48
gem5::GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:98
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::Wavefront::rdGmReqsInPipe
int rdGmReqsInPipe
Definition: wavefront.hh:185
gem5::GlobalMemPipeline::maxWaveRequests
int maxWaveRequests
Definition: global_memory_pipeline.hh:116
gem5::Shader::coissue_return
int coissue_return
Definition: shader.hh:229
gem5::WaitClass::set
void set(uint64_t i)
Definition: misc.hh:82
gem5::GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:63
gem5::GlobalMemPipeline::computeUnit
ComputeUnit & computeUnit
Definition: global_memory_pipeline.hh:113
gem5::Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:357
gem5::Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:488
gem5::GlobalMemPipeline::gmOrderedRespBuffer
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
Definition: global_memory_pipeline.hh:140
gem5::TokenManager::recvTokens
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition: token_port.cc:155
gem5::statistics::Group
Statistics container.
Definition: group.hh:92
gem5::Wavefront::outstandingReqsWrGm
int outstandingReqsWrGm
Definition: wavefront.hh:173
gem5::GlobalMemPipeline::completeRequest
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
Definition: global_memory_pipeline.cc:258
gem5::Wavefront::outstandingReqsRdGm
int outstandingReqsRdGm
Definition: wavefront.hh:177
gem5::GlobalMemPipeline::globalMemSize
int globalMemSize
Definition: global_memory_pipeline.hh:125
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: gpu_translation_state.hh:37
gem5::ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:225
gem5::GlobalMemPipeline::gmQueueSize
int gmQueueSize
Definition: global_memory_pipeline.hh:115
gem5::Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:451
gem5::WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:93
gem5::GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:111
gem5::GlobalMemPipeline::getNextReadyResp
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
Definition: global_memory_pipeline.cc:244
gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:305
gem5::GlobalMemPipeline::GlobalMemPipeline
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
Definition: global_memory_pipeline.cc:47
gem5::ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:221

Generated on Sun Jul 30 2023 01:56:57 for gem5 by doxygen 1.8.17