gem5  v20.1.0.0
global_memory_pipeline.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #define __STDC_FORMAT_MACROS
35 #include <cinttypes>
36 #include "debug/GPUCoalescer.hh"
37 #include "debug/GPUMem.hh"
38 #include "debug/GPUReg.hh"
42 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/wavefront.hh"
45 
46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
47  ComputeUnit &cu)
48  : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
49  gmQueueSize(p->global_mem_queue_size),
50  maxWaveRequests(p->max_wave_requests), inflightStores(0),
51  inflightLoads(0)
52 {
53 }
54 
55 void
57 {
59 }
60 
61 bool
63 {
64  // We require one token from the coalescer's uncoalesced table to
65  // proceed
66  int token_count = 1;
67 
68  // Make sure the vector port has tokens. There is a single pool
69  // of tokens so only one port in the vector port needs to be checked.
70  // Lane 0 is chosen arbirarily.
71  DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
72  if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
73  DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
74  return false;
75  }
76 
77  return true;
78 }
79 
80 void
82 {
83  // We require one token from the coalescer's uncoalesced table to
84  // proceed
85  int token_count = 1;
86 
87  DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
88  assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
89  mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
90 }
91 
92 bool
94 {
95  // Ensure we haven't exceeded the maximum number of vmem requests
96  // for this wavefront
97  if ((mp->wavefront()->outstandingReqsRdGm
98  + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
99  return false;
100  }
101 
102  return true;
103 }
104 
105 void
107 {
108  // apply any returned global memory operations
110 
111  bool accessVrf = true;
112  Wavefront *w = nullptr;
113 
114  // check the VRF to see if the operands of a load (or load component
115  // of an atomic) are accessible
116  if (m && (m->isLoad() || m->isAtomicRet())) {
117  w = m->wavefront();
118 
119  accessVrf = w->computeUnit->vrf[w->simdId]->
120  canScheduleWriteOperandsFromLoad(w, m);
121 
122  }
123 
124  if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
125  accessVrf && (computeUnit.shader->coissue_return ||
127 
128  w = m->wavefront();
129 
130  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
131  m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
132  m->completeAcc(m);
133  w->decVMemInstsIssued();
134 
135  if (m->isLoad() || m->isAtomicRet()) {
136  w->computeUnit->vrf[w->simdId]->
137  scheduleWriteOperandsFromLoad(w, m);
138  }
139 
141 
142  Tick accessTime = curTick() - m->getAccessTime();
143 
144  // Decrement outstanding requests count
145  computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
146  if (m->isStore() || m->isAtomic() || m->isMemSync()) {
147  computeUnit.shader->sampleStore(accessTime);
148  computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
149  m->time, -1);
150  }
151 
152  if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
153  computeUnit.shader->sampleLoad(accessTime);
154  computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
155  m->time, -1);
156  }
157 
158  w->validateRequestCounters();
159 
160  // Generate stats for round-trip time for vectory memory insts
161  // going all the way to memory and stats for individual cache
162  // blocks generated by the instruction.
163  m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
164  computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
165  computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
166 
167  // Mark write bus busy for appropriate amount of time
170  w->computeUnit->vectorGlobalMemUnit.set(m->time);
171  }
172 
173  // If pipeline has executed a global memory instruction
174  // execute global memory packets and issue global
175  // memory packets to DTLB
176  if (!gmIssuedRequests.empty()) {
178  if (mp->isLoad() || mp->isAtomic()) {
179  if (inflightLoads >= gmQueueSize) {
180  return;
181  } else {
182  ++inflightLoads;
183  }
184  } else if (mp->isStore()) {
185  if (inflightStores >= gmQueueSize) {
186  return;
187  } else {
188  ++inflightStores;
189  }
190  }
191 
192  DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
193  mp->disassemble(), mp->seqNum());
194  mp->initiateAcc(mp);
195 
196  if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
206  gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
207  std::make_pair(mp, false)));
208  }
209 
210  if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
222  }
223 
224  gmIssuedRequests.pop();
225 
226  DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
227  computeUnit.cu_id, mp->simdId, mp->wfSlotId);
228  }
229 }
230 
233 {
234  if (!gmOrderedRespBuffer.empty()) {
235  auto mem_req = gmOrderedRespBuffer.begin();
236 
237  if (mem_req->second.second) {
238  return mem_req->second.first;
239  }
240  }
241 
242  return nullptr;
243 }
244 
245 void
247 {
248  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
249  assert(inflightLoads > 0);
250  --inflightLoads;
251  } else if (gpuDynInst->isStore()) {
252  assert(inflightStores > 0);
253  --inflightStores;
254  }
255 
256  // we should only pop the oldest requst, and it
257  // should be marked as done if we are here
258  assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
259  assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
260  assert(gmOrderedRespBuffer.begin()->second.second);
261  // remove this instruction from the buffer by its
262  // unique seq ID
263  gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
264 }
265 
266 void
268 {
269  gpuDynInst->setAccessTime(curTick());
270  gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
271  gmIssuedRequests.push(gpuDynInst);
272 }
273 
274 void
276 {
277  auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
278  // if we are getting a response for this mem request,
279  // then it ought to already be in the ordered response
280  // buffer
281  assert(mem_req != gmOrderedRespBuffer.end());
282  mem_req->second.second = true;
283 }
284 
285 void
287 {
289  .name(name() + ".load_vrf_bank_conflict_cycles")
290  .desc("total number of cycles GM data are delayed before updating "
291  "the VRF")
292  ;
293 }
GlobalMemPipeline::issueRequest
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
Definition: global_memory_pipeline.cc:267
GlobalMemPipeline::inflightLoads
int inflightLoads
Definition: global_memory_pipeline.hh:124
TokenManager::recvTokens
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition: token_port.cc:156
GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition: global_memory_pipeline.cc:81
shader.hh
global_memory_pipeline.hh
GPUCoalescer
Definition: GPUCoalescer.hh:201
compute_unit.hh
GlobalMemPipeline::gmIssuedRequests
std::queue< GPUDynInstPtr > gmIssuedRequests
Definition: global_memory_pipeline.hh:146
Shader::globalMemSize
int globalMemSize
Definition: shader.hh:231
Tick
uint64_t Tick
Tick count type.
Definition: types.hh:63
Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:562
ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:289
ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:222
GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:106
Shader::coissue_return
int coissue_return
Definition: shader.hh:222
GlobalMemPipeline::regStats
void regStats()
Definition: global_memory_pipeline.cc:286
wavefront.hh
WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:90
GlobalMemPipeline::loadVrfBankConflictCycles
Stats::Scalar loadVrfBankConflictCycles
Definition: global_memory_pipeline.hh:119
ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:218
GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:93
ComputeUnit
Definition: compute_unit.hh:198
vector_register_file.hh
GlobalMemPipeline::maxWaveRequests
int maxWaveRequests
Definition: global_memory_pipeline.hh:114
GlobalMemPipeline::getNextReadyResp
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
Definition: global_memory_pipeline.cc:232
MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:278
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:234
GlobalMemPipeline::gmQueueSize
int gmQueueSize
Definition: global_memory_pipeline.hh:113
WaitClass::set
void set(uint64_t i)
Definition: misc.hh:79
Initiate
@ Initiate
Definition: misc.hh:51
GlobalMemPipeline::globalMemSize
int globalMemSize
Definition: global_memory_pipeline.hh:127
gpu_dyn_inst.hh
GlobalMemPipeline::GlobalMemPipeline
GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu)
Definition: global_memory_pipeline.cc:46
Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:542
Stats::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:274
name
const std::string & name()
Definition: trace.cc:50
GlobalMemPipeline::inflightStores
int inflightStores
Definition: global_memory_pipeline.hh:123
ComputeUnit::getTokenManager
TokenManager * getTokenManager()
Definition: compute_unit.hh:981
Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:431
Wavefront
Definition: wavefront.hh:57
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
GlobalMemPipeline::computeUnit
ComputeUnit & computeUnit
Definition: global_memory_pipeline.hh:111
GlobalMemPipeline::name
const std::string & name() const
Definition: global_memory_pipeline.hh:97
Complete
@ Complete
Definition: misc.hh:55
GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:56
ArmISA::mp
Bitfield< 11 > mp
Definition: miscregs_types.hh:762
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
GlobalMemPipeline::completeRequest
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
Definition: global_memory_pipeline.cc:246
GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:275
GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:62
Stats::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:307
Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:535
GlobalMemPipeline::gmOrderedRespBuffer
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
Definition: global_memory_pipeline.hh:142
Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:525
ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:356
ArmISA::m
Bitfield< 0 > m
Definition: miscregs_types.hh:389
curTick
Tick curTick()
The current simulated tick.
Definition: core.hh:45

Generated on Wed Sep 30 2020 14:02:12 for gem5 by doxygen 1.8.17