gem5  v21.0.1.0
global_memory_pipeline.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #define __STDC_FORMAT_MACROS
35 #include <cinttypes>
36 #include "debug/GPUCoalescer.hh"
37 #include "debug/GPUMem.hh"
38 #include "debug/GPUReg.hh"
42 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/wavefront.hh"
45 
46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
47  ComputeUnit &cu)
48  : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
49  gmQueueSize(p.global_mem_queue_size),
50  maxWaveRequests(p.max_wave_requests), inflightStores(0),
51  inflightLoads(0), stats(&cu)
52 {
53 }
54 
55 void
57 {
59 }
60 
61 bool
63 {
64  // We require one token from the coalescer's uncoalesced table to
65  // proceed
66  int token_count = 1;
67 
68  // Make sure the vector port has tokens. There is a single pool
69  // of tokens so only one port in the vector port needs to be checked.
70  // Lane 0 is chosen arbirarily.
71  DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
72  if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
73  DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
74  return false;
75  }
76 
77  return true;
78 }
79 
80 void
82 {
83  // We require one token from the coalescer's uncoalesced table to
84  // proceed
85  int token_count = 1;
86 
87  DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
88  assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
89  mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
90 }
91 
92 bool
94 {
95  // Ensure we haven't exceeded the maximum number of vmem requests
96  // for this wavefront
97  if ((mp->wavefront()->outstandingReqsRdGm
98  + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
99  return false;
100  }
101 
102  return true;
103 }
104 
105 void
107 {
108  // apply any returned global memory operations
110 
111  bool accessVrf = true;
112  Wavefront *w = nullptr;
113 
114  // check the VRF to see if the operands of a load (or load component
115  // of an atomic) are accessible
116  if (m && (m->isLoad() || m->isAtomicRet())) {
117  w = m->wavefront();
118 
119  accessVrf = w->computeUnit->vrf[w->simdId]->
120  canScheduleWriteOperandsFromLoad(w, m);
121 
122  }
123 
124  if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
125  accessVrf && (computeUnit.shader->coissue_return ||
127 
128  w = m->wavefront();
129 
130  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
131  m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
132  m->completeAcc(m);
133  if (m->isFlat()) {
134  w->decLGKMInstsIssued();
135  }
136  w->decVMemInstsIssued();
137 
138  if (m->isLoad() || m->isAtomicRet()) {
139  w->computeUnit->vrf[w->simdId]->
140  scheduleWriteOperandsFromLoad(w, m);
141  }
142 
144 
145  Tick accessTime = curTick() - m->getAccessTime();
146 
147  // Decrement outstanding requests count
148  computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
149  if (m->isStore() || m->isAtomic() || m->isMemSync()) {
150  computeUnit.shader->sampleStore(accessTime);
151  computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
152  m->time, -1);
153  }
154 
155  if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
156  computeUnit.shader->sampleLoad(accessTime);
157  computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
158  m->time, -1);
159  }
160 
161  w->validateRequestCounters();
162 
163  // Generate stats for round-trip time for vectory memory insts
164  // going all the way to memory and stats for individual cache
165  // blocks generated by the instruction.
166  m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
167  computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
168  computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
169 
170  // Mark write bus busy for appropriate amount of time
173  w->computeUnit->vectorGlobalMemUnit.set(m->time);
174  }
175 
176  // If pipeline has executed a global memory instruction
177  // execute global memory packets and issue global
178  // memory packets to DTLB
179  if (!gmIssuedRequests.empty()) {
181  if (mp->isLoad() || mp->isAtomic()) {
182  if (inflightLoads >= gmQueueSize) {
183  return;
184  } else {
185  ++inflightLoads;
186  }
187  } else if (mp->isStore()) {
188  if (inflightStores >= gmQueueSize) {
189  return;
190  } else {
191  ++inflightStores;
192  }
193  }
194 
195  DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
196  mp->disassemble(), mp->seqNum());
197  mp->initiateAcc(mp);
198 
199  if (mp->isStore() && mp->isGlobalSeg()) {
200  mp->wavefront()->decExpInstsIssued();
201  }
202 
203  if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
213  gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
214  std::make_pair(mp, false)));
215  }
216 
217  if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
229  }
230 
231  gmIssuedRequests.pop();
232 
233  DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
234  computeUnit.cu_id, mp->simdId, mp->wfSlotId);
235  }
236 }
237 
240 {
241  if (!gmOrderedRespBuffer.empty()) {
242  auto mem_req = gmOrderedRespBuffer.begin();
243 
244  if (mem_req->second.second) {
245  return mem_req->second.first;
246  }
247  }
248 
249  return nullptr;
250 }
251 
252 void
254 {
255  if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
256  assert(inflightLoads > 0);
257  --inflightLoads;
258  } else if (gpuDynInst->isStore()) {
259  assert(inflightStores > 0);
260  --inflightStores;
261  }
262 
263  // we should only pop the oldest requst, and it
264  // should be marked as done if we are here
265  assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
266  assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
267  assert(gmOrderedRespBuffer.begin()->second.second);
268  // remove this instruction from the buffer by its
269  // unique seq ID
270  gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
271 }
272 
273 void
275 {
276  gpuDynInst->setAccessTime(curTick());
277  gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
278  gmIssuedRequests.push(gpuDynInst);
279 }
280 
281 void
283 {
284  auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
285  // if we are getting a response for this mem request,
286  // then it ought to already be in the ordered response
287  // buffer
288  assert(mem_req != gmOrderedRespBuffer.end());
289  mem_req->second.second = true;
290 }
291 
294  : Stats::Group(parent, "GlobalMemPipeline"),
295  ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
296  "are delayed before updating the VRF")
297 {
298 }
GlobalMemPipeline::issueRequest
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
Definition: global_memory_pipeline.cc:274
GlobalMemPipeline::inflightLoads
int inflightLoads
Definition: global_memory_pipeline.hh:121
TokenManager::recvTokens
void recvTokens(int num_tokens)
Increment the number of available tokens by num_tokens.
Definition: token_port.cc:154
GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition: global_memory_pipeline.cc:81
shader.hh
global_memory_pipeline.hh
GPUCoalescer
Definition: GPUCoalescer.hh:209
compute_unit.hh
GlobalMemPipeline::gmIssuedRequests
std::queue< GPUDynInstPtr > gmIssuedRequests
Definition: global_memory_pipeline.hh:143
Shader::globalMemSize
int globalMemSize
Definition: shader.hh:211
Tick
uint64_t Tick
Tick count type.
Definition: types.hh:59
Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:487
ComputeUnit::cu_id
int cu_id
Definition: compute_unit.hh:291
ComputeUnit::vectorGlobalMemUnit
WaitClass vectorGlobalMemUnit
Definition: compute_unit.hh:224
GlobalMemPipeline::exec
void exec()
Definition: global_memory_pipeline.cc:106
Shader::coissue_return
int coissue_return
Definition: shader.hh:202
wavefront.hh
WaitClass::rdy
bool rdy(Cycles cycles=Cycles(0)) const
Definition: misc.hh:90
ComputeUnit::glbMemToVrfBus
WaitClass glbMemToVrfBus
Definition: compute_unit.hh:220
GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:93
ComputeUnit
Definition: compute_unit.hh:200
vector_register_file.hh
GlobalMemPipeline::maxWaveRequests
int maxWaveRequests
Definition: global_memory_pipeline.hh:115
GlobalMemPipeline::getNextReadyResp
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
Definition: global_memory_pipeline.cc:239
MipsISA::w
Bitfield< 0 > w
Definition: pra_constants.hh:278
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:237
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:71
GlobalMemPipeline::gmQueueSize
int gmQueueSize
Definition: global_memory_pipeline.hh:114
WaitClass::set
void set(uint64_t i)
Definition: misc.hh:79
Initiate
@ Initiate
Definition: misc.hh:51
GlobalMemPipeline::globalMemSize
int globalMemSize
Definition: global_memory_pipeline.hh:124
gpu_dyn_inst.hh
Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:467
name
const std::string & name()
Definition: trace.cc:48
GlobalMemPipeline::inflightStores
int inflightStores
Definition: global_memory_pipeline.hh:120
GlobalMemPipeline::GlobalMemPipeline
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
Definition: global_memory_pipeline.cc:46
ComputeUnit::getTokenManager
TokenManager * getTokenManager()
Definition: compute_unit.hh:836
GlobalMemPipeline::GlobalMemPipelineStats::GlobalMemPipelineStats
GlobalMemPipelineStats(Stats::Group *parent)
Definition: global_memory_pipeline.cc:293
Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:356
Wavefront
Definition: wavefront.hh:59
Stats::Group
Statistics container.
Definition: group.hh:87
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
GlobalMemPipeline::computeUnit
ComputeUnit & computeUnit
Definition: global_memory_pipeline.hh:112
Stats
Definition: statistics.cc:53
Complete
@ Complete
Definition: misc.hh:55
curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:43
GlobalMemPipeline::init
void init()
Definition: global_memory_pipeline.cc:56
ArmISA::mp
Bitfield< 11 > mp
Definition: miscregs_types.hh:762
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
GlobalMemPipeline::completeRequest
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
Definition: global_memory_pipeline.cc:253
GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition: global_memory_pipeline.cc:282
GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition: global_memory_pipeline.cc:62
Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:460
GlobalMemPipeline::gmOrderedRespBuffer
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
Definition: global_memory_pipeline.hh:139
Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:450
ComputeUnit::shader
Shader * shader
Definition: compute_unit.hh:352
ArmISA::m
Bitfield< 0 > m
Definition: miscregs_types.hh:389

Generated on Tue Jun 22 2021 15:28:28 for gem5 by doxygen 1.8.17