gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
dispatcher.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32
34
35#include "debug/GPUAgentDisp.hh"
36#include "debug/GPUDisp.hh"
37#include "debug/GPUKernelInfo.hh"
38#include "debug/GPUWgLatency.hh"
41#include "gpu-compute/shader.hh"
43#include "sim/sim_exit.hh"
45#include "sim/system.hh"
46
47namespace gem5
48{
49
51 : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
52 tickEvent([this]{ exec(); },
53 "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
54 dispatchActive(false), kernelExitEvents(p.kernel_exit_events),
55 stats(this)
56{
57 schedule(&tickEvent, 0);
58}
59
63
66{
67 assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
68 return hsaQueueEntries[disp_id];
69}
70
71void
73{
74 gpuCmdProc = gpu_cmd_proc;
75}
76
77void
79{
80 shader = new_shader;
81}
82
83void
85{
86 Tick event_tick = 0;
87
88 if (tickEvent.scheduled())
89 event_tick = tickEvent.when();
90
91 SERIALIZE_SCALAR(event_tick);
92}
93
94void
96{
97 Tick event_tick;
98
99 if (tickEvent.scheduled())
101
102 UNSERIALIZE_SCALAR(event_tick);
103
104 if (event_tick) {
105 schedule(&tickEvent, event_tick);
106 }
107}
108
115void
117{
118 ++stats.numKernelLaunched;
119
120 DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
121 task->kernelName(), task->dispatchId());
122 DPRINTF(GPUAgentDisp, "launching kernel: %s, dispatch ID: %d\n",
123 task->kernelName(), task->dispatchId());
124
125 execIds.push(task->dispatchId());
126 dispatchActive = true;
127 hsaQueueEntries.emplace(task->dispatchId(), task);
128
129 if (!tickEvent.scheduled()) {
130 schedule(&tickEvent, curTick() + shader->clockPeriod());
131 }
132}
133
134void
136{
137 int fail_count(0);
138 int disp_count(0);
139
145 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
146 DPRINTF(GPUAgentDisp, "Launching %d Kernels\n", execIds.size());
147
148 if (execIds.size() > 0) {
149 ++stats.cyclesWaitingForDispatch;
150 }
151
157 while (execIds.size() > fail_count) {
158 int exec_id = execIds.front();
159 auto task = hsaQueueEntries[exec_id];
160 bool launched(false);
161
162 // acq is needed before starting dispatch
163 if (shader->impl_kern_launch_acq) {
164 // try to invalidate cache
165 shader->prepareInvalidate(task);
166 } else {
167 // kern launch acquire is not set, skip invalidate
168 task->markInvDone();
169 }
170
175 if (!task->isInvDone()){
176 execIds.push(exec_id);
177 ++fail_count;
178
179 DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
180 " invalidate requests\n", exec_id, task->outstandingInvs());
181
182 // try the next kernel_id
183 execIds.pop();
184 continue;
185 }
186
187 // kernel invalidate is done, start workgroup dispatch
188 while (!task->dispComplete()) {
189 // update the thread context
190 shader->updateContext(task->contextId());
191
192 // attempt to dispatch workgroup
193 DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
194 curTick(), exec_id);
195
196 if (!shader->dispatchWorkgroups(task)) {
202 DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
203 execIds.push(exec_id);
204 ++fail_count;
205 break;
206 } else if (!launched) {
207 launched = true;
208 disp_count++;
209 DPRINTF(GPUKernelInfo, "Launched kernel %d for WG %d\n",
210 exec_id, disp_count);
211 }
212 }
213
214 // try the next kernel_id
215 execIds.pop();
216 }
217
218 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
219 DPRINTF(GPUWgLatency, "Kernel Wgs dispatched: %d | %d failures\n",
220 disp_count, fail_count);
221
222 while (doneIds.size()) {
223 DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
224 doneIds.pop();
225 }
226}
227
228bool
230{
231 int kern_id = wf->kernId;
232 assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
233 auto task = hsaQueueEntries[kern_id];
234 assert(task->dispatchId() == kern_id);
235
240 return (task->numWgCompleted() + 1 == task->numWgTotal());
241}
242
248void
250 assert(val == -1 || val == 1);
251
252 auto task = hsaQueueEntries[kern_id];
253 task->updateOutstandingInvs(val);
254
255 // kernel invalidate is done, schedule dispatch work
256 if (task->isInvDone() && !tickEvent.scheduled()) {
257 schedule(&tickEvent, curTick() + shader->clockPeriod());
258 }
259}
260
268bool
270 assert(val == -1 || val == 1);
271
272 auto task = hsaQueueEntries[kern_id];
273 task->updateOutstandingWbs(val);
274
275 // true: WB is done, false: WB is still ongoing
276 return (task->outstandingWbs() == 0);
277}
278
282int
284 auto task = hsaQueueEntries[kernId];
285
286 return task->outstandingWbs();
287}
288
297void
299{
300 int kern_id = wf->kernId;
301 DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
302 auto task = hsaQueueEntries[kern_id];
303 assert(task->dispatchId() == kern_id);
304 task->notifyWgCompleted();
305
306 DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
307 curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
308
309 if (task->numWgCompleted() == task->numWgTotal()) {
310 // Notify the HSA PP that this kernel is complete
311 gpuCmdProc->hsaPacketProc()
312 .finishPkt(task->dispPktPtr(), task->queueId());
313 if (task->completionSignal()) {
314 DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
315 "signal! Addr: %d\n", task->completionSignal());
316
317 gpuCmdProc->sendCompletionSignal(task->completionSignal());
318 } else {
319 DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
320 "signal\n");
321 }
322
323 DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
324 curTick(), kern_id);
325 DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
326
327 if (kernelExitEvents) {
328 shader->requestKernelExitEvent(task->completionSignal());
329 }
330 }
331
332 if (!tickEvent.scheduled()) {
333 schedule(&tickEvent, curTick() + shader->clockPeriod());
334 }
335}
336
337void
339{
340 if (!tickEvent.scheduled()) {
341 schedule(&tickEvent, curTick() + shader->clockPeriod());
342 }
343}
344
346 statistics::Group *parent)
347 : statistics::Group(parent),
348 ADD_STAT(numKernelLaunched, "number of kernel launched"),
349 ADD_STAT(cyclesWaitingForDispatch, "number of cycles with outstanding "
350 "wavefronts that are waiting to be dispatched")
351{
352}
353
354} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
void serialize(CheckpointOut &cp) const override
Serialize an object.
Definition dispatcher.cc:84
void dispatch(HSAQueueEntry *task)
After all relevant HSA data structures have been traversed/extracted from memory by the CP,...
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
EventFunctionWrapper tickEvent
Definition dispatcher.hh:87
bool isReachingKernelEnd(Wavefront *wf)
GPUDispatcherParams Params
Definition dispatcher.hh:65
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
std::unordered_map< int, HSAQueueEntry * > hsaQueueEntries
Definition dispatcher.hh:88
gem5::GPUDispatcher::GPUDispatcherStats stats
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
HSAQueueEntry * hsaTask(int disp_id)
Definition dispatcher.cc:65
void unserialize(CheckpointIn &cp) override
Unserialize an object.
Definition dispatcher.cc:95
GPUCommandProcessor * gpuCmdProc
Definition dispatcher.hh:86
std::queue< int > execIds
Definition dispatcher.hh:90
GPUDispatcher(const Params &p)
Definition dispatcher.cc:50
void notifyWgCompl(Wavefront *wf)
When an end program instruction detects that the last WF in a WG has completed it will call this meth...
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
Definition dispatcher.cc:72
void setShader(Shader *new_shader)
Definition dispatcher.cc:78
std::queue< int > doneIds
Definition dispatcher.hh:92
const std::string & kernelName() const
ComputeUnit * computeUnit
Definition wavefront.hh:109
Statistics container.
Definition group.hh:93
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
void deschedule(Event &event)
Definition eventq.hh:1021
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
SimObject(const Params &p)
Definition sim_object.cc:58
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Bitfield< 0 > p
Bitfield< 63 > val
Definition misc.hh:804
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
std::ostream CheckpointOut
Definition serialize.hh:66
uint64_t Tick
Tick count type.
Definition types.hh:58
#define UNSERIALIZE_SCALAR(scalar)
Definition serialize.hh:575
#define SERIALIZE_SCALAR(scalar)
Definition serialize.hh:568
GPUDispatcherStats(statistics::Group *parent)
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Mon May 26 2025 09:19:10 for gem5 by doxygen 1.13.2