gem5 v24.1.0.1
Loading...
Searching...
No Matches
vop2.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
34#include "debug/VEGA.hh"
35
36namespace gem5
37{
38
39namespace VegaISA
40{
41 // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
42
44 : Inst_VOP2(iFmt, "v_cndmask_b32")
45 {
46 setFlag(ALU);
47 setFlag(ReadsVCC);
48 } // Inst_VOP2__V_CNDMASK_B32
49
51 {
52 } // ~Inst_VOP2__V_CNDMASK_B32
53
54 // --- description from .arch file ---
55 // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
56 // as a scalar GPR in S2.
57 void
59 {
60 Wavefront *wf = gpuDynInst->wavefront();
61 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
62 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
63 VecOperandU32 vdst(gpuDynInst, instData.VDST);
64 ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
65
66 src0.readSrc();
67 src1.read();
68 vcc.read();
69
70 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
71 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
72
73 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
74 if (wf->execMask(lane)) {
75 vdst[lane]
76 = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
77 }
78 }
79
80 vdst.write();
81 } // execute
82 // --- Inst_VOP2__V_ADD_F32 class methods ---
83
85 : Inst_VOP2(iFmt, "v_add_f32")
86 {
87 setFlag(ALU);
88 setFlag(F32);
89 } // Inst_VOP2__V_ADD_F32
90
92 {
93 } // ~Inst_VOP2__V_ADD_F32
94
95 // --- description from .arch file ---
96 // D.f = S0.f + S1.f.
97 void
99 {
100 Wavefront *wf = gpuDynInst->wavefront();
101 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
102 VecOperandF32 src1(gpuDynInst, instData.VSRC1);
103 VecOperandF32 vdst(gpuDynInst, instData.VDST);
104
105 src0.readSrc();
106 src1.read();
107
108 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
109
110 if (isDPPInst()) {
111 VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
112 src0_dpp.read();
113
114 DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
115 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
116 "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
117 "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
126
127 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
128
129 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
130 if (wf->execMask(lane)) {
131 vdst[lane] = src0_dpp[lane] + src1[lane];
132 }
133 }
134 } else {
135 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
136 if (wf->execMask(lane)) {
137 vdst[lane] = src0[lane] + src1[lane];
138 }
139 }
140 }
141
142 vdst.write();
143 } // execute
144 // --- Inst_VOP2__V_SUB_F32 class methods ---
145
147 : Inst_VOP2(iFmt, "v_sub_f32")
148 {
149 setFlag(ALU);
150 setFlag(F32);
151 } // Inst_VOP2__V_SUB_F32
152
154 {
155 } // ~Inst_VOP2__V_SUB_F32
156
157 // --- description from .arch file ---
158 // D.f = S0.f - S1.f.
159 // SQ translates to V_ADD_F32.
160 void
162 {
163 Wavefront *wf = gpuDynInst->wavefront();
164 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
165 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
166 VecOperandF32 vdst(gpuDynInst, instData.VDST);
167
168 src0.readSrc();
169 src1.read();
170
171 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
172 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
173
174 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
175 if (wf->execMask(lane)) {
176 vdst[lane] = src0[lane] - src1[lane];
177 }
178 }
179
180 vdst.write();
181 } // execute
182 // --- Inst_VOP2__V_SUBREV_F32 class methods ---
183
185 : Inst_VOP2(iFmt, "v_subrev_f32")
186 {
187 setFlag(ALU);
188 setFlag(F32);
189 } // Inst_VOP2__V_SUBREV_F32
190
192 {
193 } // ~Inst_VOP2__V_SUBREV_F32
194
195 // --- description from .arch file ---
196 // D.f = S1.f - S0.f.
197 // SQ translates to V_ADD_F32.
198 void
200 {
201 Wavefront *wf = gpuDynInst->wavefront();
202 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
203 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
204 VecOperandF32 vdst(gpuDynInst, instData.VDST);
205
206 src0.readSrc();
207 src1.read();
208
209 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
210 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
211
212 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
213 if (wf->execMask(lane)) {
214 vdst[lane] = src1[lane] - src0[lane];
215 }
216 }
217
218 vdst.write();
219 } // execute
220 // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
221
223 : Inst_VOP2(iFmt, "v_mul_legacy_f32")
224 {
225 setFlag(ALU);
226 setFlag(F32);
227 } // Inst_VOP2__V_MUL_LEGACY_F32
228
230 {
231 } // ~Inst_VOP2__V_MUL_LEGACY_F32
232
233 // --- description from .arch file ---
234 // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
235 void
237 {
238 Wavefront *wf = gpuDynInst->wavefront();
239 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
240 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
241 VecOperandF32 vdst(gpuDynInst, instData.VDST);
242
243 src0.readSrc();
244 src1.read();
245
246 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
247 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
248
249 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
250 if (wf->execMask(lane)) {
251 vdst[lane] = src0[lane] * src1[lane];
252 }
253 }
254
255 vdst.write();
256 } // execute
257 // --- Inst_VOP2__V_MUL_F32 class methods ---
258
260 : Inst_VOP2(iFmt, "v_mul_f32")
261 {
262 setFlag(ALU);
263 setFlag(F32);
264 } // Inst_VOP2__V_MUL_F32
265
267 {
268 } // ~Inst_VOP2__V_MUL_F32
269
270 // --- description from .arch file ---
271 // D.f = S0.f * S1.f.
272 void
274 {
275 Wavefront *wf = gpuDynInst->wavefront();
276 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
277 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
278 VecOperandF32 vdst(gpuDynInst, instData.VDST);
279
280 src0.readSrc();
281 src1.read();
282
283 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
284 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
285
286 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
287 if (wf->execMask(lane)) {
288 if (std::isnan(src0[lane]) ||
289 std::isnan(src1[lane])) {
290 vdst[lane] = NAN;
291 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
292 std::fpclassify(src0[lane]) == FP_ZERO) &&
293 !std::signbit(src0[lane])) {
294 if (std::isinf(src1[lane])) {
295 vdst[lane] = NAN;
296 } else if (!std::signbit(src1[lane])) {
297 vdst[lane] = +0.0;
298 } else {
299 vdst[lane] = -0.0;
300 }
301 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
302 std::fpclassify(src0[lane]) == FP_ZERO) &&
303 std::signbit(src0[lane])) {
304 if (std::isinf(src1[lane])) {
305 vdst[lane] = NAN;
306 } else if (std::signbit(src1[lane])) {
307 vdst[lane] = +0.0;
308 } else {
309 vdst[lane] = -0.0;
310 }
311 } else if (std::isinf(src0[lane]) &&
312 !std::signbit(src0[lane])) {
313 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
314 std::fpclassify(src1[lane]) == FP_ZERO) {
315 vdst[lane] = NAN;
316 } else if (!std::signbit(src1[lane])) {
317 vdst[lane] = +INFINITY;
318 } else {
319 vdst[lane] = -INFINITY;
320 }
321 } else if (std::isinf(src0[lane]) &&
322 std::signbit(src0[lane])) {
323 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
324 std::fpclassify(src1[lane]) == FP_ZERO) {
325 vdst[lane] = NAN;
326 } else if (std::signbit(src1[lane])) {
327 vdst[lane] = +INFINITY;
328 } else {
329 vdst[lane] = -INFINITY;
330 }
331 } else {
332 vdst[lane] = src0[lane] * src1[lane];
333 }
334 }
335 }
336
337 vdst.write();
338 } // execute
339 // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
340
342 : Inst_VOP2(iFmt, "v_mul_i32_i24")
343 {
344 setFlag(ALU);
345 } // Inst_VOP2__V_MUL_I32_I24
346
348 {
349 } // ~Inst_VOP2__V_MUL_I32_I24
350
351 // --- description from .arch file ---
352 // D.i = S0.i[23:0] * S1.i[23:0].
353 void
355 {
356 Wavefront *wf = gpuDynInst->wavefront();
357 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
358 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
359 VecOperandI32 vdst(gpuDynInst, instData.VDST);
360
361 src0.readSrc();
362 src1.read();
363
364 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
365 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
366
367 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
368 if (wf->execMask(lane)) {
369 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
370 * sext<24>(bits(src1[lane], 23, 0));
371 }
372 }
373
374 vdst.write();
375 } // execute
376 // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
377
379 : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
380 {
381 setFlag(ALU);
382 } // Inst_VOP2__V_MUL_HI_I32_I24
383
385 {
386 } // ~Inst_VOP2__V_MUL_HI_I32_I24
387
388 // --- description from .arch file ---
389 // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
390 void
392 {
393 Wavefront *wf = gpuDynInst->wavefront();
394 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
395 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
396 VecOperandI32 vdst(gpuDynInst, instData.VDST);
397
398 src0.readSrc();
399 src1.read();
400
401 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
402 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
403
404 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
405 if (wf->execMask(lane)) {
406 VecElemI64 tmp_src0
407 = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
408 VecElemI64 tmp_src1
409 = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
410
411 vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
412 }
413 }
414
415 vdst.write();
416 } // execute
417 // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
418
420 : Inst_VOP2(iFmt, "v_mul_u32_u24")
421 {
422 setFlag(ALU);
423 } // Inst_VOP2__V_MUL_U32_U24
424
426 {
427 } // ~Inst_VOP2__V_MUL_U32_U24
428
429 // --- description from .arch file ---
430 // D.u = S0.u[23:0] * S1.u[23:0].
431 void
433 {
434 auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
435 VecOperandU32& vdst, Wavefront* wf) {
436 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
437 if (wf->execMask(lane)) {
438 vdst[lane] = bits(src0[lane], 23, 0) *
439 bits(src1[lane], 23, 0);
440 }
441 }
442 };
443
444 vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
445 } // execute
446 // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
447
449 : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
450 {
451 setFlag(ALU);
452 } // Inst_VOP2__V_MUL_HI_U32_U24
453
455 {
456 } // ~Inst_VOP2__V_MUL_HI_U32_U24
457
458 // --- description from .arch file ---
459 // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
460 void
462 {
463 Wavefront *wf = gpuDynInst->wavefront();
464 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
465 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
466 VecOperandU32 vdst(gpuDynInst, instData.VDST);
467
468 src0.readSrc();
469 src1.read();
470
471 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
472 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
473
474 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
475 if (wf->execMask(lane)) {
476 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
477 VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
478 vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
479 }
480 }
481
482 vdst.write();
483 } // execute
484 // --- Inst_VOP2__V_MIN_F32 class methods ---
485
487 : Inst_VOP2(iFmt, "v_min_f32")
488 {
489 setFlag(ALU);
490 setFlag(F32);
491 } // Inst_VOP2__V_MIN_F32
492
494 {
495 } // ~Inst_VOP2__V_MIN_F32
496
497 // --- description from .arch file ---
498 // D.f = (S0.f < S1.f ? S0.f : S1.f).
499 void
501 {
502 Wavefront *wf = gpuDynInst->wavefront();
503 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
504 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
505 VecOperandF32 vdst(gpuDynInst, instData.VDST);
506
507 src0.readSrc();
508 src1.read();
509
510 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
511 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
512
513 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
514 if (wf->execMask(lane)) {
515 vdst[lane] = std::fmin(src0[lane], src1[lane]);
516 }
517 }
518
519 vdst.write();
520 } // execute
521 // --- Inst_VOP2__V_MAX_F32 class methods ---
522
524 : Inst_VOP2(iFmt, "v_max_f32")
525 {
526 setFlag(ALU);
527 setFlag(F32);
528 } // Inst_VOP2__V_MAX_F32
529
531 {
532 } // ~Inst_VOP2__V_MAX_F32
533
534 // --- description from .arch file ---
535 // D.f = (S0.f >= S1.f ? S0.f : S1.f).
536 void
538 {
539 Wavefront *wf = gpuDynInst->wavefront();
540 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
541 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
542 VecOperandF32 vdst(gpuDynInst, instData.VDST);
543
544 src0.readSrc();
545 src1.read();
546
547 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
548 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
549
550 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
551 if (wf->execMask(lane)) {
552 vdst[lane] = std::fmax(src0[lane], src1[lane]);
553 }
554 }
555
556 vdst.write();
557 } // execute
558 // --- Inst_VOP2__V_MIN_I32 class methods ---
559
561 : Inst_VOP2(iFmt, "v_min_i32")
562 {
563 setFlag(ALU);
564 } // Inst_VOP2__V_MIN_I32
565
567 {
568 } // ~Inst_VOP2__V_MIN_I32
569
570 // --- description from .arch file ---
571 // D.i = min(S0.i, S1.i).
572 void
574 {
575 Wavefront *wf = gpuDynInst->wavefront();
576 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
577 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
578 VecOperandI32 vdst(gpuDynInst, instData.VDST);
579
580 src0.readSrc();
581 src1.read();
582
583 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
584 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
585
586 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
587 if (wf->execMask(lane)) {
588 vdst[lane] = std::min(src0[lane], src1[lane]);
589 }
590 }
591
592 vdst.write();
593 } // execute
594 // --- Inst_VOP2__V_MAX_I32 class methods ---
595
597 : Inst_VOP2(iFmt, "v_max_i32")
598 {
599 setFlag(ALU);
600 } // Inst_VOP2__V_MAX_I32
601
603 {
604 } // ~Inst_VOP2__V_MAX_I32
605
606 // --- description from .arch file ---
607 // D.i = max(S0.i, S1.i).
608 void
610 {
611 Wavefront *wf = gpuDynInst->wavefront();
612 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
613 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
614 VecOperandI32 vdst(gpuDynInst, instData.VDST);
615
616 src0.readSrc();
617 src1.read();
618
619 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
620 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
621
622 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
623 if (wf->execMask(lane)) {
624 vdst[lane] = std::max(src0[lane], src1[lane]);
625 }
626 }
627
628 vdst.write();
629 } // execute
630 // --- Inst_VOP2__V_MIN_U32 class methods ---
631
633 : Inst_VOP2(iFmt, "v_min_u32")
634 {
635 setFlag(ALU);
636 } // Inst_VOP2__V_MIN_U32
637
639 {
640 } // ~Inst_VOP2__V_MIN_U32
641
642 // --- description from .arch file ---
643 // D.u = min(S0.u, S1.u).
644 void
646 {
647 Wavefront *wf = gpuDynInst->wavefront();
648 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
649 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
650 VecOperandU32 vdst(gpuDynInst, instData.VDST);
651
652 src0.readSrc();
653 src1.read();
654
655 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
656 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
657
658 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
659 if (wf->execMask(lane)) {
660 vdst[lane] = std::min(src0[lane], src1[lane]);
661 }
662 }
663
664 vdst.write();
665 } // execute
666 // --- Inst_VOP2__V_MAX_U32 class methods ---
667
669 : Inst_VOP2(iFmt, "v_max_u32")
670 {
671 setFlag(ALU);
672 } // Inst_VOP2__V_MAX_U32
673
675 {
676 } // ~Inst_VOP2__V_MAX_U32
677
678 // --- description from .arch file ---
679 // D.u = max(S0.u, S1.u).
680 void
682 {
683 Wavefront *wf = gpuDynInst->wavefront();
684 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
685 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
686 VecOperandU32 vdst(gpuDynInst, instData.VDST);
687
688 src0.readSrc();
689 src1.read();
690
691 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
692 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
693
694 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
695 if (wf->execMask(lane)) {
696 vdst[lane] = std::max(src0[lane], src1[lane]);
697 }
698 }
699
700 vdst.write();
701 } // execute
702 // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
703
705 : Inst_VOP2(iFmt, "v_lshrrev_b32")
706 {
707 setFlag(ALU);
708 } // Inst_VOP2__V_LSHRREV_B32
709
711 {
712 } // ~Inst_VOP2__V_LSHRREV_B32
713
714 // --- description from .arch file ---
715 // D.u = S1.u >> S0.u[4:0].
716 // The vacated bits are set to zero.
717 // SQ translates this to an internal SP opcode.
718 void
720 {
721 Wavefront *wf = gpuDynInst->wavefront();
722 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
723 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
724 VecOperandU32 vdst(gpuDynInst, instData.VDST);
725
726 src0.readSrc();
727 src1.read();
728
729 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
730 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
731
732 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
733 if (wf->execMask(lane)) {
734 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
735 }
736 }
737
738 vdst.write();
739 } // execute
740 // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
741
743 : Inst_VOP2(iFmt, "v_ashrrev_i32")
744 {
745 setFlag(ALU);
746 } // Inst_VOP2__V_ASHRREV_I32
747
749 {
750 } // ~Inst_VOP2__V_ASHRREV_I32
751
752 // --- description from .arch file ---
753 // D.i = signext(S1.i) >> S0.i[4:0].
754 // The vacated bits are set to the sign bit of the input value.
755 // SQ translates this to an internal SP opcode.
756 void
758 {
759 Wavefront *wf = gpuDynInst->wavefront();
760 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
761 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
762 VecOperandI32 vdst(gpuDynInst, instData.VDST);
763
764 src0.readSrc();
765 src1.read();
766
767 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
768 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
769
770 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
771 if (wf->execMask(lane)) {
772 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
773 }
774 }
775
776 vdst.write();
777 } // execute
778 // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
779
781 : Inst_VOP2(iFmt, "v_lshlrev_b32")
782 {
783 setFlag(ALU);
784 } // Inst_VOP2__V_LSHLREV_B32
785
787 {
788 } // ~Inst_VOP2__V_LSHLREV_B32
789
790 // --- description from .arch file ---
791 // D.u = S1.u << S0.u[4:0].
792 // SQ translates this to an internal SP opcode.
793 void
795 {
796 Wavefront *wf = gpuDynInst->wavefront();
797 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
798 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
799 VecOperandU32 vdst(gpuDynInst, instData.VDST);
800
801 src0.readSrc();
802 src1.read();
803
804 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
805
806 if (isSDWAInst()) {
807 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
808 // use copies of original src0, src1, and vdst during selecting
809 VecOperandU32 origSrc0_sdwa(gpuDynInst,
811 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
812 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
813
814 src0_sdwa.read();
815 origSrc0_sdwa.read();
816 origSrc1.read();
817
818 DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
819 "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
820 "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
821 "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
833
834 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
835 src1, origSrc1);
836
837 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
838 if (wf->execMask(lane)) {
839 vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
840 origVdst[lane] = vdst[lane]; // keep copy consistent
841 }
842 }
843
844 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
845 } else {
846 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
847 if (wf->execMask(lane)) {
848 vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
849 }
850 }
851 }
852
853 vdst.write();
854 } // execute
855 // --- Inst_VOP2__V_AND_B32 class methods ---
856
858 : Inst_VOP2(iFmt, "v_and_b32")
859 {
860 setFlag(ALU);
861 } // Inst_VOP2__V_AND_B32
862
864 {
865 } // ~Inst_VOP2__V_AND_B32
866
867 // --- description from .arch file ---
868 // D.u = S0.u & S1.u.
869 // Input and output modifiers not supported.
870 void
872 {
873 Wavefront *wf = gpuDynInst->wavefront();
874 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
875 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
876 VecOperandU32 vdst(gpuDynInst, instData.VDST);
877
878 src0.readSrc();
879 src1.read();
880
881 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
882
883 if (isDPPInst()) {
884 VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
885 src0_dpp.read();
886
887 DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
888 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
889 "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
890 "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
899
900 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
901
902 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
903 if (wf->execMask(lane)) {
904 vdst[lane] = src0_dpp[lane] & src1[lane];
905 }
906 }
907 } else {
908 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
909 if (wf->execMask(lane)) {
910 vdst[lane] = src0[lane] & src1[lane];
911 }
912 }
913 }
914
915 vdst.write();
916 } // execute
917 // --- Inst_VOP2__V_OR_B32 class methods ---
918
920 : Inst_VOP2(iFmt, "v_or_b32")
921 {
922 setFlag(ALU);
923 } // Inst_VOP2__V_OR_B32
924
926 {
927 } // ~Inst_VOP2__V_OR_B32
928
929 // --- description from .arch file ---
930 // D.u = S0.u | S1.u.
931 // Input and output modifiers not supported.
932 void
934 {
935 Wavefront *wf = gpuDynInst->wavefront();
936 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
937 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
938 VecOperandU32 vdst(gpuDynInst, instData.VDST);
939
940 src0.readSrc();
941 src1.read();
942
943 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
944
945 if (isSDWAInst()) {
946 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
947 // use copies of original src0, src1, and dest during selecting
948 VecOperandU32 origSrc0_sdwa(gpuDynInst,
950 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
951 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
952
953 src0_sdwa.read();
954 origSrc0_sdwa.read();
955 origSrc1.read();
956
957 DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
958 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
959 "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
960 "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
972
973 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
974 src1, origSrc1);
975
976 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
977 if (wf->execMask(lane)) {
978 vdst[lane] = src0_sdwa[lane] | src1[lane];
979 origVdst[lane] = vdst[lane]; // keep copy consistent
980 }
981 }
982
983 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
984 } else {
985 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
986 if (wf->execMask(lane)) {
987 vdst[lane] = src0[lane] | src1[lane];
988 }
989 }
990 }
991
992 vdst.write();
993 } // execute
994 // --- Inst_VOP2__V_XOR_B32 class methods ---
995
997 : Inst_VOP2(iFmt, "v_xor_b32")
998 {
999 setFlag(ALU);
1000 } // Inst_VOP2__V_XOR_B32
1001
1003 {
1004 } // ~Inst_VOP2__V_XOR_B32
1005
1006 // --- description from .arch file ---
1007 // D.u = S0.u ^ S1.u.
1008 // Input and output modifiers not supported.
1009 void
1011 {
1012 Wavefront *wf = gpuDynInst->wavefront();
1013 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1014 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1015 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1016
1017 src0.readSrc();
1018 src1.read();
1019
1020 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1021 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1022
1023 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1024 if (wf->execMask(lane)) {
1025 vdst[lane] = src0[lane] ^ src1[lane];
1026 }
1027 }
1028
1029 vdst.write();
1030 } // execute
1031 // --- Inst_VOP2__V_MAC_F32 class methods ---
1032
1034 : Inst_VOP2(iFmt, "v_mac_f32")
1035 {
1036 setFlag(ALU);
1037 setFlag(F32);
1038 setFlag(MAC);
1039 } // Inst_VOP2__V_MAC_F32
1040
1042 {
1043 } // ~Inst_VOP2__V_MAC_F32
1044
1045 // --- description from .arch file ---
1046 // D.f = S0.f * S1.f + D.f.
1047 // SQ translates to V_MAD_F32.
1048 void
1050 {
1051 Wavefront *wf = gpuDynInst->wavefront();
1052 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
1053 VecOperandF32 src1(gpuDynInst, instData.VSRC1);
1054 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1055
1056 src0.readSrc();
1057 src1.read();
1058 vdst.read();
1059
1060 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1061
1062 if (isDPPInst()) {
1063 VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
1064 src0_dpp.read();
1065
1066 DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
1067 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
1068 "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
1069 "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
1078
1079 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
1080
1081 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1082 if (wf->execMask(lane)) {
1083 vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
1084 vdst[lane]);
1085 }
1086 }
1087 } else {
1088 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1089 if (wf->execMask(lane)) {
1090 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
1091 }
1092 }
1093 }
1094
1095 vdst.write();
1096 } // execute
1097 // --- Inst_VOP2__V_MADMK_F32 class methods ---
1098
1100 : Inst_VOP2(iFmt, "v_madmk_f32")
1101 {
1102 setFlag(ALU);
1103 setFlag(F32);
1104 setFlag(MAD);
1105 } // Inst_VOP2__V_MADMK_F32
1106
1108 {
1109 } // ~Inst_VOP2__V_MADMK_F32
1110
1111 // --- description from .arch file ---
1112 // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
1113 // This opcode cannot use the VOP3 encoding and cannot use input/output
1114 // --- modifiers.
1115 // SQ translates to V_MAD_F32.
1116 void
1118 {
1119 Wavefront *wf = gpuDynInst->wavefront();
1120 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
1121 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
1122 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1124
1125 src0.readSrc();
1126 src1.read();
1127
1128 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1129 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1130
1131 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1132 if (wf->execMask(lane)) {
1133 vdst[lane] = std::fma(src0[lane], k, src1[lane]);
1134 }
1135 }
1136
1137 vdst.write();
1138 } // execute
1139 // --- Inst_VOP2__V_MADAK_F32 class methods ---
1140
1142 : Inst_VOP2(iFmt, "v_madak_f32")
1143 {
1144 setFlag(ALU);
1145 setFlag(F32);
1146 setFlag(MAD);
1147 } // Inst_VOP2__V_MADAK_F32
1148
1150 {
1151 } // ~Inst_VOP2__V_MADAK_F32
1152
1153 // --- description from .arch file ---
1154 // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
1155 // This opcode cannot use the VOP3 encoding and cannot use input/output
1156 // --- modifiers.
1157 // SQ translates to V_MAD_F32.
1158 void
1160 {
1161 Wavefront *wf = gpuDynInst->wavefront();
1162 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
1163 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
1164 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1166
1167 src0.readSrc();
1168 src1.read();
1169
1170 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1171 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1172
1173 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1174 if (wf->execMask(lane)) {
1175 vdst[lane] = std::fma(src0[lane], src1[lane], k);
1176 }
1177 }
1178
1179 vdst.write();
1180 } // execute
1181 // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
1182
1184 : Inst_VOP2(iFmt, "v_add_co_u32")
1185 {
1186 setFlag(ALU);
1187 setFlag(WritesVCC);
1188 } // Inst_VOP2__V_ADD_CO_U32
1189
1191 {
1192 } // ~Inst_VOP2__V_ADD_CO_U32
1193
1194 // --- description from .arch file ---
1195 // D.u = S0.u + S1.u;
1196 // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
1197 // --- overflow or carry-out for V_ADDC_U32.
1198 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1199 void
1201 {
1202 Wavefront *wf = gpuDynInst->wavefront();
1203 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1204 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
1205 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1206 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1207
1208 src0.readSrc();
1209 src1.read();
1210
1211 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1212
1213 if (isSDWAInst()) {
1214 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
1215 // use copies of original src0, src1, and dest during selecting
1216 VecOperandU32 origSrc0_sdwa(gpuDynInst,
1218 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
1219 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
1220
1221 src0_sdwa.read();
1222 origSrc0_sdwa.read();
1223 origSrc1.read();
1224
1225 DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
1226 "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
1227 "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
1228 "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
1240
1241 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
1242 src1, origSrc1);
1243
1244 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1245 if (wf->execMask(lane)) {
1246 vdst[lane] = src0_sdwa[lane] + src1[lane];
1247 origVdst[lane] = vdst[lane]; // keep copy consistent
1248 vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
1249 + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
1250 }
1251 }
1252
1253 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
1254 } else {
1255 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1256 if (wf->execMask(lane)) {
1257 vdst[lane] = src0[lane] + src1[lane];
1258 vcc.setBit(lane, ((VecElemU64)src0[lane]
1259 + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
1260 }
1261 }
1262 }
1263
1264 vcc.write();
1265 vdst.write();
1266 } // execute
1267 // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
1268
1270 : Inst_VOP2(iFmt, "v_sub_co_u32")
1271 {
1272 setFlag(ALU);
1273 setFlag(WritesVCC);
1274 } // Inst_VOP2__V_SUB_CO_U32
1275
1277 {
1278 } // ~Inst_VOP2__V_SUB_CO_U32
1279
1280 // --- description from .arch file ---
1281 // D.u = S0.u - S1.u;
1282 // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
1283 // carry-out for V_SUBB_U32.
1284 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1285 void
1287 {
1288 Wavefront *wf = gpuDynInst->wavefront();
1289 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1290 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1291 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1292 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1293
1294 src0.readSrc();
1295 src1.read();
1296
1297 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1298 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1299
1300 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1301 if (wf->execMask(lane)) {
1302 vdst[lane] = src0[lane] - src1[lane];
1303 vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
1304 }
1305 }
1306
1307 vdst.write();
1308 vcc.write();
1309 } // execute
1310 // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
1311
1313 : Inst_VOP2(iFmt, "v_subrev_co_u32")
1314 {
1315 setFlag(ALU);
1316 setFlag(WritesVCC);
1317 } // Inst_VOP2__V_SUBREV_CO_U32
1318
1320 {
1321 } // ~Inst_VOP2__V_SUBREV_CO_U32
1322
1323 // --- description from .arch file ---
1324 // D.u = S1.u - S0.u;
1325 // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
1326 // carry-out for V_SUBB_U32.
1327 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1328 void
1330 {
1331 Wavefront *wf = gpuDynInst->wavefront();
1332 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1333 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1334 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1335 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1336
1337 src0.readSrc();
1338 src1.read();
1339
1340 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1341 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1342
1343 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1344 if (wf->execMask(lane)) {
1345 vdst[lane] = src1[lane] - src0[lane];
1346 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
1347 }
1348 }
1349
1350 vdst.write();
1351 vcc.write();
1352 } // execute
1353 // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
1354
1356 : Inst_VOP2(iFmt, "v_addc_co_u32")
1357 {
1358 setFlag(ALU);
1359 setFlag(WritesVCC);
1360 setFlag(ReadsVCC);
1361 } // Inst_VOP2__V_ADDC_CO_U32
1362
1364 {
1365 } // ~Inst_VOP2__V_ADDC_CO_U32
1366
1367 // --- description from .arch file ---
1368 // D.u = S0.u + S1.u + VCC[threadId];
1369 // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
1370 // is an UNSIGNED overflow.
1371 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1372 // source comes from the SGPR-pair at S2.u.
1373 void
1375 {
1376 Wavefront *wf = gpuDynInst->wavefront();
1377 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1378 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1379 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1380 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1381
1382 src0.readSrc();
1383 src1.read();
1384 vcc.read();
1385
1386 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1387 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1388
1389 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1390 if (wf->execMask(lane)) {
1391 vdst[lane] = src0[lane] + src1[lane]
1392 + bits(vcc.rawData(), lane);
1393 vcc.setBit(lane, ((VecElemU64)src0[lane]
1394 + (VecElemU64)src1[lane]
1395 + (VecElemU64)bits(vcc.rawData(), lane, lane))
1396 >= 0x100000000 ? 1 : 0);
1397 }
1398 }
1399
1400 vdst.write();
1401 vcc.write();
1402 } // execute
1403 // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
1404
1406 : Inst_VOP2(iFmt, "v_subb_co_u32")
1407 {
1408 setFlag(ALU);
1409 setFlag(WritesVCC);
1410 setFlag(ReadsVCC);
1411 } // Inst_VOP2__V_SUBB_CO_U32
1412
1414 {
1415 } // ~Inst_VOP2__V_SUBB_CO_U32
1416
1417 // --- description from .arch file ---
1418 // D.u = S0.u - S1.u - VCC[threadId];
1419 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1420 // --- overflow.
1421 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1422 // --- source comes from the SGPR-pair at S2.u.
1423 void
1425 {
1426 Wavefront *wf = gpuDynInst->wavefront();
1427 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1428 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1429 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1430 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1431
1432 src0.readSrc();
1433 src1.read();
1434 vcc.read();
1435
1436 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1437 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1438
1439 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1440 if (wf->execMask(lane)) {
1441 vdst[lane]
1442 = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
1443 vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1444 > src0[lane] ? 1 : 0);
1445 }
1446 }
1447
1448 vdst.write();
1449 vcc.write();
1450 } // execute
1451 // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
1452
1454 : Inst_VOP2(iFmt, "v_subbrev_co_u32")
1455 {
1456 setFlag(ALU);
1457 setFlag(WritesVCC);
1458 setFlag(ReadsVCC);
1459 } // Inst_VOP2__V_SUBBREV_CO_U32
1460
1462 {
1463 } // ~Inst_VOP2__V_SUBBREV_CO_U32
1464
1465 // --- description from .arch file ---
1466 // D.u = S1.u - S0.u - VCC[threadId];
1467 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1468 // overflow.
1469 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1470 // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
1471 // SQ translates this to V_SUBREV_U32 with reversed operands.
1472 void
1474 {
1475 Wavefront *wf = gpuDynInst->wavefront();
1476 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1477 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1478 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1479 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1480
1481 src0.readSrc();
1482 src1.read();
1483 vcc.read();
1484
1485 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1486 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1487
1488 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1489 if (wf->execMask(lane)) {
1490 vdst[lane]
1491 = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
1492 vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
1493 > src1[lane] ? 1 : 0);
1494 }
1495 }
1496
1497 vdst.write();
1498 vcc.write();
1499 } // execute
1500 // --- Inst_VOP2__V_ADD_F16 class methods ---
1501
1503 : Inst_VOP2(iFmt, "v_add_f16")
1504 {
1505 setFlag(ALU);
1506 setFlag(F16);
1507 } // Inst_VOP2__V_ADD_F16
1508
1510 {
1511 } // ~Inst_VOP2__V_ADD_F16
1512
1513 // --- description from .arch file ---
1514 // D.f16 = S0.f16 + S1.f16.
1515 // Supports denormals, round mode, exception flags, saturation.
1516 void
1518 {
1520 } // execute
1521 // --- Inst_VOP2__V_SUB_F16 class methods ---
1522
1524 : Inst_VOP2(iFmt, "v_sub_f16")
1525 {
1526 setFlag(ALU);
1527 setFlag(F16);
1528 } // Inst_VOP2__V_SUB_F16
1529
1531 {
1532 } // ~Inst_VOP2__V_SUB_F16
1533
1534 // --- description from .arch file ---
1535 // D.f16 = S0.f16 - S1.f16.
1536 // Supports denormals, round mode, exception flags, saturation.
1537 // SQ translates to V_ADD_F16.
1538 void
1540 {
1542 } // execute
1543 // --- Inst_VOP2__V_SUBREV_F16 class methods ---
1544
1546 : Inst_VOP2(iFmt, "v_subrev_f16")
1547 {
1548 setFlag(ALU);
1549 setFlag(F16);
1550 } // Inst_VOP2__V_SUBREV_F16
1551
1553 {
1554 } // ~Inst_VOP2__V_SUBREV_F16
1555
1556 // --- description from .arch file ---
1557 // D.f16 = S1.f16 - S0.f16.
1558 // Supports denormals, round mode, exception flags, saturation.
1559 // SQ translates to V_ADD_F16.
1560 void
1562 {
1564 } // execute
1565 // --- Inst_VOP2__V_MUL_F16 class methods ---
1566
1568 : Inst_VOP2(iFmt, "v_mul_f16")
1569 {
1570 setFlag(ALU);
1571 setFlag(F16);
1572 } // Inst_VOP2__V_MUL_F16
1573
1575 {
1576 } // ~Inst_VOP2__V_MUL_F16
1577
1578 // --- description from .arch file ---
1579 // D.f16 = S0.f16 * S1.f16.
1580 // Supports denormals, round mode, exception flags, saturation.
1581 void
1583 {
1585 } // execute
1586 // --- Inst_VOP2__V_MAC_F16 class methods ---
1587
1589 : Inst_VOP2(iFmt, "v_mac_f16")
1590 {
1591 setFlag(ALU);
1592 setFlag(F16);
1593 setFlag(MAC);
1594 } // Inst_VOP2__V_MAC_F16
1595
1597 {
1598 } // ~Inst_VOP2__V_MAC_F16
1599
1600 // --- description from .arch file ---
1601 // D.f16 = S0.f16 * S1.f16 + D.f16.
1602 // Supports round mode, exception flags, saturation.
1603 // SQ translates this to V_MAD_F16.
1604 void
1606 {
1608 } // execute
1609 // --- Inst_VOP2__V_MADMK_F16 class methods ---
1610
1612 : Inst_VOP2(iFmt, "v_madmk_f16")
1613 {
1614 setFlag(ALU);
1615 setFlag(F16);
1616 setFlag(MAD);
1617 } // Inst_VOP2__V_MADMK_F16
1618
1620 {
1621 } // ~Inst_VOP2__V_MADMK_F16
1622
1623 // --- description from .arch file ---
1624 // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
1625 // in the following literal DWORD.
1626 // This opcode cannot use the VOP3 encoding and cannot use input/output
1627 // modifiers. Supports round mode, exception flags, saturation.
1628 // SQ translates this to V_MAD_F16.
1629 void
1631 {
1633 } // execute
1634 // --- Inst_VOP2__V_MADAK_F16 class methods ---
1635
1637 : Inst_VOP2(iFmt, "v_madak_f16")
1638 {
1639 setFlag(ALU);
1640 setFlag(F16);
1641 setFlag(MAD);
1642 } // Inst_VOP2__V_MADAK_F16
1643
1645 {
1646 } // ~Inst_VOP2__V_MADAK_F16
1647
1648 // --- description from .arch file ---
1649 // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
1650 // in the following literal DWORD.
1651 // This opcode cannot use the VOP3 encoding and cannot use input/output
1652 // modifiers. Supports round mode, exception flags, saturation.
1653 // SQ translates this to V_MAD_F16.
1654 void
1656 {
1658 } // execute
1659 // --- Inst_VOP2__V_ADD_U16 class methods ---
1660
1662 : Inst_VOP2(iFmt, "v_add_u16")
1663 {
1664 setFlag(ALU);
1665 } // Inst_VOP2__V_ADD_U16
1666
1668 {
1669 } // ~Inst_VOP2__V_ADD_U16
1670
1671 // --- description from .arch file ---
1672 // D.u16 = S0.u16 + S1.u16.
1673 // Supports saturation (unsigned 16-bit integer domain).
1674 void
1676 {
1677 Wavefront *wf = gpuDynInst->wavefront();
1678 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1679 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1680 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1681
1682 src0.readSrc();
1683 src1.read();
1684
1685 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1686 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1687
1688 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1689 if (wf->execMask(lane)) {
1690 vdst[lane] = src0[lane] + src1[lane];
1691 }
1692 }
1693
1694 vdst.write();
1695 } // execute
1696 // --- Inst_VOP2__V_SUB_U16 class methods ---
1697
1699 : Inst_VOP2(iFmt, "v_sub_u16")
1700 {
1701 setFlag(ALU);
1702 } // Inst_VOP2__V_SUB_U16
1703
1705 {
1706 } // ~Inst_VOP2__V_SUB_U16
1707
1708 // --- description from .arch file ---
1709 // D.u16 = S0.u16 - S1.u16.
1710 // Supports saturation (unsigned 16-bit integer domain).
1711 void
1713 {
1714 Wavefront *wf = gpuDynInst->wavefront();
1715 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1716 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1717 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1718
1719 src0.readSrc();
1720 src1.read();
1721
1722 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1723 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1724
1725 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1726 if (wf->execMask(lane)) {
1727 vdst[lane] = src0[lane] - src1[lane];
1728 }
1729 }
1730
1731 vdst.write();
1732 } // execute
1733 // --- Inst_VOP2__V_SUBREV_U16 class methods ---
1734
1736 : Inst_VOP2(iFmt, "v_subrev_u16")
1737 {
1738 setFlag(ALU);
1739 } // Inst_VOP2__V_SUBREV_U16
1740
1742 {
1743 } // ~Inst_VOP2__V_SUBREV_U16
1744
1745 // --- description from .arch file ---
1746 // D.u16 = S1.u16 - S0.u16.
1747 // Supports saturation (unsigned 16-bit integer domain).
1748 // SQ translates this to V_SUB_U16 with reversed operands.
1749 void
1751 {
1752 Wavefront *wf = gpuDynInst->wavefront();
1753 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1754 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1755 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1756
1757 src0.readSrc();
1758 src1.read();
1759
1760 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1761 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1762
1763 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1764 if (wf->execMask(lane)) {
1765 vdst[lane] = src1[lane] - src0[lane];
1766 }
1767 }
1768
1769 vdst.write();
1770 } // execute
1771 // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
1772
1774 : Inst_VOP2(iFmt, "v_mul_lo_u16")
1775 {
1776 setFlag(ALU);
1777 } // Inst_VOP2__V_MUL_LO_U16
1778
1780 {
1781 } // ~Inst_VOP2__V_MUL_LO_U16
1782
1783 // --- description from .arch file ---
1784 // D.u16 = S0.u16 * S1.u16.
1785 // Supports saturation (unsigned 16-bit integer domain).
1786 void
1788 {
1789 Wavefront *wf = gpuDynInst->wavefront();
1790 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1791 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1792 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1793
1794 src0.readSrc();
1795 src1.read();
1796
1797 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1798 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1799
1800 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1801 if (wf->execMask(lane)) {
1802 vdst[lane] = src0[lane] * src1[lane];
1803 }
1804 }
1805
1806 vdst.write();
1807 } // execute
1808 // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
1809
1811 : Inst_VOP2(iFmt, "v_lshlrev_b16")
1812 {
1813 setFlag(ALU);
1814 } // Inst_VOP2__V_LSHLREV_B16
1815
1817 {
1818 } // ~Inst_VOP2__V_LSHLREV_B16
1819
1820 // --- description from .arch file ---
1821 // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
1822 // SQ translates this to an internal SP opcode.
1823 void
1825 {
1826 Wavefront *wf = gpuDynInst->wavefront();
1827 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1828 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1829 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1830
1831 src0.readSrc();
1832 src1.read();
1833
1834 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1835 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1836
1837 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1838 if (wf->execMask(lane)) {
1839 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
1840 }
1841 }
1842
1843 vdst.write();
1844 } // execute
1845 // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
1846
1848 : Inst_VOP2(iFmt, "v_lshrrev_b16")
1849 {
1850 setFlag(ALU);
1851 } // Inst_VOP2__V_LSHRREV_B16
1852
1854 {
1855 } // ~Inst_VOP2__V_LSHRREV_B16
1856
1857 // --- description from .arch file ---
1858 // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
1859 // The vacated bits are set to zero.
1860 // SQ translates this to an internal SP opcode.
1861 void
1863 {
1864 Wavefront *wf = gpuDynInst->wavefront();
1865 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1866 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1867 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1868
1869 src0.readSrc();
1870 src1.read();
1871
1872 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1873 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1874
1875 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1876 if (wf->execMask(lane)) {
1877 vdst[lane] = src1[lane] >> src0[lane];
1878 }
1879 }
1880
1881 vdst.write();
1882 } // execute
1883 // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
1884
1886 : Inst_VOP2(iFmt, "v_ashrrev_i16")
1887 {
1888 setFlag(ALU);
1889 } // Inst_VOP2__V_ASHRREV_I16
1890
1892 {
1893 } // ~Inst_VOP2__V_ASHRREV_I16
1894
1895 // --- description from .arch file ---
1896 // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
1897 // The vacated bits are set to the sign bit of the input value.
1898 // SQ translates this to an internal SP opcode.
1899 void
1901 {
1902 Wavefront *wf = gpuDynInst->wavefront();
1903 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1904 ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
1905 VecOperandI16 vdst(gpuDynInst, instData.VDST);
1906
1907 src0.readSrc();
1908 src1.read();
1909
1910 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1911 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1912
1913 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1914 if (wf->execMask(lane)) {
1915 vdst[lane] = src1[lane] >> src0[lane];
1916 }
1917 }
1918
1919 vdst.write();
1920 } // execute
1921 // --- Inst_VOP2__V_MAX_F16 class methods ---
1922
1924 : Inst_VOP2(iFmt, "v_max_f16")
1925 {
1926 setFlag(ALU);
1927 setFlag(F16);
1928 } // Inst_VOP2__V_MAX_F16
1929
1931 {
1932 } // ~Inst_VOP2__V_MAX_F16
1933
1934 // --- description from .arch file ---
1935 // D.f16 = max(S0.f16, S1.f16).
1936 // IEEE compliant. Supports denormals, round mode, exception flags,
1937 // saturation.
1938 void
1940 {
1942 } // execute
1943 // --- Inst_VOP2__V_MIN_F16 class methods ---
1944
1946 : Inst_VOP2(iFmt, "v_min_f16")
1947 {
1948 setFlag(ALU);
1949 setFlag(F16);
1950 } // Inst_VOP2__V_MIN_F16
1951
1953 {
1954 } // ~Inst_VOP2__V_MIN_F16
1955
1956 // --- description from .arch file ---
1957 // D.f16 = min(S0.f16, S1.f16).
1958 // IEEE compliant. Supports denormals, round mode, exception flags,
1959 // saturation.
1960 void
1962 {
1964 } // execute
1965 // --- Inst_VOP2__V_MAX_U16 class methods ---
1966
1968 : Inst_VOP2(iFmt, "v_max_u16")
1969 {
1970 setFlag(ALU);
1971 } // Inst_VOP2__V_MAX_U16
1972
1974 {
1975 } // ~Inst_VOP2__V_MAX_U16
1976
1977 // --- description from .arch file ---
1978 // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
1979 void
1981 {
1982 Wavefront *wf = gpuDynInst->wavefront();
1983 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1984 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1985 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1986
1987 src0.readSrc();
1988 src1.read();
1989
1990 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
1991 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
1992
1993 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1994 if (wf->execMask(lane)) {
1995 vdst[lane] = std::max(src0[lane], src1[lane]);
1996 }
1997 }
1998
1999 vdst.write();
2000 } // execute
2001 // --- Inst_VOP2__V_MAX_I16 class methods ---
2002
2004 : Inst_VOP2(iFmt, "v_max_i16")
2005 {
2006 setFlag(ALU);
2007 } // Inst_VOP2__V_MAX_I16
2008
2010 {
2011 } // ~Inst_VOP2__V_MAX_I16
2012
2013 // --- description from .arch file ---
2014 // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
2015 void
2017 {
2018 Wavefront *wf = gpuDynInst->wavefront();
2019 ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
2020 ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
2021 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2022
2023 src0.readSrc();
2024 src1.read();
2025
2026 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2027 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2028
2029 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2030 if (wf->execMask(lane)) {
2031 vdst[lane] = std::max(src0[lane], src1[lane]);
2032 }
2033 }
2034
2035 vdst.write();
2036 } // execute
2037 // --- Inst_VOP2__V_MIN_U16 class methods ---
2038
2040 : Inst_VOP2(iFmt, "v_min_u16")
2041 {
2042 setFlag(ALU);
2043 } // Inst_VOP2__V_MIN_U16
2044
2046 {
2047 } // ~Inst_VOP2__V_MIN_U16
2048
2049 // --- description from .arch file ---
2050 // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
2051 void
2053 {
2054 Wavefront *wf = gpuDynInst->wavefront();
2055 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
2056 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
2057 VecOperandU16 vdst(gpuDynInst, instData.VDST);
2058
2059 src0.readSrc();
2060 src1.read();
2061
2062 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2063 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2064
2065 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2066 if (wf->execMask(lane)) {
2067 vdst[lane] = std::min(src0[lane], src1[lane]);
2068 }
2069 }
2070
2071 vdst.write();
2072 } // execute
2073 // --- Inst_VOP2__V_MIN_I16 class methods ---
2074
2076 : Inst_VOP2(iFmt, "v_min_i16")
2077 {
2078 setFlag(ALU);
2079 } // Inst_VOP2__V_MIN_I16
2080
2082 {
2083 } // ~Inst_VOP2__V_MIN_I16
2084
2085 // --- description from .arch file ---
2086 // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
2087 void
2089 {
2090 Wavefront *wf = gpuDynInst->wavefront();
2091 ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
2092 ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
2093 VecOperandI16 vdst(gpuDynInst, instData.VDST);
2094
2095 src0.readSrc();
2096 src1.read();
2097
2098 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2099 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2100
2101 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2102 if (wf->execMask(lane)) {
2103 vdst[lane] = std::min(src0[lane], src1[lane]);
2104 }
2105 }
2106
2107 vdst.write();
2108 } // execute
2109 // --- Inst_VOP2__V_LDEXP_F16 class methods ---
2110
2112 : Inst_VOP2(iFmt, "v_ldexp_f16")
2113 {
2114 setFlag(ALU);
2115 setFlag(F16);
2116 } // Inst_VOP2__V_LDEXP_F16
2117
2119 {
2120 } // ~Inst_VOP2__V_LDEXP_F16
2121
2122 // --- description from .arch file ---
2123 // D.f16 = S0.f16 * (2 ** S1.i16).
2124 void
2126 {
2128 } // execute
2129 // --- Inst_VOP2__V_ADD_U32 class methods ---
2130
2132 : Inst_VOP2(iFmt, "v_add_u32")
2133 {
2134 setFlag(ALU);
2135 } // Inst_VOP2__V_ADD_U32
2136
2138 {
2139 } // ~Inst_VOP2__V_ADD_U32
2140
2141 // --- description from .arch file ---
2142 // D.u = S0.u + S1.u;
2143 void
2145 {
2146 Wavefront *wf = gpuDynInst->wavefront();
2147 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2148 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
2149 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2150
2151 src0.readSrc();
2152 src1.read();
2153
2154 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2155
2156 if (isSDWAInst()) {
2157 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
2158 // use copies of original src0, src1, and dest during selecting
2159 VecOperandU32 origSrc0_sdwa(gpuDynInst,
2161 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
2162 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
2163
2164 src0_sdwa.read();
2165 origSrc0_sdwa.read();
2166 origSrc1.read();
2167
2168 DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
2169 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
2170 "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
2171 "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
2183
2184 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
2185 src1, origSrc1);
2186
2187 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2188 if (wf->execMask(lane)) {
2189 vdst[lane] = src0_sdwa[lane] + src1[lane];
2190 origVdst[lane] = vdst[lane]; // keep copy consistent
2191 }
2192 }
2193
2194 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
2195 } else {
2196 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2197 if (wf->execMask(lane)) {
2198 vdst[lane] = src0[lane] + src1[lane];
2199 }
2200 }
2201 }
2202
2203 vdst.write();
2204 } // execute
2205 // --- Inst_VOP2__V_SUB_U32 class methods ---
2206
2208 : Inst_VOP2(iFmt, "v_sub_u32")
2209 {
2210 setFlag(ALU);
2211 } // Inst_VOP2__V_SUB_U32
2212
2214 {
2215 } // ~Inst_VOP2__V_SUB_U32
2216
2217 // --- description from .arch file ---
2218 // D.u = S0.u - S1.u;
2219 void
2221 {
2222 Wavefront *wf = gpuDynInst->wavefront();
2223 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2224 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
2225 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2226
2227 src0.readSrc();
2228 src1.read();
2229
2230 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2231 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2232
2233 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2234 if (wf->execMask(lane)) {
2235 vdst[lane] = src0[lane] - src1[lane];
2236 }
2237 }
2238
2239 vdst.write();
2240 } // execute
2241 // --- Inst_VOP2__V_SUBREV_U32 class methods ---
2242
2244 : Inst_VOP2(iFmt, "v_subrev_u32")
2245 {
2246 setFlag(ALU);
2247 } // Inst_VOP2__V_SUBREV_U32
2248
2250 {
2251 } // ~Inst_VOP2__V_SUBREV_U32
2252
2253 // --- description from .arch file ---
2254 // D.u = S1.u - S0.u;
2255 void
2257 {
2258 Wavefront *wf = gpuDynInst->wavefront();
2259 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2260 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
2261 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2262
2263 src0.readSrc();
2264 src1.read();
2265
2266 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2267 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2268
2269 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2270 if (wf->execMask(lane)) {
2271 vdst[lane] = src1[lane] - src0[lane];
2272 }
2273 }
2274
2275 vdst.write();
2276 } // execute
2277 // --- Inst_VOP2__V_FMAC_F32 class methods ---
2278
2280 : Inst_VOP2(iFmt, "v_fmac_f32")
2281 {
2282 setFlag(ALU);
2283 } // Inst_VOP2__V_FMAC_F32
2284
2286 {
2287 } // ~Inst_VOP2__V_FMAC_F32
2288
2289 // --- description from .arch file ---
2290 // D.u = S1.u - S0.u;
2291 void
2293 {
2294 Wavefront *wf = gpuDynInst->wavefront();
2295 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
2296 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
2297 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2298
2299 src0.readSrc();
2300 src1.read();
2301 vdst.read();
2302
2303 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2304 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2305
2306 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2307 if (wf->execMask(lane)) {
2308 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
2309 }
2310 }
2311
2312 vdst.write();
2313 } // execute
2314 // --- Inst_VOP2__V_XNOR_B32 class methods ---
2315
2317 : Inst_VOP2(iFmt, "v_xnor_b32")
2318 {
2319 setFlag(ALU);
2320 } // Inst_VOP2__V_XNOR_B32
2321
2323 {
2324 } // ~Inst_VOP2__V_XNOR_B32
2325
2326 // --- description from .arch file ---
2327 // D.u = S1.u - S0.u;
2328 void
2330 {
2331 Wavefront *wf = gpuDynInst->wavefront();
2332 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2333 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
2334 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2335
2336 src0.readSrc();
2337 src1.read();
2338 vdst.read();
2339
2340 panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
2341 panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
2342
2343 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2344 if (wf->execMask(lane)) {
2345 vdst[lane] = ~(src0[lane] ^ src1[lane]);
2346 }
2347 }
2348
2349 vdst.write();
2350 } // execute
2351} // namespace VegaISA
2352} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
void setFlag(Flags flag)
const std::string _opcode
void execute(GPUDynInstPtr) override
Definition vop2.cc:1374
void execute(GPUDynInstPtr) override
Definition vop2.cc:1200
void execute(GPUDynInstPtr) override
Definition vop2.cc:1517
void execute(GPUDynInstPtr) override
Definition vop2.cc:98
Inst_VOP2__V_ADD_F32(InFmt_VOP2 *)
Definition vop2.cc:84
void execute(GPUDynInstPtr) override
Definition vop2.cc:1675
void execute(GPUDynInstPtr) override
Definition vop2.cc:2144
void execute(GPUDynInstPtr) override
Definition vop2.cc:871
void execute(GPUDynInstPtr) override
Definition vop2.cc:1900
void execute(GPUDynInstPtr) override
Definition vop2.cc:757
void execute(GPUDynInstPtr) override
Definition vop2.cc:58
void execute(GPUDynInstPtr) override
Definition vop2.cc:2292
void execute(GPUDynInstPtr) override
Definition vop2.cc:2125
void execute(GPUDynInstPtr) override
Definition vop2.cc:1824
void execute(GPUDynInstPtr) override
Definition vop2.cc:794
void execute(GPUDynInstPtr) override
Definition vop2.cc:1862
void execute(GPUDynInstPtr) override
Definition vop2.cc:719
void execute(GPUDynInstPtr) override
Definition vop2.cc:1605
void execute(GPUDynInstPtr) override
Definition vop2.cc:1049
void execute(GPUDynInstPtr) override
Definition vop2.cc:1655
void execute(GPUDynInstPtr) override
Definition vop2.cc:1159
void execute(GPUDynInstPtr) override
Definition vop2.cc:1630
void execute(GPUDynInstPtr) override
Definition vop2.cc:1117
void execute(GPUDynInstPtr) override
Definition vop2.cc:1939
void execute(GPUDynInstPtr) override
Definition vop2.cc:537
void execute(GPUDynInstPtr) override
Definition vop2.cc:2016
void execute(GPUDynInstPtr) override
Definition vop2.cc:609
void execute(GPUDynInstPtr) override
Definition vop2.cc:1980
void execute(GPUDynInstPtr) override
Definition vop2.cc:681
void execute(GPUDynInstPtr) override
Definition vop2.cc:1961
void execute(GPUDynInstPtr) override
Definition vop2.cc:500
void execute(GPUDynInstPtr) override
Definition vop2.cc:2088
void execute(GPUDynInstPtr) override
Definition vop2.cc:573
void execute(GPUDynInstPtr) override
Definition vop2.cc:2052
void execute(GPUDynInstPtr) override
Definition vop2.cc:645
void execute(GPUDynInstPtr) override
Definition vop2.cc:1582
void execute(GPUDynInstPtr) override
Definition vop2.cc:273
void execute(GPUDynInstPtr) override
Definition vop2.cc:391
void execute(GPUDynInstPtr) override
Definition vop2.cc:461
void execute(GPUDynInstPtr) override
Definition vop2.cc:354
void execute(GPUDynInstPtr) override
Definition vop2.cc:236
void execute(GPUDynInstPtr) override
Definition vop2.cc:1787
void execute(GPUDynInstPtr) override
Definition vop2.cc:432
void execute(GPUDynInstPtr) override
Definition vop2.cc:933
Inst_VOP2__V_OR_B32(InFmt_VOP2 *)
Definition vop2.cc:919
void execute(GPUDynInstPtr) override
Definition vop2.cc:1473
void execute(GPUDynInstPtr) override
Definition vop2.cc:1424
void execute(GPUDynInstPtr) override
Definition vop2.cc:1329
void execute(GPUDynInstPtr) override
Definition vop2.cc:1561
void execute(GPUDynInstPtr) override
Definition vop2.cc:199
void execute(GPUDynInstPtr) override
Definition vop2.cc:1750
void execute(GPUDynInstPtr) override
Definition vop2.cc:2256
void execute(GPUDynInstPtr) override
Definition vop2.cc:1286
void execute(GPUDynInstPtr) override
Definition vop2.cc:1539
void execute(GPUDynInstPtr) override
Definition vop2.cc:161
void execute(GPUDynInstPtr) override
Definition vop2.cc:1712
void execute(GPUDynInstPtr) override
Definition vop2.cc:2220
void execute(GPUDynInstPtr) override
Definition vop2.cc:2329
void execute(GPUDynInstPtr) override
Definition vop2.cc:1010
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:409
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:392
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
Definition operand.hh:491
void read() override
read from the vrf.
Definition operand.hh:147
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:131
void write() override
write to the vrf.
Definition operand.hh:199
VectorMask & execMask()
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
Bitfield< 23 > k
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:836
uint32_t VecElemU32
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:892
uint64_t VecElemU64
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:424
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:78
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:83

Generated on Mon Jan 13 2025 04:28:00 for gem5 by doxygen 1.9.8