gem5 v24.0.0.0
Loading...
Searching...
No Matches
vop2.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
34#include "debug/VEGA.hh"
35
36namespace gem5
37{
38
39namespace VegaISA
40{
41 // --- Inst_VOP2__V_CNDMASK_B32 class methods ---
42
44 : Inst_VOP2(iFmt, "v_cndmask_b32")
45 {
46 setFlag(ALU);
47 setFlag(ReadsVCC);
48 } // Inst_VOP2__V_CNDMASK_B32
49
51 {
52 } // ~Inst_VOP2__V_CNDMASK_B32
53
54 // --- description from .arch file ---
55 // D.u = (VCC[i] ? S1.u : S0.u) (i = threadID in wave); VOP3: specify VCC
56 // as a scalar GPR in S2.
57 void
59 {
60 Wavefront *wf = gpuDynInst->wavefront();
61 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
62 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
63 VecOperandU32 vdst(gpuDynInst, instData.VDST);
64 ConstScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
65
66 src0.readSrc();
67 src1.read();
68 vcc.read();
69
70 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
71 if (wf->execMask(lane)) {
72 vdst[lane]
73 = bits(vcc.rawData(), lane) ? src1[lane] : src0[lane];
74 }
75 }
76
77 vdst.write();
78 } // execute
79 // --- Inst_VOP2__V_ADD_F32 class methods ---
80
82 : Inst_VOP2(iFmt, "v_add_f32")
83 {
84 setFlag(ALU);
85 setFlag(F32);
86 } // Inst_VOP2__V_ADD_F32
87
89 {
90 } // ~Inst_VOP2__V_ADD_F32
91
92 // --- description from .arch file ---
93 // D.f = S0.f + S1.f.
94 void
96 {
97 Wavefront *wf = gpuDynInst->wavefront();
98 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
99 VecOperandF32 src1(gpuDynInst, instData.VSRC1);
100 VecOperandF32 vdst(gpuDynInst, instData.VDST);
101
102 src0.readSrc();
103 src1.read();
104
105 if (isDPPInst()) {
106 VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
107 src0_dpp.read();
108
109 DPRINTF(VEGA, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], "
110 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
111 "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
112 "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
121
122 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
123
124 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
125 if (wf->execMask(lane)) {
126 vdst[lane] = src0_dpp[lane] + src1[lane];
127 }
128 }
129 } else {
130 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
131 if (wf->execMask(lane)) {
132 vdst[lane] = src0[lane] + src1[lane];
133 }
134 }
135 }
136
137 vdst.write();
138 } // execute
139 // --- Inst_VOP2__V_SUB_F32 class methods ---
140
142 : Inst_VOP2(iFmt, "v_sub_f32")
143 {
144 setFlag(ALU);
145 setFlag(F32);
146 } // Inst_VOP2__V_SUB_F32
147
149 {
150 } // ~Inst_VOP2__V_SUB_F32
151
152 // --- description from .arch file ---
153 // D.f = S0.f - S1.f.
154 // SQ translates to V_ADD_F32.
155 void
157 {
158 Wavefront *wf = gpuDynInst->wavefront();
159 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
160 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
161 VecOperandF32 vdst(gpuDynInst, instData.VDST);
162
163 src0.readSrc();
164 src1.read();
165
166 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
167 if (wf->execMask(lane)) {
168 vdst[lane] = src0[lane] - src1[lane];
169 }
170 }
171
172 vdst.write();
173 } // execute
174 // --- Inst_VOP2__V_SUBREV_F32 class methods ---
175
177 : Inst_VOP2(iFmt, "v_subrev_f32")
178 {
179 setFlag(ALU);
180 setFlag(F32);
181 } // Inst_VOP2__V_SUBREV_F32
182
184 {
185 } // ~Inst_VOP2__V_SUBREV_F32
186
187 // --- description from .arch file ---
188 // D.f = S1.f - S0.f.
189 // SQ translates to V_ADD_F32.
190 void
192 {
193 Wavefront *wf = gpuDynInst->wavefront();
194 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
195 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
196 VecOperandF32 vdst(gpuDynInst, instData.VDST);
197
198 src0.readSrc();
199 src1.read();
200
201 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
202 if (wf->execMask(lane)) {
203 vdst[lane] = src1[lane] - src0[lane];
204 }
205 }
206
207 vdst.write();
208 } // execute
209 // --- Inst_VOP2__V_MUL_LEGACY_F32 class methods ---
210
212 : Inst_VOP2(iFmt, "v_mul_legacy_f32")
213 {
214 setFlag(ALU);
215 setFlag(F32);
216 } // Inst_VOP2__V_MUL_LEGACY_F32
217
219 {
220 } // ~Inst_VOP2__V_MUL_LEGACY_F32
221
222 // --- description from .arch file ---
223 // D.f = S0.f * S1.f (DX9 rules, 0.0*x = 0.0).
224 void
226 {
227 Wavefront *wf = gpuDynInst->wavefront();
228 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
229 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
230 VecOperandF32 vdst(gpuDynInst, instData.VDST);
231
232 src0.readSrc();
233 src1.read();
234
235 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
236 if (wf->execMask(lane)) {
237 vdst[lane] = src0[lane] * src1[lane];
238 }
239 }
240
241 vdst.write();
242 } // execute
243 // --- Inst_VOP2__V_MUL_F32 class methods ---
244
246 : Inst_VOP2(iFmt, "v_mul_f32")
247 {
248 setFlag(ALU);
249 setFlag(F32);
250 } // Inst_VOP2__V_MUL_F32
251
253 {
254 } // ~Inst_VOP2__V_MUL_F32
255
256 // --- description from .arch file ---
257 // D.f = S0.f * S1.f.
258 void
260 {
261 Wavefront *wf = gpuDynInst->wavefront();
262 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
263 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
264 VecOperandF32 vdst(gpuDynInst, instData.VDST);
265
266 src0.readSrc();
267 src1.read();
268
269 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
270 if (wf->execMask(lane)) {
271 if (std::isnan(src0[lane]) ||
272 std::isnan(src1[lane])) {
273 vdst[lane] = NAN;
274 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
275 std::fpclassify(src0[lane]) == FP_ZERO) &&
276 !std::signbit(src0[lane])) {
277 if (std::isinf(src1[lane])) {
278 vdst[lane] = NAN;
279 } else if (!std::signbit(src1[lane])) {
280 vdst[lane] = +0.0;
281 } else {
282 vdst[lane] = -0.0;
283 }
284 } else if ((std::fpclassify(src0[lane]) == FP_SUBNORMAL ||
285 std::fpclassify(src0[lane]) == FP_ZERO) &&
286 std::signbit(src0[lane])) {
287 if (std::isinf(src1[lane])) {
288 vdst[lane] = NAN;
289 } else if (std::signbit(src1[lane])) {
290 vdst[lane] = +0.0;
291 } else {
292 vdst[lane] = -0.0;
293 }
294 } else if (std::isinf(src0[lane]) &&
295 !std::signbit(src0[lane])) {
296 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
297 std::fpclassify(src1[lane]) == FP_ZERO) {
298 vdst[lane] = NAN;
299 } else if (!std::signbit(src1[lane])) {
300 vdst[lane] = +INFINITY;
301 } else {
302 vdst[lane] = -INFINITY;
303 }
304 } else if (std::isinf(src0[lane]) &&
305 std::signbit(src0[lane])) {
306 if (std::fpclassify(src1[lane]) == FP_SUBNORMAL ||
307 std::fpclassify(src1[lane]) == FP_ZERO) {
308 vdst[lane] = NAN;
309 } else if (std::signbit(src1[lane])) {
310 vdst[lane] = +INFINITY;
311 } else {
312 vdst[lane] = -INFINITY;
313 }
314 } else {
315 vdst[lane] = src0[lane] * src1[lane];
316 }
317 }
318 }
319
320 vdst.write();
321 } // execute
322 // --- Inst_VOP2__V_MUL_I32_I24 class methods ---
323
325 : Inst_VOP2(iFmt, "v_mul_i32_i24")
326 {
327 setFlag(ALU);
328 } // Inst_VOP2__V_MUL_I32_I24
329
331 {
332 } // ~Inst_VOP2__V_MUL_I32_I24
333
334 // --- description from .arch file ---
335 // D.i = S0.i[23:0] * S1.i[23:0].
336 void
338 {
339 Wavefront *wf = gpuDynInst->wavefront();
340 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
341 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
342 VecOperandI32 vdst(gpuDynInst, instData.VDST);
343
344 src0.readSrc();
345 src1.read();
346
347 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
348 if (wf->execMask(lane)) {
349 vdst[lane] = sext<24>(bits(src0[lane], 23, 0))
350 * sext<24>(bits(src1[lane], 23, 0));
351 }
352 }
353
354 vdst.write();
355 } // execute
356 // --- Inst_VOP2__V_MUL_HI_I32_I24 class methods ---
357
359 : Inst_VOP2(iFmt, "v_mul_hi_i32_i24")
360 {
361 setFlag(ALU);
362 } // Inst_VOP2__V_MUL_HI_I32_I24
363
365 {
366 } // ~Inst_VOP2__V_MUL_HI_I32_I24
367
368 // --- description from .arch file ---
369 // D.i = (S0.i[23:0] * S1.i[23:0])>>32.
370 void
372 {
373 Wavefront *wf = gpuDynInst->wavefront();
374 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
375 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
376 VecOperandI32 vdst(gpuDynInst, instData.VDST);
377
378 src0.readSrc();
379 src1.read();
380
381 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
382 if (wf->execMask(lane)) {
383 VecElemI64 tmp_src0
384 = (VecElemI64)sext<24>(bits(src0[lane], 23, 0));
385 VecElemI64 tmp_src1
386 = (VecElemI64)sext<24>(bits(src1[lane], 23, 0));
387
388 vdst[lane] = (VecElemI32)((tmp_src0 * tmp_src1) >> 32);
389 }
390 }
391
392 vdst.write();
393 } // execute
394 // --- Inst_VOP2__V_MUL_U32_U24 class methods ---
395
397 : Inst_VOP2(iFmt, "v_mul_u32_u24")
398 {
399 setFlag(ALU);
400 } // Inst_VOP2__V_MUL_U32_U24
401
403 {
404 } // ~Inst_VOP2__V_MUL_U32_U24
405
406 // --- description from .arch file ---
407 // D.u = S0.u[23:0] * S1.u[23:0].
408 void
410 {
411 auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
412 VecOperandU32& vdst, Wavefront* wf) {
413 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
414 if (wf->execMask(lane)) {
415 vdst[lane] = bits(src0[lane], 23, 0) *
416 bits(src1[lane], 23, 0);
417 }
418 }
419 };
420
422 } // execute
423 // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
424
426 : Inst_VOP2(iFmt, "v_mul_hi_u32_u24")
427 {
428 setFlag(ALU);
429 } // Inst_VOP2__V_MUL_HI_U32_U24
430
432 {
433 } // ~Inst_VOP2__V_MUL_HI_U32_U24
434
435 // --- description from .arch file ---
436 // D.i = (S0.u[23:0] * S1.u[23:0])>>32.
437 void
439 {
440 Wavefront *wf = gpuDynInst->wavefront();
441 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
442 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
443 VecOperandU32 vdst(gpuDynInst, instData.VDST);
444
445 src0.readSrc();
446 src1.read();
447
448 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
449 if (wf->execMask(lane)) {
450 VecElemU64 tmp_src0 = (VecElemU64)bits(src0[lane], 23, 0);
451 VecElemU64 tmp_src1 = (VecElemU64)bits(src1[lane], 23, 0);
452 vdst[lane] = (VecElemU32)((tmp_src0 * tmp_src1) >> 32);
453 }
454 }
455
456 vdst.write();
457 } // execute
458 // --- Inst_VOP2__V_MIN_F32 class methods ---
459
461 : Inst_VOP2(iFmt, "v_min_f32")
462 {
463 setFlag(ALU);
464 setFlag(F32);
465 } // Inst_VOP2__V_MIN_F32
466
468 {
469 } // ~Inst_VOP2__V_MIN_F32
470
471 // --- description from .arch file ---
472 // D.f = (S0.f < S1.f ? S0.f : S1.f).
473 void
475 {
476 Wavefront *wf = gpuDynInst->wavefront();
477 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
478 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
479 VecOperandF32 vdst(gpuDynInst, instData.VDST);
480
481 src0.readSrc();
482 src1.read();
483
484 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
485 if (wf->execMask(lane)) {
486 vdst[lane] = std::fmin(src0[lane], src1[lane]);
487 }
488 }
489
490 vdst.write();
491 } // execute
492 // --- Inst_VOP2__V_MAX_F32 class methods ---
493
495 : Inst_VOP2(iFmt, "v_max_f32")
496 {
497 setFlag(ALU);
498 setFlag(F32);
499 } // Inst_VOP2__V_MAX_F32
500
502 {
503 } // ~Inst_VOP2__V_MAX_F32
504
505 // --- description from .arch file ---
506 // D.f = (S0.f >= S1.f ? S0.f : S1.f).
507 void
509 {
510 Wavefront *wf = gpuDynInst->wavefront();
511 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
512 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
513 VecOperandF32 vdst(gpuDynInst, instData.VDST);
514
515 src0.readSrc();
516 src1.read();
517
518 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
519 if (wf->execMask(lane)) {
520 vdst[lane] = std::fmax(src0[lane], src1[lane]);
521 }
522 }
523
524 vdst.write();
525 } // execute
526 // --- Inst_VOP2__V_MIN_I32 class methods ---
527
529 : Inst_VOP2(iFmt, "v_min_i32")
530 {
531 setFlag(ALU);
532 } // Inst_VOP2__V_MIN_I32
533
535 {
536 } // ~Inst_VOP2__V_MIN_I32
537
538 // --- description from .arch file ---
539 // D.i = min(S0.i, S1.i).
540 void
542 {
543 Wavefront *wf = gpuDynInst->wavefront();
544 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
545 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
546 VecOperandI32 vdst(gpuDynInst, instData.VDST);
547
548 src0.readSrc();
549 src1.read();
550
551 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
552 if (wf->execMask(lane)) {
553 vdst[lane] = std::min(src0[lane], src1[lane]);
554 }
555 }
556
557 vdst.write();
558 } // execute
559 // --- Inst_VOP2__V_MAX_I32 class methods ---
560
562 : Inst_VOP2(iFmt, "v_max_i32")
563 {
564 setFlag(ALU);
565 } // Inst_VOP2__V_MAX_I32
566
568 {
569 } // ~Inst_VOP2__V_MAX_I32
570
571 // --- description from .arch file ---
572 // D.i = max(S0.i, S1.i).
573 void
575 {
576 Wavefront *wf = gpuDynInst->wavefront();
577 ConstVecOperandI32 src0(gpuDynInst, instData.SRC0);
578 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
579 VecOperandI32 vdst(gpuDynInst, instData.VDST);
580
581 src0.readSrc();
582 src1.read();
583
584 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
585 if (wf->execMask(lane)) {
586 vdst[lane] = std::max(src0[lane], src1[lane]);
587 }
588 }
589
590 vdst.write();
591 } // execute
592 // --- Inst_VOP2__V_MIN_U32 class methods ---
593
595 : Inst_VOP2(iFmt, "v_min_u32")
596 {
597 setFlag(ALU);
598 } // Inst_VOP2__V_MIN_U32
599
601 {
602 } // ~Inst_VOP2__V_MIN_U32
603
604 // --- description from .arch file ---
605 // D.u = min(S0.u, S1.u).
606 void
608 {
609 Wavefront *wf = gpuDynInst->wavefront();
610 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
611 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
612 VecOperandU32 vdst(gpuDynInst, instData.VDST);
613
614 src0.readSrc();
615 src1.read();
616
617 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
618 if (wf->execMask(lane)) {
619 vdst[lane] = std::min(src0[lane], src1[lane]);
620 }
621 }
622
623 vdst.write();
624 } // execute
625 // --- Inst_VOP2__V_MAX_U32 class methods ---
626
628 : Inst_VOP2(iFmt, "v_max_u32")
629 {
630 setFlag(ALU);
631 } // Inst_VOP2__V_MAX_U32
632
634 {
635 } // ~Inst_VOP2__V_MAX_U32
636
637 // --- description from .arch file ---
638 // D.u = max(S0.u, S1.u).
639 void
641 {
642 Wavefront *wf = gpuDynInst->wavefront();
643 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
644 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
645 VecOperandU32 vdst(gpuDynInst, instData.VDST);
646
647 src0.readSrc();
648 src1.read();
649
650 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
651 if (wf->execMask(lane)) {
652 vdst[lane] = std::max(src0[lane], src1[lane]);
653 }
654 }
655
656 vdst.write();
657 } // execute
658 // --- Inst_VOP2__V_LSHRREV_B32 class methods ---
659
661 : Inst_VOP2(iFmt, "v_lshrrev_b32")
662 {
663 setFlag(ALU);
664 } // Inst_VOP2__V_LSHRREV_B32
665
667 {
668 } // ~Inst_VOP2__V_LSHRREV_B32
669
670 // --- description from .arch file ---
671 // D.u = S1.u >> S0.u[4:0].
672 // The vacated bits are set to zero.
673 // SQ translates this to an internal SP opcode.
674 void
676 {
677 Wavefront *wf = gpuDynInst->wavefront();
678 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
679 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
680 VecOperandU32 vdst(gpuDynInst, instData.VDST);
681
682 src0.readSrc();
683 src1.read();
684
685 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
686 if (wf->execMask(lane)) {
687 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
688 }
689 }
690
691 vdst.write();
692 } // execute
693 // --- Inst_VOP2__V_ASHRREV_I32 class methods ---
694
696 : Inst_VOP2(iFmt, "v_ashrrev_i32")
697 {
698 setFlag(ALU);
699 } // Inst_VOP2__V_ASHRREV_I32
700
702 {
703 } // ~Inst_VOP2__V_ASHRREV_I32
704
705 // --- description from .arch file ---
706 // D.i = signext(S1.i) >> S0.i[4:0].
707 // The vacated bits are set to the sign bit of the input value.
708 // SQ translates this to an internal SP opcode.
709 void
711 {
712 Wavefront *wf = gpuDynInst->wavefront();
713 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
714 ConstVecOperandI32 src1(gpuDynInst, instData.VSRC1);
715 VecOperandI32 vdst(gpuDynInst, instData.VDST);
716
717 src0.readSrc();
718 src1.read();
719
720 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
721 if (wf->execMask(lane)) {
722 vdst[lane] = src1[lane] >> bits(src0[lane], 4, 0);
723 }
724 }
725
726 vdst.write();
727 } // execute
728 // --- Inst_VOP2__V_LSHLREV_B32 class methods ---
729
731 : Inst_VOP2(iFmt, "v_lshlrev_b32")
732 {
733 setFlag(ALU);
734 } // Inst_VOP2__V_LSHLREV_B32
735
737 {
738 } // ~Inst_VOP2__V_LSHLREV_B32
739
740 // --- description from .arch file ---
741 // D.u = S1.u << S0.u[4:0].
742 // SQ translates this to an internal SP opcode.
743 void
745 {
746 Wavefront *wf = gpuDynInst->wavefront();
747 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
748 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
749 VecOperandU32 vdst(gpuDynInst, instData.VDST);
750
751 src0.readSrc();
752 src1.read();
753
754 if (isSDWAInst()) {
755 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
756 // use copies of original src0, src1, and vdst during selecting
757 VecOperandU32 origSrc0_sdwa(gpuDynInst,
759 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
760 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
761
762 src0_sdwa.read();
763 origSrc0_sdwa.read();
764 origSrc1.read();
765
766 DPRINTF(VEGA, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
767 "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
768 "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
769 "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
781
782 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
783 src1, origSrc1);
784
785 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
786 if (wf->execMask(lane)) {
787 vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
788 origVdst[lane] = vdst[lane]; // keep copy consistent
789 }
790 }
791
792 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
793 } else {
794 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
795 if (wf->execMask(lane)) {
796 vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
797 }
798 }
799 }
800
801 vdst.write();
802 } // execute
803 // --- Inst_VOP2__V_AND_B32 class methods ---
804
806 : Inst_VOP2(iFmt, "v_and_b32")
807 {
808 setFlag(ALU);
809 } // Inst_VOP2__V_AND_B32
810
812 {
813 } // ~Inst_VOP2__V_AND_B32
814
815 // --- description from .arch file ---
816 // D.u = S0.u & S1.u.
817 // Input and output modifiers not supported.
818 void
820 {
821 Wavefront *wf = gpuDynInst->wavefront();
822 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
823 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
824 VecOperandU32 vdst(gpuDynInst, instData.VDST);
825
826 src0.readSrc();
827 src1.read();
828
829 if (isDPPInst()) {
830 VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
831 src0_dpp.read();
832
833 DPRINTF(VEGA, "Handling V_AND_B32 SRC DPP. SRC0: register v[%d], "
834 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
835 "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
836 "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
845
846 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
847
848 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
849 if (wf->execMask(lane)) {
850 vdst[lane] = src0_dpp[lane] & src1[lane];
851 }
852 }
853 } else {
854 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
855 if (wf->execMask(lane)) {
856 vdst[lane] = src0[lane] & src1[lane];
857 }
858 }
859 }
860
861 vdst.write();
862 } // execute
863 // --- Inst_VOP2__V_OR_B32 class methods ---
864
866 : Inst_VOP2(iFmt, "v_or_b32")
867 {
868 setFlag(ALU);
869 } // Inst_VOP2__V_OR_B32
870
872 {
873 } // ~Inst_VOP2__V_OR_B32
874
875 // --- description from .arch file ---
876 // D.u = S0.u | S1.u.
877 // Input and output modifiers not supported.
878 void
880 {
881 Wavefront *wf = gpuDynInst->wavefront();
882 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
883 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
884 VecOperandU32 vdst(gpuDynInst, instData.VDST);
885
886 src0.readSrc();
887 src1.read();
888
889 if (isSDWAInst()) {
890 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
891 // use copies of original src0, src1, and dest during selecting
892 VecOperandU32 origSrc0_sdwa(gpuDynInst,
894 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
895 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
896
897 src0_sdwa.read();
898 origSrc0_sdwa.read();
899 origSrc1.read();
900
901 DPRINTF(VEGA, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
902 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
903 "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
904 "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
916
917 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
918 src1, origSrc1);
919
920 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
921 if (wf->execMask(lane)) {
922 vdst[lane] = src0_sdwa[lane] | src1[lane];
923 origVdst[lane] = vdst[lane]; // keep copy consistent
924 }
925 }
926
927 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
928 } else {
929 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
930 if (wf->execMask(lane)) {
931 vdst[lane] = src0[lane] | src1[lane];
932 }
933 }
934 }
935
936 vdst.write();
937 } // execute
938 // --- Inst_VOP2__V_XOR_B32 class methods ---
939
941 : Inst_VOP2(iFmt, "v_xor_b32")
942 {
943 setFlag(ALU);
944 } // Inst_VOP2__V_XOR_B32
945
947 {
948 } // ~Inst_VOP2__V_XOR_B32
949
950 // --- description from .arch file ---
951 // D.u = S0.u ^ S1.u.
952 // Input and output modifiers not supported.
953 void
955 {
956 Wavefront *wf = gpuDynInst->wavefront();
957 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
958 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
959 VecOperandU32 vdst(gpuDynInst, instData.VDST);
960
961 src0.readSrc();
962 src1.read();
963
964 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
965 if (wf->execMask(lane)) {
966 vdst[lane] = src0[lane] ^ src1[lane];
967 }
968 }
969
970 vdst.write();
971 } // execute
972 // --- Inst_VOP2__V_MAC_F32 class methods ---
973
975 : Inst_VOP2(iFmt, "v_mac_f32")
976 {
977 setFlag(ALU);
978 setFlag(F32);
979 setFlag(MAC);
980 } // Inst_VOP2__V_MAC_F32
981
983 {
984 } // ~Inst_VOP2__V_MAC_F32
985
986 // --- description from .arch file ---
987 // D.f = S0.f * S1.f + D.f.
988 // SQ translates to V_MAD_F32.
989 void
991 {
992 Wavefront *wf = gpuDynInst->wavefront();
993 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
994 VecOperandF32 src1(gpuDynInst, instData.VSRC1);
995 VecOperandF32 vdst(gpuDynInst, instData.VDST);
996
997 src0.readSrc();
998 src1.read();
999 vdst.read();
1000
1001 if (isDPPInst()) {
1002 VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
1003 src0_dpp.read();
1004
1005 DPRINTF(VEGA, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], "
1006 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, "
1007 "SRC1_ABS: %d, SRC1_NEG: %d, BC: %d, "
1008 "BANK_MASK: %d, ROW_MASK: %d\n", extData.iFmt_VOP_DPP.SRC0,
1017
1018 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
1019
1020 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1021 if (wf->execMask(lane)) {
1022 vdst[lane] = std::fma(src0_dpp[lane], src1[lane],
1023 vdst[lane]);
1024 }
1025 }
1026 } else {
1027 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1028 if (wf->execMask(lane)) {
1029 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
1030 }
1031 }
1032 }
1033
1034 vdst.write();
1035 } // execute
1036 // --- Inst_VOP2__V_MADMK_F32 class methods ---
1037
1039 : Inst_VOP2(iFmt, "v_madmk_f32")
1040 {
1041 setFlag(ALU);
1042 setFlag(F32);
1043 setFlag(MAD);
1044 } // Inst_VOP2__V_MADMK_F32
1045
1047 {
1048 } // ~Inst_VOP2__V_MADMK_F32
1049
1050 // --- description from .arch file ---
1051 // D.f = S0.f * K + S1.f; K is a 32-bit inline constant.
1052 // This opcode cannot use the VOP3 encoding and cannot use input/output
1053 // --- modifiers.
1054 // SQ translates to V_MAD_F32.
1055 void
1057 {
1058 Wavefront *wf = gpuDynInst->wavefront();
1059 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
1060 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
1061 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1063
1064 src0.readSrc();
1065 src1.read();
1066
1067 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1068 if (wf->execMask(lane)) {
1069 vdst[lane] = std::fma(src0[lane], k, src1[lane]);
1070 }
1071 }
1072
1073 vdst.write();
1074 } // execute
1075 // --- Inst_VOP2__V_MADAK_F32 class methods ---
1076
1078 : Inst_VOP2(iFmt, "v_madak_f32")
1079 {
1080 setFlag(ALU);
1081 setFlag(F32);
1082 setFlag(MAD);
1083 } // Inst_VOP2__V_MADAK_F32
1084
1086 {
1087 } // ~Inst_VOP2__V_MADAK_F32
1088
1089 // --- description from .arch file ---
1090 // D.f = S0.f * S1.f + K; K is a 32-bit inline constant.
1091 // This opcode cannot use the VOP3 encoding and cannot use input/output
1092 // --- modifiers.
1093 // SQ translates to V_MAD_F32.
1094 void
1096 {
1097 Wavefront *wf = gpuDynInst->wavefront();
1098 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
1099 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
1100 VecOperandF32 vdst(gpuDynInst, instData.VDST);
1102
1103 src0.readSrc();
1104 src1.read();
1105
1106 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1107 if (wf->execMask(lane)) {
1108 vdst[lane] = std::fma(src0[lane], src1[lane], k);
1109 }
1110 }
1111
1112 vdst.write();
1113 } // execute
1114 // --- Inst_VOP2__V_ADD_CO_U32 class methods ---
1115
1117 : Inst_VOP2(iFmt, "v_add_co_u32")
1118 {
1119 setFlag(ALU);
1120 setFlag(WritesVCC);
1121 } // Inst_VOP2__V_ADD_CO_U32
1122
1124 {
1125 } // ~Inst_VOP2__V_ADD_CO_U32
1126
1127 // --- description from .arch file ---
1128 // D.u = S0.u + S1.u;
1129 // VCC[threadId] = (S0.u + S1.u >= 0x800000000ULL ? 1 : 0) is an UNSIGNED
1130 // --- overflow or carry-out for V_ADDC_U32.
1131 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1132 void
1134 {
1135 Wavefront *wf = gpuDynInst->wavefront();
1136 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1137 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
1138 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1139 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1140
1141 src0.readSrc();
1142 src1.read();
1143
1144 if (isSDWAInst()) {
1145 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
1146 // use copies of original src0, src1, and dest during selecting
1147 VecOperandU32 origSrc0_sdwa(gpuDynInst,
1149 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
1150 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
1151
1152 src0_sdwa.read();
1153 origSrc0_sdwa.read();
1154 origSrc1.read();
1155
1156 DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "
1157 "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
1158 "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
1159 "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
1171
1172 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
1173 src1, origSrc1);
1174
1175 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1176 if (wf->execMask(lane)) {
1177 vdst[lane] = src0_sdwa[lane] + src1[lane];
1178 origVdst[lane] = vdst[lane]; // keep copy consistent
1179 vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
1180 + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
1181 }
1182 }
1183
1184 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
1185 } else {
1186 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1187 if (wf->execMask(lane)) {
1188 vdst[lane] = src0[lane] + src1[lane];
1189 vcc.setBit(lane, ((VecElemU64)src0[lane]
1190 + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
1191 }
1192 }
1193 }
1194
1195 vcc.write();
1196 vdst.write();
1197 } // execute
1198 // --- Inst_VOP2__V_SUB_CO_U32 class methods ---
1199
1201 : Inst_VOP2(iFmt, "v_sub_co_u32")
1202 {
1203 setFlag(ALU);
1204 setFlag(WritesVCC);
1205 } // Inst_VOP2__V_SUB_CO_U32
1206
1208 {
1209 } // ~Inst_VOP2__V_SUB_CO_U32
1210
1211 // --- description from .arch file ---
1212 // D.u = S0.u - S1.u;
1213 // VCC[threadId] = (S1.u > S0.u ? 1 : 0) is an UNSIGNED overflow or
1214 // carry-out for V_SUBB_U32.
1215 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1216 void
1218 {
1219 Wavefront *wf = gpuDynInst->wavefront();
1220 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1221 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1222 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1223 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1224
1225 src0.readSrc();
1226 src1.read();
1227
1228 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1229 if (wf->execMask(lane)) {
1230 vdst[lane] = src0[lane] - src1[lane];
1231 vcc.setBit(lane, src1[lane] > src0[lane] ? 1 : 0);
1232 }
1233 }
1234
1235 vdst.write();
1236 vcc.write();
1237 } // execute
1238 // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---
1239
1241 : Inst_VOP2(iFmt, "v_subrev_co_u32")
1242 {
1243 setFlag(ALU);
1244 setFlag(WritesVCC);
1245 } // Inst_VOP2__V_SUBREV_CO_U32
1246
1248 {
1249 } // ~Inst_VOP2__V_SUBREV_CO_U32
1250
1251 // --- description from .arch file ---
1252 // D.u = S1.u - S0.u;
1253 // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
1254 // carry-out for V_SUBB_U32.
1255 // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
1256 void
1258 {
1259 Wavefront *wf = gpuDynInst->wavefront();
1260 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1261 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1262 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1263 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1264
1265 src0.readSrc();
1266 src1.read();
1267
1268 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1269 if (wf->execMask(lane)) {
1270 vdst[lane] = src1[lane] - src0[lane];
1271 vcc.setBit(lane, src0[lane] > src1[lane] ? 1 : 0);
1272 }
1273 }
1274
1275 vdst.write();
1276 vcc.write();
1277 } // execute
1278 // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---
1279
1281 : Inst_VOP2(iFmt, "v_addc_co_u32")
1282 {
1283 setFlag(ALU);
1284 setFlag(WritesVCC);
1285 setFlag(ReadsVCC);
1286 } // Inst_VOP2__V_ADDC_CO_U32
1287
1289 {
1290 } // ~Inst_VOP2__V_ADDC_CO_U32
1291
1292 // --- description from .arch file ---
1293 // D.u = S0.u + S1.u + VCC[threadId];
1294 // VCC[threadId] = (S0.u + S1.u + VCC[threadId] >= 0x800000000ULL ? 1 : 0)
1295 // is an UNSIGNED overflow.
1296 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1297 // source comes from the SGPR-pair at S2.u.
1298 void
1300 {
1301 Wavefront *wf = gpuDynInst->wavefront();
1302 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1303 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1304 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1305 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1306
1307 src0.readSrc();
1308 src1.read();
1309 vcc.read();
1310
1311 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1312 if (wf->execMask(lane)) {
1313 vdst[lane] = src0[lane] + src1[lane]
1314 + bits(vcc.rawData(), lane);
1315 vcc.setBit(lane, ((VecElemU64)src0[lane]
1316 + (VecElemU64)src1[lane]
1317 + (VecElemU64)bits(vcc.rawData(), lane, lane))
1318 >= 0x100000000 ? 1 : 0);
1319 }
1320 }
1321
1322 vdst.write();
1323 vcc.write();
1324 } // execute
1325 // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---
1326
1328 : Inst_VOP2(iFmt, "v_subb_co_u32")
1329 {
1330 setFlag(ALU);
1331 setFlag(WritesVCC);
1332 setFlag(ReadsVCC);
1333 } // Inst_VOP2__V_SUBB_CO_U32
1334
1336 {
1337 } // ~Inst_VOP2__V_SUBB_CO_U32
1338
1339 // --- description from .arch file ---
1340 // D.u = S0.u - S1.u - VCC[threadId];
1341 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1342 // --- overflow.
1343 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1344 // --- source comes from the SGPR-pair at S2.u.
1345 void
1347 {
1348 Wavefront *wf = gpuDynInst->wavefront();
1349 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1350 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1351 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1352 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1353
1354 src0.readSrc();
1355 src1.read();
1356 vcc.read();
1357
1358 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1359 if (wf->execMask(lane)) {
1360 vdst[lane]
1361 = src0[lane] - src1[lane] - bits(vcc.rawData(), lane);
1362 vcc.setBit(lane, (src1[lane] + bits(vcc.rawData(), lane))
1363 > src0[lane] ? 1 : 0);
1364 }
1365 }
1366
1367 vdst.write();
1368 vcc.write();
1369 } // execute
1370 // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---
1371
1373 : Inst_VOP2(iFmt, "v_subbrev_co_u32")
1374 {
1375 setFlag(ALU);
1376 setFlag(WritesVCC);
1377 setFlag(ReadsVCC);
1378 } // Inst_VOP2__V_SUBBREV_CO_U32
1379
1381 {
1382 } // ~Inst_VOP2__V_SUBBREV_CO_U32
1383
1384 // --- description from .arch file ---
1385 // D.u = S1.u - S0.u - VCC[threadId];
1386 // VCC[threadId] = (S1.u + VCC[threadId] > S0.u ? 1 : 0) is an UNSIGNED
1387 // overflow.
1388 // In VOP3 the VCC destination may be an arbitrary SGPR-pair, and the VCC
1389 // source comes from the SGPR-pair at S2.u. SQ translates to V_SUBB_U32.
1390 // SQ translates this to V_SUBREV_U32 with reversed operands.
1391 void
1393 {
1394 Wavefront *wf = gpuDynInst->wavefront();
1395 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
1396 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
1397 VecOperandU32 vdst(gpuDynInst, instData.VDST);
1398 ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
1399
1400 src0.readSrc();
1401 src1.read();
1402 vcc.read();
1403
1404 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1405 if (wf->execMask(lane)) {
1406 vdst[lane]
1407 = src1[lane] - src0[lane] - bits(vcc.rawData(), lane);
1408 vcc.setBit(lane, (src0[lane] + bits(vcc.rawData(), lane))
1409 > src1[lane] ? 1 : 0);
1410 }
1411 }
1412
1413 vdst.write();
1414 vcc.write();
1415 } // execute
1416 // --- Inst_VOP2__V_ADD_F16 class methods ---
1417
1419 : Inst_VOP2(iFmt, "v_add_f16")
1420 {
1421 setFlag(ALU);
1422 setFlag(F16);
1423 } // Inst_VOP2__V_ADD_F16
1424
1426 {
1427 } // ~Inst_VOP2__V_ADD_F16
1428
1429 // --- description from .arch file ---
1430 // D.f16 = S0.f16 + S1.f16.
1431 // Supports denormals, round mode, exception flags, saturation.
1432 void
1434 {
1436 } // execute
1437 // --- Inst_VOP2__V_SUB_F16 class methods ---
1438
1440 : Inst_VOP2(iFmt, "v_sub_f16")
1441 {
1442 setFlag(ALU);
1443 setFlag(F16);
1444 } // Inst_VOP2__V_SUB_F16
1445
1447 {
1448 } // ~Inst_VOP2__V_SUB_F16
1449
1450 // --- description from .arch file ---
1451 // D.f16 = S0.f16 - S1.f16.
1452 // Supports denormals, round mode, exception flags, saturation.
1453 // SQ translates to V_ADD_F16.
1454 void
1456 {
1458 } // execute
1459 // --- Inst_VOP2__V_SUBREV_F16 class methods ---
1460
1462 : Inst_VOP2(iFmt, "v_subrev_f16")
1463 {
1464 setFlag(ALU);
1465 setFlag(F16);
1466 } // Inst_VOP2__V_SUBREV_F16
1467
1469 {
1470 } // ~Inst_VOP2__V_SUBREV_F16
1471
1472 // --- description from .arch file ---
1473 // D.f16 = S1.f16 - S0.f16.
1474 // Supports denormals, round mode, exception flags, saturation.
1475 // SQ translates to V_ADD_F16.
1476 void
1478 {
1480 } // execute
1481 // --- Inst_VOP2__V_MUL_F16 class methods ---
1482
1484 : Inst_VOP2(iFmt, "v_mul_f16")
1485 {
1486 setFlag(ALU);
1487 setFlag(F16);
1488 } // Inst_VOP2__V_MUL_F16
1489
1491 {
1492 } // ~Inst_VOP2__V_MUL_F16
1493
1494 // --- description from .arch file ---
1495 // D.f16 = S0.f16 * S1.f16.
1496 // Supports denormals, round mode, exception flags, saturation.
1497 void
1499 {
1501 } // execute
1502 // --- Inst_VOP2__V_MAC_F16 class methods ---
1503
1505 : Inst_VOP2(iFmt, "v_mac_f16")
1506 {
1507 setFlag(ALU);
1508 setFlag(F16);
1509 setFlag(MAC);
1510 } // Inst_VOP2__V_MAC_F16
1511
1513 {
1514 } // ~Inst_VOP2__V_MAC_F16
1515
1516 // --- description from .arch file ---
1517 // D.f16 = S0.f16 * S1.f16 + D.f16.
1518 // Supports round mode, exception flags, saturation.
1519 // SQ translates this to V_MAD_F16.
1520 void
1522 {
1524 } // execute
1525 // --- Inst_VOP2__V_MADMK_F16 class methods ---
1526
1528 : Inst_VOP2(iFmt, "v_madmk_f16")
1529 {
1530 setFlag(ALU);
1531 setFlag(F16);
1532 setFlag(MAD);
1533 } // Inst_VOP2__V_MADMK_F16
1534
1536 {
1537 } // ~Inst_VOP2__V_MADMK_F16
1538
1539 // --- description from .arch file ---
1540 // D.f16 = S0.f16 * K.f16 + S1.f16; K is a 16-bit inline constant stored
1541 // in the following literal DWORD.
1542 // This opcode cannot use the VOP3 encoding and cannot use input/output
1543 // modifiers. Supports round mode, exception flags, saturation.
1544 // SQ translates this to V_MAD_F16.
1545 void
1547 {
1549 } // execute
1550 // --- Inst_VOP2__V_MADAK_F16 class methods ---
1551
1553 : Inst_VOP2(iFmt, "v_madak_f16")
1554 {
1555 setFlag(ALU);
1556 setFlag(F16);
1557 setFlag(MAD);
1558 } // Inst_VOP2__V_MADAK_F16
1559
1561 {
1562 } // ~Inst_VOP2__V_MADAK_F16
1563
1564 // --- description from .arch file ---
1565 // D.f16 = S0.f16 * S1.f16 + K.f16; K is a 16-bit inline constant stored
1566 // in the following literal DWORD.
1567 // This opcode cannot use the VOP3 encoding and cannot use input/output
1568 // modifiers. Supports round mode, exception flags, saturation.
1569 // SQ translates this to V_MAD_F16.
1570 void
1572 {
1574 } // execute
1575 // --- Inst_VOP2__V_ADD_U16 class methods ---
1576
1578 : Inst_VOP2(iFmt, "v_add_u16")
1579 {
1580 setFlag(ALU);
1581 } // Inst_VOP2__V_ADD_U16
1582
1584 {
1585 } // ~Inst_VOP2__V_ADD_U16
1586
1587 // --- description from .arch file ---
1588 // D.u16 = S0.u16 + S1.u16.
1589 // Supports saturation (unsigned 16-bit integer domain).
1590 void
1592 {
1593 Wavefront *wf = gpuDynInst->wavefront();
1594 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1595 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1596 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1597
1598 src0.readSrc();
1599 src1.read();
1600
1601 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1602 if (wf->execMask(lane)) {
1603 vdst[lane] = src0[lane] + src1[lane];
1604 }
1605 }
1606
1607 vdst.write();
1608 } // execute
1609 // --- Inst_VOP2__V_SUB_U16 class methods ---
1610
1612 : Inst_VOP2(iFmt, "v_sub_u16")
1613 {
1614 setFlag(ALU);
1615 } // Inst_VOP2__V_SUB_U16
1616
1618 {
1619 } // ~Inst_VOP2__V_SUB_U16
1620
1621 // --- description from .arch file ---
1622 // D.u16 = S0.u16 - S1.u16.
1623 // Supports saturation (unsigned 16-bit integer domain).
1624 void
1626 {
1627 Wavefront *wf = gpuDynInst->wavefront();
1628 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1629 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1630 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1631
1632 src0.readSrc();
1633 src1.read();
1634
1635 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1636 if (wf->execMask(lane)) {
1637 vdst[lane] = src0[lane] - src1[lane];
1638 }
1639 }
1640
1641 vdst.write();
1642 } // execute
1643 // --- Inst_VOP2__V_SUBREV_U16 class methods ---
1644
1646 : Inst_VOP2(iFmt, "v_subrev_u16")
1647 {
1648 setFlag(ALU);
1649 } // Inst_VOP2__V_SUBREV_U16
1650
1652 {
1653 } // ~Inst_VOP2__V_SUBREV_U16
1654
1655 // --- description from .arch file ---
1656 // D.u16 = S1.u16 - S0.u16.
1657 // Supports saturation (unsigned 16-bit integer domain).
1658 // SQ translates this to V_SUB_U16 with reversed operands.
1659 void
1661 {
1662 Wavefront *wf = gpuDynInst->wavefront();
1663 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1664 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1665 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1666
1667 src0.readSrc();
1668 src1.read();
1669
1670 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1671 if (wf->execMask(lane)) {
1672 vdst[lane] = src1[lane] - src0[lane];
1673 }
1674 }
1675
1676 vdst.write();
1677 } // execute
1678 // --- Inst_VOP2__V_MUL_LO_U16 class methods ---
1679
1681 : Inst_VOP2(iFmt, "v_mul_lo_u16")
1682 {
1683 setFlag(ALU);
1684 } // Inst_VOP2__V_MUL_LO_U16
1685
1687 {
1688 } // ~Inst_VOP2__V_MUL_LO_U16
1689
1690 // --- description from .arch file ---
1691 // D.u16 = S0.u16 * S1.u16.
1692 // Supports saturation (unsigned 16-bit integer domain).
1693 void
1695 {
1696 Wavefront *wf = gpuDynInst->wavefront();
1697 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1698 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1699 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1700
1701 src0.readSrc();
1702 src1.read();
1703
1704 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1705 if (wf->execMask(lane)) {
1706 vdst[lane] = src0[lane] * src1[lane];
1707 }
1708 }
1709
1710 vdst.write();
1711 } // execute
1712 // --- Inst_VOP2__V_LSHLREV_B16 class methods ---
1713
1715 : Inst_VOP2(iFmt, "v_lshlrev_b16")
1716 {
1717 setFlag(ALU);
1718 } // Inst_VOP2__V_LSHLREV_B16
1719
1721 {
1722 } // ~Inst_VOP2__V_LSHLREV_B16
1723
1724 // --- description from .arch file ---
1725 // D.u[15:0] = S1.u[15:0] << S0.u[3:0].
1726 // SQ translates this to an internal SP opcode.
1727 void
1729 {
1730 Wavefront *wf = gpuDynInst->wavefront();
1731 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1732 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1733 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1734
1735 src0.readSrc();
1736 src1.read();
1737
1738 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1739 if (wf->execMask(lane)) {
1740 vdst[lane] = src1[lane] << bits(src0[lane], 3, 0);
1741 }
1742 }
1743
1744 vdst.write();
1745 } // execute
1746 // --- Inst_VOP2__V_LSHRREV_B16 class methods ---
1747
1749 : Inst_VOP2(iFmt, "v_lshrrev_b16")
1750 {
1751 setFlag(ALU);
1752 } // Inst_VOP2__V_LSHRREV_B16
1753
1755 {
1756 } // ~Inst_VOP2__V_LSHRREV_B16
1757
1758 // --- description from .arch file ---
1759 // D.u[15:0] = S1.u[15:0] >> S0.u[3:0].
1760 // The vacated bits are set to zero.
1761 // SQ translates this to an internal SP opcode.
1762 void
1764 {
1765 Wavefront *wf = gpuDynInst->wavefront();
1766 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1767 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1768 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1769
1770 src0.readSrc();
1771 src1.read();
1772
1773 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1774 if (wf->execMask(lane)) {
1775 vdst[lane] = src1[lane] >> src0[lane];
1776 }
1777 }
1778
1779 vdst.write();
1780 } // execute
1781 // --- Inst_VOP2__V_ASHRREV_I16 class methods ---
1782
1784 : Inst_VOP2(iFmt, "v_ashrrev_i16")
1785 {
1786 setFlag(ALU);
1787 } // Inst_VOP2__V_ASHRREV_I16
1788
1790 {
1791 } // ~Inst_VOP2__V_ASHRREV_I16
1792
1793 // --- description from .arch file ---
1794 // D.i[15:0] = signext(S1.i[15:0]) >> S0.i[3:0].
1795 // The vacated bits are set to the sign bit of the input value.
1796 // SQ translates this to an internal SP opcode.
1797 void
1799 {
1800 Wavefront *wf = gpuDynInst->wavefront();
1801 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1802 ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
1803 VecOperandI16 vdst(gpuDynInst, instData.VDST);
1804
1805 src0.readSrc();
1806 src1.read();
1807
1808 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1809 if (wf->execMask(lane)) {
1810 vdst[lane] = src1[lane] >> src0[lane];
1811 }
1812 }
1813
1814 vdst.write();
1815 } // execute
1816 // --- Inst_VOP2__V_MAX_F16 class methods ---
1817
1819 : Inst_VOP2(iFmt, "v_max_f16")
1820 {
1821 setFlag(ALU);
1822 setFlag(F16);
1823 } // Inst_VOP2__V_MAX_F16
1824
1826 {
1827 } // ~Inst_VOP2__V_MAX_F16
1828
1829 // --- description from .arch file ---
1830 // D.f16 = max(S0.f16, S1.f16).
1831 // IEEE compliant. Supports denormals, round mode, exception flags,
1832 // saturation.
1833 void
1835 {
1837 } // execute
1838 // --- Inst_VOP2__V_MIN_F16 class methods ---
1839
1841 : Inst_VOP2(iFmt, "v_min_f16")
1842 {
1843 setFlag(ALU);
1844 setFlag(F16);
1845 } // Inst_VOP2__V_MIN_F16
1846
1848 {
1849 } // ~Inst_VOP2__V_MIN_F16
1850
1851 // --- description from .arch file ---
1852 // D.f16 = min(S0.f16, S1.f16).
1853 // IEEE compliant. Supports denormals, round mode, exception flags,
1854 // saturation.
1855 void
1857 {
1859 } // execute
1860 // --- Inst_VOP2__V_MAX_U16 class methods ---
1861
1863 : Inst_VOP2(iFmt, "v_max_u16")
1864 {
1865 setFlag(ALU);
1866 } // Inst_VOP2__V_MAX_U16
1867
1869 {
1870 } // ~Inst_VOP2__V_MAX_U16
1871
1872 // --- description from .arch file ---
1873 // D.u[15:0] = max(S0.u[15:0], S1.u[15:0]).
1874 void
1876 {
1877 Wavefront *wf = gpuDynInst->wavefront();
1878 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1879 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1880 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1881
1882 src0.readSrc();
1883 src1.read();
1884
1885 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1886 if (wf->execMask(lane)) {
1887 vdst[lane] = std::max(src0[lane], src1[lane]);
1888 }
1889 }
1890
1891 vdst.write();
1892 } // execute
1893 // --- Inst_VOP2__V_MAX_I16 class methods ---
1894
1896 : Inst_VOP2(iFmt, "v_max_i16")
1897 {
1898 setFlag(ALU);
1899 } // Inst_VOP2__V_MAX_I16
1900
1902 {
1903 } // ~Inst_VOP2__V_MAX_I16
1904
1905 // --- description from .arch file ---
1906 // D.i[15:0] = max(S0.i[15:0], S1.i[15:0]).
1907 void
1909 {
1910 Wavefront *wf = gpuDynInst->wavefront();
1911 ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
1912 ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
1913 VecOperandI16 vdst(gpuDynInst, instData.VDST);
1914
1915 src0.readSrc();
1916 src1.read();
1917
1918 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1919 if (wf->execMask(lane)) {
1920 vdst[lane] = std::max(src0[lane], src1[lane]);
1921 }
1922 }
1923
1924 vdst.write();
1925 } // execute
1926 // --- Inst_VOP2__V_MIN_U16 class methods ---
1927
1929 : Inst_VOP2(iFmt, "v_min_u16")
1930 {
1931 setFlag(ALU);
1932 } // Inst_VOP2__V_MIN_U16
1933
1935 {
1936 } // ~Inst_VOP2__V_MIN_U16
1937
1938 // --- description from .arch file ---
1939 // D.u[15:0] = min(S0.u[15:0], S1.u[15:0]).
1940 void
1942 {
1943 Wavefront *wf = gpuDynInst->wavefront();
1944 ConstVecOperandU16 src0(gpuDynInst, instData.SRC0);
1945 ConstVecOperandU16 src1(gpuDynInst, instData.VSRC1);
1946 VecOperandU16 vdst(gpuDynInst, instData.VDST);
1947
1948 src0.readSrc();
1949 src1.read();
1950
1951 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1952 if (wf->execMask(lane)) {
1953 vdst[lane] = std::min(src0[lane], src1[lane]);
1954 }
1955 }
1956
1957 vdst.write();
1958 } // execute
1959 // --- Inst_VOP2__V_MIN_I16 class methods ---
1960
1962 : Inst_VOP2(iFmt, "v_min_i16")
1963 {
1964 setFlag(ALU);
1965 } // Inst_VOP2__V_MIN_I16
1966
1968 {
1969 } // ~Inst_VOP2__V_MIN_I16
1970
1971 // --- description from .arch file ---
1972 // D.i[15:0] = min(S0.i[15:0], S1.i[15:0]).
1973 void
1975 {
1976 Wavefront *wf = gpuDynInst->wavefront();
1977 ConstVecOperandI16 src0(gpuDynInst, instData.SRC0);
1978 ConstVecOperandI16 src1(gpuDynInst, instData.VSRC1);
1979 VecOperandI16 vdst(gpuDynInst, instData.VDST);
1980
1981 src0.readSrc();
1982 src1.read();
1983
1984 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1985 if (wf->execMask(lane)) {
1986 vdst[lane] = std::min(src0[lane], src1[lane]);
1987 }
1988 }
1989
1990 vdst.write();
1991 } // execute
1992 // --- Inst_VOP2__V_LDEXP_F16 class methods ---
1993
1995 : Inst_VOP2(iFmt, "v_ldexp_f16")
1996 {
1997 setFlag(ALU);
1998 setFlag(F16);
1999 } // Inst_VOP2__V_LDEXP_F16
2000
2002 {
2003 } // ~Inst_VOP2__V_LDEXP_F16
2004
2005 // --- description from .arch file ---
2006 // D.f16 = S0.f16 * (2 ** S1.i16).
2007 void
2009 {
2011 } // execute
2012 // --- Inst_VOP2__V_ADD_U32 class methods ---
2013
2015 : Inst_VOP2(iFmt, "v_add_u32")
2016 {
2017 setFlag(ALU);
2018 } // Inst_VOP2__V_ADD_U32
2019
2021 {
2022 } // ~Inst_VOP2__V_ADD_U32
2023
2024 // --- description from .arch file ---
2025 // D.u = S0.u + S1.u;
2026 void
2028 {
2029 Wavefront *wf = gpuDynInst->wavefront();
2030 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2031 VecOperandU32 src1(gpuDynInst, instData.VSRC1);
2032 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2033
2034 src0.readSrc();
2035 src1.read();
2036
2037 if (isSDWAInst()) {
2038 VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
2039 // use copies of original src0, src1, and dest during selecting
2040 VecOperandU32 origSrc0_sdwa(gpuDynInst,
2042 VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
2043 VecOperandU32 origVdst(gpuDynInst, instData.VDST);
2044
2045 src0_sdwa.read();
2046 origSrc0_sdwa.read();
2047 origSrc1.read();
2048
2049 DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
2050 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
2051 "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, "
2052 "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
2064
2065 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
2066 src1, origSrc1);
2067
2068 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2069 if (wf->execMask(lane)) {
2070 vdst[lane] = src0_sdwa[lane] + src1[lane];
2071 origVdst[lane] = vdst[lane]; // keep copy consistent
2072 }
2073 }
2074
2075 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
2076 } else {
2077 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2078 if (wf->execMask(lane)) {
2079 vdst[lane] = src0[lane] + src1[lane];
2080 }
2081 }
2082 }
2083
2084 vdst.write();
2085 } // execute
2086 // --- Inst_VOP2__V_SUB_U32 class methods ---
2087
2089 : Inst_VOP2(iFmt, "v_sub_u32")
2090 {
2091 setFlag(ALU);
2092 } // Inst_VOP2__V_SUB_U32
2093
2095 {
2096 } // ~Inst_VOP2__V_SUB_U32
2097
2098 // --- description from .arch file ---
2099 // D.u = S0.u - S1.u;
2100 void
2102 {
2103 Wavefront *wf = gpuDynInst->wavefront();
2104 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2105 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
2106 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2107
2108 src0.readSrc();
2109 src1.read();
2110
2111 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2112 if (wf->execMask(lane)) {
2113 vdst[lane] = src0[lane] - src1[lane];
2114 }
2115 }
2116
2117 vdst.write();
2118 } // execute
2119 // --- Inst_VOP2__V_SUBREV_U32 class methods ---
2120
2122 : Inst_VOP2(iFmt, "v_subrev_u32")
2123 {
2124 setFlag(ALU);
2125 } // Inst_VOP2__V_SUBREV_U32
2126
2128 {
2129 } // ~Inst_VOP2__V_SUBREV_U32
2130
2131 // --- description from .arch file ---
2132 // D.u = S1.u - S0.u;
2133 void
2135 {
2136 Wavefront *wf = gpuDynInst->wavefront();
2137 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2138 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
2139 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2140
2141 src0.readSrc();
2142 src1.read();
2143
2144 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2145 if (wf->execMask(lane)) {
2146 vdst[lane] = src1[lane] - src0[lane];
2147 }
2148 }
2149
2150 vdst.write();
2151 } // execute
2152 // --- Inst_VOP2__V_FMAC_F32 class methods ---
2153
2155 : Inst_VOP2(iFmt, "v_fmac_f32")
2156 {
2157 setFlag(ALU);
2158 } // Inst_VOP2__V_FMAC_F32
2159
2161 {
2162 } // ~Inst_VOP2__V_FMAC_F32
2163
2164 // --- description from .arch file ---
2165 // D.u = S1.u - S0.u;
2166 void
2168 {
2169 Wavefront *wf = gpuDynInst->wavefront();
2170 ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
2171 ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
2172 VecOperandF32 vdst(gpuDynInst, instData.VDST);
2173
2174 src0.readSrc();
2175 src1.read();
2176 vdst.read();
2177
2178 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2179 if (wf->execMask(lane)) {
2180 vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
2181 }
2182 }
2183
2184 vdst.write();
2185 } // execute
2186 // --- Inst_VOP2__V_XNOR_B32 class methods ---
2187
2189 : Inst_VOP2(iFmt, "v_xnor_b32")
2190 {
2191 setFlag(ALU);
2192 } // Inst_VOP2__V_XNOR_B32
2193
2195 {
2196 } // ~Inst_VOP2__V_XNOR_B32
2197
2198 // --- description from .arch file ---
2199 // D.u = S1.u - S0.u;
2200 void
2202 {
2203 Wavefront *wf = gpuDynInst->wavefront();
2204 ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
2205 ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
2206 VecOperandU32 vdst(gpuDynInst, instData.VDST);
2207
2208 src0.readSrc();
2209 src1.read();
2210 vdst.read();
2211
2212 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
2213 if (wf->execMask(lane)) {
2214 vdst[lane] = ~(src0[lane] ^ src1[lane]);
2215 }
2216 }
2217
2218 vdst.write();
2219 } // execute
2220} // namespace VegaISA
2221} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
void setFlag(Flags flag)
void execute(GPUDynInstPtr) override
Definition vop2.cc:1299
void execute(GPUDynInstPtr) override
Definition vop2.cc:1133
void execute(GPUDynInstPtr) override
Definition vop2.cc:1433
void execute(GPUDynInstPtr) override
Definition vop2.cc:95
Inst_VOP2__V_ADD_F32(InFmt_VOP2 *)
Definition vop2.cc:81
void execute(GPUDynInstPtr) override
Definition vop2.cc:1591
void execute(GPUDynInstPtr) override
Definition vop2.cc:2027
void execute(GPUDynInstPtr) override
Definition vop2.cc:819
void execute(GPUDynInstPtr) override
Definition vop2.cc:1798
void execute(GPUDynInstPtr) override
Definition vop2.cc:710
void execute(GPUDynInstPtr) override
Definition vop2.cc:58
void execute(GPUDynInstPtr) override
Definition vop2.cc:2167
void execute(GPUDynInstPtr) override
Definition vop2.cc:2008
void execute(GPUDynInstPtr) override
Definition vop2.cc:1728
void execute(GPUDynInstPtr) override
Definition vop2.cc:744
void execute(GPUDynInstPtr) override
Definition vop2.cc:1763
void execute(GPUDynInstPtr) override
Definition vop2.cc:675
void execute(GPUDynInstPtr) override
Definition vop2.cc:1521
void execute(GPUDynInstPtr) override
Definition vop2.cc:990
void execute(GPUDynInstPtr) override
Definition vop2.cc:1571
void execute(GPUDynInstPtr) override
Definition vop2.cc:1095
void execute(GPUDynInstPtr) override
Definition vop2.cc:1546
void execute(GPUDynInstPtr) override
Definition vop2.cc:1056
void execute(GPUDynInstPtr) override
Definition vop2.cc:1834
void execute(GPUDynInstPtr) override
Definition vop2.cc:508
void execute(GPUDynInstPtr) override
Definition vop2.cc:1908
void execute(GPUDynInstPtr) override
Definition vop2.cc:574
void execute(GPUDynInstPtr) override
Definition vop2.cc:1875
void execute(GPUDynInstPtr) override
Definition vop2.cc:640
void execute(GPUDynInstPtr) override
Definition vop2.cc:1856
void execute(GPUDynInstPtr) override
Definition vop2.cc:474
void execute(GPUDynInstPtr) override
Definition vop2.cc:1974
void execute(GPUDynInstPtr) override
Definition vop2.cc:541
void execute(GPUDynInstPtr) override
Definition vop2.cc:1941
void execute(GPUDynInstPtr) override
Definition vop2.cc:607
void execute(GPUDynInstPtr) override
Definition vop2.cc:1498
void execute(GPUDynInstPtr) override
Definition vop2.cc:259
void execute(GPUDynInstPtr) override
Definition vop2.cc:371
void execute(GPUDynInstPtr) override
Definition vop2.cc:438
void execute(GPUDynInstPtr) override
Definition vop2.cc:337
void execute(GPUDynInstPtr) override
Definition vop2.cc:225
void execute(GPUDynInstPtr) override
Definition vop2.cc:1694
void execute(GPUDynInstPtr) override
Definition vop2.cc:409
void execute(GPUDynInstPtr) override
Definition vop2.cc:879
Inst_VOP2__V_OR_B32(InFmt_VOP2 *)
Definition vop2.cc:865
void execute(GPUDynInstPtr) override
Definition vop2.cc:1392
void execute(GPUDynInstPtr) override
Definition vop2.cc:1346
void execute(GPUDynInstPtr) override
Definition vop2.cc:1257
void execute(GPUDynInstPtr) override
Definition vop2.cc:1477
void execute(GPUDynInstPtr) override
Definition vop2.cc:191
void execute(GPUDynInstPtr) override
Definition vop2.cc:1660
void execute(GPUDynInstPtr) override
Definition vop2.cc:2134
void execute(GPUDynInstPtr) override
Definition vop2.cc:1217
void execute(GPUDynInstPtr) override
Definition vop2.cc:1455
void execute(GPUDynInstPtr) override
Definition vop2.cc:156
void execute(GPUDynInstPtr) override
Definition vop2.cc:1625
void execute(GPUDynInstPtr) override
Definition vop2.cc:2101
void execute(GPUDynInstPtr) override
Definition vop2.cc:2201
void execute(GPUDynInstPtr) override
Definition vop2.cc:954
void vop2Helper(GPUDynInstPtr gpuDynInst, void(*fOpImpl)(T &, T &, T &, Wavefront *))
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:409
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:392
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
Definition operand.hh:491
void read() override
read from the vrf.
Definition operand.hh:147
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:131
void write() override
write to the vrf.
Definition operand.hh:199
VectorMask & execMask()
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
Bitfield< 23 > k
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:836
uint32_t VecElemU32
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:892
uint64_t VecElemU64
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:424
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
constexpr bool isinf(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:78
constexpr bool isnan(gem5::AMDGPU::fp16_e5m10_info a)
Definition fp16_e5m10.hh:83

Generated on Tue Jun 18 2024 16:23:49 for gem5 by doxygen 1.11.0