gem5 [DEVELOP-FOR-25.1]
Loading...
Searching...
No Matches
vop3p.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2023 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
36
37namespace gem5
38{
39
40namespace VegaISA
41{
42
43using half = uint16_t;
44
45// Helper functions
46template<int N>
47int32_t
48dotClampI(int32_t value, bool clamp)
49{
50 // Only valid for N < 32
51 static_assert(N < 32);
52
53 if (!clamp) {
54 return static_cast<int32_t>(value);
55 }
56
57 int32_t min = -(1 << (N - 1));
58 int32_t max = (1 << (N - 1)) - 1;
59 return std::clamp<int32_t>(value, min, max);
60}
61
62template<int N>
63uint32_t
64dotClampU(uint32_t value, bool clamp)
65{
66 // Only valid for N < 32
67 static_assert(N < 32);
68
69 if (!clamp) {
70 return static_cast<int32_t>(value);
71 }
72
73 uint32_t min = 0;
74 uint32_t max = (1 << N) - 1;
75 return std::clamp<int32_t>(value, min, max);
76}
77
78int16_t
79clampI16(int32_t value, bool clamp)
80{
81 if (!clamp) {
82 return static_cast<int16_t>(value);
83 }
84
85 return std::clamp(value,
86 static_cast<int32_t>(std::numeric_limits<int16_t>::min()),
87 static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
88}
89
90uint16_t
91clampU16(uint32_t value, bool clamp)
92{
93 if (!clamp) {
94 return static_cast<uint16_t>(value);
95 }
96
97 return std::clamp(value,
98 static_cast<uint32_t>(std::numeric_limits<uint16_t>::min()),
99 static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
100}
101
102uint16_t
103clampF16(uint16_t value, bool clamp)
104{
105 if (!clamp) {
106 return value;
107 }
108
109 // Values of one and zero in fp16.
110 constexpr uint16_t one = 0x3c00;
111 constexpr uint16_t zero = 0x0;
112 ArmISA::FPSCR fpscr1, fpscr2;
113
114 // If value > one, set to one, then if value < zero set to zero.
115 uint16_t imm = fplibMin(value, one, fpscr1);
116 return fplibMax(imm, zero, fpscr2);
117}
118
119float
120clampF32(float value, bool clamp)
121{
122 if (!clamp) {
123 return value;
124 }
125
126 return std::clamp(value, 0.0f, 1.0f);
127}
128
129
130
131
132// Begin instruction execute definitions
134{
135 auto opImpl =
136 [](int16_t S0, int16_t S1, int16_t S2, bool clamp) -> int16_t
137 {
138 return clampI16(S0 * S1 + S2, clamp);
139 };
140
141 vop3pHelper<int16_t>(gpuDynInst, opImpl);
142}
143
144void
146{
147 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
148 {
149 // Only return lower 16 bits of result - This operation cannot clamp.
150 uint32_t D = S0 * S1;
151 uint16_t Dh = D & 0xFFFF;
152 return Dh;
153 };
154
155 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
156}
157
159{
160 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
161 {
162 return clampI16(S0 + S1, clamp);
163 };
164
165 vop3pHelper<int16_t>(gpuDynInst, opImpl);
166}
167
169{
170 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
171 {
172 return clampI16(S0 - S1, clamp);
173 };
174
175 vop3pHelper<int16_t>(gpuDynInst, opImpl);
176}
177
179{
180 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
181 {
182 unsigned shift_val = bits(S0, 3, 0);
183
184 // Shift does not clamp
185 return S1 << shift_val;
186 };
187
188 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
189}
190
192{
193 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
194 {
195 unsigned shift_val = bits(S0, 3, 0);
196
197 return S1 >> shift_val;
198 };
199
200 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
201}
202
204{
205 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
206 {
207 // Sign extend to larger type to ensure we don't lose sign bits when
208 // shifting.
209 int32_t S1e = S1;
210 unsigned shift_val = bits(S0, 3, 0);
211
212 return S1e >> shift_val;
213 };
214
215 vop3pHelper<int16_t>(gpuDynInst, opImpl);
216}
217
219{
220 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
221 {
222 return clampI16((S0 >= S1) ? S0 : S1, clamp);
223 };
224
225 vop3pHelper<int16_t>(gpuDynInst, opImpl);
226}
227
229{
230 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
231 {
232 return clampI16((S0 < S1) ? S0 : S1, clamp);
233 };
234
235 vop3pHelper<int16_t>(gpuDynInst, opImpl);
236}
237
239{
240 auto opImpl =
241 [](uint16_t S0, uint16_t S1, uint16_t S2, bool clamp) -> uint16_t
242 {
243 return clampU16(S0 * S1 + S2, clamp);
244 };
245
246 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
247}
248
250{
251 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
252 {
253 return clampU16(S0 + S1, clamp);
254 };
255
256 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
257}
258
260{
261 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
262 {
263 return clampU16(S0 - S1, clamp);
264 };
265
266 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
267}
268
270{
271 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
272 {
273 return clampU16((S0 >= S1) ? S0 : S1, clamp);
274 };
275
276 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
277}
278
280{
281 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
282 {
283 return clampU16((S0 < S1) ? S0 : S1, clamp);
284 };
285
286 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
287}
288
290{
291 auto opImpl = [](half S0, half S1, half S2, bool clamp) -> half
292 {
293 ArmISA::FPSCR fpscr;
294 return clampF16(fplibMulAdd(S2, S0, S1, fpscr), clamp);
295 };
296
297 vop3pHelper<half>(gpuDynInst, opImpl);
298}
299
301{
302 auto opImpl = [](half S0, half S1, bool clamp) -> half
303 {
304 ArmISA::FPSCR fpscr;
305 return clampF16(fplibAdd(S0, S1, fpscr), clamp);
306 };
307
308 vop3pHelper<half>(gpuDynInst, opImpl);
309}
310
312{
313 auto opImpl = [](half S0, half S1, bool clamp) -> half
314 {
315 ArmISA::FPSCR fpscr;
316 return clampF16(fplibMul(S0, S1, fpscr), clamp);
317 };
318
319 vop3pHelper<half>(gpuDynInst, opImpl);
320}
321
323{
324 auto opImpl = [](half S0, half S1, bool clamp) -> half
325 {
326 ArmISA::FPSCR fpscr;
327 return clampF16(fplibMin(S0, S1, fpscr), clamp);
328 };
329
330 vop3pHelper<half>(gpuDynInst, opImpl);
331}
332
334{
335 auto opImpl = [](half S0, half S1, bool clamp) -> half
336 {
337 ArmISA::FPSCR fpscr;
338 return clampF16(fplibMax(S0, S1, fpscr), clamp);
339 };
340
341 vop3pHelper<half>(gpuDynInst, opImpl);
342}
343
345{
346 auto opImpl =
347 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
348 {
349 constexpr unsigned INBITS = 16;
350
351 constexpr unsigned elems = 32 / INBITS;
352 half S0[elems];
353 half S1[elems];
354
355 for (int i = 0; i < elems; ++i) {
356 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
357 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
358 }
359
360 float S2 = *reinterpret_cast<float*>(&S2r);
361
362 // Compute components individually to prevent overflow across packing
363 half C[elems];
364 float Csum = 0.0f;
365
366 for (int i = 0; i < elems; ++i) {
367 ArmISA::FPSCR fpscr;
368 C[i] = fplibMul(S0[i], S1[i], fpscr);
369 uint32_t conv =
371 C[i], ArmISA::FPRounding_TIEEVEN, fpscr);
372 Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);
373 }
374
375 Csum += S2;
376 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
377
378 return rv;
379 };
380
381 dotHelper(gpuDynInst, opImpl);
382}
383
385{
386 // Do not use dotHelper here as OPSEL is ignored for this instruction.
387 Wavefront *wf = gpuDynInst->wavefront();
388 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
389 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
390 ConstVecOperandU32 src2(gpuDynInst, extData.SRC2);
391 VecOperandU32 vdst(gpuDynInst, instData.VDST);
392
393 src0.readSrc();
394 src1.readSrc();
395 src2.readSrc();
396 vdst.read();
397
398 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
399 if (wf->execMask(lane)) {
401 a1.data = uint16_t(bits(src0[lane], 15, 0));
402 a2.data = uint16_t(bits(src0[lane], 31, 16));
403 b1.data = uint16_t(bits(src1[lane], 15, 0));
404 b2.data = uint16_t(bits(src1[lane], 31, 16));
405
406 if (instData.NEG_HI & 0x1) a2 = -a2;
407 if (instData.NEG_HI & 0x2) b2 = -b2;
408 if (extData.NEG & 0x1) a1 = -a1;
409 if (extData.NEG & 0x2) b1 = -b1;
410
411 vdst[lane] += float(a1) * float(b1);
412 vdst[lane] += float(a2) * float(b2);
413 vdst[lane] += src2[lane];
414
415 clampF32(vdst[lane], (bool)instData.CLMP);
416 }
417 }
418
419 vdst.write();
420}
421
423{
424 auto opImpl =
425 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
426 {
427 constexpr unsigned INBITS = 16;
428
429 constexpr unsigned elems = 32 / INBITS;
430 uint32_t S0[elems];
431 uint32_t S1[elems];
432
433 for (int i = 0; i < elems; ++i) {
434 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
435 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
436 }
437
438 int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
439
440 // Compute components individually to prevent overflow across packing
441 int32_t C[elems];
442 int32_t Csum = 0;
443
444 for (int i = 0; i < elems; ++i) {
445 C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
446 C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
447 Csum += C[i];
448 }
449
450 Csum += S2;
451 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
452
453 return rv;
454 };
455
456 dotHelper(gpuDynInst, opImpl);
457}
458
460{
461 auto opImpl =
462 [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
463 {
464 constexpr unsigned INBITS = 16;
465
466 constexpr unsigned elems = 32 / INBITS;
467 uint32_t S0[elems];
468 uint32_t S1[elems];
469
470 for (int i = 0; i < elems; ++i) {
471 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
472 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
473 }
474
475 // Compute components individually to prevent overflow across packing
476 uint32_t C[elems];
477 uint32_t Csum = 0;
478
479 for (int i = 0; i < elems; ++i) {
480 C[i] = S0[i] * S1[i];
481 C[i] = dotClampU<INBITS>(C[i], clamp);
482 Csum += C[i];
483 }
484
485 Csum += S2;
486
487 return Csum;
488 };
489
490 dotHelper(gpuDynInst, opImpl);
491}
492
494{
495 auto opImpl =
496 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
497 {
498 constexpr unsigned INBITS = 8;
499
500 constexpr unsigned elems = 32 / INBITS;
501 uint32_t S0[elems];
502 uint32_t S1[elems];
503
504 for (int i = 0; i < elems; ++i) {
505 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
506 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
507 }
508
509 int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
510
511 // Compute components individually to prevent overflow across packing
512 int32_t C[elems];
513 int32_t Csum = 0;
514
515 for (int i = 0; i < elems; ++i) {
516 C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
517 C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
518 Csum += C[i];
519 }
520
521 Csum += S2;
522 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
523
524 return rv;
525 };
526
527 dotHelper(gpuDynInst, opImpl);
528}
529
531{
532 auto opImpl =
533 [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
534 {
535 constexpr unsigned INBITS = 8;
536
537 constexpr unsigned elems = 32 / INBITS;
538 uint32_t S0[elems];
539 uint32_t S1[elems];
540
541 for (int i = 0; i < elems; ++i) {
542 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
543 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
544 }
545
546 // Compute components individually to prevent overflow across packing
547 uint32_t C[elems];
548 uint32_t Csum = 0;
549
550 for (int i = 0; i < elems; ++i) {
551 C[i] = S0[i] * S1[i];
552 C[i] = dotClampU<INBITS>(C[i], clamp);
553 Csum += C[i];
554 }
555
556 Csum += S2;
557
558 return Csum;
559 };
560
561 dotHelper(gpuDynInst, opImpl);
562}
563
565{
566 auto opImpl =
567 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
568 {
569 constexpr unsigned INBITS = 4;
570
571 constexpr unsigned elems = 32 / INBITS;
572 uint32_t S0[elems];
573 uint32_t S1[elems];
574
575 for (int i = 0; i < elems; ++i) {
576 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
577 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
578 }
579
580 int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
581
582 // Compute components individually to prevent overflow across packing
583 int32_t C[elems];
584 int32_t Csum = 0;
585
586 for (int i = 0; i < elems; ++i) {
587 C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
588 C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
589 Csum += C[i];
590 }
591
592 Csum += S2;
593 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
594
595 return rv;
596 };
597
598 dotHelper(gpuDynInst, opImpl);
599}
600
602{
603 auto opImpl =
604 [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
605 {
606 constexpr unsigned INBITS = 4;
607
608 constexpr unsigned elems = 32 / INBITS;
609 uint32_t S0[elems];
610 uint32_t S1[elems];
611
612 for (int i = 0; i < elems; ++i) {
613 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
614 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
615 }
616
617 // Compute components individually to prevent overflow across packing
618 uint32_t C[elems];
619 uint32_t Csum = 0;
620
621 for (int i = 0; i < elems; ++i) {
622 C[i] = S0[i] * S1[i];
623 C[i] = dotClampU<INBITS>(C[i], clamp);
624 Csum += C[i];
625 }
626
627 Csum += S2;
628
629 return Csum;
630 };
631
632 dotHelper(gpuDynInst, opImpl);
633}
634
636{
637 Wavefront *wf = gpuDynInst->wavefront();
638 unsigned accum_offset = wf->accumOffset;
639
640 ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset);
641 VecOperandU32 vdst(gpuDynInst, instData.VDST);
642
643 src.readSrc();
644
645 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
646 if (wf->execMask(lane)) {
647 vdst[lane] = src[lane];
648 }
649 }
650
651 vdst.write();
652}
653
655{
656 Wavefront *wf = gpuDynInst->wavefront();
657 unsigned accum_offset = wf->accumOffset;
658
659 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
660 VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
661
662 src.readSrc();
663
664 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
665 if (wf->execMask(lane)) {
666 vdst[lane] = src[lane];
667 }
668 }
669
670 vdst.write();
671}
672
673// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
674
676 : Inst_VOP3P(iFmt, "v_pk_fma_f32")
677{
678 setFlag(ALU);
679} // Inst_VOP3P__V_PK_FMA_F32
680
682{
683} // ~Inst_VOP3P__V_PK_FMA_F32
684
685// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
686// S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
687void
689{
690 // This is a special case of packed instructions which operates on
691 // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
692 // values cannot use bitwise operations. Consider the U64 to imply
693 // untyped 64-bits of data.
694 Wavefront *wf = gpuDynInst->wavefront();
695 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
696 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
697 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
698 VecOperandU64 vdst(gpuDynInst, instData.VDST);
699
700 src0.readSrc();
701 src1.readSrc();
702 src2.readSrc();
703
704 int opsel = instData.OPSEL;
705 int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
706
707 int neg = extData.NEG;
708 int neg_hi = instData.NEG_HI;
709
710 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
711 if (wf->execMask(lane)) {
712 uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
713 : bits(src0[lane], 31, 0);
714 uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
715 : bits(src1[lane], 31, 0);
716 uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
717 : bits(src2[lane], 31, 0);
718
719 float s0lf = *reinterpret_cast<float*>(&s0l);
720 float s1lf = *reinterpret_cast<float*>(&s1l);
721 float s2lf = *reinterpret_cast<float*>(&s2l);
722
723 if (neg & 1) s0lf = -s0lf;
724 if (neg & 1) s1lf = -s1lf;
725 if (neg & 1) s2lf = -s2lf;
726
727 float dword1 = std::fma(s0lf, s1lf, s2lf);
728
729 uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
730 : bits(src0[lane], 31, 0);
731 uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
732 : bits(src1[lane], 31, 0);
733 uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
734 : bits(src2[lane], 31, 0);
735
736 float s0hf = *reinterpret_cast<float*>(&s0h);
737 float s1hf = *reinterpret_cast<float*>(&s1h);
738 float s2hf = *reinterpret_cast<float*>(&s2h);
739
740 if (neg_hi & 1) s0hf = -s0hf;
741 if (neg_hi & 1) s1hf = -s1hf;
742 if (neg_hi & 1) s2hf = -s2hf;
743
744 float dword2 = std::fma(s0hf, s1hf, s2hf);
745
746 uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
747 uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
748
749 vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
750 }
751 }
752
753 vdst.write();
754} // execute
755// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
756
758 : Inst_VOP3P(iFmt, "v_pk_mul_f32")
759{
760 setFlag(ALU);
761} // Inst_VOP3P__V_PK_MUL_F32
762
764{
765} // ~Inst_VOP3P__V_PK_MUL_F32
766
767// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
768// S1.f[31:0]
769void
771{
772 // This is a special case of packed instructions which operates on
773 // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
774 // values cannot use bitwise operations. Consider the U64 to imply
775 // untyped 64-bits of data.
776 Wavefront *wf = gpuDynInst->wavefront();
777 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
778 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
779 VecOperandU64 vdst(gpuDynInst, instData.VDST);
780
781 src0.readSrc();
782 src1.readSrc();
783
784 int opsel = instData.OPSEL;
785 int opsel_hi = extData.OPSEL_HI;
786
787 int neg = extData.NEG;
788 int neg_hi = instData.NEG_HI;
789
790 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
791 if (wf->execMask(lane)) {
792 uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
793 : bits(src0[lane], 31, 0);
794 uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
795 : bits(src1[lane], 31, 0);
796
797 float ldwordf = *reinterpret_cast<float*>(&lower_dword);
798 float udwordf = *reinterpret_cast<float*>(&upper_dword);
799
800 if (neg & 1) ldwordf = -ldwordf;
801 if (neg & 2) udwordf = -udwordf;
802
803 float dword1 = ldwordf * udwordf;
804
805 lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
806 : bits(src0[lane], 31, 0);
807 upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
808 : bits(src1[lane], 31, 0);
809
810 ldwordf = *reinterpret_cast<float*>(&lower_dword);
811 udwordf = *reinterpret_cast<float*>(&upper_dword);
812
813 if (neg_hi & 1) ldwordf = -ldwordf;
814 if (neg_hi & 2) udwordf = -udwordf;
815
816 float dword2 = ldwordf * udwordf;
817
818 uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
819 uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
820
821 vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
822 }
823 }
824
825 vdst.write();
826} // execute
827// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
828
830 : Inst_VOP3P(iFmt, "v_pk_add_f32")
831{
832 setFlag(ALU);
833} // Inst_VOP3P__V_PK_ADD_F32
834
836{
837} // ~Inst_VOP3P__V_PK_ADD_F32
838
839// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
840// S1.f[31:0]
841void
843{
844 // This is a special case of packed instructions which operates on
845 // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
846 // values cannot use bitwise operations. Consider the U64 to imply
847 // untyped 64-bits of data.
848 Wavefront *wf = gpuDynInst->wavefront();
849 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
850 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
851 VecOperandU64 vdst(gpuDynInst, instData.VDST);
852
853 src0.readSrc();
854 src1.readSrc();
855
856 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
857 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
858
859 int opsel = instData.OPSEL;
860 int opsel_hi = extData.OPSEL_HI;
861
862 int neg = extData.NEG;
863 int neg_hi = instData.NEG_HI;
864
865 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
866 if (wf->execMask(lane)) {
867 uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
868 : bits(src0[lane], 31, 0);
869 uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
870 : bits(src1[lane], 31, 0);
871
872 float ldwordf = *reinterpret_cast<float*>(&lower_dword);
873 float udwordf = *reinterpret_cast<float*>(&upper_dword);
874
875 if (neg & 1) ldwordf = -ldwordf;
876 if (neg & 2) udwordf = -udwordf;
877
878 float dword1 = ldwordf + udwordf;
879
880 lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
881 : bits(src0[lane], 31, 0);
882 upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
883 : bits(src1[lane], 31, 0);
884
885 ldwordf = *reinterpret_cast<float*>(&lower_dword);
886 udwordf = *reinterpret_cast<float*>(&upper_dword);
887
888 if (neg_hi & 1) ldwordf = -ldwordf;
889 if (neg_hi & 2) udwordf = -udwordf;
890
891 float dword2 = ldwordf + udwordf;
892
893 uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
894 uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
895
896 vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
897 }
898 }
899
900 vdst.write();
901} // execute
902// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
903
905 : Inst_VOP3P(iFmt, "v_pk_mov_b32")
906{
907 setFlag(ALU);
908} // Inst_VOP3P__V_PK_MOV_B32
909
911{
912} // ~Inst_VOP3P__V_PK_MOV_B32
913
914// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
915void
917{
918 // This is a special case of packed instructions which operates on
919 // 64-bit inputs/outputs and not 32-bit.
920 Wavefront *wf = gpuDynInst->wavefront();
921 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
922 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
923 VecOperandU64 vdst(gpuDynInst, instData.VDST);
924
925 src0.readSrc();
926 src1.readSrc();
927
928 // Only OPSEL[1:0] are used
929 // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
930 int opsel = instData.OPSEL;
931
932 warn_if(instData.NEG_HI || extData.NEG,
933 "Negative modifier undefined for %s", _opcode);
934
935 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
936 if (wf->execMask(lane)) {
937 // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
938 uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
939 : bits(src0[lane], 31, 0);
940 uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
941 : bits(src1[lane], 31, 0);
942
943 vdst[lane] = upper_dword << 32 | lower_dword;
944 }
945 }
946
947 vdst.write();
948} // execute
949// --- Inst_VOP3P__V_MFMA_LOAD_SCALE class methods ---
950
952 : Inst_VOP3P(iFmt, "v_mfma_load_scale")
953{
954 setFlag(ALU);
955} // Inst_VOP3P__V_MFMA_LOAD_SCALE
956
959
960void
962{
963 // This is implemented differently in gem5 to avoid needing to change a
964 // large amount of code to handle a 4-dword instruction. Instead, we
965 // implement a fake VOP3P instruction which is assumed to come before an
966 // MFMA instruction.
967 //
968 // See https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
969 // instruction-set-architectures/
970 // amd-instinct-cdna4-instruction-set-architecture.pdf
971 // section 7.2.1 for details.
972 Wavefront *wf = gpuDynInst->wavefront();
973 ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
974 ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
975
976 src0.readSrc();
977 src1.readSrc();
978
979 if (isVectorReg(extData.SRC0)) {
980 int opsel = ((extData.OPSEL_HI & 1) << 1) | (instData.OPSEL & 1);
981
982 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
983 wf->setMfmaAScale(lane,
984 bits(src0[lane], opsel * 8 + 7, opsel * 8));
985 }
986 } else {
987 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
988 wf->setMfmaAScale(lane, bits(src0[lane], 30, 23));
989 }
990 }
991
992 if (isVectorReg(extData.SRC1)) {
993 int opsel = ((extData.OPSEL_HI & 2) << 1) | (instData.OPSEL & 2);
994
995 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
996 wf->setMfmaBScale(lane,
997 bits(src1[lane], opsel * 8 + 7, opsel * 8));
998 }
999 } else {
1000 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1001 wf->setMfmaBScale(lane, bits(src1[lane], 30, 23));
1002 }
1003 }
1004}
1005
1006} // namespace VegaISA
1007} // namespace gem5
uint32_t data
Definition mxfp.hh:112
void setFlag(Flags flag)
const std::string _opcode
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:635
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:654
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:384
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:344
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:422
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:459
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:493
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:530
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:564
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:601
void execute(GPUDynInstPtr) override
Definition vop3p.cc:961
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:300
void execute(GPUDynInstPtr) override
Definition vop3p.cc:842
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:158
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:249
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:203
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:289
void execute(GPUDynInstPtr) override
Definition vop3p.cc:688
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:178
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:191
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:133
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:238
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:333
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:218
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:269
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:322
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:228
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:279
void execute(GPUDynInstPtr) override
Definition vop3p.cc:916
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:311
void execute(GPUDynInstPtr) override
Definition vop3p.cc:770
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:145
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:168
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:259
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
Inst_VOP3P(InFmt_VOP3P *, const std::string &opcode)
void read() override
read from the vrf.
Definition operand.hh:148
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:132
void write() override
write to the vrf.
Definition operand.hh:203
uint32_t accumOffset
Definition wavefront.hh:138
void setMfmaBScale(int idx, uint8_t value)
VectorMask & execMask()
void setMfmaAScale(int idx, uint8_t value)
Floating-point library code, which will gradually replace vfp.hh.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:315
mxfp< fp16_e8m7_info > mxbfloat16
Definition mxfp_types.hh:49
Bitfield< 22 > a1
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7, 0 > imm
Definition types.hh:132
uint16_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:3597
uint16_t fplibMul(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:4820
uint16_t fplibMulAdd(uint16_t addend, uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:3876
Bitfield< 7 > i
Definition misc_types.hh:67
uint16_t fplibMax(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:4646
uint16_t fplibMin(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:4733
uint16_t fplibAdd(uint16_t op1, uint16_t op2, FPSCR &fpscr, FPCR fpcr)
Definition fplib.cc:3346
@ FPRounding_TIEEVEN
Definition fplib.hh:62
Bitfield< 7, 4 > b1
Definition qarma.hh:65
Bitfield< 11, 8 > b2
Definition qarma.hh:64
classes that represnt vector/scalar operands in VEGA ISA.
Definition faults.cc:39
int16_t clampI16(int32_t value, bool clamp)
Definition vop3p.cc:79
bool isVectorReg(int opIdx)
int32_t dotClampI(int32_t value, bool clamp)
Definition vop3p.cc:48
float clampF32(float value, bool clamp)
Definition vop3p.cc:120
VecOperand< VecElemU32, false > VecOperandU32
Definition operand.hh:829
VecOperand< VecElemU32, true > ConstVecOperandU32
Definition operand.hh:844
uint16_t clampF16(uint16_t value, bool clamp)
Definition vop3p.cc:103
uint16_t half
Definition vop3p.cc:43
const int NumVecElemPerVecReg(64)
VecOperand< VecElemU64, false > VecOperandU64
Definition operand.hh:832
uint32_t dotClampU(uint32_t value, bool clamp)
Definition vop3p.cc:64
VecOperand< VecElemU64, true > ConstVecOperandU64
Definition operand.hh:847
uint16_t clampU16(uint32_t value, bool clamp)
Definition vop3p.cc:91
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

Generated on Mon Oct 27 2025 04:12:51 for gem5 by doxygen 1.14.0