gem5 v24.0.0.0
Loading...
Searching...
No Matches
vop3p.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2023 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
36
37namespace gem5
38{
39
40namespace VegaISA
41{
42
43using half = uint16_t;
44
45// Helper functions
46template<int N>
47int32_t
48dotClampI(int32_t value, bool clamp)
49{
50 // Only valid for N < 32
51 static_assert(N < 32);
52
53 if (!clamp) {
54 return static_cast<int32_t>(value);
55 }
56
57 int32_t min = -(1 << (N - 1));
58 int32_t max = (1 << (N - 1)) - 1;
59 return std::clamp<int32_t>(value, min, max);
60}
61
62template<int N>
63uint32_t
64dotClampU(uint32_t value, bool clamp)
65{
66 // Only valid for N < 32
67 static_assert(N < 32);
68
69 if (!clamp) {
70 return static_cast<int32_t>(value);
71 }
72
73 uint32_t min = 0;
74 uint32_t max = (1 << N) - 1;
75 return std::clamp<int32_t>(value, min, max);
76}
77
78int16_t
79clampI16(int32_t value, bool clamp)
80{
81 if (!clamp) {
82 return static_cast<int16_t>(value);
83 }
84
85 return std::clamp(value,
86 static_cast<int32_t>(std::numeric_limits<int16_t>::min()),
87 static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
88}
89
90uint16_t
91clampU16(uint32_t value, bool clamp)
92{
93 if (!clamp) {
94 return static_cast<uint16_t>(value);
95 }
96
97 return std::clamp(value,
98 static_cast<uint32_t>(std::numeric_limits<uint16_t>::min()),
99 static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
100}
101
102uint16_t
103clampF16(uint16_t value, bool clamp)
104{
105 if (!clamp) {
106 return value;
107 }
108
109 // Values of one and zero in fp16.
110 constexpr uint16_t one = 0x3c00;
111 constexpr uint16_t zero = 0x0;
112 ArmISA::FPSCR fpscr1, fpscr2;
113
114 // If value > one, set to one, then if value < zero set to zero.
115 uint16_t imm = fplibMin(value, one, fpscr1);
116 return fplibMax(imm, zero, fpscr2);
117}
118
119float
120clampF32(float value, bool clamp)
121{
122 if (!clamp) {
123 return value;
124 }
125
126 return std::clamp(value, 0.0f, 1.0f);
127}
128
129
130
131
132// Begin instruction execute definitions
134{
135 auto opImpl =
136 [](int16_t S0, int16_t S1, int16_t S2, bool clamp) -> int16_t
137 {
138 return clampI16(S0 * S1 + S2, clamp);
139 };
140
141 vop3pHelper<int16_t>(gpuDynInst, opImpl);
142}
143
144void
146{
147 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
148 {
149 // Only return lower 16 bits of result - This operation cannot clamp.
150 uint32_t D = S0 * S1;
151 uint16_t Dh = D & 0xFFFF;
152 return Dh;
153 };
154
155 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
156}
157
159{
160 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
161 {
162 return clampI16(S0 + S1, clamp);
163 };
164
165 vop3pHelper<int16_t>(gpuDynInst, opImpl);
166}
167
169{
170 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
171 {
172 return clampI16(S0 - S1, clamp);
173 };
174
175 vop3pHelper<int16_t>(gpuDynInst, opImpl);
176}
177
179{
180 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
181 {
182 unsigned shift_val = bits(S0, 3, 0);
183
184 // Shift does not clamp
185 return S1 << shift_val;
186 };
187
188 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
189}
190
192{
193 auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t
194 {
195 unsigned shift_val = bits(S0, 3, 0);
196
197 return S1 >> shift_val;
198 };
199
200 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
201}
202
204{
205 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
206 {
207 // Sign extend to larger type to ensure we don't lose sign bits when
208 // shifting.
209 int32_t S1e = S1;
210 unsigned shift_val = bits(S0, 3, 0);
211
212 return S1e >> shift_val;
213 };
214
215 vop3pHelper<int16_t>(gpuDynInst, opImpl);
216}
217
219{
220 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
221 {
222 return clampI16((S0 >= S1) ? S0 : S1, clamp);
223 };
224
225 vop3pHelper<int16_t>(gpuDynInst, opImpl);
226}
227
229{
230 auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t
231 {
232 return clampI16((S0 < S1) ? S0 : S1, clamp);
233 };
234
235 vop3pHelper<int16_t>(gpuDynInst, opImpl);
236}
237
239{
240 auto opImpl =
241 [](uint16_t S0, uint16_t S1, uint16_t S2, bool clamp) -> uint16_t
242 {
243 return clampU16(S0 * S1 + S2, clamp);
244 };
245
246 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
247}
248
250{
251 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
252 {
253 return clampU16(S0 + S1, clamp);
254 };
255
256 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
257}
258
260{
261 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
262 {
263 return clampU16(S0 - S1, clamp);
264 };
265
266 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
267}
268
270{
271 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
272 {
273 return clampU16((S0 >= S1) ? S0 : S1, clamp);
274 };
275
276 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
277}
278
280{
281 auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t
282 {
283 return clampU16((S0 < S1) ? S0 : S1, clamp);
284 };
285
286 vop3pHelper<uint16_t>(gpuDynInst, opImpl);
287}
288
290{
291 auto opImpl = [](half S0, half S1, half S2, bool clamp) -> half
292 {
293 ArmISA::FPSCR fpscr;
294 return clampF16(fplibMulAdd(S2, S0, S1, fpscr), clamp);
295 };
296
297 vop3pHelper<half>(gpuDynInst, opImpl);
298}
299
301{
302 auto opImpl = [](half S0, half S1, bool clamp) -> half
303 {
304 ArmISA::FPSCR fpscr;
305 return clampF16(fplibAdd(S0, S1, fpscr), clamp);
306 };
307
308 vop3pHelper<half>(gpuDynInst, opImpl);
309}
310
312{
313 auto opImpl = [](half S0, half S1, bool clamp) -> half
314 {
315 ArmISA::FPSCR fpscr;
316 return clampF16(fplibMul(S0, S1, fpscr), clamp);
317 };
318
319 vop3pHelper<half>(gpuDynInst, opImpl);
320}
321
323{
324 auto opImpl = [](half S0, half S1, bool clamp) -> half
325 {
326 ArmISA::FPSCR fpscr;
327 return clampF16(fplibMin(S0, S1, fpscr), clamp);
328 };
329
330 vop3pHelper<half>(gpuDynInst, opImpl);
331}
332
334{
335 auto opImpl = [](half S0, half S1, bool clamp) -> half
336 {
337 ArmISA::FPSCR fpscr;
338 return clampF16(fplibMax(S0, S1, fpscr), clamp);
339 };
340
341 vop3pHelper<half>(gpuDynInst, opImpl);
342}
343
345{
346 auto opImpl =
347 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
348 {
349 constexpr unsigned INBITS = 16;
350
351 constexpr unsigned elems = 32 / INBITS;
352 half S0[elems];
353 half S1[elems];
354
355 for (int i = 0; i < elems; ++i) {
356 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
357 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
358 }
359
360 float S2 = *reinterpret_cast<float*>(&S2r);
361
362 // Compute components individually to prevent overflow across packing
363 half C[elems];
364 float Csum = 0.0f;
365
366 for (int i = 0; i < elems; ++i) {
367 ArmISA::FPSCR fpscr;
368 C[i] = fplibMul(S0[i], S1[i], fpscr);
369 uint32_t conv =
371 C[i], ArmISA::FPRounding_TIEEVEN, fpscr);
372 Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);
373 }
374
375 Csum += S2;
376 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
377
378 return rv;
379 };
380
381 dotHelper(gpuDynInst, opImpl);
382}
383
385{
386 auto opImpl =
387 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
388 {
389 constexpr unsigned INBITS = 16;
390
391 constexpr unsigned elems = 32 / INBITS;
392 uint32_t S0[elems];
393 uint32_t S1[elems];
394
395 for (int i = 0; i < elems; ++i) {
396 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
397 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
398 }
399
400 int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
401
402 // Compute components individually to prevent overflow across packing
403 int32_t C[elems];
404 int32_t Csum = 0;
405
406 for (int i = 0; i < elems; ++i) {
407 C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
408 C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
409 Csum += C[i];
410 }
411
412 Csum += S2;
413 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
414
415 return rv;
416 };
417
418 dotHelper(gpuDynInst, opImpl);
419}
420
422{
423 auto opImpl =
424 [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
425 {
426 constexpr unsigned INBITS = 16;
427
428 constexpr unsigned elems = 32 / INBITS;
429 uint32_t S0[elems];
430 uint32_t S1[elems];
431
432 for (int i = 0; i < elems; ++i) {
433 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
434 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
435 }
436
437 // Compute components individually to prevent overflow across packing
438 uint32_t C[elems];
439 uint32_t Csum = 0;
440
441 for (int i = 0; i < elems; ++i) {
442 C[i] = S0[i] * S1[i];
443 C[i] = dotClampU<INBITS>(C[i], clamp);
444 Csum += C[i];
445 }
446
447 Csum += S2;
448
449 return Csum;
450 };
451
452 dotHelper(gpuDynInst, opImpl);
453}
454
456{
457 auto opImpl =
458 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
459 {
460 constexpr unsigned INBITS = 8;
461
462 constexpr unsigned elems = 32 / INBITS;
463 uint32_t S0[elems];
464 uint32_t S1[elems];
465
466 for (int i = 0; i < elems; ++i) {
467 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
468 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
469 }
470
471 int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
472
473 // Compute components individually to prevent overflow across packing
474 int32_t C[elems];
475 int32_t Csum = 0;
476
477 for (int i = 0; i < elems; ++i) {
478 C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
479 C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
480 Csum += C[i];
481 }
482
483 Csum += S2;
484 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
485
486 return rv;
487 };
488
489 dotHelper(gpuDynInst, opImpl);
490}
491
493{
494 auto opImpl =
495 [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
496 {
497 constexpr unsigned INBITS = 8;
498
499 constexpr unsigned elems = 32 / INBITS;
500 uint32_t S0[elems];
501 uint32_t S1[elems];
502
503 for (int i = 0; i < elems; ++i) {
504 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
505 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
506 }
507
508 // Compute components individually to prevent overflow across packing
509 uint32_t C[elems];
510 uint32_t Csum = 0;
511
512 for (int i = 0; i < elems; ++i) {
513 C[i] = S0[i] * S1[i];
514 C[i] = dotClampU<INBITS>(C[i], clamp);
515 Csum += C[i];
516 }
517
518 Csum += S2;
519
520 return Csum;
521 };
522
523 dotHelper(gpuDynInst, opImpl);
524}
525
527{
528 auto opImpl =
529 [](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
530 {
531 constexpr unsigned INBITS = 4;
532
533 constexpr unsigned elems = 32 / INBITS;
534 uint32_t S0[elems];
535 uint32_t S1[elems];
536
537 for (int i = 0; i < elems; ++i) {
538 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
539 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
540 }
541
542 int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
543
544 // Compute components individually to prevent overflow across packing
545 int32_t C[elems];
546 int32_t Csum = 0;
547
548 for (int i = 0; i < elems; ++i) {
549 C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
550 C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
551 Csum += C[i];
552 }
553
554 Csum += S2;
555 uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
556
557 return rv;
558 };
559
560 dotHelper(gpuDynInst, opImpl);
561}
562
564{
565 auto opImpl =
566 [](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
567 {
568 constexpr unsigned INBITS = 4;
569
570 constexpr unsigned elems = 32 / INBITS;
571 uint32_t S0[elems];
572 uint32_t S1[elems];
573
574 for (int i = 0; i < elems; ++i) {
575 S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
576 S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
577 }
578
579 // Compute components individually to prevent overflow across packing
580 uint32_t C[elems];
581 uint32_t Csum = 0;
582
583 for (int i = 0; i < elems; ++i) {
584 C[i] = S0[i] * S1[i];
585 C[i] = dotClampU<INBITS>(C[i], clamp);
586 Csum += C[i];
587 }
588
589 Csum += S2;
590
591 return Csum;
592 };
593
594 dotHelper(gpuDynInst, opImpl);
595}
596
598{
599 Wavefront *wf = gpuDynInst->wavefront();
600 unsigned accum_offset = wf->accumOffset;
601
602 ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset);
603 VecOperandU32 vdst(gpuDynInst, instData.VDST);
604
605 src.readSrc();
606
607 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
608 if (wf->execMask(lane)) {
609 vdst[lane] = src[lane];
610 }
611 }
612
613 vdst.write();
614}
615
617{
618 Wavefront *wf = gpuDynInst->wavefront();
619 unsigned accum_offset = wf->accumOffset;
620
621 ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
622 VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
623
624 src.readSrc();
625
626 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
627 if (wf->execMask(lane)) {
628 vdst[lane] = src[lane];
629 }
630 }
631
632 vdst.write();
633}
634
635// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
636
638 : Inst_VOP3P(iFmt, "v_pk_fma_f32")
639{
640 setFlag(ALU);
641} // Inst_VOP3P__V_PK_FMA_F32
642
644{
645} // ~Inst_VOP3P__V_PK_FMA_F32
646
647// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
648// S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
649void
651{
652 // This is a special case of packed instructions which operates on
653 // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
654 // values cannot use bitwise operations. Consider the U64 to imply
655 // untyped 64-bits of data.
656 Wavefront *wf = gpuDynInst->wavefront();
657 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
658 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
659 ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
660 VecOperandU64 vdst(gpuDynInst, instData.VDST);
661
662 src0.readSrc();
663 src1.readSrc();
664 src2.readSrc();
665
666 int opsel = instData.OPSEL;
667 int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
668
669 int neg = extData.NEG;
670 int neg_hi = instData.NEG_HI;
671
672 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
673 if (wf->execMask(lane)) {
674 uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
675 : bits(src0[lane], 31, 0);
676 uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
677 : bits(src1[lane], 31, 0);
678 uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
679 : bits(src2[lane], 31, 0);
680
681 float s0lf = *reinterpret_cast<float*>(&s0l);
682 float s1lf = *reinterpret_cast<float*>(&s1l);
683 float s2lf = *reinterpret_cast<float*>(&s2l);
684
685 if (neg & 1) s0lf = -s0lf;
686 if (neg & 1) s1lf = -s1lf;
687 if (neg & 1) s2lf = -s2lf;
688
689 float dword1 = std::fma(s0lf, s1lf, s2lf);
690
691 uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
692 : bits(src0[lane], 31, 0);
693 uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
694 : bits(src1[lane], 31, 0);
695 uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
696 : bits(src2[lane], 31, 0);
697
698 float s0hf = *reinterpret_cast<float*>(&s0h);
699 float s1hf = *reinterpret_cast<float*>(&s1h);
700 float s2hf = *reinterpret_cast<float*>(&s2h);
701
702 if (neg_hi & 1) s0hf = -s0hf;
703 if (neg_hi & 1) s1hf = -s1hf;
704 if (neg_hi & 1) s2hf = -s2hf;
705
706 float dword2 = std::fma(s0hf, s1hf, s2hf);
707
708 uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
709 uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
710
711 vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
712 }
713 }
714
715 vdst.write();
716} // execute
717// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
718
720 : Inst_VOP3P(iFmt, "v_pk_mul_f32")
721{
722 setFlag(ALU);
723} // Inst_VOP3P__V_PK_MUL_F32
724
726{
727} // ~Inst_VOP3P__V_PK_MUL_F32
728
729// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
730// S1.f[31:0]
731void
733{
734 // This is a special case of packed instructions which operates on
735 // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
736 // values cannot use bitwise operations. Consider the U64 to imply
737 // untyped 64-bits of data.
738 Wavefront *wf = gpuDynInst->wavefront();
739 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
740 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
741 VecOperandU64 vdst(gpuDynInst, instData.VDST);
742
743 src0.readSrc();
744 src1.readSrc();
745
746 int opsel = instData.OPSEL;
747 int opsel_hi = extData.OPSEL_HI;
748
749 int neg = extData.NEG;
750 int neg_hi = instData.NEG_HI;
751
752 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
753 if (wf->execMask(lane)) {
754 uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
755 : bits(src0[lane], 31, 0);
756 uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
757 : bits(src1[lane], 31, 0);
758
759 float ldwordf = *reinterpret_cast<float*>(&lower_dword);
760 float udwordf = *reinterpret_cast<float*>(&upper_dword);
761
762 if (neg & 1) ldwordf = -ldwordf;
763 if (neg & 2) udwordf = -udwordf;
764
765 float dword1 = ldwordf * udwordf;
766
767 lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
768 : bits(src0[lane], 31, 0);
769 upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
770 : bits(src1[lane], 31, 0);
771
772 ldwordf = *reinterpret_cast<float*>(&lower_dword);
773 udwordf = *reinterpret_cast<float*>(&upper_dword);
774
775 if (neg_hi & 1) ldwordf = -ldwordf;
776 if (neg_hi & 2) udwordf = -udwordf;
777
778 float dword2 = ldwordf * udwordf;
779
780 uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
781 uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
782
783 vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
784 }
785 }
786
787 vdst.write();
788} // execute
789// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
790
792 : Inst_VOP3P(iFmt, "v_pk_add_f32")
793{
794 setFlag(ALU);
795} // Inst_VOP3P__V_PK_ADD_F32
796
798{
799} // ~Inst_VOP3P__V_PK_ADD_F32
800
801// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
802// S1.f[31:0]
803void
805{
806 // This is a special case of packed instructions which operates on
807 // 64-bit inputs/outputs and not 32-bit. U64 is used here as float
808 // values cannot use bitwise operations. Consider the U64 to imply
809 // untyped 64-bits of data.
810 Wavefront *wf = gpuDynInst->wavefront();
811 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
812 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
813 VecOperandU64 vdst(gpuDynInst, instData.VDST);
814
815 src0.readSrc();
816 src1.readSrc();
817
818 panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
819 panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
820
821 int opsel = instData.OPSEL;
822 int opsel_hi = extData.OPSEL_HI;
823
824 int neg = extData.NEG;
825 int neg_hi = instData.NEG_HI;
826
827 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
828 if (wf->execMask(lane)) {
829 uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
830 : bits(src0[lane], 31, 0);
831 uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
832 : bits(src1[lane], 31, 0);
833
834 float ldwordf = *reinterpret_cast<float*>(&lower_dword);
835 float udwordf = *reinterpret_cast<float*>(&upper_dword);
836
837 if (neg & 1) ldwordf = -ldwordf;
838 if (neg & 2) udwordf = -udwordf;
839
840 float dword1 = ldwordf + udwordf;
841
842 lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
843 : bits(src0[lane], 31, 0);
844 upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
845 : bits(src1[lane], 31, 0);
846
847 ldwordf = *reinterpret_cast<float*>(&lower_dword);
848 udwordf = *reinterpret_cast<float*>(&upper_dword);
849
850 if (neg_hi & 1) ldwordf = -ldwordf;
851 if (neg_hi & 2) udwordf = -udwordf;
852
853 float dword2 = ldwordf + udwordf;
854
855 uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
856 uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
857
858 vdst[lane] = (static_cast<uint64_t>(result2) << 32) | result1;
859 }
860 }
861
862 vdst.write();
863} // execute
864// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
865
867 : Inst_VOP3P(iFmt, "v_pk_mov_b32")
868{
869 setFlag(ALU);
870} // Inst_VOP3P__V_PK_MOV_B32
871
873{
874} // ~Inst_VOP3P__V_PK_MOV_B32
875
876// D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0].
877void
879{
880 // This is a special case of packed instructions which operates on
881 // 64-bit inputs/outputs and not 32-bit.
882 Wavefront *wf = gpuDynInst->wavefront();
883 ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
884 ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
885 VecOperandU64 vdst(gpuDynInst, instData.VDST);
886
887 src0.readSrc();
888 src1.readSrc();
889
890 // Only OPSEL[1:0] are used
891 // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
892 int opsel = instData.OPSEL;
893
895 "Negative modifier undefined for %s", _opcode);
896
897 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
898 if (wf->execMask(lane)) {
899 // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1
900 uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
901 : bits(src0[lane], 31, 0);
902 uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
903 : bits(src1[lane], 31, 0);
904
905 vdst[lane] = upper_dword << 32 | lower_dword;
906 }
907 }
908
909 vdst.write();
910} // execute
911
912} // namespace VegaISA
913} // namespace gem5
void setFlag(Flags flag)
const std::string _opcode
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:597
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:616
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:344
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:384
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:421
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:455
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:492
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:526
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:563
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:300
void execute(GPUDynInstPtr) override
Definition vop3p.cc:804
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:158
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:249
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:203
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:289
void execute(GPUDynInstPtr) override
Definition vop3p.cc:650
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:178
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:191
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:133
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:238
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:333
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:218
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:269
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:322
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:228
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:279
void execute(GPUDynInstPtr) override
Definition vop3p.cc:878
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:311
void execute(GPUDynInstPtr) override
Definition vop3p.cc:732
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:145
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:168
void execute(GPUDynInstPtr gpuDynInst) override
Definition vop3p.cc:259
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:131
void write() override
write to the vrf.
Definition operand.hh:199
uint32_t accumOffset
Definition wavefront.hh:137
VectorMask & execMask()
Floating-point library code, which will gradually replace vfp.hh.
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:283
uint16_t fplibMax(uint16_t op1, uint16_t op2, FPSCR &fpscr)
Definition fplib.cc:3182
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7, 0 > imm
Definition types.hh:132
uint16_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr)
Definition fplib.cc:2627
Bitfield< 7 > i
Definition misc_types.hh:67
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
uint16_t fplibAdd(uint16_t op1, uint16_t op2, FPSCR &fpscr)
Definition fplib.cc:2393
uint16_t fplibMul(uint16_t op1, uint16_t op2, FPSCR &fpscr)
Definition fplib.cc:3368
uint16_t fplibMulAdd(uint16_t addend, uint16_t op1, uint16_t op2, FPSCR &fpscr)
Definition fplib.cc:2878
@ FPRounding_TIEEVEN
Definition fplib.hh:62
uint16_t fplibMin(uint16_t op1, uint16_t op2, FPSCR &fpscr)
Definition fplib.cc:3275
int16_t clampI16(int32_t value, bool clamp)
Definition vop3p.cc:79
int32_t dotClampI(int32_t value, bool clamp)
Definition vop3p.cc:48
float clampF32(float value, bool clamp)
Definition vop3p.cc:120
uint16_t clampF16(uint16_t value, bool clamp)
Definition vop3p.cc:103
uint16_t half
Definition vop3p.cc:43
uint32_t dotClampU(uint32_t value, bool clamp)
Definition vop3p.cc:64
uint16_t clampU16(uint32_t value, bool clamp)
Definition vop3p.cc:91
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

Generated on Tue Jun 18 2024 16:23:53 for gem5 by doxygen 1.11.0