gem5 v24.0.0.0
Loading...
Searching...
No Matches
inst_util.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __ARCH_VEGA_INSTS_INST_UTIL_HH__
33#define __ARCH_VEGA_INSTS_INST_UTIL_HH__
34
35#include <cmath>
36
39
40namespace gem5
41{
42
43// values for SDWA select operations
44enum SDWASelVals : int
45{
46 SDWA_BYTE_0 = 0, /* select data[7:0] */
47 SDWA_BYTE_1 = 1, /* select data[15:8] */
48 SDWA_BYTE_2 = 2, /* select data[23:16] */
49 SDWA_BYTE_3 = 3, /* select data[31:24] */
50 SDWA_WORD_0 = 4, /* select data[15:0] */
51 SDWA_WORD_1 = 5, /* select data[31:16] */
52 SDWA_DWORD = 6 /* select data[31:0] */
53};
54
55// values for format of destination bits for SDWA operations
56enum SDWADstVals : int
57{
58 SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
59 SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
60 SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
61};
62
63// values for DPP operations
83static const int ROW_SIZE = 16; /* 16 registers per row */
84static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
85
86namespace VegaISA
87{
88 template<typename T>
89 inline T
91 {
92 T wqm = 0;
93 T mask = 0xF;
94
95 for (T bits = val; mask != 0; mask <<= 4)
96 if ((bits & mask) != 0)
97 wqm |= mask;
98
99 return wqm;
100 }
101
102 template<typename T>
103 inline T
105 {
106 T qmsk = 0;
107 T mask = 0xF;
108 T qbit = 0x1;
109
110 for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
111 if (bits & mask) {
112 qmsk |= qbit;
113 }
114 }
115
116 return qmsk;
117 }
118
119 template<typename T>
120 inline ScalarRegI32
122 {
123 ScalarRegI32 num_zeros
124 = std::numeric_limits<T>::digits - popCount(val);
125
126 return num_zeros;
127 }
128
129 template<typename T>
130 inline ScalarRegI32
132 {
133 if (val == ~T(0)) {
134 return -1;
135 }
136
137 return findLsbSet(~val);
138 }
139
140 template<typename T>
141 inline ScalarRegI32
143 {
144 if (!val) {
145 return -1;
146 }
147
148 return findLsbSet(val);
149 }
150
151 template<typename T>
152 inline ScalarRegI32
154 {
155 if (!val) {
156 return -1;
157 }
158
159 return findMsbSet(val);
160 }
161
162 template<typename T>
163 inline ScalarRegI32
165 {
166 if (!val) {
167 return -1;
168 }
169
170 return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
171 }
172
173 inline ScalarRegI32
175 {
176 bool found(false);
177 bool sign_bit = (val & 0x80000000) != 0;
178 ScalarRegU32 tmp_val(0);
179 int count(0);
180
181 if (!val || val == -1) {
182 return -1;
183 }
184
185 for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
186 tmp_val = val & (0x80000000 >> i);
187
188 if (!sign_bit) {
189 if (tmp_val) {
190 found = true;
191 break;
192 }
193 } else {
194 if (!tmp_val) {
195 found = true;
196 break;
197 }
198 }
199 ++count;
200 }
201
202 if (found) {
203 return count;
204 } else {
205 return -1;
206 }
207 }
208
209 inline ScalarRegI32
211 {
212 bool found(false);
213 bool sign_bit = (val & 0x8000000000000000ULL) != 0;
214 ScalarRegU64 tmp_val(0);
215 int count(0);
216
217 if (!val || val == -1) {
218 return -1;
219 }
220
221 for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
222 tmp_val = val & (0x8000000000000000ULL >> i);
223
224 if (!sign_bit) {
225 if (tmp_val) {
226 found = true;
227 break;
228 }
229 } else {
230 if (!tmp_val) {
231 found = true;
232 break;
233 }
234 }
235 ++count;
236 }
237
238 if (found) {
239 return count;
240 } else {
241 return -1;
242 }
243 }
244
245 template<typename T>
246 inline T
247 median(T val_0, T val_1, T val_2)
248 {
249 if (std::is_floating_point_v<T>) {
250 return std::fmax(std::fmin(val_0, val_1),
251 std::fmin(std::fmax(val_0, val_1), val_2));
252 } else {
253 return std::max(std::min(val_0, val_1),
254 std::min(std::max(val_0, val_1), val_2));
255 }
256 }
257
258 template <typename T>
260 {
261 T int_part = 0;
262 T nearest_round = std::floor(val + 0.5);
263 if ((int)std::floor(val) % 2 == 0
264 && std::modf(std::abs(val), &int_part) == 0.5) {
265 nearest_round = nearest_round - 1;
266 }
267
268 return nearest_round;
269 }
270
271 inline VecElemU32
273 VecElemU64 val_2)
274 {
275 __uint128_t u0 = (__uint128_t)val_0;
276 __uint128_t u1 = (__uint128_t)val_1;
277 __uint128_t u2 = (__uint128_t)val_2;
278 __uint128_t result = u0 * u1 + u2;
279
280 dst = (VecElemU64)result;
281
282 return (VecElemU32)(result >> 64) ? 1 : 0;
283 }
284
285 inline VecElemU32
287 VecElemI64 val_2)
288 {
289 __int128_t u0 = (__int128_t)val_0;
290 __int128_t u1 = (__int128_t)val_1;
291 __int128_t u2 = (__int128_t)val_2;
292 __int128_t result = u0 * u1 + u2;
293
294 dst = (VecElemI64)result;
295
296 return (VecElemU32)(result >> 64) ? 1 : 0;
297 }
298
319 inline int
320 dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
321 int rowOffset, bool & outOfBounds)
322 {
323 // local variables
324 // newLane will be the same as the input lane unless swizzling happens
325 int newLane = currLane;
326 // for shift/rotate permutations; positive values are LEFT rotates
327 // shift/rotate left means lane n -> lane n-1 (e.g., lane 1 -> lane 0)
328 int count = 0;
329 int localRowOffset = rowOffset;
330 int localRowNum = rowNum;
331
332 if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
333 int quadBase = (currLane & ~(3));
334 int quadPix = (currLane & 3);
335 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
336 newLane = (quadBase | quadPix);
337 } else if (dppCtrl == SQ_DPP_RESERVED) {
338 panic("ERROR: instruction using reserved DPP_CTRL value\n");
339 } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
340 (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
341 count = (dppCtrl - SQ_DPP_ROW_SL1 + 1);
342 if ((localRowOffset + count >= 0) &&
343 (localRowOffset + count < ROW_SIZE)) {
344 localRowOffset += count;
345 newLane = ((rowNum * ROW_SIZE) | localRowOffset);
346 } else {
347 outOfBounds = true;
348 }
349 } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
350 (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
351 count = -(dppCtrl - SQ_DPP_ROW_SR1 + 1);
352 if ((localRowOffset + count >= 0) &&
353 (localRowOffset + count < ROW_SIZE)) {
354 localRowOffset += count;
355 newLane = ((rowNum * ROW_SIZE) | localRowOffset);
356 } else {
357 outOfBounds = true;
358 }
359 } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
360 (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
361 count = -(dppCtrl - SQ_DPP_ROW_RR1 + 1);
362 localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
363 newLane = ((rowNum * ROW_SIZE) | localRowOffset);
364 } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
365 if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
366 newLane += 1;
367 } else {
368 outOfBounds = true;
369 }
370 } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
371 newLane = (currLane - 1 + NumVecElemPerVecReg) %
373 } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
374 int currVal = (currLane - 1);
375 if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
376 newLane -= 1;
377 } else {
378 outOfBounds = true;
379 }
380 } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
381 newLane = (currLane - 1 + NumVecElemPerVecReg) %
383 } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
384 localRowOffset = (15 - localRowOffset);
385 newLane = (rowNum | localRowOffset);
386 } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
387 localRowNum = (currLane & -0x7);
388 localRowOffset = (currLane & 0x7);
389 localRowOffset = (7 - localRowNum);
390 newLane = (localRowNum | localRowOffset);
391 } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
392 count = 15;
393 if (currLane > count) {
394 // 0x30 selects which set of 16 lanes to use. We broadcast the
395 // last lane of one set to all lanes of the next set (e.g.,
396 // lane 15 is written to 16-31, 31 to 32-47, 47 to 48-63).
397 newLane = (currLane & 0x30) - 1;
398 } else {
399 outOfBounds = true;
400 }
401 } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
402 count = 31;
403 if (currLane > count) {
404 // 0x20 selects either the upper 32 or lower 32 lanes and
405 // broadcasts the last lane of one set to all lanes of the
406 // next set (e.g., lane 31 is written to 32-63).
407 newLane = (currLane & 0x20) - 1;
408 } else {
409 outOfBounds = true;
410 }
411 } else {
412 panic("Unimplemented DPP control operation: %d\n", dppCtrl);
413 }
414
415 return newLane;
416 }
417
423 template<typename T>
424 void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
425 T & src0)
426 {
427 // local variables
428 SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
429 int boundCtrl = dppInst.BC;
430 int bankMask = dppInst.BANK_MASK;
431 int rowMask = dppInst.ROW_MASK;
432 // row, bank info to be calculated per lane
433 int rowNum = 0, bankNum = 0, rowOffset = 0;
434 // outLane will be the same as the input lane unless swizzling happens
435 int outLane = 0;
436 bool laneDisabled = false;
437 // flags used for determining if a lane should be written to/reset/etc.
438 bool outOfBounds = false, zeroSrc = false;
439 long long threadValid = 0;
440
447 if (dppInst.SRC0_NEG) {
448 src0.negModifier();
449 }
450
451 if (dppInst.SRC0_ABS) {
452 src0.absModifier();
453 }
454
455 // Need a copy of the original data since we update one lane at a time
456 T src0_copy = src0;
457
458 // iterate over all register lanes, performing steps 2-4
459 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
460 threadValid = (0x1LL << lane);
466 rowNum = (lane / ROW_SIZE);
467 rowOffset = (lane % ROW_SIZE);
468 bankNum = (rowOffset / NUM_BANKS);
469
470 if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
471 ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
472 laneDisabled = true;
473 }
474
491 if (!laneDisabled) {
492 outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
493 outOfBounds);
494 }
495
501 if (laneDisabled) {
502 threadValid = 0;
503 } else if (outOfBounds) {
504 if (boundCtrl == 1) {
505 zeroSrc = true;
506 } else {
507 threadValid = 0;
508 }
509 } else if (!gpuDynInst->wavefront()->execMask(lane)) {
510 if (boundCtrl == 1) {
511 zeroSrc = true;
512 } else {
513 threadValid = 0;
514 }
515 }
516
517 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
518 assert(!laneDisabled);
519 src0[lane] = src0_copy[outLane];
520 } else if (zeroSrc) {
521 src0[lane] = 0;
522 }
523
524 // reset for next iteration
525 laneDisabled = false;
526 outOfBounds = false;
527 zeroSrc = false;
528 }
529 }
530
536 template<typename T>
537 void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
538 T & src0, T & src1)
539 {
546 if (dppInst.SRC1_NEG) {
547 src1.negModifier();
548 }
549
550 if (dppInst.SRC1_ABS) {
551 src1.absModifier();
552 }
553
554 // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
555 // which is only used for negation/absolute value, call other version
556 // to do everything else.
557 processDPP(gpuDynInst, dppInst, src0);
558 }
559
566 template<typename T>
567 T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
568 const SDWASelVals sel, const bool signExt)
569 {
570 // local variables
571 int low_bit = 0, high_bit = 0;
572 bool signExt_local = signExt;
573 T retVal = 0;
574
575 // if we're preserving all of the bits, then we can immediately return
576 if (sel == SDWA_DWORD) {
577 return currOperVal;
578 }
579
580 if (sel < SDWA_WORD_0) { // we are selecting 1 byte
581 /*
582 Process byte 0 first. This code eiter selects the original bits
583 of byte 0, or makes the bits of the selected byte be byte 0 (and
584 next either sign extends or zero's out upper bits).
585 */
586 low_bit = (sel * VegaISA::BITS_PER_BYTE);
587 high_bit = low_bit + VegaISA::MSB_PER_BYTE;
588 retVal = bits(currOperVal, high_bit, low_bit);
589
590 // make sure update propagated, since used next
592 bits(origOperVal, high_bit),
593 "ERROR: SDWA byte update not propagated: retVal: %d, "
594 "orig: %d\n", bits(retVal, VegaISA::MSB_PER_BYTE),
595 bits(origOperVal, high_bit));
596 // sign extended value depends on upper-most bit of the new byte 0
597 signExt_local = (signExt &&
598 (bits(retVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
599
600 // process all other bytes -- if sign extending, make them 1, else
601 // all 0's so leave as is
602 if (signExt_local) {
603 retVal = (uint32_t)sext<VegaISA::MSB_PER_BYTE>(retVal);
604 }
605 } else if (sel < SDWA_DWORD) { // we are selecting 1 word
606 /*
607 Process word 0 first. This code eiter selects the original bits
608 of word 0, or makes the bits of the selected word be word 0 (and
609 next either sign extends or zero's out upper bits).
610 */
611 low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
612 high_bit = low_bit + VegaISA::MSB_PER_WORD;
613 retVal = bits(currOperVal, high_bit, low_bit);
614
615 // make sure update propagated, since used next
617 bits(origOperVal, high_bit),
618 "ERROR: SDWA word update not propagated: retVal: %d, "
619 "orig: %d\n",
621 bits(origOperVal, high_bit));
622 // sign extended value depends on upper-most bit of the new word 0
623 signExt_local = (signExt &&
624 (bits(retVal, VegaISA::MSB_PER_WORD, 0) &
625 0x8000));
626
627 // process other word -- if sign extending, make them 1, else all
628 // 0's so leave as is
629 if (signExt_local) {
630 retVal = (uint32_t)sext<VegaISA::MSB_PER_WORD>(retVal);
631 }
632 } else {
633 assert(sel != SDWA_DWORD); // should have returned earlier
634 panic("Unimplemented SDWA select operation: %d\n", sel);
635 }
636
637 return retVal;
638 }
639
640
659 template<typename T>
660 void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
661 const SDWASelVals sel, const bool signExt)
662 {
663 // iterate over all lanes, setting appropriate, selected value
664 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
665 currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
666 origCurrOper[lane], sel,
667 signExt);
668 }
669 }
670
671
678 template<typename T>
679 T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
680 const bool clamp, const SDWASelVals sel,
681 const SDWADstVals unusedBits_format)
682 {
683 // local variables
684 int low_bit = 0, high_bit = 0;
685 bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
686 //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
687 bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
688 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
689 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
690
691 // if we're preserving all of the bits, then we can immediately return
692 if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
693 assert(sel == SDWA_DWORD);
694 return currDstVal;
695 } else if (sel == SDWA_DWORD) {
696 // NOTE: users may set the unused bits variable to anything in this
697 // scenario, because it will be ignored
698 return currDstVal;
699 }
700
701 if (sel < SDWA_WORD_0) { // we are selecting 1 byte
702 // if we sign extended depends on upper-most bit of byte 0
703 signExt = (signExt &&
704 (bits(currDstVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
705
706 for (int byte = 0; byte < 4; ++byte) {
707 low_bit = byte * VegaISA::BITS_PER_BYTE;
708 high_bit = low_bit + VegaISA::MSB_PER_BYTE;
709 /*
710 Options:
711 1. byte == sel: we are keeping all bits in this byte
712 2. preserve is set: keep this byte as is because the
713 output preserve flag is set
714 3. byte > sel && signExt: we're sign extending and
715 this byte is one of the bytes we need to sign extend
716 */
717 origBits_thisByte = bits(origDstVal, VegaISA::MSB_PER_BYTE, 0);
718 currBits_thisByte = bits(currDstVal, high_bit, low_bit);
719 newBits = ((byte == sel) ? origBits_thisByte :
720 ((preserve) ? currBits_thisByte :
721 (((byte > sel) && signExt) ? 0xff : 0)));
722 retVal = insertBits(retVal, high_bit, low_bit, newBits);
723 }
724 } else if (sel < SDWA_DWORD) { // we are selecting 1 word
725 low_bit = 0;
726 high_bit = low_bit + VegaISA::MSB_PER_WORD;
727 // if we sign extended depends on upper-most bit of word 0
728 signExt = (signExt &&
729 (bits(currDstVal, high_bit, low_bit) & 0x8000));
730
731 for (int word = 0; word < 2; ++word) {
732 low_bit = word * VegaISA::BITS_PER_WORD;
733 high_bit = low_bit + VegaISA::MSB_PER_WORD;
734 /*
735 Options:
736 1. word == sel & 1: we are keeping all bits in this word
737 2. preserve is set: keep this word as is because the
738 output preserve flag is set
739 3. word > (sel & 1) && signExt: we're sign extending and
740 this word is one of the words we need to sign extend
741 */
742 origBits_thisWord = bits(origDstVal, VegaISA::MSB_PER_WORD, 0);
743 currBits_thisWord = bits(currDstVal, high_bit, low_bit);
744 newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
745 ((preserve) ? currBits_thisWord :
746 (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
747 retVal = insertBits(retVal, high_bit, low_bit, newBits);
748 }
749 } else {
750 assert(sel != SDWA_DWORD); // should have returned earlier
751 panic("Unimplemented SDWA select operation: %d\n", sel);
752 }
753
754 return retVal;
755 }
756
757
779 template<typename T>
780 void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
781 const SDWASelVals sel,
782 const SDWADstVals unusedBits_format)
783 {
784 // iterate over all lanes, setting appropriate, selected value
785 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
786 dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
787 origDstOper[lane], clamp,
788 sel, unusedBits_format);
789 }
790 }
791
792
800 template<typename T>
801 void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
802 const SDWASelVals src_sel,
803 const bool src_signExt, const bool src_abs,
804 const bool src_neg)
805 {
813 if (src_neg) {
814 currSrc.negModifier();
815 }
816
817 if (src_abs) {
818 currSrc.absModifier();
819 }
820
824 sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
825 }
826
827
835 template<typename T>
836 void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
837 {
838 // local variables
839 const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
840 const bool src0_signExt = sdwaInst.SRC0_SEXT;
841 const bool src0_neg = sdwaInst.SRC0_NEG;
842 const bool src0_abs = sdwaInst.SRC0_ABS;
843
844 // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
845 // operand. So ensure that SRC1 fields are not set, then call helper
846 // function only on src0.
847 assert(!sdwaInst.SRC1_SEXT);
848 assert(!sdwaInst.SRC1_NEG);
849 assert(!sdwaInst.SRC1_ABS);
850
851 processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
852 src0_abs, src0_neg);
853 }
854
855
863 template<typename T>
864 void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
865 T & src1, T & origSrc1)
866 {
867 // local variables
868 const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
869 const bool src0_signExt = sdwaInst.SRC0_SEXT;
870 const bool src0_neg = sdwaInst.SRC0_NEG;
871 const bool src0_abs = sdwaInst.SRC0_ABS;
872 const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
873 const bool src1_signExt = sdwaInst.SRC1_SEXT;
874 const bool src1_neg = sdwaInst.SRC1_NEG;
875 const bool src1_abs = sdwaInst.SRC1_ABS;
876
877 processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
878 src0_abs, src0_neg);
879 processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
880 src1_abs, src1_neg);
881 }
882
883
891 template<typename T>
892 void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
893 {
894 // local variables
895 const SDWADstVals dst_unusedBits_format =
896 (SDWADstVals)sdwaInst.DST_U;
897 const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
898 const bool clamp = sdwaInst.CLMP;
899
904 sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
905 }
906} // namespace VegaISA
907} // namespace gem5
908
909#endif // __ARCH_VEGA_INSTS_INST_UTIL_HH__
constexpr int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition bitfield.hh:279
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition bitfield.hh:415
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition bitfield.hh:185
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input That function will either use a builtin ...
Definition bitfield.hh:369
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7 > i
Definition misc_types.hh:67
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
ScalarRegI32 countZeroBitsMsb(T val)
Definition inst_util.hh:164
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition inst_util.hh:679
T quadMask(T val)
Definition inst_util.hh:104
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition inst_util.hh:174
ScalarRegI32 findFirstZero(T val)
Definition inst_util.hh:131
uint64_t ScalarRegU64
ScalarRegI32 findFirstOne(T val)
Definition inst_util.hh:142
const int BITS_PER_WORD
T median(T val_0, T val_1, T val_2)
Definition inst_util.hh:247
ScalarRegI32 findFirstOneMsb(T val)
Definition inst_util.hh:153
const int MSB_PER_BYTE
T roundNearestEven(T val)
Definition inst_util.hh:259
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:836
uint32_t VecElemU32
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition inst_util.hh:320
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition inst_util.hh:660
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:892
T wholeQuadMode(T val)
Definition inst_util.hh:90
uint64_t VecElemU64
const int MSB_PER_WORD
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition inst_util.hh:272
const int BITS_PER_BYTE
ScalarRegI32 countZeroBits(T val)
Definition inst_util.hh:121
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition inst_util.hh:567
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition inst_util.hh:801
uint32_t ScalarRegU32
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:424
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition inst_util.hh:780
Bitfield< 63 > val
Definition misc.hh:804
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
static const int NUM_BANKS
Definition inst_util.hh:84
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
static const int ROW_SIZE
Definition inst_util.hh:83
SDWADstVals
Definition inst_util.hh:57
@ SDWA_UNUSED_PRESERVE
Definition inst_util.hh:60
@ SDWA_UNUSED_SEXT
Definition inst_util.hh:59
@ SDWA_UNUSED_PAD
Definition inst_util.hh:58
SDWASelVals
Definition inst_util.hh:45
@ SDWA_BYTE_2
Definition inst_util.hh:48
@ SDWA_WORD_0
Definition inst_util.hh:50
@ SDWA_BYTE_0
Definition inst_util.hh:46
@ SDWA_BYTE_3
Definition inst_util.hh:49
@ SDWA_BYTE_1
Definition inst_util.hh:47
@ SDWA_DWORD
Definition inst_util.hh:52
@ SDWA_WORD_1
Definition inst_util.hh:51
SqDPPVals
Definition inst_util.hh:65
@ SQ_DPP_ROW_RR15
Definition inst_util.hh:73
@ SQ_DPP_ROW_MIRROR
Definition inst_util.hh:78
@ SQ_DPP_ROW_SR1
Definition inst_util.hh:70
@ SQ_DPP_ROW_SL15
Definition inst_util.hh:69
@ SQ_DPP_QUAD_PERM_MAX
Definition inst_util.hh:66
@ SQ_DPP_WF_RL1
Definition inst_util.hh:75
@ SQ_DPP_ROW_BCAST15
Definition inst_util.hh:80
@ SQ_DPP_ROW_SL1
Definition inst_util.hh:68
@ SQ_DPP_ROW_HALF_MIRROR
Definition inst_util.hh:79
@ SQ_DPP_ROW_RR1
Definition inst_util.hh:72
@ SQ_DPP_WF_SL1
Definition inst_util.hh:74
@ SQ_DPP_WF_SR1
Definition inst_util.hh:76
@ SQ_DPP_RESERVED
Definition inst_util.hh:67
@ SQ_DPP_ROW_SR15
Definition inst_util.hh:71
@ SQ_DPP_WF_RR1
Definition inst_util.hh:77
@ SQ_DPP_ROW_BCAST31
Definition inst_util.hh:81

Generated on Tue Jun 18 2024 16:23:41 for gem5 by doxygen 1.11.0