gem5 v23.0.0.1
Loading...
Searching...
No Matches
inst_util.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __ARCH_VEGA_INSTS_INST_UTIL_HH__
33#define __ARCH_VEGA_INSTS_INST_UTIL_HH__
34
35#include <cmath>
36
38
39namespace gem5
40{
41
42// values for SDWA select operations
43enum SDWASelVals : int
44{
45 SDWA_BYTE_0 = 0, /* select data[7:0] */
46 SDWA_BYTE_1 = 1, /* select data[15:8] */
47 SDWA_BYTE_2 = 2, /* select data[23:16] */
48 SDWA_BYTE_3 = 3, /* select data[31:24] */
49 SDWA_WORD_0 = 4, /* select data[15:0] */
50 SDWA_WORD_1 = 5, /* select data[31:16] */
51 SDWA_DWORD = 6 /* select data[31:0] */
52};
53
54// values for format of destination bits for SDWA operations
55enum SDWADstVals : int
56{
57 SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
58 SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
59 SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
60};
61
62// values for DPP operations
63enum SqDPPVals : int
64{
66 SQ_DPP_RESERVED = 0x100,
67 SQ_DPP_ROW_SL1 = 0x101,
68 SQ_DPP_ROW_SL15 = 0x10F,
69 SQ_DPP_ROW_SR1 = 0x111,
70 SQ_DPP_ROW_SR15 = 0x11F,
71 SQ_DPP_ROW_RR1 = 0x121,
72 SQ_DPP_ROW_RR15 = 0x12F,
73 SQ_DPP_WF_SL1 = 0x130,
74 SQ_DPP_WF_RL1 = 0x134,
75 SQ_DPP_WF_SR1 = 0x138,
76 SQ_DPP_WF_RR1 = 0x13C,
77 SQ_DPP_ROW_MIRROR = 0x140,
79 SQ_DPP_ROW_BCAST15 = 0x142,
80 SQ_DPP_ROW_BCAST31 = 0x143
81};
82static const int ROW_SIZE = 16; /* 16 registers per row */
83static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
84
85namespace VegaISA
86{
87 template<typename T>
88 inline T
90 {
91 T wqm = 0;
92 T mask = 0xF;
93
94 for (T bits = val; mask != 0; mask <<= 4)
95 if ((bits & mask) != 0)
96 wqm |= mask;
97
98 return wqm;
99 }
100
101 template<typename T>
102 inline T
104 {
105 T qmsk = 0;
106 T mask = 0xF;
107 T qbit = 0x1;
108
109 for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
110 if (bits & mask) {
111 qmsk |= qbit;
112 }
113 }
114
115 return qmsk;
116 }
117
118 template<typename T>
119 inline ScalarRegI32
121 {
122 ScalarRegI32 num_zeros
123 = std::numeric_limits<T>::digits - popCount(val);
124
125 return num_zeros;
126 }
127
128 template<typename T>
129 inline ScalarRegI32
131 {
132 if (val == ~T(0)) {
133 return -1;
134 }
135
136 return findLsbSet(~val);
137 }
138
139 template<typename T>
140 inline ScalarRegI32
142 {
143 if (!val) {
144 return -1;
145 }
146
147 return findLsbSet(val);
148 }
149
150 template<typename T>
151 inline ScalarRegI32
153 {
154 if (!val) {
155 return -1;
156 }
157
158 return findMsbSet(val);
159 }
160
161 template<typename T>
162 inline ScalarRegI32
164 {
165 if (!val) {
166 return -1;
167 }
168
169 return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
170 }
171
172 inline ScalarRegI32
174 {
175 bool found(false);
176 bool sign_bit = (val & 0x80000000) != 0;
177 ScalarRegU32 tmp_val(0);
178 int count(0);
179
180 if (!val || val == -1) {
181 return -1;
182 }
183
184 for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
185 tmp_val = val & (0x80000000 >> i);
186
187 if (!sign_bit) {
188 if (tmp_val) {
189 found = true;
190 break;
191 }
192 } else {
193 if (!tmp_val) {
194 found = true;
195 break;
196 }
197 }
198 ++count;
199 }
200
201 if (found) {
202 return count;
203 } else {
204 return -1;
205 }
206 }
207
208 inline ScalarRegI32
210 {
211 bool found(false);
212 bool sign_bit = (val & 0x8000000000000000ULL) != 0;
213 ScalarRegU64 tmp_val(0);
214 int count(0);
215
216 if (!val || val == -1) {
217 return -1;
218 }
219
220 for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
221 tmp_val = val & (0x8000000000000000ULL >> i);
222
223 if (!sign_bit) {
224 if (tmp_val) {
225 found = true;
226 break;
227 }
228 } else {
229 if (!tmp_val) {
230 found = true;
231 break;
232 }
233 }
234 ++count;
235 }
236
237 if (found) {
238 return count;
239 } else {
240 return -1;
241 }
242 }
243
244 template<typename T>
245 inline T
246 median(T val_0, T val_1, T val_2)
247 {
248 if (std::is_floating_point_v<T>) {
249 return std::fmax(std::fmin(val_0, val_1),
250 std::fmin(std::fmax(val_0, val_1), val_2));
251 } else {
252 return std::max(std::min(val_0, val_1),
253 std::min(std::max(val_0, val_1), val_2));
254 }
255 }
256
257 template <typename T>
259 {
260 T int_part = 0;
261 T nearest_round = std::floor(val + 0.5);
262 if ((int)std::floor(val) % 2 == 0
263 && std::modf(std::abs(val), &int_part) == 0.5) {
264 nearest_round = nearest_round - 1;
265 }
266
267 return nearest_round;
268 }
269
270 inline VecElemU32
272 VecElemU64 val_2)
273 {
274 __uint128_t u0 = (__uint128_t)val_0;
275 __uint128_t u1 = (__uint128_t)val_1;
276 __uint128_t u2 = (__uint128_t)val_2;
277 __uint128_t result = u0 * u1 + u2;
278
279 dst = (VecElemU64)result;
280
281 return (VecElemU32)(result >> 64) ? 1 : 0;
282 }
283
284 inline VecElemU32
286 VecElemI64 val_2)
287 {
288 __int128_t u0 = (__int128_t)val_0;
289 __int128_t u1 = (__int128_t)val_1;
290 __int128_t u2 = (__int128_t)val_2;
291 __int128_t result = u0 * u1 + u2;
292
293 dst = (VecElemI64)result;
294
295 return (VecElemU32)(result >> 64) ? 1 : 0;
296 }
297
318 int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
319 int rowOffset, bool & outOfBounds)
320 {
321 // local variables
322 // newLane will be the same as the input lane unless swizzling happens
323 int newLane = currLane;
324 // for shift/rotate permutations; positive values are LEFT rotates
325 // shift/rotate left means lane n -> lane n-1 (e.g., lane 1 -> lane 0)
326 int count = 0;
327 int localRowOffset = rowOffset;
328 int localRowNum = rowNum;
329
330 if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
331 int quadBase = (currLane & ~(3));
332 int quadPix = (currLane & 3);
333 quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
334 newLane = (quadBase | quadPix);
335 } else if (dppCtrl == SQ_DPP_RESERVED) {
336 panic("ERROR: instruction using reserved DPP_CTRL value\n");
337 } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
338 (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
339 count = (dppCtrl - SQ_DPP_ROW_SL1 + 1);
340 if ((localRowOffset + count >= 0) &&
341 (localRowOffset + count < ROW_SIZE)) {
342 localRowOffset += count;
343 newLane = ((rowNum * ROW_SIZE) | localRowOffset);
344 } else {
345 outOfBounds = true;
346 }
347 } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
348 (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
349 count = -(dppCtrl - SQ_DPP_ROW_SR1 + 1);
350 if ((localRowOffset + count >= 0) &&
351 (localRowOffset + count < ROW_SIZE)) {
352 localRowOffset += count;
353 newLane = ((rowNum * ROW_SIZE) | localRowOffset);
354 } else {
355 outOfBounds = true;
356 }
357 } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
358 (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
359 count = -(dppCtrl - SQ_DPP_ROW_RR1 + 1);
360 localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
361 newLane = ((rowNum * ROW_SIZE) | localRowOffset);
362 } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
363 if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
364 newLane += 1;
365 } else {
366 outOfBounds = true;
367 }
368 } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
369 newLane = (currLane - 1 + NumVecElemPerVecReg) %
371 } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
372 int currVal = (currLane - 1);
373 if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
374 newLane -= 1;
375 } else {
376 outOfBounds = true;
377 }
378 } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
379 newLane = (currLane - 1 + NumVecElemPerVecReg) %
381 } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
382 localRowOffset = (15 - localRowOffset);
383 newLane = (rowNum | localRowOffset);
384 } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
385 localRowNum = (currLane & -0x7);
386 localRowOffset = (currLane & 0x7);
387 localRowOffset = (7 - localRowNum);
388 newLane = (localRowNum | localRowOffset);
389 } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
390 count = 15;
391 if (currLane > count) {
392 // 0x30 selects which set of 16 lanes to use. We broadcast the
393 // last lane of one set to all lanes of the next set (e.g.,
394 // lane 15 is written to 16-31, 31 to 32-47, 47 to 48-63).
395 newLane = (currLane & 0x30) - 1;
396 } else {
397 outOfBounds = true;
398 }
399 } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
400 count = 31;
401 if (currLane > count) {
402 // 0x20 selects either the upper 32 or lower 32 lanes and
403 // broadcasts the last lane of one set to all lanes of the
404 // next set (e.g., lane 31 is written to 32-63).
405 newLane = (currLane & 0x20) - 1;
406 } else {
407 outOfBounds = true;
408 }
409 } else {
410 panic("Unimplemented DPP control operation: %d\n", dppCtrl);
411 }
412
413 return newLane;
414 }
415
421 template<typename T>
422 void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
423 T & src0)
424 {
425 // local variables
426 SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
427 int boundCtrl = dppInst.BC;
428 int bankMask = dppInst.BANK_MASK;
429 int rowMask = dppInst.ROW_MASK;
430 // row, bank info to be calculated per lane
431 int rowNum = 0, bankNum = 0, rowOffset = 0;
432 // outLane will be the same as the input lane unless swizzling happens
433 int outLane = 0;
434 bool laneDisabled = false;
435 // flags used for determining if a lane should be written to/reset/etc.
436 bool outOfBounds = false, zeroSrc = false;
437 long long threadValid = 0;
438
445 if (dppInst.SRC0_NEG) {
446 src0.negModifier();
447 }
448
449 if (dppInst.SRC0_ABS) {
450 src0.absModifier();
451 }
452
453 // Need a copy of the original data since we update one lane at a time
454 T src0_copy = src0;
455
456 // iterate over all register lanes, performing steps 2-4
457 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
458 threadValid = (0x1LL << lane);
464 rowNum = (lane / ROW_SIZE);
465 rowOffset = (lane % ROW_SIZE);
466 bankNum = (rowOffset / NUM_BANKS);
467
468 if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
469 ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
470 laneDisabled = true;
471 }
472
489 if (!laneDisabled) {
490 outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
491 outOfBounds);
492 }
493
499 if (laneDisabled) {
500 threadValid = 0;
501 } else if (outOfBounds) {
502 if (boundCtrl == 1) {
503 zeroSrc = true;
504 } else {
505 threadValid = 0;
506 }
507 } else if (!gpuDynInst->wavefront()->execMask(lane)) {
508 if (boundCtrl == 1) {
509 zeroSrc = true;
510 } else {
511 threadValid = 0;
512 }
513 }
514
515 if (threadValid != 0 && !outOfBounds && !zeroSrc) {
516 assert(!laneDisabled);
517 src0[lane] = src0_copy[outLane];
518 } else if (zeroSrc) {
519 src0[lane] = 0;
520 }
521
522 // reset for next iteration
523 laneDisabled = false;
524 outOfBounds = false;
525 zeroSrc = false;
526 }
527 }
528
534 template<typename T>
535 void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
536 T & src0, T & src1)
537 {
544 if (dppInst.SRC1_NEG) {
545 src1.negModifier();
546 }
547
548 if (dppInst.SRC1_ABS) {
549 src1.absModifier();
550 }
551
552 // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
553 // which is only used for negation/absolute value, call other version
554 // to do everything else.
555 processDPP(gpuDynInst, dppInst, src0);
556 }
557
564 template<typename T>
565 T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
566 const SDWASelVals sel, const bool signExt)
567 {
568 // local variables
569 int low_bit = 0, high_bit = 0;
570 bool signExt_local = signExt;
571 T retVal = 0;
572
573 // if we're preserving all of the bits, then we can immediately return
574 if (sel == SDWA_DWORD) {
575 return currOperVal;
576 }
577
578 if (sel < SDWA_WORD_0) { // we are selecting 1 byte
579 /*
580 Process byte 0 first. This code eiter selects the original bits
581 of byte 0, or makes the bits of the selected byte be byte 0 (and
582 next either sign extends or zero's out upper bits).
583 */
584 low_bit = (sel * VegaISA::BITS_PER_BYTE);
585 high_bit = low_bit + VegaISA::MSB_PER_BYTE;
586 retVal = bits(currOperVal, high_bit, low_bit);
587
588 // make sure update propagated, since used next
590 bits(origOperVal, high_bit),
591 "ERROR: SDWA byte update not propagated: retVal: %d, "
592 "orig: %d\n", bits(retVal, VegaISA::MSB_PER_BYTE),
593 bits(origOperVal, high_bit));
594 // sign extended value depends on upper-most bit of the new byte 0
595 signExt_local = (signExt &&
596 (bits(retVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
597
598 // process all other bytes -- if sign extending, make them 1, else
599 // all 0's so leave as is
600 if (signExt_local) {
601 retVal = (uint32_t)sext<VegaISA::MSB_PER_BYTE>(retVal);
602 }
603 } else if (sel < SDWA_DWORD) { // we are selecting 1 word
604 /*
605 Process word 0 first. This code eiter selects the original bits
606 of word 0, or makes the bits of the selected word be word 0 (and
607 next either sign extends or zero's out upper bits).
608 */
609 low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
610 high_bit = low_bit + VegaISA::MSB_PER_WORD;
611 retVal = bits(currOperVal, high_bit, low_bit);
612
613 // make sure update propagated, since used next
615 bits(origOperVal, high_bit),
616 "ERROR: SDWA word update not propagated: retVal: %d, "
617 "orig: %d\n",
619 bits(origOperVal, high_bit));
620 // sign extended value depends on upper-most bit of the new word 0
621 signExt_local = (signExt &&
622 (bits(retVal, VegaISA::MSB_PER_WORD, 0) &
623 0x8000));
624
625 // process other word -- if sign extending, make them 1, else all
626 // 0's so leave as is
627 if (signExt_local) {
628 retVal = (uint32_t)sext<VegaISA::MSB_PER_WORD>(retVal);
629 }
630 } else {
631 assert(sel != SDWA_DWORD); // should have returned earlier
632 panic("Unimplemented SDWA select operation: %d\n", sel);
633 }
634
635 return retVal;
636 }
637
638
657 template<typename T>
658 void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
659 const SDWASelVals sel, const bool signExt)
660 {
661 // iterate over all lanes, setting appropriate, selected value
662 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
663 currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
664 origCurrOper[lane], sel,
665 signExt);
666 }
667 }
668
669
676 template<typename T>
677 T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
678 const bool clamp, const SDWASelVals sel,
679 const SDWADstVals unusedBits_format)
680 {
681 // local variables
682 int low_bit = 0, high_bit = 0;
683 bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
684 //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
685 bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
686 T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
687 origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
688
689 // if we're preserving all of the bits, then we can immediately return
690 if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
691 assert(sel == SDWA_DWORD);
692 return currDstVal;
693 } else if (sel == SDWA_DWORD) {
694 // NOTE: users may set the unused bits variable to anything in this
695 // scenario, because it will be ignored
696 return currDstVal;
697 }
698
699 if (sel < SDWA_WORD_0) { // we are selecting 1 byte
700 // if we sign extended depends on upper-most bit of byte 0
701 signExt = (signExt &&
702 (bits(currDstVal, VegaISA::MSB_PER_WORD, 0) & 0x80));
703
704 for (int byte = 0; byte < 4; ++byte) {
705 low_bit = byte * VegaISA::BITS_PER_BYTE;
706 high_bit = low_bit + VegaISA::MSB_PER_BYTE;
707 /*
708 Options:
709 1. byte == sel: we are keeping all bits in this byte
710 2. preserve is set: keep this byte as is because the
711 output preserve flag is set
712 3. byte > sel && signExt: we're sign extending and
713 this byte is one of the bytes we need to sign extend
714 */
715 origBits_thisByte = bits(origDstVal, high_bit, low_bit);
716 currBits_thisByte = bits(currDstVal, high_bit, low_bit);
717 newBits = ((byte == sel) ? origBits_thisByte :
718 ((preserve) ? currBits_thisByte :
719 (((byte > sel) && signExt) ? 0xff : 0)));
720 retVal = insertBits(retVal, high_bit, low_bit, newBits);
721 }
722 } else if (sel < SDWA_DWORD) { // we are selecting 1 word
723 low_bit = 0;
724 high_bit = low_bit + VegaISA::MSB_PER_WORD;
725 // if we sign extended depends on upper-most bit of word 0
726 signExt = (signExt &&
727 (bits(currDstVal, high_bit, low_bit) & 0x8000));
728
729 for (int word = 0; word < 2; ++word) {
730 low_bit = word * VegaISA::BITS_PER_WORD;
731 high_bit = low_bit + VegaISA::MSB_PER_WORD;
732 /*
733 Options:
734 1. word == sel & 1: we are keeping all bits in this word
735 2. preserve is set: keep this word as is because the
736 output preserve flag is set
737 3. word > (sel & 1) && signExt: we're sign extending and
738 this word is one of the words we need to sign extend
739 */
740 origBits_thisWord = bits(origDstVal, high_bit, low_bit);
741 currBits_thisWord = bits(currDstVal, high_bit, low_bit);
742 newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
743 ((preserve) ? currBits_thisWord :
744 (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
745 retVal = insertBits(retVal, high_bit, low_bit, newBits);
746 }
747 } else {
748 assert(sel != SDWA_DWORD); // should have returned earlier
749 panic("Unimplemented SDWA select operation: %d\n", sel);
750 }
751
752 return retVal;
753 }
754
755
777 template<typename T>
778 void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
779 const SDWASelVals sel,
780 const SDWADstVals unusedBits_format)
781 {
782 // iterate over all lanes, setting appropriate, selected value
783 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
784 dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
785 origDstOper[lane], clamp,
786 sel, unusedBits_format);
787 }
788 }
789
790
798 template<typename T>
799 void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
800 const SDWASelVals src_sel,
801 const bool src_signExt, const bool src_abs,
802 const bool src_neg)
803 {
811 if (src_neg) {
812 currSrc.negModifier();
813 }
814
815 if (src_abs) {
816 currSrc.absModifier();
817 }
818
822 sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
823 }
824
825
833 template<typename T>
834 void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
835 {
836 // local variables
837 const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
838 const bool src0_signExt = sdwaInst.SRC0_SEXT;
839 const bool src0_neg = sdwaInst.SRC0_NEG;
840 const bool src0_abs = sdwaInst.SRC0_ABS;
841
842 // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
843 // operand. So ensure that SRC1 fields are not set, then call helper
844 // function only on src0.
845 assert(!sdwaInst.SRC1_SEXT);
846 assert(!sdwaInst.SRC1_NEG);
847 assert(!sdwaInst.SRC1_ABS);
848
849 processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
850 src0_abs, src0_neg);
851 }
852
853
861 template<typename T>
862 void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
863 T & src1, T & origSrc1)
864 {
865 // local variables
866 const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
867 const bool src0_signExt = sdwaInst.SRC0_SEXT;
868 const bool src0_neg = sdwaInst.SRC0_NEG;
869 const bool src0_abs = sdwaInst.SRC0_ABS;
870 const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
871 const bool src1_signExt = sdwaInst.SRC1_SEXT;
872 const bool src1_neg = sdwaInst.SRC1_NEG;
873 const bool src1_abs = sdwaInst.SRC1_ABS;
874
875 processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
876 src0_abs, src0_neg);
877 processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
878 src1_abs, src1_neg);
879 }
880
881
889 template<typename T>
890 void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
891 {
892 // local variables
893 const SDWADstVals dst_unusedBits_format =
894 (SDWADstVals)sdwaInst.DST_U;
895 const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
896 const bool clamp = sdwaInst.CLMP;
897
902 sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
903 }
904} // namespace VegaISA
905} // namespace gem5
906
907#endif // __ARCH_VEGA_INSTS_INST_UTIL_HH__
constexpr int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition bitfield.hh:276
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:76
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition bitfield.hh:350
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition bitfield.hh:182
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
Definition bitfield.hh:312
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7 > i
Definition misc_types.hh:67
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
ScalarRegI32 countZeroBitsMsb(T val)
Definition inst_util.hh:163
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition inst_util.hh:677
T quadMask(T val)
Definition inst_util.hh:103
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition inst_util.hh:173
ScalarRegI32 findFirstZero(T val)
Definition inst_util.hh:130
uint64_t ScalarRegU64
ScalarRegI32 findFirstOne(T val)
Definition inst_util.hh:141
const int BITS_PER_WORD
T median(T val_0, T val_1, T val_2)
Definition inst_util.hh:246
ScalarRegI32 findFirstOneMsb(T val)
Definition inst_util.hh:152
const int MSB_PER_BYTE
T roundNearestEven(T val)
Definition inst_util.hh:258
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:834
uint32_t VecElemU32
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition inst_util.hh:318
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition inst_util.hh:658
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:890
T wholeQuadMode(T val)
Definition inst_util.hh:89
uint64_t VecElemU64
const int MSB_PER_WORD
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition inst_util.hh:271
const int BITS_PER_BYTE
ScalarRegI32 countZeroBits(T val)
Definition inst_util.hh:120
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition inst_util.hh:565
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition inst_util.hh:799
uint32_t ScalarRegU32
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:422
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition inst_util.hh:778
Bitfield< 63 > val
Definition misc.hh:776
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
static const int NUM_BANKS
Definition inst_util.hh:83
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
static const int ROW_SIZE
Definition inst_util.hh:82
SDWADstVals
Definition inst_util.hh:56
@ SDWA_UNUSED_PRESERVE
Definition inst_util.hh:59
@ SDWA_UNUSED_PAD
Definition inst_util.hh:57
@ SDWA_UNUSED_SEXT
Definition inst_util.hh:58
SDWASelVals
Definition inst_util.hh:44
@ SDWA_BYTE_1
Definition inst_util.hh:46
@ SDWA_BYTE_3
Definition inst_util.hh:48
@ SDWA_DWORD
Definition inst_util.hh:51
@ SDWA_WORD_1
Definition inst_util.hh:50
@ SDWA_BYTE_2
Definition inst_util.hh:47
@ SDWA_WORD_0
Definition inst_util.hh:49
@ SDWA_BYTE_0
Definition inst_util.hh:45
SqDPPVals
Definition inst_util.hh:64
@ SQ_DPP_WF_RL1
Definition inst_util.hh:74
@ SQ_DPP_ROW_SR1
Definition inst_util.hh:69
@ SQ_DPP_ROW_BCAST31
Definition inst_util.hh:80
@ SQ_DPP_ROW_SL15
Definition inst_util.hh:68
@ SQ_DPP_ROW_HALF_MIRROR
Definition inst_util.hh:78
@ SQ_DPP_QUAD_PERM_MAX
Definition inst_util.hh:65
@ SQ_DPP_ROW_SL1
Definition inst_util.hh:67
@ SQ_DPP_ROW_MIRROR
Definition inst_util.hh:77
@ SQ_DPP_RESERVED
Definition inst_util.hh:66
@ SQ_DPP_ROW_BCAST15
Definition inst_util.hh:79
@ SQ_DPP_ROW_RR1
Definition inst_util.hh:71
@ SQ_DPP_ROW_SR15
Definition inst_util.hh:70
@ SQ_DPP_ROW_RR15
Definition inst_util.hh:72
@ SQ_DPP_WF_RR1
Definition inst_util.hh:76
@ SQ_DPP_WF_SL1
Definition inst_util.hh:73
@ SQ_DPP_WF_SR1
Definition inst_util.hh:75

Generated on Mon Jul 10 2023 15:30:56 for gem5 by doxygen 1.9.7