gem5  v21.1.0.2
inst_util.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef __ARCH_VEGA_INSTS_INST_UTIL_HH__
35 #define __ARCH_VEGA_INSTS_INST_UTIL_HH__
36 
37 #include <cmath>
38 
40 
41 namespace gem5
42 {
43 
44 // values for SDWA select operations
45 enum SDWASelVals : int
46 {
47  SDWA_BYTE_0 = 0, /* select data[7:0] */
48  SDWA_BYTE_1 = 1, /* select data[15:8] */
49  SDWA_BYTE_2 = 2, /* select data[23:16] */
50  SDWA_BYTE_3 = 3, /* select data[31:24] */
51  SDWA_WORD_0 = 4, /* select data[15:0] */
52  SDWA_WORD_1 = 5, /* select data[31:16] */
53  SDWA_DWORD = 6 /* select data[31:0] */
54 };
55 
56 // values for format of destination bits for SDWA operations
57 enum SDWADstVals : int
58 {
59  SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
60  SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
61  SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
62 };
63 
64 // values for DPP operations
65 enum SqDPPVals : int
66 {
67  SQ_DPP_QUAD_PERM_MAX = 0xFF,
68  SQ_DPP_RESERVED = 0x100,
69  SQ_DPP_ROW_SL1 = 0x101,
70  SQ_DPP_ROW_SL15 = 0x10F,
71  SQ_DPP_ROW_SR1 = 0x111,
72  SQ_DPP_ROW_SR15 = 0x11F,
73  SQ_DPP_ROW_RR1 = 0x121,
74  SQ_DPP_ROW_RR15 = 0x12F,
75  SQ_DPP_WF_SL1 = 0x130,
76  SQ_DPP_WF_RL1 = 0x134,
77  SQ_DPP_WF_SR1 = 0x138,
78  SQ_DPP_WF_RR1 = 0x13C,
79  SQ_DPP_ROW_MIRROR = 0x140,
80  SQ_DPP_ROW_HALF_MIRROR = 0x141,
81  SQ_DPP_ROW_BCAST15 = 0x142,
82  SQ_DPP_ROW_BCAST31 = 0x143
83 };
84 static const int ROW_SIZE = 16; /* 16 registers per row */
85 static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
86 
87 namespace VegaISA
88 {
89  template<typename T>
90  inline T
92  {
93  T wqm = 0;
94  T mask = 0xF;
95 
96  for (T bits = val; mask != 0; mask <<= 4)
97  if ((bits & mask) != 0)
98  wqm |= mask;
99 
100  return wqm;
101  }
102 
103  template<typename T>
104  inline T
106  {
107  T qmsk = 0;
108  T mask = 0xF;
109  T qbit = 0x1;
110 
111  for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
112  if (bits & mask) {
113  qmsk |= qbit;
114  }
115  }
116 
117  return qmsk;
118  }
119 
120  template<typename T>
121  inline ScalarRegI32
123  {
124  ScalarRegI32 num_zeros
125  = std::numeric_limits<T>::digits - popCount(val);
126 
127  return num_zeros;
128  }
129 
130  template<typename T>
131  inline ScalarRegI32
133  {
134  if (val == ~T(0)) {
135  return -1;
136  }
137 
138  return findLsbSet(~val);
139  }
140 
141  template<typename T>
142  inline ScalarRegI32
144  {
145  if (!val) {
146  return -1;
147  }
148 
149  return findLsbSet(val);
150  }
151 
152  template<typename T>
153  inline ScalarRegI32
155  {
156  if (!val) {
157  return -1;
158  }
159 
160  return findMsbSet(val);
161  }
162 
163  template<typename T>
164  inline ScalarRegI32
166  {
167  if (!val) {
168  return -1;
169  }
170 
171  return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
172  }
173 
174  inline ScalarRegI32
176  {
177  bool found(false);
178  bool sign_bit = (val & 0x80000000) != 0;
179  ScalarRegU32 tmp_val(0);
180  int count(0);
181 
182  if (!val || val == -1) {
183  return -1;
184  }
185 
186  for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
187  tmp_val = val & (0x80000000 >> i);
188 
189  if (!sign_bit) {
190  if (tmp_val) {
191  found = true;
192  break;
193  }
194  } else {
195  if (!tmp_val) {
196  found = true;
197  break;
198  }
199  }
200  ++count;
201  }
202 
203  if (found) {
204  return count;
205  } else {
206  return -1;
207  }
208  }
209 
210  inline ScalarRegI32
212  {
213  bool found(false);
214  bool sign_bit = (val & 0x8000000000000000ULL) != 0;
215  ScalarRegU64 tmp_val(0);
216  int count(0);
217 
218  if (!val || val == -1) {
219  return -1;
220  }
221 
222  for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
223  tmp_val = val & (0x8000000000000000ULL >> i);
224 
225  if (!sign_bit) {
226  if (tmp_val) {
227  found = true;
228  break;
229  }
230  } else {
231  if (!tmp_val) {
232  found = true;
233  break;
234  }
235  }
236  ++count;
237  }
238 
239  if (found) {
240  return count;
241  } else {
242  return -1;
243  }
244  }
245 
246  template<typename T>
247  inline T
248  median(T val_0, T val_1, T val_2)
249  {
250  if (std::is_floating_point<T>::value) {
251  return std::fmax(std::fmin(val_0, val_1),
252  std::fmin(std::fmax(val_0, val_1), val_2));
253  } else {
254  return std::max(std::min(val_0, val_1),
255  std::min(std::max(val_0, val_1), val_2));
256  }
257  }
258 
259  template <typename T>
260  inline T roundNearestEven(T val)
261  {
262  T int_part = 0;
263  T nearest_round = std::floor(val + 0.5);
264  if ((int)std::floor(val) % 2 == 0
265  && std::modf(std::abs(val), &int_part) == 0.5) {
266  nearest_round = nearest_round - 1;
267  }
268 
269  return nearest_round;
270  }
271 
272  inline VecElemU32
274  VecElemU64 val_2)
275  {
276  __uint128_t u0 = (__uint128_t)val_0;
277  __uint128_t u1 = (__uint128_t)val_1;
278  __uint128_t u2 = (__uint128_t)val_2;
279  __uint128_t result = u0 * u1 + u2;
280 
281  dst = (VecElemU64)result;
282 
283  return (VecElemU32)(result >> 64) ? 1 : 0;
284  }
285 
286  inline VecElemU32
288  VecElemI64 val_2)
289  {
290  __int128_t u0 = (__int128_t)val_0;
291  __int128_t u1 = (__int128_t)val_1;
292  __int128_t u2 = (__int128_t)val_2;
293  __int128_t result = u0 * u1 + u2;
294 
295  dst = (VecElemI64)result;
296 
297  return (VecElemU32)(result >> 64) ? 1 : 0;
298  }
299 
320  int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
321  int rowOffset, bool & outOfBounds)
322  {
323  // local variables
324  // newLane will be the same as the input lane unless swizzling happens
325  int newLane = currLane;
326  // for shift/rotate permutations; positive values are LEFT rotates
327  int count = 1;
328  int localRowOffset = rowOffset;
329  int localRowNum = rowNum;
330 
331  if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
332  int quadBase = (currLane & ~(3));
333  int quadPix = (currLane & 3);
334  quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
335  newLane = (quadBase | quadPix);
336  } else if (dppCtrl == SQ_DPP_RESERVED) {
337  panic("ERROR: instruction using reserved DPP_CTRL value\n");
338  } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
339  (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
340  count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
341  if ((localRowOffset + count >= 0) &&
342  (localRowOffset + count < ROW_SIZE)) {
343  localRowOffset += count;
344  newLane = (rowNum | localRowOffset);
345  } else {
346  outOfBounds = true;
347  }
348  } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
349  (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
350  count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
351  if ((localRowOffset + count >= 0) &&
352  (localRowOffset + count < ROW_SIZE)) {
353  localRowOffset += count;
354  newLane = (rowNum | localRowOffset);
355  } else {
356  outOfBounds = true;
357  }
358  } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
359  (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
360  count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
361  localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
362  newLane = (rowNum | localRowOffset);
363  } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
364  count = 1;
365  if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
366  newLane += count;
367  } else {
368  outOfBounds = true;
369  }
370  } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
371  count = 1;
372  newLane = (currLane + count + NumVecElemPerVecReg) %
374  } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
375  count = -1;
376  int currVal = (currLane + count);
377  if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
378  newLane += count;
379  } else {
380  outOfBounds = true;
381  }
382  } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
383  count = -1;
384  newLane = (currLane + count + NumVecElemPerVecReg) %
386  } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
387  localRowOffset = (15 - localRowOffset);
388  newLane = (rowNum | localRowOffset);
389  } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
390  localRowNum = (currLane & -0x7);
391  localRowOffset = (currLane & 0x7);
392  localRowOffset = (7 - localRowNum);
393  newLane = (localRowNum | localRowOffset);
394  } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
395  count = 15;
396  if (currLane > count) {
397  newLane = (currLane & ~count) - 1;
398  }
399  } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
400  count = 31;
401  if (currLane > count) {
402  newLane = (currLane & ~count) - 1;
403  }
404  } else {
405  panic("Unimplemented DPP control operation: %d\n", dppCtrl);
406  }
407 
408  return newLane;
409  }
410 
416  template<typename T>
417  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
418  T & src0)
419  {
420  // local variables
421  SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
422  int boundCtrl = dppInst.BC;
423  int bankMask = dppInst.BANK_MASK;
424  int rowMask = dppInst.ROW_MASK;
425  // row, bank info to be calculated per lane
426  int rowNum = 0, bankNum = 0, rowOffset = 0;
427  // outLane will be the same as the input lane unless swizzling happens
428  int outLane = 0;
429  bool laneDisabled = false;
430  // flags used for determining if a lane should be written to/reset/etc.
431  bool outOfBounds = false, zeroSrc = false;
432  long long threadValid = 0;
433 
440  if (dppInst.SRC0_NEG) {
441  src0.negModifier();
442  }
443 
444  if (dppInst.SRC0_ABS) {
445  src0.absModifier();
446  }
447 
448  // iterate over all register lanes, performing steps 2-4
449  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
450  threadValid = (0x1LL << lane);
456  rowNum = (lane / ROW_SIZE);
457  rowOffset = (lane % ROW_SIZE);
458  bankNum = (rowOffset / NUM_BANKS);
459 
460  if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
461  ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
462  laneDisabled = true;
463  continue;
464  }
465 
482  if (!laneDisabled) {
483  outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
484  outOfBounds);
485  }
486 
492  if (laneDisabled) {
493  threadValid = 0;
494  } else if (outOfBounds) {
495  if (boundCtrl == 1) {
496  zeroSrc = true;
497  } else {
498  threadValid = 0;
499  }
500  } else if (!gpuDynInst->exec_mask[lane]) {
501  if (boundCtrl == 1) {
502  zeroSrc = true;
503  } else {
504  threadValid = 0;
505  }
506  }
507 
508  if (threadValid != 0 && !outOfBounds && !zeroSrc) {
509  assert(!laneDisabled);
510  src0[outLane] = src0[lane];
511  } else if (zeroSrc) {
512  src0[lane] = 0;
513  }
514 
515  // reset for next iteration
516  laneDisabled = false;
517  }
518  }
519 
525  template<typename T>
526  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
527  T & src0, T & src1)
528  {
535  if (dppInst.SRC1_NEG) {
536  src1.negModifier();
537  }
538 
539  if (dppInst.SRC1_ABS) {
540  src1.absModifier();
541  }
542 
543  // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
544  // which is only used for negation/absolute value, call other version
545  // to do everything else.
546  processDPP(gpuDynInst, dppInst, src0);
547  }
548 
555  template<typename T>
556  T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
557  const SDWASelVals sel, const bool signExt)
558  {
559  // local variables
560  int low_bit = 0, high_bit = 0;
561  bool signExt_local = signExt;
562  T retVal = 0;
563 
564  // if we're preserving all of the bits, then we can immediately return
565  if (sel == SDWA_DWORD) {
566  return currOperVal;
567  }
568 
569  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
570  /*
571  Process byte 0 first. This code eiter selects the original bits
572  of byte 0, or makes the bits of the selected byte be byte 0 (and
573  next either sign extends or zero's out upper bits).
574  */
575  low_bit = (sel * VegaISA::BITS_PER_BYTE);
576  high_bit = low_bit + VegaISA::MSB_PER_BYTE;
577  retVal = bits(currOperVal, high_bit, low_bit);
578 
579  // make sure update propagated, since used next
581  bits(origOperVal, high_bit),
582  "ERROR: SDWA byte update not propagated: retVal: %d, "
583  "orig: %d\n", bits(retVal, VegaISA::MSB_PER_BYTE),
584  bits(origOperVal, high_bit));
585  // sign extended value depends on upper-most bit of the new byte 0
586  signExt_local = (signExt &&
587  (bits(retVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
588 
589  // process all other bytes -- if sign extending, make them 1, else
590  // all 0's so leave as is
591  if (signExt_local) {
592  retVal = (uint32_t)sext<VegaISA::MSB_PER_BYTE>(retVal);
593  }
594  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
595  /*
596  Process word 0 first. This code eiter selects the original bits
597  of word 0, or makes the bits of the selected word be word 0 (and
598  next either sign extends or zero's out upper bits).
599  */
600  low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
601  high_bit = low_bit + VegaISA::MSB_PER_WORD;
602  retVal = bits(currOperVal, high_bit, low_bit);
603 
604  // make sure update propagated, since used next
606  bits(origOperVal, high_bit),
607  "ERROR: SDWA word update not propagated: retVal: %d, "
608  "orig: %d\n",
609  bits(retVal, VegaISA::MSB_PER_WORD),
610  bits(origOperVal, high_bit));
611  // sign extended value depends on upper-most bit of the new word 0
612  signExt_local = (signExt &&
613  (bits(retVal, VegaISA::MSB_PER_WORD, 0) &
614  0x8000));
615 
616  // process other word -- if sign extending, make them 1, else all
617  // 0's so leave as is
618  if (signExt_local) {
619  retVal = (uint32_t)sext<VegaISA::MSB_PER_WORD>(retVal);
620  }
621  } else {
622  assert(sel != SDWA_DWORD); // should have returned earlier
623  panic("Unimplemented SDWA select operation: %d\n", sel);
624  }
625 
626  return retVal;
627  }
628 
629 
648  template<typename T>
649  void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
650  const SDWASelVals sel, const bool signExt)
651  {
652  // iterate over all lanes, setting appropriate, selected value
653  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
654  currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
655  origCurrOper[lane], sel,
656  signExt);
657  }
658  }
659 
660 
667  template<typename T>
668  T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
669  const bool clamp, const SDWASelVals sel,
670  const SDWADstVals unusedBits_format)
671  {
672  // local variables
673  int low_bit = 0, high_bit = 0;
674  bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
675  //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
676  bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
677  T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
678  origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
679 
680  // if we're preserving all of the bits, then we can immediately return
681  if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
682  assert(sel == SDWA_DWORD);
683  return currDstVal;
684  } else if (sel == SDWA_DWORD) {
685  // NOTE: users may set the unused bits variable to anything in this
686  // scenario, because it will be ignored
687  return currDstVal;
688  }
689 
690  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
691  // if we sign extended depends on upper-most bit of byte 0
692  signExt = (signExt &&
693  (bits(currDstVal, VegaISA::MSB_PER_WORD, 0) & 0x80));
694 
695  for (int byte = 0; byte < 4; ++byte) {
696  low_bit = byte * VegaISA::BITS_PER_BYTE;
697  high_bit = low_bit + VegaISA::MSB_PER_BYTE;
698  /*
699  Options:
700  1. byte == sel: we are keeping all bits in this byte
701  2. preserve is set: keep this byte as is because the
702  output preserve flag is set
703  3. byte > sel && signExt: we're sign extending and
704  this byte is one of the bytes we need to sign extend
705  */
706  origBits_thisByte = bits(origDstVal, high_bit, low_bit);
707  currBits_thisByte = bits(currDstVal, high_bit, low_bit);
708  newBits = ((byte == sel) ? origBits_thisByte :
709  ((preserve) ? currBits_thisByte :
710  (((byte > sel) && signExt) ? 0xff : 0)));
711  retVal = insertBits(retVal, high_bit, low_bit, newBits);
712  }
713  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
714  low_bit = 0;
715  high_bit = low_bit + VegaISA::MSB_PER_WORD;
716  // if we sign extended depends on upper-most bit of word 0
717  signExt = (signExt &&
718  (bits(currDstVal, high_bit, low_bit) & 0x8000));
719 
720  for (int word = 0; word < 2; ++word) {
721  low_bit = word * VegaISA::BITS_PER_WORD;
722  high_bit = low_bit + VegaISA::MSB_PER_WORD;
723  /*
724  Options:
725  1. word == sel & 1: we are keeping all bits in this word
726  2. preserve is set: keep this word as is because the
727  output preserve flag is set
728  3. word > (sel & 1) && signExt: we're sign extending and
729  this word is one of the words we need to sign extend
730  */
731  origBits_thisWord = bits(origDstVal, high_bit, low_bit);
732  currBits_thisWord = bits(currDstVal, high_bit, low_bit);
733  newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
734  ((preserve) ? currBits_thisWord :
735  (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
736  retVal = insertBits(retVal, high_bit, low_bit, newBits);
737  }
738  } else {
739  assert(sel != SDWA_DWORD); // should have returned earlier
740  panic("Unimplemented SDWA select operation: %d\n", sel);
741  }
742 
743  return retVal;
744  }
745 
746 
768  template<typename T>
769  void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
770  const SDWASelVals sel,
771  const SDWADstVals unusedBits_format)
772  {
773  // iterate over all lanes, setting appropriate, selected value
774  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
775  dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
776  origDstOper[lane], clamp,
777  sel, unusedBits_format);
778  }
779  }
780 
781 
789  template<typename T>
790  void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
791  const SDWASelVals src_sel,
792  const bool src_signExt, const bool src_abs,
793  const bool src_neg)
794  {
802  if (src_neg) {
803  currSrc.negModifier();
804  }
805 
806  if (src_abs) {
807  currSrc.absModifier();
808  }
809 
813  sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
814  }
815 
816 
824  template<typename T>
825  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
826  {
827  // local variables
828  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
829  const bool src0_signExt = sdwaInst.SRC0_SEXT;
830  const bool src0_neg = sdwaInst.SRC0_NEG;
831  const bool src0_abs = sdwaInst.SRC0_ABS;
832 
833  // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
834  // operand. So ensure that SRC1 fields are not set, then call helper
835  // function only on src0.
836  assert(!sdwaInst.SRC1_SEXT);
837  assert(!sdwaInst.SRC1_NEG);
838  assert(!sdwaInst.SRC1_ABS);
839 
840  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
841  src0_abs, src0_neg);
842  }
843 
844 
852  template<typename T>
853  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
854  T & src1, T & origSrc1)
855  {
856  // local variables
857  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
858  const bool src0_signExt = sdwaInst.SRC0_SEXT;
859  const bool src0_neg = sdwaInst.SRC0_NEG;
860  const bool src0_abs = sdwaInst.SRC0_ABS;
861  const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
862  const bool src1_signExt = sdwaInst.SRC1_SEXT;
863  const bool src1_neg = sdwaInst.SRC1_NEG;
864  const bool src1_abs = sdwaInst.SRC1_ABS;
865 
866  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
867  src0_abs, src0_neg);
868  processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
869  src1_abs, src1_neg);
870  }
871 
872 
880  template<typename T>
881  void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
882  {
883  // local variables
884  const SDWADstVals dst_unusedBits_format =
885  (SDWADstVals)sdwaInst.DST_U;
886  const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
887  const bool clamp = sdwaInst.CLMP;
888 
893  sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
894  }
895 } // namespace VegaISA
896 } // namespace gem5
897 
898 #endif // __ARCH_VEGA_INSTS_INST_UTIL_HH__
gem5::SQ_DPP_WF_RL1
@ SQ_DPP_WF_RL1
Definition: inst_util.hh:76
gem5::VegaISA::BITS_PER_WORD
const int BITS_PER_WORD
Definition: gpu_registers.hh:146
gem5::SDWADstVals
SDWADstVals
Definition: inst_util.hh:57
gem5::VegaISA::sdwaInstDstImpl
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition: inst_util.hh:769
gem5::NUM_BANKS
static const int NUM_BANKS
Definition: inst_util.hh:85
sc_dt::word
unsigned int word
Definition: scfx_mant.hh:96
gem5::VegaISA::MSB_PER_BYTE
const int MSB_PER_BYTE
Definition: gpu_registers.hh:147
gem5::VegaISA::VecElemU64
uint64_t VecElemU64
Definition: gpu_registers.hh:170
gem5::VegaISA::InFmt_VOP_DPP::ROW_MASK
unsigned int ROW_MASK
Definition: gpu_decoder.hh:1844
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1855
gem5::VegaISA::InFmt_VOP_DPP::DPP_CTRL
unsigned int DPP_CTRL
Definition: gpu_decoder.hh:1836
gem5::VegaISA::findFirstOneMsb
ScalarRegI32 findFirstOneMsb(T val)
Definition: inst_util.hh:154
gem5::VegaISA::sdwaInstSrcImpl
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition: inst_util.hh:649
gem5::SQ_DPP_ROW_SL1
@ SQ_DPP_ROW_SL1
Definition: inst_util.hh:69
gem5::VegaISA::NumVecElemPerVecReg
const int NumVecElemPerVecReg(64)
gem5::SQ_DPP_ROW_SR15
@ SQ_DPP_ROW_SR15
Definition: inst_util.hh:72
gem5::VegaISA::sdwaInstDstImpl_helper
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:668
gem5::X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:775
gem5::SQ_DPP_ROW_MIRROR
@ SQ_DPP_ROW_MIRROR
Definition: inst_util.hh:79
gem5::SQ_DPP_QUAD_PERM_MAX
@ SQ_DPP_QUAD_PERM_MAX
Definition: inst_util.hh:67
gem5::SQ_DPP_ROW_BCAST31
@ SQ_DPP_ROW_BCAST31
Definition: inst_util.hh:82
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:66
gem5::SDWA_DWORD
@ SDWA_DWORD
Definition: inst_util.hh:53
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_SEXT
unsigned int SRC0_SEXT
Definition: gpu_decoder.hh:1854
gem5::VegaISA::roundNearestEven
T roundNearestEven(T val)
Definition: inst_util.hh:260
gem5::SDWA_BYTE_1
@ SDWA_BYTE_1
Definition: inst_util.hh:48
gem5::SQ_DPP_ROW_BCAST15
@ SQ_DPP_ROW_BCAST15
Definition: inst_util.hh:81
gem5::SQ_DPP_WF_SL1
@ SQ_DPP_WF_SL1
Definition: inst_util.hh:75
gem5::VegaISA::InFmt_VOP_SDWA
Definition: gpu_decoder.hh:1847
gem5::mask
constexpr uint64_t mask(unsigned nbits)
Generate a 64-bit mask of 'nbits' 1s, right justified.
Definition: bitfield.hh:63
gem5::VegaISA::InFmt_VOP_DPP::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1841
gem5::VegaISA::firstOppositeSignBit
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition: inst_util.hh:175
gem5::SQ_DPP_WF_RR1
@ SQ_DPP_WF_RR1
Definition: inst_util.hh:78
gem5::VegaISA::InFmt_VOP_DPP::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1839
gem5::VegaISA::InFmt_VOP_DPP::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1840
gem5::VegaISA::countZeroBitsMsb
ScalarRegI32 countZeroBitsMsb(T val)
Definition: inst_util.hh:165
gem5::SQ_DPP_ROW_RR1
@ SQ_DPP_ROW_RR1
Definition: inst_util.hh:73
gem5::VegaISA::InFmt_VOP_SDWA::CLMP
unsigned int CLMP
Definition: gpu_decoder.hh:1851
gem5::VegaISA::median
T median(T val_0, T val_1, T val_2)
Definition: inst_util.hh:248
gem5::SQ_DPP_RESERVED
@ SQ_DPP_RESERVED
Definition: inst_util.hh:68
gem5::VegaISA::InFmt_VOP_DPP::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1842
gem5::findLsbSet
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
Definition: bitfield.hh:299
gem5::SQ_DPP_WF_SR1
@ SQ_DPP_WF_SR1
Definition: inst_util.hh:77
gem5::X86ISA::count
count
Definition: misc.hh:709
gem5::popCount
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition: bitfield.hh:337
gem5::SDWA_BYTE_2
@ SDWA_BYTE_2
Definition: inst_util.hh:49
gem5::VegaISA::InFmt_VOP_DPP::BANK_MASK
unsigned int BANK_MASK
Definition: gpu_decoder.hh:1843
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1862
gem5::SQ_DPP_ROW_SR1
@ SQ_DPP_ROW_SR1
Definition: inst_util.hh:71
gem5::VegaISA::findFirstZero
ScalarRegI32 findFirstZero(T val)
Definition: inst_util.hh:132
gpu_registers.hh
gem5::SQ_DPP_ROW_RR15
@ SQ_DPP_ROW_RR15
Definition: inst_util.hh:74
gem5::VegaISA::wholeQuadMode
T wholeQuadMode(T val)
Definition: inst_util.hh:91
gem5::insertBits
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition: bitfield.hh:166
gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition: bitfield.hh:76
gem5::SDWASelVals
SDWASelVals
Definition: inst_util.hh:45
gem5::VegaISA::quadMask
T quadMask(T val)
Definition: inst_util.hh:105
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_SEL
unsigned int SRC0_SEL
Definition: gpu_decoder.hh:1853
gem5::VegaISA::VecElemI64
int64_t VecElemI64
Definition: gpu_registers.hh:171
gem5::SDWA_BYTE_3
@ SDWA_BYTE_3
Definition: inst_util.hh:50
gem5::ROW_SIZE
static const int ROW_SIZE
Definition: inst_util.hh:84
gem5::SQ_DPP_ROW_SL15
@ SQ_DPP_ROW_SL15
Definition: inst_util.hh:70
gem5::VegaISA::ScalarRegU64
uint64_t ScalarRegU64
Definition: gpu_registers.hh:158
gem5::VegaISA::VecElemI32
int32_t VecElemI32
Definition: gpu_registers.hh:168
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:51
gem5::VegaISA::ScalarRegI64
int64_t ScalarRegI64
Definition: gpu_registers.hh:159
gem5::SDWA_UNUSED_PAD
@ SDWA_UNUSED_PAD
Definition: inst_util.hh:59
gem5::VegaISA::VecElemU32
uint32_t VecElemU32
Definition: gpu_registers.hh:167
gem5::VegaISA::dppInstImpl
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition: inst_util.hh:320
gem5::VegaISA::ScalarRegI32
int32_t ScalarRegI32
Definition: gpu_registers.hh:156
gem5::VegaISA::InFmt_VOP_SDWA::DST_U
unsigned int DST_U
Definition: gpu_decoder.hh:1850
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:203
gem5::SDWA_WORD_1
@ SDWA_WORD_1
Definition: inst_util.hh:52
gem5::VegaISA::BITS_PER_BYTE
const int BITS_PER_BYTE
Definition: gpu_registers.hh:145
gem5::VegaISA::countZeroBits
ScalarRegI32 countZeroBits(T val)
Definition: inst_util.hh:122
gem5::VegaISA::processDPP
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition: inst_util.hh:417
gem5::VegaISA::sdwaInstSrcImpl_helper
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:556
gem5::VegaISA::MSB_PER_WORD
const int MSB_PER_WORD
Definition: gpu_registers.hh:148
gem5::SDWA_WORD_0
@ SDWA_WORD_0
Definition: inst_util.hh:51
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1856
gem5::VegaISA::muladd
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition: inst_util.hh:273
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1861
gem5::ArmISA::sel
sel
Definition: misc_types.hh:650
gem5::SDWA_UNUSED_PRESERVE
@ SDWA_UNUSED_PRESERVE
Definition: inst_util.hh:61
gem5::findMsbSet
constexpr int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition: bitfield.hh:263
gem5::VegaISA::ScalarRegU32
uint32_t ScalarRegU32
Definition: gpu_registers.hh:155
gem5::SQ_DPP_ROW_HALF_MIRROR
@ SQ_DPP_ROW_HALF_MIRROR
Definition: inst_util.hh:80
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_SEL
unsigned int SRC1_SEL
Definition: gpu_decoder.hh:1859
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: decoder.cc:40
gem5::SDWA_BYTE_0
@ SDWA_BYTE_0
Definition: inst_util.hh:47
gem5::VegaISA::InFmt_VOP_SDWA::DST_SEL
unsigned int DST_SEL
Definition: gpu_decoder.hh:1849
gem5::SDWA_UNUSED_SEXT
@ SDWA_UNUSED_SEXT
Definition: inst_util.hh:60
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_SEXT
unsigned int SRC1_SEXT
Definition: gpu_decoder.hh:1860
gem5::VegaISA::processSDWA_dst
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition: inst_util.hh:881
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:177
gem5::SqDPPVals
SqDPPVals
Definition: inst_util.hh:65
gem5::VegaISA::processSDWA_src_helper
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition: inst_util.hh:790
gem5::VegaISA::InFmt_VOP_DPP::BC
unsigned int BC
Definition: gpu_decoder.hh:1838
gem5::VegaISA::findFirstOne
ScalarRegI32 findFirstOne(T val)
Definition: inst_util.hh:143
gem5::VegaISA::processSDWA_src
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition: inst_util.hh:825
gem5::VegaISA::InFmt_VOP_DPP
Definition: gpu_decoder.hh:1834

Generated on Tue Sep 21 2021 12:24:02 for gem5 by doxygen 1.8.17