gem5  [DEVELOP-FOR-23.0]
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
inst_util.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2021 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #ifndef __ARCH_VEGA_INSTS_INST_UTIL_HH__
33 #define __ARCH_VEGA_INSTS_INST_UTIL_HH__
34 
35 #include <cmath>
36 
38 
39 namespace gem5
40 {
41 
42 // values for SDWA select operations
43 enum SDWASelVals : int
44 {
45  SDWA_BYTE_0 = 0, /* select data[7:0] */
46  SDWA_BYTE_1 = 1, /* select data[15:8] */
47  SDWA_BYTE_2 = 2, /* select data[23:16] */
48  SDWA_BYTE_3 = 3, /* select data[31:24] */
49  SDWA_WORD_0 = 4, /* select data[15:0] */
50  SDWA_WORD_1 = 5, /* select data[31:16] */
51  SDWA_DWORD = 6 /* select data[31:0] */
52 };
53 
54 // values for format of destination bits for SDWA operations
55 enum SDWADstVals : int
56 {
57  SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
58  SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
59  SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
60 };
61 
62 // values for DPP operations
63 enum SqDPPVals : int
64 {
65  SQ_DPP_QUAD_PERM_MAX = 0xFF,
66  SQ_DPP_RESERVED = 0x100,
67  SQ_DPP_ROW_SL1 = 0x101,
68  SQ_DPP_ROW_SL15 = 0x10F,
69  SQ_DPP_ROW_SR1 = 0x111,
70  SQ_DPP_ROW_SR15 = 0x11F,
71  SQ_DPP_ROW_RR1 = 0x121,
72  SQ_DPP_ROW_RR15 = 0x12F,
73  SQ_DPP_WF_SL1 = 0x130,
74  SQ_DPP_WF_RL1 = 0x134,
75  SQ_DPP_WF_SR1 = 0x138,
76  SQ_DPP_WF_RR1 = 0x13C,
77  SQ_DPP_ROW_MIRROR = 0x140,
78  SQ_DPP_ROW_HALF_MIRROR = 0x141,
79  SQ_DPP_ROW_BCAST15 = 0x142,
80  SQ_DPP_ROW_BCAST31 = 0x143
81 };
82 static const int ROW_SIZE = 16; /* 16 registers per row */
83 static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
84 
85 namespace VegaISA
86 {
87  template<typename T>
88  inline T
90  {
91  T wqm = 0;
92  T mask = 0xF;
93 
94  for (T bits = val; mask != 0; mask <<= 4)
95  if ((bits & mask) != 0)
96  wqm |= mask;
97 
98  return wqm;
99  }
100 
101  template<typename T>
102  inline T
104  {
105  T qmsk = 0;
106  T mask = 0xF;
107  T qbit = 0x1;
108 
109  for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
110  if (bits & mask) {
111  qmsk |= qbit;
112  }
113  }
114 
115  return qmsk;
116  }
117 
118  template<typename T>
119  inline ScalarRegI32
121  {
122  ScalarRegI32 num_zeros
123  = std::numeric_limits<T>::digits - popCount(val);
124 
125  return num_zeros;
126  }
127 
128  template<typename T>
129  inline ScalarRegI32
131  {
132  if (val == ~T(0)) {
133  return -1;
134  }
135 
136  return findLsbSet(~val);
137  }
138 
139  template<typename T>
140  inline ScalarRegI32
142  {
143  if (!val) {
144  return -1;
145  }
146 
147  return findLsbSet(val);
148  }
149 
150  template<typename T>
151  inline ScalarRegI32
153  {
154  if (!val) {
155  return -1;
156  }
157 
158  return findMsbSet(val);
159  }
160 
161  template<typename T>
162  inline ScalarRegI32
164  {
165  if (!val) {
166  return -1;
167  }
168 
169  return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
170  }
171 
172  inline ScalarRegI32
174  {
175  bool found(false);
176  bool sign_bit = (val & 0x80000000) != 0;
177  ScalarRegU32 tmp_val(0);
178  int count(0);
179 
180  if (!val || val == -1) {
181  return -1;
182  }
183 
184  for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
185  tmp_val = val & (0x80000000 >> i);
186 
187  if (!sign_bit) {
188  if (tmp_val) {
189  found = true;
190  break;
191  }
192  } else {
193  if (!tmp_val) {
194  found = true;
195  break;
196  }
197  }
198  ++count;
199  }
200 
201  if (found) {
202  return count;
203  } else {
204  return -1;
205  }
206  }
207 
208  inline ScalarRegI32
210  {
211  bool found(false);
212  bool sign_bit = (val & 0x8000000000000000ULL) != 0;
213  ScalarRegU64 tmp_val(0);
214  int count(0);
215 
216  if (!val || val == -1) {
217  return -1;
218  }
219 
220  for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
221  tmp_val = val & (0x8000000000000000ULL >> i);
222 
223  if (!sign_bit) {
224  if (tmp_val) {
225  found = true;
226  break;
227  }
228  } else {
229  if (!tmp_val) {
230  found = true;
231  break;
232  }
233  }
234  ++count;
235  }
236 
237  if (found) {
238  return count;
239  } else {
240  return -1;
241  }
242  }
243 
244  template<typename T>
245  inline T
246  median(T val_0, T val_1, T val_2)
247  {
248  if (std::is_floating_point_v<T>) {
249  return std::fmax(std::fmin(val_0, val_1),
250  std::fmin(std::fmax(val_0, val_1), val_2));
251  } else {
252  return std::max(std::min(val_0, val_1),
253  std::min(std::max(val_0, val_1), val_2));
254  }
255  }
256 
257  template <typename T>
258  inline T roundNearestEven(T val)
259  {
260  T int_part = 0;
261  T nearest_round = std::floor(val + 0.5);
262  if ((int)std::floor(val) % 2 == 0
263  && std::modf(std::abs(val), &int_part) == 0.5) {
264  nearest_round = nearest_round - 1;
265  }
266 
267  return nearest_round;
268  }
269 
270  inline VecElemU32
272  VecElemU64 val_2)
273  {
274  __uint128_t u0 = (__uint128_t)val_0;
275  __uint128_t u1 = (__uint128_t)val_1;
276  __uint128_t u2 = (__uint128_t)val_2;
277  __uint128_t result = u0 * u1 + u2;
278 
279  dst = (VecElemU64)result;
280 
281  return (VecElemU32)(result >> 64) ? 1 : 0;
282  }
283 
284  inline VecElemU32
286  VecElemI64 val_2)
287  {
288  __int128_t u0 = (__int128_t)val_0;
289  __int128_t u1 = (__int128_t)val_1;
290  __int128_t u2 = (__int128_t)val_2;
291  __int128_t result = u0 * u1 + u2;
292 
293  dst = (VecElemI64)result;
294 
295  return (VecElemU32)(result >> 64) ? 1 : 0;
296  }
297 
318  int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
319  int rowOffset, bool & outOfBounds)
320  {
321  // local variables
322  // newLane will be the same as the input lane unless swizzling happens
323  int newLane = currLane;
324  // for shift/rotate permutations; positive values are LEFT rotates
325  // shift/rotate left means lane n -> lane n-1 (e.g., lane 1 -> lane 0)
326  int count = 0;
327  int localRowOffset = rowOffset;
328  int localRowNum = rowNum;
329 
330  if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
331  int quadBase = (currLane & ~(3));
332  int quadPix = (currLane & 3);
333  quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
334  newLane = (quadBase | quadPix);
335  } else if (dppCtrl == SQ_DPP_RESERVED) {
336  panic("ERROR: instruction using reserved DPP_CTRL value\n");
337  } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
338  (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
339  count = (dppCtrl - SQ_DPP_ROW_SL1 + 1);
340  if ((localRowOffset + count >= 0) &&
341  (localRowOffset + count < ROW_SIZE)) {
342  localRowOffset += count;
343  newLane = ((rowNum * ROW_SIZE) | localRowOffset);
344  } else {
345  outOfBounds = true;
346  }
347  } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
348  (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
349  count = -(dppCtrl - SQ_DPP_ROW_SR1 + 1);
350  if ((localRowOffset + count >= 0) &&
351  (localRowOffset + count < ROW_SIZE)) {
352  localRowOffset += count;
353  newLane = ((rowNum * ROW_SIZE) | localRowOffset);
354  } else {
355  outOfBounds = true;
356  }
357  } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
358  (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
359  count = -(dppCtrl - SQ_DPP_ROW_RR1 + 1);
360  localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
361  newLane = ((rowNum * ROW_SIZE) | localRowOffset);
362  } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
363  if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
364  newLane += 1;
365  } else {
366  outOfBounds = true;
367  }
368  } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
369  newLane = (currLane - 1 + NumVecElemPerVecReg) %
371  } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
372  int currVal = (currLane - 1);
373  if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
374  newLane -= 1;
375  } else {
376  outOfBounds = true;
377  }
378  } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
379  newLane = (currLane - 1 + NumVecElemPerVecReg) %
381  } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
382  localRowOffset = (15 - localRowOffset);
383  newLane = (rowNum | localRowOffset);
384  } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
385  localRowNum = (currLane & -0x7);
386  localRowOffset = (currLane & 0x7);
387  localRowOffset = (7 - localRowNum);
388  newLane = (localRowNum | localRowOffset);
389  } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
390  count = 15;
391  if (currLane > count) {
392  // 0x30 selects which set of 16 lanes to use. We broadcast the
393  // last lane of one set to all lanes of the next set (e.g.,
394  // lane 15 is written to 16-31, 31 to 32-47, 47 to 48-63).
395  newLane = (currLane & 0x30) - 1;
396  } else {
397  outOfBounds = true;
398  }
399  } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
400  count = 31;
401  if (currLane > count) {
402  // 0x20 selects either the upper 32 or lower 32 lanes and
403  // broadcasts the last lane of one set to all lanes of the
404  // next set (e.g., lane 31 is written to 32-63).
405  newLane = (currLane & 0x20) - 1;
406  } else {
407  outOfBounds = true;
408  }
409  } else {
410  panic("Unimplemented DPP control operation: %d\n", dppCtrl);
411  }
412 
413  return newLane;
414  }
415 
421  template<typename T>
422  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
423  T & src0)
424  {
425  // local variables
426  SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
427  int boundCtrl = dppInst.BC;
428  int bankMask = dppInst.BANK_MASK;
429  int rowMask = dppInst.ROW_MASK;
430  // row, bank info to be calculated per lane
431  int rowNum = 0, bankNum = 0, rowOffset = 0;
432  // outLane will be the same as the input lane unless swizzling happens
433  int outLane = 0;
434  bool laneDisabled = false;
435  // flags used for determining if a lane should be written to/reset/etc.
436  bool outOfBounds = false, zeroSrc = false;
437  long long threadValid = 0;
438 
445  if (dppInst.SRC0_NEG) {
446  src0.negModifier();
447  }
448 
449  if (dppInst.SRC0_ABS) {
450  src0.absModifier();
451  }
452 
453  // Need a copy of the original data since we update one lane at a time
454  T src0_copy = src0;
455 
456  // iterate over all register lanes, performing steps 2-4
457  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
458  threadValid = (0x1LL << lane);
464  rowNum = (lane / ROW_SIZE);
465  rowOffset = (lane % ROW_SIZE);
466  bankNum = (rowOffset / NUM_BANKS);
467 
468  if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
469  ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
470  laneDisabled = true;
471  }
472 
489  if (!laneDisabled) {
490  outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
491  outOfBounds);
492  }
493 
499  if (laneDisabled) {
500  threadValid = 0;
501  } else if (outOfBounds) {
502  if (boundCtrl == 1) {
503  zeroSrc = true;
504  } else {
505  threadValid = 0;
506  }
507  } else if (!gpuDynInst->wavefront()->execMask(lane)) {
508  if (boundCtrl == 1) {
509  zeroSrc = true;
510  } else {
511  threadValid = 0;
512  }
513  }
514 
515  if (threadValid != 0 && !outOfBounds && !zeroSrc) {
516  assert(!laneDisabled);
517  src0[lane] = src0_copy[outLane];
518  } else if (zeroSrc) {
519  src0[lane] = 0;
520  }
521 
522  // reset for next iteration
523  laneDisabled = false;
524  outOfBounds = false;
525  zeroSrc = false;
526  }
527  }
528 
534  template<typename T>
535  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
536  T & src0, T & src1)
537  {
544  if (dppInst.SRC1_NEG) {
545  src1.negModifier();
546  }
547 
548  if (dppInst.SRC1_ABS) {
549  src1.absModifier();
550  }
551 
552  // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
553  // which is only used for negation/absolute value, call other version
554  // to do everything else.
555  processDPP(gpuDynInst, dppInst, src0);
556  }
557 
564  template<typename T>
565  T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
566  const SDWASelVals sel, const bool signExt)
567  {
568  // local variables
569  int low_bit = 0, high_bit = 0;
570  bool signExt_local = signExt;
571  T retVal = 0;
572 
573  // if we're preserving all of the bits, then we can immediately return
574  if (sel == SDWA_DWORD) {
575  return currOperVal;
576  }
577 
578  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
579  /*
580  Process byte 0 first. This code eiter selects the original bits
581  of byte 0, or makes the bits of the selected byte be byte 0 (and
582  next either sign extends or zero's out upper bits).
583  */
584  low_bit = (sel * VegaISA::BITS_PER_BYTE);
585  high_bit = low_bit + VegaISA::MSB_PER_BYTE;
586  retVal = bits(currOperVal, high_bit, low_bit);
587 
588  // make sure update propagated, since used next
590  bits(origOperVal, high_bit),
591  "ERROR: SDWA byte update not propagated: retVal: %d, "
592  "orig: %d\n", bits(retVal, VegaISA::MSB_PER_BYTE),
593  bits(origOperVal, high_bit));
594  // sign extended value depends on upper-most bit of the new byte 0
595  signExt_local = (signExt &&
596  (bits(retVal, VegaISA::MSB_PER_BYTE, 0) & 0x80));
597 
598  // process all other bytes -- if sign extending, make them 1, else
599  // all 0's so leave as is
600  if (signExt_local) {
601  retVal = (uint32_t)sext<VegaISA::MSB_PER_BYTE>(retVal);
602  }
603  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
604  /*
605  Process word 0 first. This code eiter selects the original bits
606  of word 0, or makes the bits of the selected word be word 0 (and
607  next either sign extends or zero's out upper bits).
608  */
609  low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
610  high_bit = low_bit + VegaISA::MSB_PER_WORD;
611  retVal = bits(currOperVal, high_bit, low_bit);
612 
613  // make sure update propagated, since used next
615  bits(origOperVal, high_bit),
616  "ERROR: SDWA word update not propagated: retVal: %d, "
617  "orig: %d\n",
618  bits(retVal, VegaISA::MSB_PER_WORD),
619  bits(origOperVal, high_bit));
620  // sign extended value depends on upper-most bit of the new word 0
621  signExt_local = (signExt &&
622  (bits(retVal, VegaISA::MSB_PER_WORD, 0) &
623  0x8000));
624 
625  // process other word -- if sign extending, make them 1, else all
626  // 0's so leave as is
627  if (signExt_local) {
628  retVal = (uint32_t)sext<VegaISA::MSB_PER_WORD>(retVal);
629  }
630  } else {
631  assert(sel != SDWA_DWORD); // should have returned earlier
632  panic("Unimplemented SDWA select operation: %d\n", sel);
633  }
634 
635  return retVal;
636  }
637 
638 
657  template<typename T>
658  void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
659  const SDWASelVals sel, const bool signExt)
660  {
661  // iterate over all lanes, setting appropriate, selected value
662  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
663  currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
664  origCurrOper[lane], sel,
665  signExt);
666  }
667  }
668 
669 
676  template<typename T>
677  T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
678  const bool clamp, const SDWASelVals sel,
679  const SDWADstVals unusedBits_format)
680  {
681  // local variables
682  int low_bit = 0, high_bit = 0;
683  bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
684  //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
685  bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
686  T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
687  origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
688 
689  // if we're preserving all of the bits, then we can immediately return
690  if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
691  assert(sel == SDWA_DWORD);
692  return currDstVal;
693  } else if (sel == SDWA_DWORD) {
694  // NOTE: users may set the unused bits variable to anything in this
695  // scenario, because it will be ignored
696  return currDstVal;
697  }
698 
699  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
700  // if we sign extended depends on upper-most bit of byte 0
701  signExt = (signExt &&
702  (bits(currDstVal, VegaISA::MSB_PER_WORD, 0) & 0x80));
703 
704  for (int byte = 0; byte < 4; ++byte) {
705  low_bit = byte * VegaISA::BITS_PER_BYTE;
706  high_bit = low_bit + VegaISA::MSB_PER_BYTE;
707  /*
708  Options:
709  1. byte == sel: we are keeping all bits in this byte
710  2. preserve is set: keep this byte as is because the
711  output preserve flag is set
712  3. byte > sel && signExt: we're sign extending and
713  this byte is one of the bytes we need to sign extend
714  */
715  origBits_thisByte = bits(origDstVal, high_bit, low_bit);
716  currBits_thisByte = bits(currDstVal, high_bit, low_bit);
717  newBits = ((byte == sel) ? origBits_thisByte :
718  ((preserve) ? currBits_thisByte :
719  (((byte > sel) && signExt) ? 0xff : 0)));
720  retVal = insertBits(retVal, high_bit, low_bit, newBits);
721  }
722  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
723  low_bit = 0;
724  high_bit = low_bit + VegaISA::MSB_PER_WORD;
725  // if we sign extended depends on upper-most bit of word 0
726  signExt = (signExt &&
727  (bits(currDstVal, high_bit, low_bit) & 0x8000));
728 
729  for (int word = 0; word < 2; ++word) {
730  low_bit = word * VegaISA::BITS_PER_WORD;
731  high_bit = low_bit + VegaISA::MSB_PER_WORD;
732  /*
733  Options:
734  1. word == sel & 1: we are keeping all bits in this word
735  2. preserve is set: keep this word as is because the
736  output preserve flag is set
737  3. word > (sel & 1) && signExt: we're sign extending and
738  this word is one of the words we need to sign extend
739  */
740  origBits_thisWord = bits(origDstVal, high_bit, low_bit);
741  currBits_thisWord = bits(currDstVal, high_bit, low_bit);
742  newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
743  ((preserve) ? currBits_thisWord :
744  (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
745  retVal = insertBits(retVal, high_bit, low_bit, newBits);
746  }
747  } else {
748  assert(sel != SDWA_DWORD); // should have returned earlier
749  panic("Unimplemented SDWA select operation: %d\n", sel);
750  }
751 
752  return retVal;
753  }
754 
755 
777  template<typename T>
778  void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
779  const SDWASelVals sel,
780  const SDWADstVals unusedBits_format)
781  {
782  // iterate over all lanes, setting appropriate, selected value
783  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
784  dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
785  origDstOper[lane], clamp,
786  sel, unusedBits_format);
787  }
788  }
789 
790 
798  template<typename T>
799  void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
800  const SDWASelVals src_sel,
801  const bool src_signExt, const bool src_abs,
802  const bool src_neg)
803  {
811  if (src_neg) {
812  currSrc.negModifier();
813  }
814 
815  if (src_abs) {
816  currSrc.absModifier();
817  }
818 
822  sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
823  }
824 
825 
833  template<typename T>
834  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
835  {
836  // local variables
837  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
838  const bool src0_signExt = sdwaInst.SRC0_SEXT;
839  const bool src0_neg = sdwaInst.SRC0_NEG;
840  const bool src0_abs = sdwaInst.SRC0_ABS;
841 
842  // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
843  // operand. So ensure that SRC1 fields are not set, then call helper
844  // function only on src0.
845  assert(!sdwaInst.SRC1_SEXT);
846  assert(!sdwaInst.SRC1_NEG);
847  assert(!sdwaInst.SRC1_ABS);
848 
849  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
850  src0_abs, src0_neg);
851  }
852 
853 
861  template<typename T>
862  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
863  T & src1, T & origSrc1)
864  {
865  // local variables
866  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
867  const bool src0_signExt = sdwaInst.SRC0_SEXT;
868  const bool src0_neg = sdwaInst.SRC0_NEG;
869  const bool src0_abs = sdwaInst.SRC0_ABS;
870  const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
871  const bool src1_signExt = sdwaInst.SRC1_SEXT;
872  const bool src1_neg = sdwaInst.SRC1_NEG;
873  const bool src1_abs = sdwaInst.SRC1_ABS;
874 
875  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
876  src0_abs, src0_neg);
877  processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
878  src1_abs, src1_neg);
879  }
880 
881 
889  template<typename T>
890  void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
891  {
892  // local variables
893  const SDWADstVals dst_unusedBits_format =
894  (SDWADstVals)sdwaInst.DST_U;
895  const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
896  const bool clamp = sdwaInst.CLMP;
897 
902  sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
903  }
904 } // namespace VegaISA
905 } // namespace gem5
906 
907 #endif // __ARCH_VEGA_INSTS_INST_UTIL_HH__
gem5::SQ_DPP_WF_RL1
@ SQ_DPP_WF_RL1
Definition: inst_util.hh:74
gem5::VegaISA::BITS_PER_WORD
const int BITS_PER_WORD
Definition: gpu_registers.hh:144
gem5::SDWADstVals
SDWADstVals
Definition: inst_util.hh:55
gem5::VegaISA::sdwaInstDstImpl
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition: inst_util.hh:778
gem5::NUM_BANKS
static const int NUM_BANKS
Definition: inst_util.hh:83
sc_dt::word
unsigned int word
Definition: scfx_mant.hh:96
gem5::VegaISA::MSB_PER_BYTE
const int MSB_PER_BYTE
Definition: gpu_registers.hh:145
gem5::VegaISA::VecElemU64
uint64_t VecElemU64
Definition: gpu_registers.hh:168
gem5::VegaISA::InFmt_VOP_DPP::ROW_MASK
unsigned int ROW_MASK
Definition: gpu_decoder.hh:1852
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1863
gem5::VegaISA::InFmt_VOP_DPP::DPP_CTRL
unsigned int DPP_CTRL
Definition: gpu_decoder.hh:1844
gem5::VegaISA::findFirstOneMsb
ScalarRegI32 findFirstOneMsb(T val)
Definition: inst_util.hh:152
gem5::VegaISA::sdwaInstSrcImpl
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition: inst_util.hh:658
gem5::SQ_DPP_ROW_SL1
@ SQ_DPP_ROW_SL1
Definition: inst_util.hh:67
gem5::VegaISA::NumVecElemPerVecReg
const int NumVecElemPerVecReg(64)
gem5::SQ_DPP_ROW_SR15
@ SQ_DPP_ROW_SR15
Definition: inst_util.hh:70
gem5::VegaISA::sdwaInstDstImpl_helper
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:677
gem5::X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:776
gem5::SQ_DPP_ROW_MIRROR
@ SQ_DPP_ROW_MIRROR
Definition: inst_util.hh:77
gem5::SQ_DPP_QUAD_PERM_MAX
@ SQ_DPP_QUAD_PERM_MAX
Definition: inst_util.hh:65
gem5::SQ_DPP_ROW_BCAST31
@ SQ_DPP_ROW_BCAST31
Definition: inst_util.hh:80
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:67
gem5::SDWA_DWORD
@ SDWA_DWORD
Definition: inst_util.hh:51
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_SEXT
unsigned int SRC0_SEXT
Definition: gpu_decoder.hh:1862
gem5::VegaISA::roundNearestEven
T roundNearestEven(T val)
Definition: inst_util.hh:258
gem5::SDWA_BYTE_1
@ SDWA_BYTE_1
Definition: inst_util.hh:46
gem5::SQ_DPP_ROW_BCAST15
@ SQ_DPP_ROW_BCAST15
Definition: inst_util.hh:79
gem5::SQ_DPP_WF_SL1
@ SQ_DPP_WF_SL1
Definition: inst_util.hh:73
gem5::VegaISA::InFmt_VOP_SDWA
Definition: gpu_decoder.hh:1855
gem5::mask
constexpr uint64_t mask(unsigned nbits)
Generate a 64-bit mask of 'nbits' 1s, right justified.
Definition: bitfield.hh:63
gem5::VegaISA::InFmt_VOP_DPP::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1849
gem5::VegaISA::firstOppositeSignBit
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition: inst_util.hh:173
gem5::SQ_DPP_WF_RR1
@ SQ_DPP_WF_RR1
Definition: inst_util.hh:76
gem5::VegaISA::InFmt_VOP_DPP::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1847
gem5::VegaISA::InFmt_VOP_DPP::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1848
gem5::VegaISA::countZeroBitsMsb
ScalarRegI32 countZeroBitsMsb(T val)
Definition: inst_util.hh:163
gem5::SQ_DPP_ROW_RR1
@ SQ_DPP_ROW_RR1
Definition: inst_util.hh:71
gem5::VegaISA::InFmt_VOP_SDWA::CLMP
unsigned int CLMP
Definition: gpu_decoder.hh:1859
gem5::VegaISA::median
T median(T val_0, T val_1, T val_2)
Definition: inst_util.hh:246
gem5::SQ_DPP_RESERVED
@ SQ_DPP_RESERVED
Definition: inst_util.hh:66
gem5::VegaISA::InFmt_VOP_DPP::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1850
gem5::findLsbSet
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
Definition: bitfield.hh:312
gem5::SQ_DPP_WF_SR1
@ SQ_DPP_WF_SR1
Definition: inst_util.hh:75
gem5::X86ISA::count
count
Definition: misc.hh:710
gem5::popCount
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition: bitfield.hh:350
gem5::SDWA_BYTE_2
@ SDWA_BYTE_2
Definition: inst_util.hh:47
gem5::VegaISA::InFmt_VOP_DPP::BANK_MASK
unsigned int BANK_MASK
Definition: gpu_decoder.hh:1851
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1870
gem5::SQ_DPP_ROW_SR1
@ SQ_DPP_ROW_SR1
Definition: inst_util.hh:69
gem5::VegaISA::findFirstZero
ScalarRegI32 findFirstZero(T val)
Definition: inst_util.hh:130
gpu_registers.hh
gem5::SQ_DPP_ROW_RR15
@ SQ_DPP_ROW_RR15
Definition: inst_util.hh:72
gem5::VegaISA::wholeQuadMode
T wholeQuadMode(T val)
Definition: inst_util.hh:89
gem5::insertBits
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition: bitfield.hh:182
gem5::bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition: bitfield.hh:76
gem5::SDWASelVals
SDWASelVals
Definition: inst_util.hh:43
gem5::VegaISA::quadMask
T quadMask(T val)
Definition: inst_util.hh:103
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_SEL
unsigned int SRC0_SEL
Definition: gpu_decoder.hh:1861
gem5::VegaISA::VecElemI64
int64_t VecElemI64
Definition: gpu_registers.hh:169
gem5::SDWA_BYTE_3
@ SDWA_BYTE_3
Definition: inst_util.hh:48
gem5::ROW_SIZE
static const int ROW_SIZE
Definition: inst_util.hh:82
gem5::SQ_DPP_ROW_SL15
@ SQ_DPP_ROW_SL15
Definition: inst_util.hh:68
gem5::VegaISA::ScalarRegU64
uint64_t ScalarRegU64
Definition: gpu_registers.hh:156
gem5::VegaISA::VecElemI32
int32_t VecElemI32
Definition: gpu_registers.hh:166
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::VegaISA::ScalarRegI64
int64_t ScalarRegI64
Definition: gpu_registers.hh:157
gem5::SDWA_UNUSED_PAD
@ SDWA_UNUSED_PAD
Definition: inst_util.hh:57
gem5::VegaISA::VecElemU32
uint32_t VecElemU32
Definition: gpu_registers.hh:165
gem5::VegaISA::dppInstImpl
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition: inst_util.hh:318
gem5::VegaISA::ScalarRegI32
int32_t ScalarRegI32
Definition: gpu_registers.hh:154
gem5::VegaISA::InFmt_VOP_SDWA::DST_U
unsigned int DST_U
Definition: gpu_decoder.hh:1858
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:214
gem5::SDWA_WORD_1
@ SDWA_WORD_1
Definition: inst_util.hh:50
gem5::VegaISA::BITS_PER_BYTE
const int BITS_PER_BYTE
Definition: gpu_registers.hh:143
gem5::VegaISA::countZeroBits
ScalarRegI32 countZeroBits(T val)
Definition: inst_util.hh:120
gem5::VegaISA::processDPP
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition: inst_util.hh:422
gem5::VegaISA::sdwaInstSrcImpl_helper
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:565
gem5::VegaISA::MSB_PER_WORD
const int MSB_PER_WORD
Definition: gpu_registers.hh:146
gem5::SDWA_WORD_0
@ SDWA_WORD_0
Definition: inst_util.hh:49
gem5::VegaISA::InFmt_VOP_SDWA::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1864
gem5::VegaISA::muladd
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition: inst_util.hh:271
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1869
gem5::ArmISA::sel
sel
Definition: misc_types.hh:708
gem5::SDWA_UNUSED_PRESERVE
@ SDWA_UNUSED_PRESERVE
Definition: inst_util.hh:59
gem5::findMsbSet
constexpr int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition: bitfield.hh:276
gem5::VegaISA::ScalarRegU32
uint32_t ScalarRegU32
Definition: gpu_registers.hh:153
gem5::SQ_DPP_ROW_HALF_MIRROR
@ SQ_DPP_ROW_HALF_MIRROR
Definition: inst_util.hh:78
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_SEL
unsigned int SRC1_SEL
Definition: gpu_decoder.hh:1867
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: gpu_translation_state.hh:37
gem5::SDWA_BYTE_0
@ SDWA_BYTE_0
Definition: inst_util.hh:45
gem5::VegaISA::InFmt_VOP_SDWA::DST_SEL
unsigned int DST_SEL
Definition: gpu_decoder.hh:1857
gem5::SDWA_UNUSED_SEXT
@ SDWA_UNUSED_SEXT
Definition: inst_util.hh:58
gem5::VegaISA::InFmt_VOP_SDWA::SRC1_SEXT
unsigned int SRC1_SEXT
Definition: gpu_decoder.hh:1868
gem5::VegaISA::processSDWA_dst
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition: inst_util.hh:890
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:188
gem5::SqDPPVals
SqDPPVals
Definition: inst_util.hh:63
gem5::VegaISA::processSDWA_src_helper
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition: inst_util.hh:799
gem5::VegaISA::InFmt_VOP_DPP::BC
unsigned int BC
Definition: gpu_decoder.hh:1846
gem5::VegaISA::findFirstOne
ScalarRegI32 findFirstOne(T val)
Definition: inst_util.hh:141
gem5::VegaISA::processSDWA_src
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition: inst_util.hh:834
gem5::VegaISA::InFmt_VOP_DPP
Definition: gpu_decoder.hh:1842

Generated on Sun Jul 30 2023 01:56:33 for gem5 by doxygen 1.8.17