gem5  v21.0.1.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
inst_util.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
35 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__
36 
37 #include <cmath>
38 
39 #include "arch/gcn3/registers.hh"
40 
41 // values for SDWA select operations
42 enum SDWASelVals : int
43 {
44  SDWA_BYTE_0 = 0, /* select data[7:0] */
45  SDWA_BYTE_1 = 1, /* select data[15:8] */
46  SDWA_BYTE_2 = 2, /* select data[23:16] */
47  SDWA_BYTE_3 = 3, /* select data[31:24] */
48  SDWA_WORD_0 = 4, /* select data[15:0] */
49  SDWA_WORD_1 = 5, /* select data[31:16] */
50  SDWA_DWORD = 6 /* select data[31:0] */
51 };
52 
53 // values for format of destination bits for SDWA operations
54 enum SDWADstVals : int
55 {
56  SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
57  SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
58  SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
59 };
60 
61 // values for DPP operations
62 enum SqDPPVals : int
63 {
65  SQ_DPP_RESERVED = 0x100,
66  SQ_DPP_ROW_SL1 = 0x101,
67  SQ_DPP_ROW_SL15 = 0x10F,
68  SQ_DPP_ROW_SR1 = 0x111,
69  SQ_DPP_ROW_SR15 = 0x11F,
70  SQ_DPP_ROW_RR1 = 0x121,
71  SQ_DPP_ROW_RR15 = 0x12F,
72  SQ_DPP_WF_SL1 = 0x130,
73  SQ_DPP_WF_RL1 = 0x134,
74  SQ_DPP_WF_SR1 = 0x138,
75  SQ_DPP_WF_RR1 = 0x13C,
80 };
81 static const int ROW_SIZE = 16; /* 16 registers per row */
82 static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
83 
84 namespace Gcn3ISA
85 {
86  template<typename T>
87  inline T
89  {
90  T wqm = 0;
91  T mask = 0xF;
92 
93  for (T bits = val; mask != 0; mask <<= 4)
94  if ((bits & mask) != 0)
95  wqm |= mask;
96 
97  return wqm;
98  }
99 
100  template<typename T>
101  inline T
103  {
104  T qmsk = 0;
105  T mask = 0xF;
106  T qbit = 0x1;
107 
108  for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
109  if (bits & mask) {
110  qmsk |= qbit;
111  }
112  }
113 
114  return qmsk;
115  }
116 
117  template<typename T>
118  inline ScalarRegI32
120  {
121  ScalarRegI32 num_zeros
122  = std::numeric_limits<T>::digits - popCount(val);
123 
124  return num_zeros;
125  }
126 
127  template<typename T>
128  inline ScalarRegI32
130  {
131  if (val == ~T(0)) {
132  return -1;
133  }
134 
135  return findLsbSet(~val);
136  }
137 
138  template<typename T>
139  inline ScalarRegI32
141  {
142  if (!val) {
143  return -1;
144  }
145 
146  return findLsbSet(val);
147  }
148 
149  template<typename T>
150  inline ScalarRegI32
152  {
153  if (!val) {
154  return -1;
155  }
156 
157  return findMsbSet(val);
158  }
159 
160  template<typename T>
161  inline ScalarRegI32
163  {
164  if (!val) {
165  return -1;
166  }
167 
168  return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
169  }
170 
171  inline ScalarRegI32
173  {
174  bool found(false);
175  bool sign_bit = (val & 0x80000000) != 0;
176  ScalarRegU32 tmp_val(0);
177  int count(0);
178 
179  if (!val || val == -1) {
180  return -1;
181  }
182 
183  for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
184  tmp_val = val & (0x80000000 >> i);
185 
186  if (!sign_bit) {
187  if (tmp_val) {
188  found = true;
189  break;
190  }
191  } else {
192  if (!tmp_val) {
193  found = true;
194  break;
195  }
196  }
197  ++count;
198  }
199 
200  if (found) {
201  return count;
202  } else {
203  return -1;
204  }
205  }
206 
207  inline ScalarRegI32
209  {
210  bool found(false);
211  bool sign_bit = (val & 0x8000000000000000ULL) != 0;
212  ScalarRegU64 tmp_val(0);
213  int count(0);
214 
215  if (!val || val == -1) {
216  return -1;
217  }
218 
219  for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
220  tmp_val = val & (0x8000000000000000ULL >> i);
221 
222  if (!sign_bit) {
223  if (tmp_val) {
224  found = true;
225  break;
226  }
227  } else {
228  if (!tmp_val) {
229  found = true;
230  break;
231  }
232  }
233  ++count;
234  }
235 
236  if (found) {
237  return count;
238  } else {
239  return -1;
240  }
241  }
242 
243  template<typename T>
244  inline T
245  median(T val_0, T val_1, T val_2)
246  {
247  if (std::is_floating_point<T>::value) {
248  return std::fmax(std::fmin(val_0, val_1),
249  std::fmin(std::fmax(val_0, val_1), val_2));
250  } else {
251  return std::max(std::min(val_0, val_1),
252  std::min(std::max(val_0, val_1), val_2));
253  }
254  }
255 
256  template <typename T>
257  inline T roundNearestEven(T val)
258  {
259  T int_part = 0;
260  T nearest_round = std::floor(val + 0.5);
261  if ((int)std::floor(val) % 2 == 0
262  && std::modf(std::abs(val), &int_part) == 0.5) {
263  nearest_round = nearest_round - 1;
264  }
265 
266  return nearest_round;
267  }
268 
269  inline VecElemU32
271  VecElemU64 val_2)
272  {
273  __uint128_t u0 = (__uint128_t)val_0;
274  __uint128_t u1 = (__uint128_t)val_1;
275  __uint128_t u2 = (__uint128_t)val_2;
276  __uint128_t result = u0 * u1 + u2;
277 
278  dst = (VecElemU64)result;
279 
280  return (VecElemU32)(result >> 64) ? 1 : 0;
281  }
282 
283  inline VecElemU32
285  VecElemI64 val_2)
286  {
287  __int128_t u0 = (__int128_t)val_0;
288  __int128_t u1 = (__int128_t)val_1;
289  __int128_t u2 = (__int128_t)val_2;
290  __int128_t result = u0 * u1 + u2;
291 
292  dst = (VecElemI64)result;
293 
294  return (VecElemU32)(result >> 64) ? 1 : 0;
295  }
296 
317  int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
318  int rowOffset, bool & outOfBounds)
319  {
320  // local variables
321  // newLane will be the same as the input lane unless swizzling happens
322  int newLane = currLane;
323  // for shift/rotate permutations; positive values are LEFT rotates
324  int count = 1;
325  int localRowOffset = rowOffset;
326  int localRowNum = rowNum;
327 
328  if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
329  int quadBase = (currLane & ~(3));
330  int quadPix = (currLane & 3);
331  quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
332  newLane = (quadBase | quadPix);
333  } else if (dppCtrl == SQ_DPP_RESERVED) {
334  panic("ERROR: instruction using reserved DPP_CTRL value\n");
335  } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
336  (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
337  count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
338  if ((localRowOffset + count >= 0) &&
339  (localRowOffset + count < ROW_SIZE)) {
340  localRowOffset += count;
341  newLane = (rowNum | localRowOffset);
342  } else {
343  outOfBounds = true;
344  }
345  } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
346  (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
347  count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
348  if ((localRowOffset + count >= 0) &&
349  (localRowOffset + count < ROW_SIZE)) {
350  localRowOffset += count;
351  newLane = (rowNum | localRowOffset);
352  } else {
353  outOfBounds = true;
354  }
355  } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
356  (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
357  count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
358  localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
359  newLane = (rowNum | localRowOffset);
360  } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
361  count = 1;
362  if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
363  newLane += count;
364  } else {
365  outOfBounds = true;
366  }
367  } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
368  count = 1;
369  newLane = (currLane + count + NumVecElemPerVecReg) %
371  } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
372  count = -1;
373  int currVal = (currLane + count);
374  if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
375  newLane += count;
376  } else {
377  outOfBounds = true;
378  }
379  } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
380  count = -1;
381  newLane = (currLane + count + NumVecElemPerVecReg) %
383  } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
384  localRowOffset = (15 - localRowOffset);
385  newLane = (rowNum | localRowOffset);
386  } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
387  localRowNum = (currLane & -0x7);
388  localRowOffset = (currLane & 0x7);
389  localRowOffset = (7 - localRowNum);
390  newLane = (localRowNum | localRowOffset);
391  } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
392  count = 15;
393  if (currLane > count) {
394  newLane = (currLane & ~count) - 1;
395  }
396  } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
397  count = 31;
398  if (currLane > count) {
399  newLane = (currLane & ~count) - 1;
400  }
401  } else {
402  panic("Unimplemented DPP control operation: %d\n", dppCtrl);
403  }
404 
405  return newLane;
406  }
407 
413  template<typename T>
414  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
415  T & src0)
416  {
417  // local variables
418  SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
419  int boundCtrl = dppInst.BOUND_CTRL;
420  int bankMask = dppInst.BANK_MASK;
421  int rowMask = dppInst.ROW_MASK;
422  // row, bank info to be calculated per lane
423  int rowNum = 0, bankNum = 0, rowOffset = 0;
424  // outLane will be the same as the input lane unless swizzling happens
425  int outLane = 0;
426  bool laneDisabled = false;
427  // flags used for determining if a lane should be written to/reset/etc.
428  bool outOfBounds = false, zeroSrc = false;
429  long long threadValid = 0;
430 
437  if (dppInst.SRC0_NEG) {
438  src0.negModifier();
439  }
440 
441  if (dppInst.SRC0_ABS) {
442  src0.absModifier();
443  }
444 
445  // iterate over all register lanes, performing steps 2-4
446  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
447  threadValid = (0x1LL << lane);
453  rowNum = (lane / ROW_SIZE);
454  rowOffset = (lane % ROW_SIZE);
455  bankNum = (rowOffset / NUM_BANKS);
456 
457  if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
458  ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
459  laneDisabled = true;
460  continue;
461  }
462 
479  if (!laneDisabled) {
480  outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
481  outOfBounds);
482  }
483 
489  if (laneDisabled) {
490  threadValid = 0;
491  } else if (outOfBounds) {
492  if (boundCtrl == 1) {
493  zeroSrc = true;
494  } else {
495  threadValid = 0;
496  }
497  } else if (!gpuDynInst->exec_mask[lane]) {
498  if (boundCtrl == 1) {
499  zeroSrc = true;
500  } else {
501  threadValid = 0;
502  }
503  }
504 
505  if (threadValid != 0 && !outOfBounds && !zeroSrc) {
506  assert(!laneDisabled);
507  src0[outLane] = src0[lane];
508  } else if (zeroSrc) {
509  src0[lane] = 0;
510  }
511 
512  // reset for next iteration
513  laneDisabled = false;
514  }
515  }
516 
522  template<typename T>
523  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
524  T & src0, T & src1)
525  {
532  if (dppInst.SRC1_NEG) {
533  src1.negModifier();
534  }
535 
536  if (dppInst.SRC1_ABS) {
537  src1.absModifier();
538  }
539 
540  // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
541  // which is only used for negation/absolute value, call other version
542  // to do everything else.
543  processDPP(gpuDynInst, dppInst, src0);
544  }
545 
552  template<typename T>
553  T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
554  const SDWASelVals sel, const bool signExt)
555  {
556  // local variables
557  int low_bit = 0, high_bit = 0;
558  bool signExt_local = signExt;
559  T retVal = 0;
560 
561  // if we're preserving all of the bits, then we can immediately return
562  if (sel == SDWA_DWORD) {
563  return currOperVal;
564  }
565 
566  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
567  /*
568  Process byte 0 first. This code eiter selects the original bits
569  of byte 0, or makes the bits of the selected byte be byte 0 (and
570  next either sign extends or zero's out upper bits).
571  */
572  low_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
573  high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
574  retVal = bits(currOperVal, high_bit, low_bit);
575 
576  // make sure update propagated, since used next
578  bits(origOperVal, high_bit),
579  "ERROR: SDWA byte update not propagated: retVal: %d, "
580  "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE),
581  bits(origOperVal, high_bit));
582  // sign extended value depends on upper-most bit of the new byte 0
583  signExt_local = (signExt &&
584  (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80));
585 
586  // process all other bytes -- if sign extending, make them 1, else
587  // all 0's so leave as is
588  if (signExt_local) {
589  retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
590  }
591  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
592  /*
593  Process word 0 first. This code eiter selects the original bits
594  of word 0, or makes the bits of the selected word be word 0 (and
595  next either sign extends or zero's out upper bits).
596  */
597  low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
598  high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
599  retVal = bits(currOperVal, high_bit, low_bit);
600 
601  // make sure update propagated, since used next
603  bits(origOperVal, high_bit),
604  "ERROR: SDWA word update not propagated: retVal: %d, "
605  "orig: %d\n",
606  bits(retVal, Gcn3ISA::MSB_PER_WORD),
607  bits(origOperVal, high_bit));
608  // sign extended value depends on upper-most bit of the new word 0
609  signExt_local = (signExt &&
610  (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) &
611  0x8000));
612 
613  // process other word -- if sign extending, make them 1, else all
614  // 0's so leave as is
615  if (signExt_local) {
616  retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
617  }
618  } else {
619  assert(sel != SDWA_DWORD); // should have returned earlier
620  panic("Unimplemented SDWA select operation: %d\n", sel);
621  }
622 
623  return retVal;
624  }
625 
626 
645  template<typename T>
646  void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
647  const SDWASelVals sel, const bool signExt)
648  {
649  // iterate over all lanes, setting appropriate, selected value
650  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
651  currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
652  origCurrOper[lane], sel,
653  signExt);
654  }
655  }
656 
657 
664  template<typename T>
665  T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
666  const bool clamp, const SDWASelVals sel,
667  const SDWADstVals unusedBits_format)
668  {
669  // local variables
670  int low_bit = 0, high_bit = 0;
671  bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
672  //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
673  bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
674  T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
675  origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
676 
677  // if we're preserving all of the bits, then we can immediately return
678  if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
679  assert(sel == SDWA_DWORD);
680  return currDstVal;
681  } else if (sel == SDWA_DWORD) {
682  // NOTE: users may set the unused bits variable to anything in this
683  // scenario, because it will be ignored
684  return currDstVal;
685  }
686 
687  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
688  // if we sign extended depends on upper-most bit of byte 0
689  signExt = (signExt &&
690  (bits(currDstVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x80));
691 
692  for (int byte = 0; byte < 4; ++byte) {
693  low_bit = byte * Gcn3ISA::BITS_PER_BYTE;
694  high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
695  /*
696  Options:
697  1. byte == sel: we are keeping all bits in this byte
698  2. preserve is set: keep this byte as is because the
699  output preserve flag is set
700  3. byte > sel && signExt: we're sign extending and
701  this byte is one of the bytes we need to sign extend
702  */
703  origBits_thisByte = bits(origDstVal, high_bit, low_bit);
704  currBits_thisByte = bits(currDstVal, high_bit, low_bit);
705  newBits = ((byte == sel) ? origBits_thisByte :
706  ((preserve) ? currBits_thisByte :
707  (((byte > sel) && signExt) ? 0xff : 0)));
708  retVal = insertBits(retVal, high_bit, low_bit, newBits);
709  }
710  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
711  low_bit = 0;
712  high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
713  // if we sign extended depends on upper-most bit of word 0
714  signExt = (signExt &&
715  (bits(currDstVal, high_bit, low_bit) & 0x8000));
716 
717  for (int word = 0; word < 2; ++word) {
718  low_bit = word * Gcn3ISA::BITS_PER_WORD;
719  high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
720  /*
721  Options:
722  1. word == sel & 1: we are keeping all bits in this word
723  2. preserve is set: keep this word as is because the
724  output preserve flag is set
725  3. word > (sel & 1) && signExt: we're sign extending and
726  this word is one of the words we need to sign extend
727  */
728  origBits_thisWord = bits(origDstVal, high_bit, low_bit);
729  currBits_thisWord = bits(currDstVal, high_bit, low_bit);
730  newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
731  ((preserve) ? currBits_thisWord :
732  (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
733  retVal = insertBits(retVal, high_bit, low_bit, newBits);
734  }
735  } else {
736  assert(sel != SDWA_DWORD); // should have returned earlier
737  panic("Unimplemented SDWA select operation: %d\n", sel);
738  }
739 
740  return retVal;
741  }
742 
743 
765  template<typename T>
766  void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
767  const SDWASelVals sel,
768  const SDWADstVals unusedBits_format)
769  {
770  // iterate over all lanes, setting appropriate, selected value
771  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
772  dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
773  origDstOper[lane], clamp,
774  sel, unusedBits_format);
775  }
776  }
777 
778 
786  template<typename T>
787  void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
788  const SDWASelVals src_sel,
789  const bool src_signExt, const bool src_abs,
790  const bool src_neg)
791  {
799  if (src_neg) {
800  currSrc.negModifier();
801  }
802 
803  if (src_abs) {
804  currSrc.absModifier();
805  }
806 
810  sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
811  }
812 
813 
821  template<typename T>
822  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
823  {
824  // local variables
825  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
826  const bool src0_signExt = sdwaInst.SRC0_SEXT;
827  const bool src0_neg = sdwaInst.SRC0_NEG;
828  const bool src0_abs = sdwaInst.SRC0_ABS;
829 
830  // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
831  // operand. So ensure that SRC1 fields are not set, then call helper
832  // function only on src0.
833  assert(!sdwaInst.SRC1_SEXT);
834  assert(!sdwaInst.SRC1_NEG);
835  assert(!sdwaInst.SRC1_ABS);
836 
837  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
838  src0_abs, src0_neg);
839  }
840 
841 
849  template<typename T>
850  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
851  T & src1, T & origSrc1)
852  {
853  // local variables
854  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
855  const bool src0_signExt = sdwaInst.SRC0_SEXT;
856  const bool src0_neg = sdwaInst.SRC0_NEG;
857  const bool src0_abs = sdwaInst.SRC0_ABS;
858  const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
859  const bool src1_signExt = sdwaInst.SRC1_SEXT;
860  const bool src1_neg = sdwaInst.SRC1_NEG;
861  const bool src1_abs = sdwaInst.SRC1_ABS;
862 
863  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
864  src0_abs, src0_neg);
865  processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
866  src1_abs, src1_neg);
867  }
868 
869 
877  template<typename T>
878  void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
879  {
880  // local variables
881  const SDWADstVals dst_unusedBits_format =
882  (SDWADstVals)sdwaInst.DST_UNUSED;
883  const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
884  const bool clamp = sdwaInst.CLAMP;
885 
890  sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
891  }
892 } // namespace Gcn3ISA
893 
894 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__
Gcn3ISA::NumVecElemPerVecReg
const int NumVecElemPerVecReg(64)
Gcn3ISA::InFmt_VOP_SDWA::DST_SEL
unsigned int DST_SEL
Definition: gpu_decoder.hh:1594
insertBits
constexpr T insertBits(T val, unsigned first, unsigned last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition: bitfield.hh:143
LL
#define LL(N)
int64_t constant
Definition: types.hh:48
SQ_DPP_ROW_SL1
@ SQ_DPP_ROW_SL1
Definition: inst_util.hh:66
sc_dt::word
unsigned int word
Definition: scfx_mant.hh:96
Gcn3ISA::sdwaInstSrcImpl
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition: inst_util.hh:646
SDWA_UNUSED_SEXT
@ SDWA_UNUSED_SEXT
Definition: inst_util.hh:57
popCount
constexpr int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition: bitfield.hh:314
Gcn3ISA::InFmt_VOP_SDWA::CLAMP
unsigned int CLAMP
Definition: gpu_decoder.hh:1596
Gcn3ISA::median
T median(T val_0, T val_1, T val_2)
Definition: inst_util.hh:245
SQ_DPP_ROW_SL15
@ SQ_DPP_ROW_SL15
Definition: inst_util.hh:67
Gcn3ISA::ScalarRegI64
int64_t ScalarRegI64
Definition: registers.hh:156
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
ArmISA::sel
sel
Definition: miscregs_types.hh:644
Gcn3ISA::BITS_PER_BYTE
const int BITS_PER_BYTE
Definition: registers.hh:142
SDWA_BYTE_2
@ SDWA_BYTE_2
Definition: inst_util.hh:46
Gcn3ISA::countZeroBits
ScalarRegI32 countZeroBits(T val)
Definition: inst_util.hh:119
Gcn3ISA::InFmt_VOP_DPP::BANK_MASK
unsigned int BANK_MASK
Definition: gpu_decoder.hh:1588
Gcn3ISA::findFirstZero
ScalarRegI32 findFirstZero(T val)
Definition: inst_util.hh:129
Gcn3ISA::ScalarRegU64
uint64_t ScalarRegU64
Definition: registers.hh:155
Gcn3ISA::InFmt_VOP_SDWA::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1606
X86ISA::count
count
Definition: misc.hh:703
Gcn3ISA::InFmt_VOP_DPP::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1585
Gcn3ISA::firstOppositeSignBit
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition: inst_util.hh:172
registers.hh
Gcn3ISA::InFmt_VOP_DPP::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1587
Gcn3ISA::roundNearestEven
T roundNearestEven(T val)
Definition: inst_util.hh:257
SQ_DPP_ROW_RR1
@ SQ_DPP_ROW_RR1
Definition: inst_util.hh:70
Gcn3ISA::VecElemU32
uint32_t VecElemU32
Definition: registers.hh:164
Gcn3ISA::findFirstOne
ScalarRegI32 findFirstOne(T val)
Definition: inst_util.hh:140
SQ_DPP_ROW_MIRROR
@ SQ_DPP_ROW_MIRROR
Definition: inst_util.hh:76
Gcn3ISA::wholeQuadMode
T wholeQuadMode(T val)
Definition: inst_util.hh:88
Gcn3ISA::InFmt_VOP_SDWA::SRC1_SEXT
unsigned int SRC1_SEXT
Definition: gpu_decoder.hh:1604
SDWASelVals
SDWASelVals
Definition: inst_util.hh:42
Gcn3ISA::sdwaInstDstImpl
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition: inst_util.hh:766
Gcn3ISA::dppInstImpl
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition: inst_util.hh:317
SDWA_BYTE_3
@ SDWA_BYTE_3
Definition: inst_util.hh:47
Gcn3ISA::InFmt_VOP_SDWA::DST_UNUSED
unsigned int DST_UNUSED
Definition: gpu_decoder.hh:1595
SQ_DPP_WF_RR1
@ SQ_DPP_WF_RR1
Definition: inst_util.hh:75
SqDPPVals
SqDPPVals
Definition: inst_util.hh:62
Gcn3ISA::ScalarRegI32
int32_t ScalarRegI32
Definition: registers.hh:153
SQ_DPP_ROW_SR15
@ SQ_DPP_ROW_SR15
Definition: inst_util.hh:69
Gcn3ISA
classes that represnt vector/scalar operands in GCN3 ISA.
Definition: decoder.cc:41
Gcn3ISA::InFmt_VOP_DPP::ROW_MASK
unsigned int ROW_MASK
Definition: gpu_decoder.hh:1589
Gcn3ISA::quadMask
T quadMask(T val)
Definition: inst_util.hh:102
Gcn3ISA::findFirstOneMsb
ScalarRegI32 findFirstOneMsb(T val)
Definition: inst_util.hh:151
Gcn3ISA::MSB_PER_WORD
const int MSB_PER_WORD
Definition: registers.hh:145
SQ_DPP_QUAD_PERM_MAX
@ SQ_DPP_QUAD_PERM_MAX
Definition: inst_util.hh:64
SQ_DPP_WF_RL1
@ SQ_DPP_WF_RL1
Definition: inst_util.hh:73
Gcn3ISA::InFmt_VOP_DPP::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1586
Gcn3ISA::VecElemI64
int64_t VecElemI64
Definition: registers.hh:168
Gcn3ISA::InFmt_VOP_DPP::BOUND_CTRL
unsigned int BOUND_CTRL
Definition: gpu_decoder.hh:1583
SQ_DPP_WF_SR1
@ SQ_DPP_WF_SR1
Definition: inst_util.hh:74
X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:769
SQ_DPP_ROW_HALF_MIRROR
@ SQ_DPP_ROW_HALF_MIRROR
Definition: inst_util.hh:77
ROW_SIZE
static const int ROW_SIZE
Definition: inst_util.hh:81
Gcn3ISA::countZeroBitsMsb
ScalarRegI32 countZeroBitsMsb(T val)
Definition: inst_util.hh:162
Gcn3ISA::InFmt_VOP_SDWA::SRC0_SEL
unsigned int SRC0_SEL
Definition: gpu_decoder.hh:1598
Gcn3ISA::InFmt_VOP_DPP::DPP_CTRL
unsigned int DPP_CTRL
Definition: gpu_decoder.hh:1581
Gcn3ISA::InFmt_VOP_SDWA::SRC0_SEXT
unsigned int SRC0_SEXT
Definition: gpu_decoder.hh:1599
Gcn3ISA::BITS_PER_WORD
const int BITS_PER_WORD
Definition: registers.hh:143
SDWA_UNUSED_PAD
@ SDWA_UNUSED_PAD
Definition: inst_util.hh:56
Gcn3ISA::muladd
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition: inst_util.hh:270
SDWA_BYTE_0
@ SDWA_BYTE_0
Definition: inst_util.hh:44
findMsbSet
constexpr int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition: bitfield.hh:240
SQ_DPP_WF_SL1
@ SQ_DPP_WF_SL1
Definition: inst_util.hh:72
SDWA_UNUSED_PRESERVE
@ SDWA_UNUSED_PRESERVE
Definition: inst_util.hh:58
Gcn3ISA::processSDWA_src
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition: inst_util.hh:822
Gcn3ISA::sdwaInstSrcImpl_helper
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:553
Gcn3ISA::InFmt_VOP_SDWA::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1605
Gcn3ISA::InFmt_VOP_SDWA::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1601
SQ_DPP_ROW_SR1
@ SQ_DPP_ROW_SR1
Definition: inst_util.hh:68
NUM_BANKS
static const int NUM_BANKS
Definition: inst_util.hh:82
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Gcn3ISA::sdwaInstDstImpl_helper
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:665
Gcn3ISA::ScalarRegU32
uint32_t ScalarRegU32
Definition: registers.hh:152
bits
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition: bitfield.hh:73
Gcn3ISA::processSDWA_dst
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition: inst_util.hh:878
findLsbSet
constexpr int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
Definition: bitfield.hh:276
Gcn3ISA::VecElemI32
int32_t VecElemI32
Definition: registers.hh:165
SDWA_BYTE_1
@ SDWA_BYTE_1
Definition: inst_util.hh:45
SDWA_DWORD
@ SDWA_DWORD
Definition: inst_util.hh:50
Gcn3ISA::InFmt_VOP_SDWA::SRC1_SEL
unsigned int SRC1_SEL
Definition: gpu_decoder.hh:1603
SQ_DPP_RESERVED
@ SQ_DPP_RESERVED
Definition: inst_util.hh:65
Gcn3ISA::processSDWA_src_helper
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition: inst_util.hh:787
Gcn3ISA::VecElemU64
uint64_t VecElemU64
Definition: registers.hh:167
Gcn3ISA::MSB_PER_BYTE
const int MSB_PER_BYTE
Definition: registers.hh:144
SDWA_WORD_0
@ SDWA_WORD_0
Definition: inst_util.hh:48
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:219
Gcn3ISA::InFmt_VOP_DPP
Definition: gpu_decoder.hh:1579
SDWA_WORD_1
@ SDWA_WORD_1
Definition: inst_util.hh:49
Gcn3ISA::InFmt_VOP_DPP::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1584
SQ_DPP_ROW_RR15
@ SQ_DPP_ROW_RR15
Definition: inst_util.hh:71
Gcn3ISA::InFmt_VOP_SDWA
Definition: gpu_decoder.hh:1592
Gcn3ISA::InFmt_VOP_SDWA::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1600
ULL
#define ULL(N)
uint64_t constant
Definition: types.hh:46
SQ_DPP_ROW_BCAST31
@ SQ_DPP_ROW_BCAST31
Definition: inst_util.hh:79
ArmISA::mask
Bitfield< 28, 24 > mask
Definition: miscregs_types.hh:711
Gcn3ISA::processDPP
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition: inst_util.hh:414
SDWADstVals
SDWADstVals
Definition: inst_util.hh:54
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:171
SQ_DPP_ROW_BCAST15
@ SQ_DPP_ROW_BCAST15
Definition: inst_util.hh:78

Generated on Tue Jun 22 2021 15:28:21 for gem5 by doxygen 1.8.17