gem5  v20.1.0.0
inst_util.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Authors: Anthony Gutierrez
34  */
35 
36 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
37 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__
38 
39 #include <cmath>
40 
41 #include "arch/gcn3/registers.hh"
42 
43 // values for SDWA select operations
44 enum SDWASelVals : int
45 {
46  SDWA_BYTE_0 = 0, /* select data[7:0] */
47  SDWA_BYTE_1 = 1, /* select data[15:8] */
48  SDWA_BYTE_2 = 2, /* select data[23:16] */
49  SDWA_BYTE_3 = 3, /* select data[31:24] */
50  SDWA_WORD_0 = 4, /* select data[15:0] */
51  SDWA_WORD_1 = 5, /* select data[31:16] */
52  SDWA_DWORD = 6 /* select data[31:0] */
53 };
54 
55 // values for format of destination bits for SDWA operations
56 enum SDWADstVals : int
57 {
58  SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
59  SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
60  SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
61 };
62 
63 // values for DPP operations
64 enum SqDPPVals : int
65 {
67  SQ_DPP_RESERVED = 0x100,
68  SQ_DPP_ROW_SL1 = 0x101,
69  SQ_DPP_ROW_SL15 = 0x10F,
70  SQ_DPP_ROW_SR1 = 0x111,
71  SQ_DPP_ROW_SR15 = 0x11F,
72  SQ_DPP_ROW_RR1 = 0x121,
73  SQ_DPP_ROW_RR15 = 0x12F,
74  SQ_DPP_WF_SL1 = 0x130,
75  SQ_DPP_WF_RL1 = 0x134,
76  SQ_DPP_WF_SR1 = 0x138,
77  SQ_DPP_WF_RR1 = 0x13C,
82 };
83 static const int ROW_SIZE = 16; /* 16 registers per row */
84 static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
85 
86 namespace Gcn3ISA
87 {
88  template<typename T>
89  inline T
91  {
92  T wqm = 0;
93  T mask = 0xF;
94 
95  for (T bits = val; mask != 0; mask <<= 4)
96  if ((bits & mask) != 0)
97  wqm |= mask;
98 
99  return wqm;
100  }
101 
102  template<typename T>
103  inline T
105  {
106  T qmsk = 0;
107  T mask = 0xF;
108  T qbit = 0x1;
109 
110  for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
111  if (bits & mask) {
112  qmsk |= qbit;
113  }
114  }
115 
116  return qmsk;
117  }
118 
119  template<typename T>
120  inline ScalarRegI32
122  {
123  ScalarRegI32 num_zeros
124  = std::numeric_limits<T>::digits - popCount(val);
125 
126  return num_zeros;
127  }
128 
129  template<typename T>
130  inline ScalarRegI32
132  {
133  if (val == ~T(0)) {
134  return -1;
135  }
136 
137  return findLsbSet(~val);
138  }
139 
140  template<typename T>
141  inline ScalarRegI32
143  {
144  if (!val) {
145  return -1;
146  }
147 
148  return findLsbSet(val);
149  }
150 
151  template<typename T>
152  inline ScalarRegI32
154  {
155  if (!val) {
156  return -1;
157  }
158 
159  return findMsbSet(val);
160  }
161 
162  template<typename T>
163  inline ScalarRegI32
165  {
166  if (!val) {
167  return -1;
168  }
169 
170  return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
171  }
172 
173  inline ScalarRegI32
175  {
176  bool found(false);
177  bool sign_bit = (val & 0x80000000) != 0;
178  ScalarRegU32 tmp_val(0);
179  int count(0);
180 
181  if (!val || val == -1) {
182  return -1;
183  }
184 
185  for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
186  tmp_val = val & (0x80000000 >> i);
187 
188  if (!sign_bit) {
189  if (tmp_val) {
190  found = true;
191  break;
192  }
193  } else {
194  if (!tmp_val) {
195  found = true;
196  break;
197  }
198  }
199  ++count;
200  }
201 
202  if (found) {
203  return count;
204  } else {
205  return -1;
206  }
207  }
208 
209  inline ScalarRegI32
211  {
212  bool found(false);
213  bool sign_bit = (val & 0x8000000000000000ULL) != 0;
214  ScalarRegU64 tmp_val(0);
215  int count(0);
216 
217  if (!val || val == -1) {
218  return -1;
219  }
220 
221  for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
222  tmp_val = val & (0x8000000000000000ULL >> i);
223 
224  if (!sign_bit) {
225  if (tmp_val) {
226  found = true;
227  break;
228  }
229  } else {
230  if (!tmp_val) {
231  found = true;
232  break;
233  }
234  }
235  ++count;
236  }
237 
238  if (found) {
239  return count;
240  } else {
241  return -1;
242  }
243  }
244 
245  template<typename T>
246  inline T
247  median(T val_0, T val_1, T val_2)
248  {
249  if (std::is_floating_point<T>::value) {
250  return std::fmax(std::fmin(val_0, val_1),
251  std::fmin(std::fmax(val_0, val_1), val_2));
252  } else {
253  return std::max(std::min(val_0, val_1),
254  std::min(std::max(val_0, val_1), val_2));
255  }
256  }
257 
258  template <typename T>
259  inline T roundNearestEven(T val)
260  {
261  T int_part = 0;
262  T nearest_round = std::floor(val + 0.5);
263  if ((int)std::floor(val) % 2 == 0
264  && std::modf(std::abs(val), &int_part) == 0.5) {
265  nearest_round = nearest_round - 1;
266  }
267 
268  return nearest_round;
269  }
270 
271  inline VecElemU32
273  VecElemU64 val_2)
274  {
275  __uint128_t u0 = (__uint128_t)val_0;
276  __uint128_t u1 = (__uint128_t)val_1;
277  __uint128_t u2 = (__uint128_t)val_2;
278  __uint128_t result = u0 * u1 + u2;
279 
280  dst = (VecElemU64)result;
281 
282  return (VecElemU32)(result >> 64) ? 1 : 0;
283  }
284 
285  inline VecElemU32
287  VecElemI64 val_2)
288  {
289  __int128_t u0 = (__int128_t)val_0;
290  __int128_t u1 = (__int128_t)val_1;
291  __int128_t u2 = (__int128_t)val_2;
292  __int128_t result = u0 * u1 + u2;
293 
294  dst = (VecElemI64)result;
295 
296  return (VecElemU32)(result >> 64) ? 1 : 0;
297  }
298 
319  int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
320  int rowOffset, bool & outOfBounds)
321  {
322  // local variables
323  // newLane will be the same as the input lane unless swizzling happens
324  int newLane = currLane;
325  // for shift/rotate permutations; positive values are LEFT rotates
326  int count = 1;
327  int localRowOffset = rowOffset;
328  int localRowNum = rowNum;
329 
330  if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
331  int quadBase = (currLane & ~(3));
332  int quadPix = (currLane & 3);
333  quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
334  newLane = (quadBase | quadPix);
335  } else if (dppCtrl == SQ_DPP_RESERVED) {
336  panic("ERROR: instruction using reserved DPP_CTRL value\n");
337  } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
338  (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
339  count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
340  if ((localRowOffset + count >= 0) &&
341  (localRowOffset + count < ROW_SIZE)) {
342  localRowOffset += count;
343  newLane = (rowNum | localRowOffset);
344  } else {
345  outOfBounds = true;
346  }
347  } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
348  (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
349  count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
350  if ((localRowOffset + count >= 0) &&
351  (localRowOffset + count < ROW_SIZE)) {
352  localRowOffset += count;
353  newLane = (rowNum | localRowOffset);
354  } else {
355  outOfBounds = true;
356  }
357  } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
358  (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
359  count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
360  localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
361  newLane = (rowNum | localRowOffset);
362  } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
363  count = 1;
364  if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
365  newLane += count;
366  } else {
367  outOfBounds = true;
368  }
369  } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
370  count = 1;
371  newLane = (currLane + count + NumVecElemPerVecReg) %
373  } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
374  count = -1;
375  int currVal = (currLane + count);
376  if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
377  newLane += count;
378  } else {
379  outOfBounds = true;
380  }
381  } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
382  count = -1;
383  newLane = (currLane + count + NumVecElemPerVecReg) %
385  } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
386  localRowOffset = (15 - localRowOffset);
387  newLane = (rowNum | localRowOffset);
388  } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
389  localRowNum = (currLane & -0x7);
390  localRowOffset = (currLane & 0x7);
391  localRowOffset = (7 - localRowNum);
392  newLane = (localRowNum | localRowOffset);
393  } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
394  count = 15;
395  if (currLane > count) {
396  newLane = (currLane & ~count) - 1;
397  }
398  } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
399  count = 31;
400  if (currLane > count) {
401  newLane = (currLane & ~count) - 1;
402  }
403  } else {
404  panic("Unimplemented DPP control operation: %d\n", dppCtrl);
405  }
406 
407  return newLane;
408  }
409 
415  template<typename T>
416  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
417  T & src0)
418  {
419  // local variables
420  SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
421  int boundCtrl = dppInst.BOUND_CTRL;
422  int bankMask = dppInst.BANK_MASK;
423  int rowMask = dppInst.ROW_MASK;
424  // row, bank info to be calculated per lane
425  int rowNum = 0, bankNum = 0, rowOffset = 0;
426  // outLane will be the same as the input lane unless swizzling happens
427  int outLane = 0;
428  bool laneDisabled = false;
429  // flags used for determining if a lane should be written to/reset/etc.
430  bool outOfBounds = false, zeroSrc = false;
431  long long threadValid = 0;
432 
439  if (dppInst.SRC0_NEG) {
440  src0.negModifier();
441  }
442 
443  if (dppInst.SRC0_ABS) {
444  src0.absModifier();
445  }
446 
447  // iterate over all register lanes, performing steps 2-4
448  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
449  threadValid = (0x1LL << lane);
455  rowNum = (lane / ROW_SIZE);
456  rowOffset = (lane % ROW_SIZE);
457  bankNum = (rowOffset / NUM_BANKS);
458 
459  if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
460  ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
461  laneDisabled = true;
462  continue;
463  }
464 
481  if (!laneDisabled) {
482  outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
483  outOfBounds);
484  }
485 
491  if (laneDisabled) {
492  threadValid = 0;
493  } else if (outOfBounds) {
494  if (boundCtrl == 1) {
495  zeroSrc = true;
496  } else {
497  threadValid = 0;
498  }
499  } else if (!gpuDynInst->exec_mask[lane]) {
500  if (boundCtrl == 1) {
501  zeroSrc = true;
502  } else {
503  threadValid = 0;
504  }
505  }
506 
507  if (threadValid != 0 && !outOfBounds && !zeroSrc) {
508  assert(!laneDisabled);
509  src0[outLane] = src0[lane];
510  } else if (zeroSrc) {
511  src0[lane] = 0;
512  }
513 
514  // reset for next iteration
515  laneDisabled = false;
516  }
517  }
518 
524  template<typename T>
525  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
526  T & src0, T & src1)
527  {
534  if (dppInst.SRC1_NEG) {
535  src1.negModifier();
536  }
537 
538  if (dppInst.SRC1_ABS) {
539  src1.absModifier();
540  }
541 
542  // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
543  // which is only used for negation/absolute value, call other version
544  // to do everything else.
545  processDPP(gpuDynInst, dppInst, src0);
546  }
547 
554  template<typename T>
555  T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
556  const SDWASelVals sel, const bool signExt)
557  {
558  // local variables
559  int low_bit = 0, high_bit = 0;
560  bool signExt_local = signExt;
561  T retVal = 0;
562 
563  // if we're preserving all of the bits, then we can immediately return
564  if (sel == SDWA_DWORD) {
565  return currOperVal;
566  }
567 
568  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
569  /*
570  Process byte 0 first. This code eiter selects the original bits
571  of byte 0, or makes the bits of the selected byte be byte 0 (and
572  next either sign extends or zero's out upper bits).
573  */
574  low_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
575  high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
576  retVal = bits(currOperVal, high_bit, low_bit);
577 
578  // make sure update propagated, since used next
580  bits(origOperVal, high_bit),
581  "ERROR: SDWA byte update not propagated: retVal: %d, "
582  "orig: %d\n", bits(retVal, Gcn3ISA::MSB_PER_BYTE),
583  bits(origOperVal, high_bit));
584  // sign extended value depends on upper-most bit of the new byte 0
585  signExt_local = (signExt &&
586  (bits(retVal, Gcn3ISA::MSB_PER_BYTE, 0) & 0x80));
587 
588  // process all other bytes -- if sign extending, make them 1, else
589  // all 0's so leave as is
590  if (signExt_local) {
591  retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
592  }
593  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
594  /*
595  Process word 0 first. This code eiter selects the original bits
596  of word 0, or makes the bits of the selected word be word 0 (and
597  next either sign extends or zero's out upper bits).
598  */
599  low_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
600  high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
601  retVal = bits(currOperVal, high_bit, low_bit);
602 
603  // make sure update propagated, since used next
605  bits(origOperVal, high_bit),
606  "ERROR: SDWA word update not propagated: retVal: %d, "
607  "orig: %d\n",
608  bits(retVal, Gcn3ISA::MSB_PER_WORD),
609  bits(origOperVal, high_bit));
610  // sign extended value depends on upper-most bit of the new word 0
611  signExt_local = (signExt &&
612  (bits(retVal, Gcn3ISA::MSB_PER_WORD, 0) &
613  0x8000));
614 
615  // process other word -- if sign extending, make them 1, else all
616  // 0's so leave as is
617  if (signExt_local) {
618  retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
619  }
620  } else {
621  assert(sel != SDWA_DWORD); // should have returned earlier
622  panic("Unimplemented SDWA select operation: %d\n", sel);
623  }
624 
625  return retVal;
626  }
627 
628 
647  template<typename T>
648  void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
649  const SDWASelVals sel, const bool signExt)
650  {
651  // iterate over all lanes, setting appropriate, selected value
652  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
653  currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
654  origCurrOper[lane], sel,
655  signExt);
656  }
657  }
658 
659 
666  template<typename T>
667  T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
668  const bool clamp, const SDWASelVals sel,
669  const SDWADstVals unusedBits_format)
670  {
671  // local variables
672  int low_bit = 0, high_bit = 0;
673  bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
674  //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
675  bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
676  T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
677  origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
678 
679  // if we're preserving all of the bits, then we can immediately return
680  if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
681  assert(sel == SDWA_DWORD);
682  return currDstVal;
683  } else if (sel == SDWA_DWORD) {
684  // NOTE: users may set the unused bits variable to anything in this
685  // scenario, because it will be ignored
686  return currDstVal;
687  }
688 
689  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
690  // if we sign extended depends on upper-most bit of byte 0
691  signExt = (signExt &&
692  (bits(currDstVal, Gcn3ISA::MSB_PER_WORD, 0) & 0x80));
693 
694  for (int byte = 0; byte < 4; ++byte) {
695  low_bit = byte * Gcn3ISA::BITS_PER_BYTE;
696  high_bit = low_bit + Gcn3ISA::MSB_PER_BYTE;
697  /*
698  Options:
699  1. byte == sel: we are keeping all bits in this byte
700  2. preserve is set: keep this byte as is because the
701  output preserve flag is set
702  3. byte > sel && signExt: we're sign extending and
703  this byte is one of the bytes we need to sign extend
704  */
705  origBits_thisByte = bits(origDstVal, high_bit, low_bit);
706  currBits_thisByte = bits(currDstVal, high_bit, low_bit);
707  newBits = ((byte == sel) ? origBits_thisByte :
708  ((preserve) ? currBits_thisByte :
709  (((byte > sel) && signExt) ? 0xff : 0)));
710  retVal = insertBits(retVal, high_bit, low_bit, newBits);
711  }
712  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
713  low_bit = 0;
714  high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
715  // if we sign extended depends on upper-most bit of word 0
716  signExt = (signExt &&
717  (bits(currDstVal, high_bit, low_bit) & 0x8000));
718 
719  for (int word = 0; word < 2; ++word) {
720  low_bit = word * Gcn3ISA::BITS_PER_WORD;
721  high_bit = low_bit + Gcn3ISA::MSB_PER_WORD;
722  /*
723  Options:
724  1. word == sel & 1: we are keeping all bits in this word
725  2. preserve is set: keep this word as is because the
726  output preserve flag is set
727  3. word > (sel & 1) && signExt: we're sign extending and
728  this word is one of the words we need to sign extend
729  */
730  origBits_thisWord = bits(origDstVal, high_bit, low_bit);
731  currBits_thisWord = bits(currDstVal, high_bit, low_bit);
732  newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
733  ((preserve) ? currBits_thisWord :
734  (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
735  retVal = insertBits(retVal, high_bit, low_bit, newBits);
736  }
737  } else {
738  assert(sel != SDWA_DWORD); // should have returned earlier
739  panic("Unimplemented SDWA select operation: %d\n", sel);
740  }
741 
742  return retVal;
743  }
744 
745 
767  template<typename T>
768  void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
769  const SDWASelVals sel,
770  const SDWADstVals unusedBits_format)
771  {
772  // iterate over all lanes, setting appropriate, selected value
773  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
774  dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
775  origDstOper[lane], clamp,
776  sel, unusedBits_format);
777  }
778  }
779 
780 
788  template<typename T>
789  void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
790  const SDWASelVals src_sel,
791  const bool src_signExt, const bool src_abs,
792  const bool src_neg)
793  {
801  if (src_neg) {
802  currSrc.negModifier();
803  }
804 
805  if (src_abs) {
806  currSrc.absModifier();
807  }
808 
812  sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
813  }
814 
815 
823  template<typename T>
824  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
825  {
826  // local variables
827  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
828  const bool src0_signExt = sdwaInst.SRC0_SEXT;
829  const bool src0_neg = sdwaInst.SRC0_NEG;
830  const bool src0_abs = sdwaInst.SRC0_ABS;
831 
832  // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
833  // operand. So ensure that SRC1 fields are not set, then call helper
834  // function only on src0.
835  assert(!sdwaInst.SRC1_SEXT);
836  assert(!sdwaInst.SRC1_NEG);
837  assert(!sdwaInst.SRC1_ABS);
838 
839  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
840  src0_abs, src0_neg);
841  }
842 
843 
851  template<typename T>
852  void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
853  T & src1, T & origSrc1)
854  {
855  // local variables
856  const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
857  const bool src0_signExt = sdwaInst.SRC0_SEXT;
858  const bool src0_neg = sdwaInst.SRC0_NEG;
859  const bool src0_abs = sdwaInst.SRC0_ABS;
860  const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
861  const bool src1_signExt = sdwaInst.SRC1_SEXT;
862  const bool src1_neg = sdwaInst.SRC1_NEG;
863  const bool src1_abs = sdwaInst.SRC1_ABS;
864 
865  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
866  src0_abs, src0_neg);
867  processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
868  src1_abs, src1_neg);
869  }
870 
871 
879  template<typename T>
880  void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
881  {
882  // local variables
883  const SDWADstVals dst_unusedBits_format =
884  (SDWADstVals)sdwaInst.DST_UNUSED;
885  const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
886  const bool clamp = sdwaInst.CLAMP;
887 
892  sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
893  }
894 } // namespace Gcn3ISA
895 
896 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__
Gcn3ISA::NumVecElemPerVecReg
const int NumVecElemPerVecReg(64)
Gcn3ISA::InFmt_VOP_SDWA::DST_SEL
unsigned int DST_SEL
Definition: gpu_decoder.hh:1597
LL
#define LL(N)
int64_t constant
Definition: types.hh:52
SQ_DPP_ROW_SL1
@ SQ_DPP_ROW_SL1
Definition: inst_util.hh:68
sc_dt::word
unsigned int word
Definition: scfx_mant.hh:96
Gcn3ISA::sdwaInstSrcImpl
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition: inst_util.hh:648
SDWA_UNUSED_SEXT
@ SDWA_UNUSED_SEXT
Definition: inst_util.hh:59
Gcn3ISA::InFmt_VOP_SDWA::CLAMP
unsigned int CLAMP
Definition: gpu_decoder.hh:1599
Gcn3ISA::median
T median(T val_0, T val_1, T val_2)
Definition: inst_util.hh:247
SQ_DPP_ROW_SL15
@ SQ_DPP_ROW_SL15
Definition: inst_util.hh:69
Gcn3ISA::ScalarRegI64
int64_t ScalarRegI64
Definition: registers.hh:158
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
findMsbSet
int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition: bitfield.hh:234
ArmISA::sel
sel
Definition: miscregs_types.hh:644
Gcn3ISA::BITS_PER_BYTE
const int BITS_PER_BYTE
Definition: registers.hh:144
SDWA_BYTE_2
@ SDWA_BYTE_2
Definition: inst_util.hh:48
Gcn3ISA::countZeroBits
ScalarRegI32 countZeroBits(T val)
Definition: inst_util.hh:121
Gcn3ISA::InFmt_VOP_DPP::BANK_MASK
unsigned int BANK_MASK
Definition: gpu_decoder.hh:1591
Gcn3ISA::findFirstZero
ScalarRegI32 findFirstZero(T val)
Definition: inst_util.hh:131
Gcn3ISA::ScalarRegU64
uint64_t ScalarRegU64
Definition: registers.hh:157
Gcn3ISA::InFmt_VOP_SDWA::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1609
X86ISA::count
count
Definition: misc.hh:703
Gcn3ISA::InFmt_VOP_DPP::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1588
Gcn3ISA::firstOppositeSignBit
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition: inst_util.hh:174
registers.hh
Gcn3ISA::InFmt_VOP_DPP::SRC1_ABS
unsigned int SRC1_ABS
Definition: gpu_decoder.hh:1590
Gcn3ISA::roundNearestEven
T roundNearestEven(T val)
Definition: inst_util.hh:259
SQ_DPP_ROW_RR1
@ SQ_DPP_ROW_RR1
Definition: inst_util.hh:72
Gcn3ISA::VecElemU32
uint32_t VecElemU32
Definition: registers.hh:166
Gcn3ISA::findFirstOne
ScalarRegI32 findFirstOne(T val)
Definition: inst_util.hh:142
SQ_DPP_ROW_MIRROR
@ SQ_DPP_ROW_MIRROR
Definition: inst_util.hh:78
Gcn3ISA::wholeQuadMode
T wholeQuadMode(T val)
Definition: inst_util.hh:90
popCount
int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition: bitfield.hh:285
Gcn3ISA::InFmt_VOP_SDWA::SRC1_SEXT
unsigned int SRC1_SEXT
Definition: gpu_decoder.hh:1607
SDWASelVals
SDWASelVals
Definition: inst_util.hh:44
Gcn3ISA::sdwaInstDstImpl
void sdwaInstDstImpl(T &dstOper, T &origDstOper, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition: inst_util.hh:768
Gcn3ISA::dppInstImpl
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition: inst_util.hh:319
SDWA_BYTE_3
@ SDWA_BYTE_3
Definition: inst_util.hh:49
Gcn3ISA::InFmt_VOP_SDWA::DST_UNUSED
unsigned int DST_UNUSED
Definition: gpu_decoder.hh:1598
SQ_DPP_WF_RR1
@ SQ_DPP_WF_RR1
Definition: inst_util.hh:77
SqDPPVals
SqDPPVals
Definition: inst_util.hh:64
Gcn3ISA::ScalarRegI32
int32_t ScalarRegI32
Definition: registers.hh:155
SQ_DPP_ROW_SR15
@ SQ_DPP_ROW_SR15
Definition: inst_util.hh:71
Gcn3ISA
classes that represnt vector/scalar operands in GCN3 ISA.
Definition: decoder.cc:44
Gcn3ISA::InFmt_VOP_DPP::ROW_MASK
unsigned int ROW_MASK
Definition: gpu_decoder.hh:1592
Gcn3ISA::quadMask
T quadMask(T val)
Definition: inst_util.hh:104
Gcn3ISA::findFirstOneMsb
ScalarRegI32 findFirstOneMsb(T val)
Definition: inst_util.hh:153
Gcn3ISA::MSB_PER_WORD
const int MSB_PER_WORD
Definition: registers.hh:147
SQ_DPP_QUAD_PERM_MAX
@ SQ_DPP_QUAD_PERM_MAX
Definition: inst_util.hh:66
SQ_DPP_WF_RL1
@ SQ_DPP_WF_RL1
Definition: inst_util.hh:75
Gcn3ISA::InFmt_VOP_DPP::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1589
Gcn3ISA::VecElemI64
int64_t VecElemI64
Definition: registers.hh:170
Gcn3ISA::InFmt_VOP_DPP::BOUND_CTRL
unsigned int BOUND_CTRL
Definition: gpu_decoder.hh:1586
SQ_DPP_WF_SR1
@ SQ_DPP_WF_SR1
Definition: inst_util.hh:76
X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:769
SQ_DPP_ROW_HALF_MIRROR
@ SQ_DPP_ROW_HALF_MIRROR
Definition: inst_util.hh:79
ROW_SIZE
static const int ROW_SIZE
Definition: inst_util.hh:83
Gcn3ISA::countZeroBitsMsb
ScalarRegI32 countZeroBitsMsb(T val)
Definition: inst_util.hh:164
Gcn3ISA::InFmt_VOP_SDWA::SRC0_SEL
unsigned int SRC0_SEL
Definition: gpu_decoder.hh:1601
Gcn3ISA::InFmt_VOP_DPP::DPP_CTRL
unsigned int DPP_CTRL
Definition: gpu_decoder.hh:1584
Gcn3ISA::InFmt_VOP_SDWA::SRC0_SEXT
unsigned int SRC0_SEXT
Definition: gpu_decoder.hh:1602
Gcn3ISA::BITS_PER_WORD
const int BITS_PER_WORD
Definition: registers.hh:145
SDWA_UNUSED_PAD
@ SDWA_UNUSED_PAD
Definition: inst_util.hh:58
Gcn3ISA::muladd
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition: inst_util.hh:272
SDWA_BYTE_0
@ SDWA_BYTE_0
Definition: inst_util.hh:46
SQ_DPP_WF_SL1
@ SQ_DPP_WF_SL1
Definition: inst_util.hh:74
SDWA_UNUSED_PRESERVE
@ SDWA_UNUSED_PRESERVE
Definition: inst_util.hh:60
Gcn3ISA::processSDWA_src
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition: inst_util.hh:824
Gcn3ISA::sdwaInstSrcImpl_helper
T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, const SDWASelVals sel, const bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:555
Gcn3ISA::InFmt_VOP_SDWA::SRC1_NEG
unsigned int SRC1_NEG
Definition: gpu_decoder.hh:1608
Gcn3ISA::InFmt_VOP_SDWA::SRC0_ABS
unsigned int SRC0_ABS
Definition: gpu_decoder.hh:1604
insertBits
T insertBits(T val, int first, int last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition: bitfield.hh:147
SQ_DPP_ROW_SR1
@ SQ_DPP_ROW_SR1
Definition: inst_util.hh:70
NUM_BANKS
static const int NUM_BANKS
Definition: inst_util.hh:84
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Gcn3ISA::sdwaInstDstImpl_helper
T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, const bool clamp, const SDWASelVals sel, const SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:667
Gcn3ISA::ScalarRegU32
uint32_t ScalarRegU32
Definition: registers.hh:154
Gcn3ISA::processSDWA_dst
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition: inst_util.hh:880
Gcn3ISA::VecElemI32
int32_t VecElemI32
Definition: registers.hh:167
SDWA_BYTE_1
@ SDWA_BYTE_1
Definition: inst_util.hh:47
findLsbSet
int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
Definition: bitfield.hh:253
SDWA_DWORD
@ SDWA_DWORD
Definition: inst_util.hh:52
Gcn3ISA::InFmt_VOP_SDWA::SRC1_SEL
unsigned int SRC1_SEL
Definition: gpu_decoder.hh:1606
SQ_DPP_RESERVED
@ SQ_DPP_RESERVED
Definition: inst_util.hh:67
Gcn3ISA::processSDWA_src_helper
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, const SDWASelVals src_sel, const bool src_signExt, const bool src_abs, const bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition: inst_util.hh:789
Gcn3ISA::VecElemU64
uint64_t VecElemU64
Definition: registers.hh:169
Gcn3ISA::MSB_PER_BYTE
const int MSB_PER_BYTE
Definition: registers.hh:146
SDWA_WORD_0
@ SDWA_WORD_0
Definition: inst_util.hh:50
fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:219
Gcn3ISA::InFmt_VOP_DPP
Definition: gpu_decoder.hh:1582
SDWA_WORD_1
@ SDWA_WORD_1
Definition: inst_util.hh:51
Gcn3ISA::InFmt_VOP_DPP::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1587
SQ_DPP_ROW_RR15
@ SQ_DPP_ROW_RR15
Definition: inst_util.hh:73
Gcn3ISA::InFmt_VOP_SDWA
Definition: gpu_decoder.hh:1595
Gcn3ISA::InFmt_VOP_SDWA::SRC0_NEG
unsigned int SRC0_NEG
Definition: gpu_decoder.hh:1603
ULL
#define ULL(N)
uint64_t constant
Definition: types.hh:50
SQ_DPP_ROW_BCAST31
@ SQ_DPP_ROW_BCAST31
Definition: inst_util.hh:81
ArmISA::mask
Bitfield< 28, 24 > mask
Definition: miscregs_types.hh:711
Gcn3ISA::processDPP
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition: inst_util.hh:416
SDWADstVals
SDWADstVals
Definition: inst_util.hh:56
panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:171
bits
T bits(T val, int first, int last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition: bitfield.hh:75
SQ_DPP_ROW_BCAST15
@ SQ_DPP_ROW_BCAST15
Definition: inst_util.hh:80

Generated on Wed Sep 30 2020 14:02:01 for gem5 by doxygen 1.8.17