gem5  v20.0.0.3
inst_util.hh
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Authors: Anthony Gutierrez
34  */
35 
36 #ifndef __ARCH_GCN3_INSTS_INST_UTIL_HH__
37 #define __ARCH_GCN3_INSTS_INST_UTIL_HH__
38 
39 #include <cmath>
40 
41 #include "arch/gcn3/registers.hh"
42 
43 // values for SDWA select operations
44 enum SDWASelVals : int
45 {
46  SDWA_BYTE_0 = 0, /* select data[7:0] */
47  SDWA_BYTE_1 = 1, /* select data[15:8] */
48  SDWA_BYTE_2 = 2, /* select data[23:16] */
49  SDWA_BYTE_3 = 3, /* select data[31:24] */
50  SDWA_WORD_0 = 4, /* select data[15:0] */
51  SDWA_WORD_1 = 5, /* select data[31:16] */
52  SDWA_DWORD = 6 /* select data[31:0] */
53 };
54 
55 // values for format of destination bits for SDWA operations
56 enum SDWADstVals : int
57 {
58  SDWA_UNUSED_PAD = 0, /* Pad all unused bits with 0 */
59  SDWA_UNUSED_SEXT = 1, /* Sign-extend upper bits; pad lower bits w/ 0 */
60  SDWA_UNUSED_PRESERVE = 2 /* select data[31:0] */
61 };
62 
63 // values for DPP operations
64 enum SqDPPVals : int
65 {
67  SQ_DPP_RESERVED = 0x100,
68  SQ_DPP_ROW_SL1 = 0x101,
69  SQ_DPP_ROW_SL15 = 0x10F,
70  SQ_DPP_ROW_SR1 = 0x111,
71  SQ_DPP_ROW_SR15 = 0x11F,
72  SQ_DPP_ROW_RR1 = 0x121,
73  SQ_DPP_ROW_RR15 = 0x12F,
74  SQ_DPP_WF_SL1 = 0x130,
75  SQ_DPP_WF_RL1 = 0x134,
76  SQ_DPP_WF_SR1 = 0x138,
77  SQ_DPP_WF_RR1 = 0x13C,
82 };
83 static const int ROW_SIZE = 16; /* 16 registers per row */
84 static const int NUM_BANKS = 4; /* 64 registers, 16/bank */
85 
86 namespace Gcn3ISA
87 {
88  template<typename T>
89  inline T
91  {
92  T wqm = 0;
93  T mask = 0xF;
94 
95  for (T bits = val; mask != 0; mask <<= 4)
96  if ((bits & mask) != 0)
97  wqm |= mask;
98 
99  return wqm;
100  }
101 
102  template<typename T>
103  inline T
105  {
106  T qmsk = 0;
107  T mask = 0xF;
108  T qbit = 0x1;
109 
110  for (T bits = val; mask != 0; mask <<= 4, qbit <<= 1) {
111  if (bits & mask) {
112  qmsk |= qbit;
113  }
114  }
115 
116  return qmsk;
117  }
118 
119  template<typename T>
120  inline ScalarRegI32
122  {
123  ScalarRegI32 num_zeros
124  = std::numeric_limits<T>::digits - popCount(val);
125 
126  return num_zeros;
127  }
128 
129  template<typename T>
130  inline ScalarRegI32
132  {
133  if (val == ~T(0)) {
134  return -1;
135  }
136 
137  return findLsbSet(~val);
138  }
139 
140  template<typename T>
141  inline ScalarRegI32
143  {
144  if (!val) {
145  return -1;
146  }
147 
148  return findLsbSet(val);
149  }
150 
151  template<typename T>
152  inline ScalarRegI32
154  {
155  if (!val) {
156  return -1;
157  }
158 
159  return findMsbSet(val);
160  }
161 
162  template<typename T>
163  inline ScalarRegI32
165  {
166  if (!val) {
167  return -1;
168  }
169 
170  return std::numeric_limits<T>::digits - 1 - findMsbSet(val);
171  }
172 
173  inline ScalarRegI32
175  {
176  bool found(false);
177  bool sign_bit = (val & 0x80000000) != 0;
178  ScalarRegU32 tmp_val(0);
179  int count(0);
180 
181  if (!val || val == -1) {
182  return -1;
183  }
184 
185  for (int i = 0; i < std::numeric_limits<ScalarRegU32>::digits; ++i) {
186  tmp_val = val & (0x80000000 >> i);
187 
188  if (!sign_bit) {
189  if (tmp_val) {
190  found = true;
191  break;
192  }
193  } else {
194  if (!tmp_val) {
195  found = true;
196  break;
197  }
198  }
199  ++count;
200  }
201 
202  if (found) {
203  return count;
204  } else {
205  return -1;
206  }
207  }
208 
209  inline ScalarRegI32
211  {
212  bool found(false);
213  bool sign_bit = (val & 0x8000000000000000ULL) != 0;
214  ScalarRegU64 tmp_val(0);
215  int count(0);
216 
217  if (!val || val == -1) {
218  return -1;
219  }
220 
221  for (int i = 0; i < std::numeric_limits<ScalarRegU64>::digits; ++i) {
222  tmp_val = val & (0x8000000000000000ULL >> i);
223 
224  if (!sign_bit) {
225  if (tmp_val) {
226  found = true;
227  break;
228  }
229  } else {
230  if (!tmp_val) {
231  found = true;
232  break;
233  }
234  }
235  ++count;
236  }
237 
238  if (found) {
239  return count;
240  } else {
241  return -1;
242  }
243  }
244 
245  template<typename T>
246  inline T
247  median(T val_0, T val_1, T val_2)
248  {
249  if (std::is_floating_point<T>::value) {
250  return std::fmax(std::fmin(val_0, val_1),
251  std::fmin(std::fmax(val_0, val_1), val_2));
252  } else {
253  return std::max(std::min(val_0, val_1),
254  std::min(std::max(val_0, val_1), val_2));
255  }
256  }
257 
258  template <typename T>
259  inline T roundNearestEven(T val)
260  {
261  T nearest_round = std::round(val * 0.5) * 2.0;
262  return nearest_round;
263  }
264 
265  inline VecElemU32
267  VecElemU64 val_2)
268  {
269  __uint128_t u0 = (__uint128_t)val_0;
270  __uint128_t u1 = (__uint128_t)val_1;
271  __uint128_t u2 = (__uint128_t)val_2;
272  __uint128_t result = u0 * u1 + u2;
273 
274  dst = (VecElemU64)result;
275 
276  return (VecElemU32)(result >> 64) ? 1 : 0;
277  }
278 
279  inline VecElemU32
281  VecElemI64 val_2)
282  {
283  __int128_t u0 = (__int128_t)val_0;
284  __int128_t u1 = (__int128_t)val_1;
285  __int128_t u2 = (__int128_t)val_2;
286  __int128_t result = u0 * u1 + u2;
287 
288  dst = (VecElemI64)result;
289 
290  return (VecElemU32)(result >> 64) ? 1 : 0;
291  }
292 
313  int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum,
314  int rowOffset, bool & outOfBounds)
315  {
316  // local variables
317  // newLane will be the same as the input lane unless swizzling happens
318  int newLane = currLane;
319  // for shift/rotate permutations; positive values are LEFT rotates
320  int count = 1;
321  int localRowOffset = rowOffset;
322  int localRowNum = rowNum;
323 
324  if (dppCtrl <= SQ_DPP_QUAD_PERM_MAX) { // DPP_QUAD_PERM{00:FF}
325  int quadBase = (currLane & ~(3));
326  int quadPix = (currLane & 3);
327  quadPix = ((dppCtrl >> (2 * quadPix)) & 3);
328  newLane = (quadBase | quadPix);
329  } else if (dppCtrl == SQ_DPP_RESERVED) {
330  panic("ERROR: instruction using reserved DPP_CTRL value\n");
331  } else if ((dppCtrl >= SQ_DPP_ROW_SL1) &&
332  (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15}
333  count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1);
334  if ((localRowOffset + count >= 0) &&
335  (localRowOffset + count < ROW_SIZE)) {
336  localRowOffset += count;
337  newLane = (rowNum | localRowOffset);
338  } else {
339  outOfBounds = true;
340  }
341  } else if ((dppCtrl >= SQ_DPP_ROW_SR1) &&
342  (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15}
343  count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1);
344  if ((localRowOffset + count >= 0) &&
345  (localRowOffset + count < ROW_SIZE)) {
346  localRowOffset += count;
347  newLane = (rowNum | localRowOffset);
348  } else {
349  outOfBounds = true;
350  }
351  } else if ((dppCtrl >= SQ_DPP_ROW_RR1) &&
352  (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15}
353  count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1);
354  localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE;
355  newLane = (rowNum | localRowOffset);
356  } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1
357  count = 1;
358  if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) {
359  newLane += count;
360  } else {
361  outOfBounds = true;
362  }
363  } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1
364  count = 1;
365  newLane = (currLane + count + NumVecElemPerVecReg) %
367  } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1
368  count = -1;
369  int currVal = (currLane + count);
370  if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) {
371  newLane += count;
372  } else {
373  outOfBounds = true;
374  }
375  } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1
376  count = -1;
377  newLane = (currLane + count + NumVecElemPerVecReg) %
379  } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR
380  localRowOffset = (15 - localRowOffset);
381  newLane = (rowNum | localRowOffset);
382  } else if (dppCtrl == SQ_DPP_ROW_HALF_MIRROR) { // DPP_ROW_HALF_MIRROR
383  localRowNum = (currLane & -0x7);
384  localRowOffset = (currLane & 0x7);
385  localRowOffset = (7 - localRowNum);
386  newLane = (localRowNum | localRowOffset);
387  } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15
388  count = 15;
389  if (currLane > count) {
390  newLane = (currLane & ~count) - 1;
391  }
392  } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31
393  count = 31;
394  if (currLane > count) {
395  newLane = (currLane & ~count) - 1;
396  }
397  } else {
398  panic("Unimplemented DPP control operation: %d\n", dppCtrl);
399  }
400 
401  return newLane;
402  }
403 
409  template<typename T>
410  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
411  T & src0)
412  {
413  // local variables
414  SqDPPVals dppCtrl = (SqDPPVals)dppInst.DPP_CTRL;
415  int boundCtrl = dppInst.BOUND_CTRL;
416  int bankMask = dppInst.BANK_MASK;
417  int rowMask = dppInst.ROW_MASK;
418  // row, bank info to be calculated per lane
419  int rowNum = 0, bankNum = 0, rowOffset = 0;
420  // outLane will be the same as the input lane unless swizzling happens
421  int outLane = 0;
422  bool laneDisabled = false;
423  // flags used for determining if a lane should be written to/reset/etc.
424  bool outOfBounds = false, zeroSrc = false;
425  long long threadValid = 0;
426 
433  if (dppInst.SRC0_NEG) {
434  src0.negModifier();
435  }
436 
437  if (dppInst.SRC0_ABS) {
438  src0.absModifier();
439  }
440 
441  // iterate over all register lanes, performing steps 2-4
442  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
443  threadValid = (0x1LL << lane);
449  rowNum = (lane / ROW_SIZE);
450  rowOffset = (lane % ROW_SIZE);
451  bankNum = (rowOffset / NUM_BANKS);
452 
453  if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ ||
454  ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) {
455  laneDisabled = true;
456  continue;
457  }
458 
475  if (!laneDisabled) {
476  outLane = dppInstImpl(dppCtrl, lane, rowNum, rowOffset,
477  outOfBounds);
478  }
479 
485  if (laneDisabled) {
486  threadValid = 0;
487  } else if (outOfBounds) {
488  if (boundCtrl == 1) {
489  zeroSrc = true;
490  } else {
491  threadValid = 0;
492  }
493  } else if (!gpuDynInst->exec_mask[lane]) {
494  if (boundCtrl == 1) {
495  zeroSrc = true;
496  } else {
497  threadValid = 0;
498  }
499  }
500 
501  if (threadValid != 0 && !outOfBounds && !zeroSrc) {
502  assert(!laneDisabled);
503  src0[outLane] = src0[lane];
504  } else if (zeroSrc) {
505  src0[lane] = 0;
506  }
507 
508  src0.write();
509  // reset for next iteration
510  laneDisabled = false;
511  }
512  }
513 
519  template<typename T>
520  void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst,
521  T & src0, T & src1)
522  {
529  if (dppInst.SRC1_NEG) {
530  src1.negModifier();
531  }
532 
533  if (dppInst.SRC1_ABS) {
534  src1.absModifier();
535  }
536 
537  // Since only difference for VOP1 and VOP2/VOPC instructions is SRC1,
538  // which is only used for negation/absolute value, call other version
539  // to do everything else.
540  processDPP(gpuDynInst, dppInst, src0);
541  }
542 
549  template<typename T>
550  T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel,
551  bool signExt)
552  {
553  // local variables
554  int first_bit = 0, last_bit = 0;
555  bool signExt_local = signExt;
556  T retVal = 0;
557 
558  // if we're preserving all of the bits, then we can immediately return
559  if (sel == SDWA_DWORD) {
560  return currOperVal;
561  }
562 
563  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
564  /*
565  Process byte 0 first. This code eiter selects the original bits
566  of byte 0, or makes the bits of the selected byte be byte 0 (and
567  next either sign extends or zero's out upper bits).
568  */
569  first_bit = (sel * Gcn3ISA::BITS_PER_BYTE);
570  last_bit = first_bit + Gcn3ISA::MSB_PER_BYTE;
571  retVal = bits(currOperVal, first_bit, last_bit);
572 
573  // make sure update propagated, since used next
574  assert(bits(retVal, Gcn3ISA::MSB_PER_BYTE) ==
575  bits(origOperVal, (sel * Gcn3ISA::BITS_PER_BYTE) +
576  Gcn3ISA::MSB_PER_BYTE));
577  // sign extended value depends on upper-most bit of the new byte 0
578  signExt_local = (signExt &&
579  (bits(retVal, 0, Gcn3ISA::MSB_PER_BYTE) & 0x80));
580 
581  // process all other bytes -- if sign extending, make them 1, else
582  // all 0's so leave as is
583  if (signExt_local) {
584  retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_BYTE>(retVal);
585  }
586  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
587  /*
588  Process word 0 first. This code eiter selects the original bits
589  of word 0, or makes the bits of the selected word be word 0 (and
590  next either sign extends or zero's out upper bits).
591  */
592  first_bit = (sel & 1) * Gcn3ISA::BITS_PER_WORD;
593  last_bit = first_bit + Gcn3ISA::MSB_PER_WORD;
594  retVal = bits(currOperVal, first_bit, last_bit);
595 
596  // make sure update propagated, since used next
597  assert(bits(retVal, Gcn3ISA::MSB_PER_WORD) ==
598  bits(origOperVal, ((sel & 1) * Gcn3ISA::BITS_PER_WORD) +
599  Gcn3ISA::MSB_PER_WORD));
600  // sign extended value depends on upper-most bit of the new word 0
601  signExt_local = (signExt &&
602  (bits(retVal, 0, Gcn3ISA::MSB_PER_WORD) &
603  0x8000));
604 
605  // process other word -- if sign extending, make them 1, else all
606  // 0's so leave as is
607  if (signExt_local) {
608  retVal = (uint32_t)sext<Gcn3ISA::MSB_PER_WORD>(retVal);
609  }
610  } else {
611  assert(sel != SDWA_DWORD); // should have returned earlier
612  panic("Unimplemented SDWA select operation: %d\n", sel);
613  }
614 
615  return retVal;
616  }
617 
618 
637  template<typename T>
638  void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel,
639  bool signExt)
640  {
641  // iterate over all lanes, setting appropriate, selected value
642  currOper.read();
643  origCurrOper.read();
644  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
645  currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
646  origCurrOper[lane], sel,
647  signExt);
648  }
649  }
650 
651 
658  template<typename T>
659  T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp,
660  SDWASelVals sel, SDWADstVals unusedBits_format)
661  {
662  // local variables
663  int first_bit = 0, last_bit = 0;
664  bool signExt = (unusedBits_format == SDWA_UNUSED_SEXT);
665  //bool pad = (unusedBits_format == SDWA_UNUSED_PAD);
666  bool preserve = (unusedBits_format == SDWA_UNUSED_PRESERVE);
667  T retVal = 0, origBits_thisByte = 0, currBits_thisByte = 0,
668  origBits_thisWord = 0, currBits_thisWord = 0, newBits = 0;
669 
670  // if we're preserving all of the bits, then we can immediately return
671  if (unusedBits_format == SDWA_UNUSED_PRESERVE) {
672  assert(sel == SDWA_DWORD);
673  return currDstVal;
674  } else if (sel == SDWA_DWORD) {
675  // NOTE: users may set the unused bits variable to anything in this
676  // scenario, because it will be ignored
677  return currDstVal;
678  }
679 
680  if (sel < SDWA_WORD_0) { // we are selecting 1 byte
681  // if we sign extended depends on upper-most bit of byte 0
682  signExt = (signExt &&
683  (bits(currDstVal, 0, Gcn3ISA::MSB_PER_WORD) & 0x80));
684 
685  for (int byte = 0; byte < 4; ++byte) {
686  first_bit = byte * Gcn3ISA::BITS_PER_BYTE;
687  last_bit = first_bit + Gcn3ISA::MSB_PER_BYTE;
688  /*
689  Options:
690  1. byte == sel: we are keeping all bits in this byte
691  2. preserve is set: keep this byte as is because the
692  output preserve flag is set
693  3. byte > sel && signExt: we're sign extending and
694  this byte is one of the bytes we need to sign extend
695  */
696  origBits_thisByte = bits(origDstVal, first_bit, last_bit);
697  currBits_thisByte = bits(currDstVal, first_bit, last_bit);
698  newBits = ((byte == sel) ? origBits_thisByte :
699  ((preserve) ? currBits_thisByte :
700  (((byte > sel) && signExt) ? 0xff : 0)));
701  retVal = insertBits(retVal, first_bit, last_bit, newBits);
702  }
703  } else if (sel < SDWA_DWORD) { // we are selecting 1 word
704  first_bit = 0;
705  last_bit = first_bit + Gcn3ISA::MSB_PER_WORD;
706  // if we sign extended depends on upper-most bit of word 0
707  signExt = (signExt &&
708  (bits(currDstVal, first_bit, last_bit) & 0x8000));
709 
710  for (int word = 0; word < 2; ++word) {
711  first_bit = word * Gcn3ISA::BITS_PER_WORD;
712  last_bit = first_bit + Gcn3ISA::MSB_PER_WORD;
713  /*
714  Options:
715  1. word == sel & 1: we are keeping all bits in this word
716  2. preserve is set: keep this word as is because the
717  output preserve flag is set
718  3. word > (sel & 1) && signExt: we're sign extending and
719  this word is one of the words we need to sign extend
720  */
721  origBits_thisWord = bits(origDstVal, first_bit, last_bit);
722  currBits_thisWord = bits(currDstVal, first_bit, last_bit);
723  newBits = ((word == (sel & 0x1)) ? origBits_thisWord :
724  ((preserve) ? currBits_thisWord :
725  (((word > (sel & 0x1)) && signExt) ? 0xffff : 0)));
726  retVal = insertBits(retVal, first_bit, last_bit, newBits);
727  }
728  } else {
729  assert(sel != SDWA_DWORD); // should have returned earlier
730  panic("Unimplemented SDWA select operation: %d\n", sel);
731  }
732 
733  return retVal;
734  }
735 
736 
758  template<typename T>
759  void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp,
760  SDWASelVals sel, SDWADstVals unusedBits_format)
761  {
762  // iterate over all lanes, setting appropriate, selected value
763  dstOper.read();
764  origDstOper.read();
765  for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
766  dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
767  origDstOper[lane], clamp,
768  sel, unusedBits_format);
769  }
770  }
771 
772 
780  template<typename T>
781  void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
782  SDWASelVals src_sel, bool src_signExt,
783  bool src_abs, bool src_neg)
784  {
792  if (src_neg) {
793  currSrc.negModifier();
794  }
795 
796  if (src_abs) {
797  currSrc.absModifier();
798  }
799 
803  sdwaInstSrcImpl(currSrc, origCurrSrc, src_sel, src_signExt);
804  }
805 
806 
814  template<typename T>
815  void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
816  T & src0, T & origSrc0)
817  {
818  // local variables
819  SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
820  bool src0_signExt = sdwaInst.SRC0_SEXT;
821  bool src0_neg = sdwaInst.SRC0_NEG;
822  bool src0_abs = sdwaInst.SRC0_ABS;
823 
824  // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
825  // operand. So ensure that SRC1 fields are not set, then call helper
826  // function only on src0.
827  assert(!sdwaInst.SRC1_SEXT);
828  assert(!sdwaInst.SRC1_NEG);
829  assert(!sdwaInst.SRC1_ABS);
830 
831  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
832  src0_abs, src0_neg);
833  }
834 
835 
843  template<typename T>
844  void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
845  T & src0, T & origSrc0, T & src1, T & origSrc1)
846  {
847  // local variables
848  SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
849  bool src0_signExt = sdwaInst.SRC0_SEXT;
850  bool src0_neg = sdwaInst.SRC0_NEG;
851  bool src0_abs = sdwaInst.SRC0_ABS;
852  SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
853  bool src1_signExt = sdwaInst.SRC1_SEXT;
854  bool src1_neg = sdwaInst.SRC1_NEG;
855  bool src1_abs = sdwaInst.SRC1_ABS;
856 
857  processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
858  src0_abs, src0_neg);
859  processSDWA_src_helper(src1, origSrc1, src1_sel, src1_signExt,
860  src1_abs, src1_neg);
861  }
862 
863 
871  template<typename T>
872  void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
873  T & dst, T & origDst)
874  {
875  // local variables
876  SDWADstVals dst_unusedBits_format = (SDWADstVals)sdwaInst.DST_UNUSED;
877  SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
878  bool clamp = sdwaInst.CLAMP;
879 
884  sdwaInstDstImpl(dst, origDst, clamp, dst_sel, dst_unusedBits_format);
885  }
886 } // namespace Gcn3ISA
887 
888 #endif // __ARCH_GCN3_INSTS_INST_UTIL_HH__
count
Definition: misc.hh:703
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
int findLsbSet(uint64_t val)
Returns the bit position of the LSB that is set in the input.
Definition: bitfield.hh:220
int64_t ScalarRegI64
Definition: registers.hh:158
Bitfield< 7 > i
T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp, SDWASelVals sel, SDWADstVals unusedBits_format)
sdwaInstDstImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:659
#define LL(N)
int64_t constant
Definition: types.hh:50
T median(T val_0, T val_1, T val_2)
Definition: inst_util.hh:247
static const int NUM_BANKS
Definition: inst_util.hh:84
SDWASelVals
Definition: inst_util.hh:44
const int BITS_PER_BYTE
Definition: registers.hh:144
ScalarRegI32 countZeroBits(T val)
Definition: inst_util.hh:121
ScalarRegI32 findFirstZero(T val)
Definition: inst_util.hh:131
ScalarRegI32 firstOppositeSignBit(ScalarRegI32 val)
Definition: inst_util.hh:174
int popCount(uint64_t val)
Returns the number of set ones in the provided value.
Definition: bitfield.hh:248
void sdwaInstSrcImpl(T &currOper, T &origCurrOper, SDWASelVals sel, bool signExt)
sdwaInstSrcImpl is a helper function that selects the appropriate bits/bytes for each lane of the inp...
Definition: inst_util.hh:638
void sdwaInstDstImpl(T &dstOper, T &origDstOper, bool clamp, SDWASelVals sel, SDWADstVals unusedBits_format)
sdwaInstDestImpl is a helper function that selects the appropriate bits/bytes for the inputted dest o...
Definition: inst_util.hh:759
T wholeQuadMode(T val)
Definition: inst_util.hh:90
uint64_t ScalarRegU64
Definition: registers.hh:157
void processSDWA_src_helper(T &currSrc, T &origCurrSrc, SDWASelVals src_sel, bool src_signExt, bool src_abs, bool src_neg)
processSDWA_srcHelper is a helper function for implementing sub d-word addressing instructions for th...
Definition: inst_util.hh:781
T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel, bool signExt)
sdwaInstSrcImpl_helper contains the per-lane code for selecting the appropriate bytes/words of the la...
Definition: inst_util.hh:550
void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition: inst_util.hh:872
int dppInstImpl(SqDPPVals dppCtrl, int currLane, int rowNum, int rowOffset, bool &outOfBounds)
dppInstImpl is a helper function that performs the inputted operation on the inputted vector register...
Definition: inst_util.hh:313
uint32_t VecElemU32
Definition: registers.hh:166
Bitfield< 63 > val
Definition: misc.hh:769
T roundNearestEven(T val)
Definition: inst_util.hh:259
ScalarRegI32 findFirstOne(T val)
Definition: inst_util.hh:142
SDWADstVals
Definition: inst_util.hh:56
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:46
int32_t ScalarRegI32
Definition: registers.hh:155
const int MSB_PER_WORD
Definition: registers.hh:147
T quadMask(T val)
Definition: inst_util.hh:104
ScalarRegI32 findFirstOneMsb(T val)
Definition: inst_util.hh:153
classes that represnt vector/scalar operands in GCN3 ISA.
Definition: decoder.cc:44
int64_t VecElemI64
Definition: registers.hh:170
T insertBits(T val, int first, int last, B bit_val)
Returns val with bits first to last set to the LSBs of bit_val.
Definition: bitfield.hh:131
void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition: inst_util.hh:815
ScalarRegI32 countZeroBitsMsb(T val)
Definition: inst_util.hh:164
#define ULL(N)
uint64_t constant
Definition: types.hh:48
unsigned int word
Definition: scfx_mant.hh:64
const int BITS_PER_WORD
Definition: registers.hh:145
VecElemU32 muladd(VecElemU64 &dst, VecElemU32 val_0, VecElemU32 val_1, VecElemU64 val_2)
Definition: inst_util.hh:266
uint32_t ScalarRegU32
Definition: registers.hh:154
static const int ROW_SIZE
Definition: inst_util.hh:83
int32_t VecElemI32
Definition: registers.hh:167
int findMsbSet(uint64_t val)
Returns the bit position of the MSB that is set in the input.
Definition: bitfield.hh:203
SqDPPVals
Definition: inst_util.hh:64
const int MSB_PER_BYTE
Definition: registers.hh:146
Bitfield< 3, 0 > mask
Definition: types.hh:62
T bits(T val, int first, int last)
Extract the bitfield from position &#39;first&#39; to &#39;last&#39; (inclusive) from &#39;val&#39; and right justify it...
Definition: bitfield.hh:71
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition: inst_util.hh:410
uint64_t VecElemU64
Definition: registers.hh:169
const int NumVecElemPerVecReg(64)

Generated on Fri Jul 3 2020 15:42:40 for gem5 by doxygen 1.8.13