gem5 v24.1.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
op_encodings.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
33#define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
34
40#include "debug/GPUExec.hh"
41#include "debug/VEGA.hh"
43
44namespace gem5
45{
46
47namespace VegaISA
48{
50 {
51 uint64_t baseAddr : 48;
52 uint32_t stride : 14;
53 uint32_t cacheSwizzle : 1;
54 uint32_t swizzleEn : 1;
55 uint32_t numRecords : 32;
56 uint32_t dstSelX : 3;
57 uint32_t dstSelY : 3;
58 uint32_t dstSelZ : 3;
59 uint32_t dstSelW : 3;
60 uint32_t numFmt : 3;
61 uint32_t dataFmt : 4;
62 uint32_t elemSize : 2;
63 uint32_t idxStride : 2;
64 uint32_t addTidEn : 1;
65 uint32_t atc : 1;
66 uint32_t hashEn : 1;
67 uint32_t heap : 1;
68 uint32_t mType : 3;
69 uint32_t type : 2;
70 };
71
72 // --- purely virtual instruction classes ---
73
75 {
76 public:
77 Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
78
79 int instSize() const override;
80 void generateDisassembly() override;
81
82 void initOperandInfo() override;
83
84 protected:
85 // first instruction DWORD
87 // possible second DWORD
89 uint32_t varSize;
90
91 private:
93 }; // Inst_SOP2
94
96 {
97 public:
98 Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
99 ~Inst_SOPK();
100
101 int instSize() const override;
102 void generateDisassembly() override;
103
104 void initOperandInfo() override;
105
106 protected:
107 // first instruction DWORD
109 // possible second DWORD
111 uint32_t varSize;
112
113 private:
115 }; // Inst_SOPK
116
118 {
119 public:
120 Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
121 ~Inst_SOP1();
122
123 int instSize() const override;
124 void generateDisassembly() override;
125
126 void initOperandInfo() override;
127
128 protected:
129 // first instruction DWORD
131 // possible second DWORD
133 uint32_t varSize;
134
135 private:
137 }; // Inst_SOP1
138
140 {
141 public:
142 Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
143 ~Inst_SOPC();
144
145 int instSize() const override;
146 void generateDisassembly() override;
147
148 void initOperandInfo() override;
149
150 protected:
151 // first instruction DWORD
153 // possible second DWORD
155 uint32_t varSize;
156
157 private:
159 }; // Inst_SOPC
160
162 {
163 public:
164 Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
165 ~Inst_SOPP();
166
167 int instSize() const override;
168 void generateDisassembly() override;
169
170 void initOperandInfo() override;
171
172 protected:
173 // first instruction DWORD
175 }; // Inst_SOPP
176
178 {
179 public:
180 Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
181 ~Inst_SMEM();
182
183 int instSize() const override;
184 void generateDisassembly() override;
185
186 void initOperandInfo() override;
187
188 protected:
192 template<int N>
193 void
195 {
196 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
198 }
199
203 template<int N>
204 void
206 {
207 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
209 }
210
214 void
217 {
218 Addr vaddr = ((addr.rawData() + offset) & ~0x3);
219 gpu_dyn_inst->scalarAddr = vaddr;
220 }
221
227 void
230 {
231 BufferRsrcDescriptor rsrc_desc;
232 ScalarRegU32 clamped_offset(offset);
233 std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
234 sizeof(BufferRsrcDescriptor));
235
241 if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
242 clamped_offset = rsrc_desc.numRecords;
243 } else if (rsrc_desc.stride && offset
244 > (rsrc_desc.stride * rsrc_desc.numRecords)) {
245 clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
246 }
247
248 Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
249 gpu_dyn_inst->scalarAddr = vaddr;
250 }
251
252 // first instruction DWORD
254 // second instruction DWORD
256 }; // Inst_SMEM
257
259 {
260 public:
261 Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
262 ~Inst_VOP2();
263
264 int instSize() const override;
265 void generateDisassembly() override;
266
267 void initOperandInfo() override;
268
269 protected:
270 // first instruction DWORD
272 // possible second DWORD
274 uint32_t varSize;
275
276 template<typename T>
277 T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)
278 {
279 T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
280 // use copies of original src0, src1, and dest during selecting
281 T origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
282 T origSrc1(gpuDynInst, instData.VSRC1);
283
284 src0_sdwa.read();
285 origSrc0_sdwa.read();
286 origSrc1.read();
287
288 DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register v[%d], "
289 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "
290 "%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "
291 "SRC1_NEG: %d, SRC1_ABS: %d\n",
292 opcode().c_str(), extData.iFmt_VOP_SDWA.SRC0,
301
302 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
303 src1, origSrc1);
304
305 return src0_sdwa;
306 }
307
308 template<typename T>
309 void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T & vdst)
310 {
311 T origVdst(gpuDynInst, instData.VDST);
312
313 Wavefront *wf = gpuDynInst->wavefront();
314 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
315 if (wf->execMask(lane)) {
316 origVdst[lane] = vdst[lane]; // keep copy consistent
317 }
318 }
319
320 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
321 }
322
323 template<typename T>
324 T dppHelper(GPUDynInstPtr gpuDynInst, T & src1)
325 {
326 T src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
327 src0_dpp.read();
328
329 DPRINTF(VEGA, "Handling %s SRC DPP. SRC0: register v[%d], "
330 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "
331 "SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",
332 opcode().c_str(), extData.iFmt_VOP_DPP.SRC0,
337
338 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
339
340 return src0_dpp;
341 }
342
343 template<typename ConstT, typename T>
344 void vop2Helper(GPUDynInstPtr gpuDynInst,
345 void (*fOpImpl)(T&, T&, T&, Wavefront*))
346 {
347 Wavefront *wf = gpuDynInst->wavefront();
348 T src0(gpuDynInst, instData.SRC0);
349 T src1(gpuDynInst, instData.VSRC1);
350 T vdst(gpuDynInst, instData.VDST);
351
352 src0.readSrc();
353 src1.read();
354
355 if (isSDWAInst()) {
356 T src0_sdwa = sdwaSrcHelper(gpuDynInst, src1);
357 fOpImpl(src0_sdwa, src1, vdst, wf);
358 sdwaDstHelper(gpuDynInst, vdst);
359 } else if (isDPPInst()) {
360 T src0_dpp = dppHelper(gpuDynInst, src1);
361 fOpImpl(src0_dpp, src1, vdst, wf);
362 } else {
363 // src0 is unmodified. We need to use the const container
364 // type to allow reading scalar operands from src0. Only
365 // src0 can index scalar operands. We copy this to vdst
366 // temporarily to pass to the lambda so the instruction
367 // does not need to write two lambda functions (one for
368 // a const src0 and one of a mutable src0).
369 ConstT const_src0(gpuDynInst, instData.SRC0);
370 const_src0.readSrc();
371
372 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
373 vdst[lane] = const_src0[lane];
374 }
375 fOpImpl(vdst, src1, vdst, wf);
376 }
377
378 vdst.write();
379 }
380
381 private:
383 }; // Inst_VOP2
384
386 {
387 public:
388 Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
389 ~Inst_VOP1();
390
391 int instSize() const override;
392 void generateDisassembly() override;
393
394 void initOperandInfo() override;
395
396 protected:
397 // first instruction DWORD
399 // possible second DWORD
401 uint32_t varSize;
402
403 private:
405 }; // Inst_VOP1
406
408 {
409 public:
410 Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
411 ~Inst_VOPC();
412
413 int instSize() const override;
414 void generateDisassembly() override;
415
416 void initOperandInfo() override;
417
418 protected:
419 // first instruction DWORD
421 // possible second DWORD
423 uint32_t varSize;
424
425 template<typename T>
426 uint32_t
427 sdwabSelect(uint32_t dword, const SDWASelVals sel,
428 bool sign_ext, bool neg, bool abs)
429 {
430 // Use the gem5 bits() helper to select a sub region from the
431 // dword based on the select. Return a 32-bit unsigned which will
432 // be cast to the appropriate compare type in the lambda passed to
433 // sdwabHelper.
434 int low_bit = 0, high_bit = 0;
435 uint32_t rv = dword;
436
437 if (sel < SDWA_WORD_0) {
438 // Selecting a sub-dword value smaller than a word (i.e., a
439 // byte). These values are 0-3 so multiplying by BITS_PER_BYTE
440 // gives the lower and upper bit easily.
441 low_bit = sel * VegaISA::BITS_PER_BYTE;
442 high_bit = low_bit + VegaISA::BITS_PER_BYTE - 1;
443 } else if (sel < SDWA_DWORD) {
444 // Selecting a sub-dword value of word size. Enum value is 4
445 // or 5, so selecting the LSb and multiplying gives the lower
446 // and upper bit.
447 low_bit = (sel & 1) * VegaISA::BITS_PER_WORD;
448 high_bit = low_bit + VegaISA::MSB_PER_WORD - 1;
449 } else {
450 // We are selecting the whole dword. Assert that is true and
451 // set the bit locations for lower and upper based on dword
452 // size.
453 assert(sel == SDWA_DWORD);
454 low_bit = 0;
455 high_bit = sizeof(uint32_t) * VegaISA::BITS_PER_BYTE - 1;
456 }
457
458 rv = bits(dword, high_bit, low_bit);
459
460 uint32_t sign_bit = 1 << high_bit;
461
462 // Panic on combinations which do not make sense.
463 if (std::is_integral_v<T> && std::is_unsigned_v<T>) {
464 panic_if(neg, "SWDAB negation operation on unsigned type!\n");
465 panic_if(sign_ext, "SWDAB sign extend on unsigned type!\n");
466 }
467
468 // Apply ABS, then NEG, then SEXT.
469 if (abs) {
470 if (std::is_integral_v<T>) {
471 // If sign is set, sign extend first then call std::abs.
472 if ((rv & sign_bit) && std::is_signed_v<T>) {
473 rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
474 rv = std::abs(static_cast<long long>(rv)) & 0xFFFFFFFF;
475 }
476 } else {
477 // Clear sign bit for FP types.
478 rv = rv & mask(high_bit);
479 }
480 }
481
482 if (neg) {
483 if (std::is_integral_v<T>) {
484 // If sign is set, sign extend first then call unary-.
485 if (rv & sign_bit) {
486 rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
487 rv = -rv;
488 }
489 } else {
490 // Flip sign bit for FP types.
491 rv = rv ^ mask(high_bit);
492 }
493 }
494
495 if (sign_ext) {
496 if (std::is_integral_v<T>) {
497 if (rv & sign_bit) {
498 rv = sext(rv, high_bit + 1) & 0xFFFFFFFF;
499 }
500 } else {
501 // It is not entirely clear what to do here. Literal
502 // extensions for FP operands append zeros to mantissa
503 // but specification does not state anything for SDWAB.
504 panic("SDWAB sign extend set for non-integral type!\n");
505 }
506 }
507
508 return rv;
509 }
510
511 template<typename T>
512 void
513 sdwabHelper(GPUDynInstPtr gpuDynInst, int (*cmpFunc)(T, T))
514 {
515 DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register %s[%d], "
516 "sDst s[%d], sDst type %s, SRC0_SEL: %d, SRC0_SEXT: %d "
517 "SRC0_NEG: %d, SRC0_ABS: %d, SRC1: register %s[%d], "
518 "SRC1_SEL: %d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: "
519 "%d\n", _opcode.c_str(),
520 (extData.iFmt_VOP_SDWAB.S0 ? "s" : "v"),
523 (extData.iFmt_VOP_SDWAB.SD ? "SGPR" : "VCC"),
528 (extData.iFmt_VOP_SDWAB.S1 ? "s" : "v"),
534
535 // Start with SRC0 and insert 9th bit for VGPR source (S0 == 0).
536 int src0_idx = extData.iFmt_VOP_SDWAB.SRC0;
537 src0_idx += (extData.iFmt_VOP_SDWAB.S0 == 0) ? 0x100 : 0;
538
539 // Start with VSRC1[7:0], insert 9th bit for VGPR source (S1 == 0).
540 int src1_idx = instData.VSRC1;
541 src1_idx += (extData.iFmt_VOP_SDWAB.S1 == 0) ? 0x100 : 0;
542
543 // SD == 0 if VCC is dest, else use SDST index.
544 int sdst_idx = (extData.iFmt_VOP_SDWAB.SD == 1) ?
546
547 ConstVecOperandU32 src0(gpuDynInst, src0_idx);
548 ConstVecOperandU32 src1(gpuDynInst, src1_idx);
549 ScalarOperandU64 sdst(gpuDynInst, sdst_idx);
550
551 // Use readSrc in case of scalar const register.
552 src0.readSrc();
553 src1.readSrc();
554
555 // Select bits first, then cast to type, then apply modifiers.
556 const SDWASelVals src0_sel =
558 const SDWASelVals src1_sel =
560
561 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
562 if (gpuDynInst->wavefront()->execMask(lane)) {
563 T a = sdwabSelect<T>(src0[lane], src0_sel,
567 T b = sdwabSelect<T>(src1[lane], src1_sel,
571 sdst.setBit(lane, cmpFunc(a, b));
572 }
573 }
574
575 sdst.write();
576 }
577
578 private:
580 }; // Inst_VOPC
581
583 {
584 public:
585 Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
586 ~Inst_VINTRP();
587
588 int instSize() const override;
589
590 protected:
591 // first instruction DWORD
593 }; // Inst_VINTRP
594
596 {
597 public:
598 Inst_VOP3A(InFmt_VOP3A*, const std::string &opcode, bool sgpr_dst);
599 ~Inst_VOP3A();
600
601 int instSize() const override;
602 void generateDisassembly() override;
603
604 void initOperandInfo() override;
605
606 protected:
607 // first instruction DWORD
609 // second instruction DWORD
611
612 // Output modifier for VOP3 instructions. This 2-bit field can be set
613 // to "0" to do nothing, "1" to multiply output value by 2, "2" to
614 // multiply output value by 4, or "3" to divide output value by 2. If
615 // the instruction supports clamping, this is applied *before* clamp
616 // but after the abs and neg modifiers.
617 template<typename T>
618 T omodModifier(T val, unsigned omod)
619 {
620 assert(omod < 4);
621
622 if constexpr (std::is_floating_point_v<T>) {
623 if (omod == 1) return val * T(2.0f);
624 if (omod == 2) return val * T(4.0f);
625 if (omod == 3) return val / T(2.0f);
626 } else {
627 assert(std::is_integral_v<T>);
628 if (omod == 1) return val * T(2);
629 if (omod == 2) return val * T(4);
630 if (omod == 3) return val / T(2);
631 }
632
633 return val;
634 }
635 private:
647 const bool sgprDst;
648 }; // Inst_VOP3A
649
651 {
652 public:
653 Inst_VOP3B(InFmt_VOP3B*, const std::string &opcode);
654 ~Inst_VOP3B();
655
656 int instSize() const override;
657 void generateDisassembly() override;
658
659 void initOperandInfo() override;
660
661 protected:
662 // first instruction DWORD
664 // second instruction DWORD
666
667 private:
669 }; // Inst_VOP3B
670
672 {
673 public:
674 Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode);
675 ~Inst_VOP3P();
676
677 int instSize() const override;
678 void generateDisassembly() override;
679
680 void initOperandInfo() override;
681
682 protected:
683 // first instruction DWORD
685 // second instruction DWORD
687
688 template<typename T>
689 void vop3pHelper(GPUDynInstPtr gpuDynInst,
690 T (*fOpImpl)(T, T, bool))
691 {
692 Wavefront *wf = gpuDynInst->wavefront();
693 ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
694 ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
695 VecOperandU32 D(gpuDynInst, instData.VDST);
696
697 S0.readSrc();
698 S1.readSrc();
699
700 int opLo = instData.OPSEL;
701 int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
702 int negLo = extData.NEG;
703 int negHi = instData.NEG_HI;
704 bool clamp = instData.CLMP;
705 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
706 if (wf->execMask(lane)) {
707 T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
708 word<T>(S1[lane], opHi, negHi, 1),
709 clamp);
710 T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
711 word<T>(S1[lane], opLo, negLo, 1),
712 clamp);
713
714 uint16_t upper_raw =
715 *reinterpret_cast<uint16_t*>(&upper_val);
716 uint16_t lower_raw =
717 *reinterpret_cast<uint16_t*>(&lower_val);
718
719 D[lane] = upper_raw << 16 | lower_raw;
720 }
721 }
722
723 D.write();
724 }
725
726 template<typename T>
727 void vop3pHelper(GPUDynInstPtr gpuDynInst,
728 T (*fOpImpl)(T, T, T, bool))
729 {
730 Wavefront *wf = gpuDynInst->wavefront();
731 ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
732 ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
733 ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
734 VecOperandU32 D(gpuDynInst, instData.VDST);
735
736 S0.readSrc();
737 S1.readSrc();
738 S2.readSrc();
739
740 int opLo = instData.OPSEL;
741 int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
742 int negLo = extData.NEG;
743 int negHi = instData.NEG_HI;
744 bool clamp = instData.CLMP;
745 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
746 if (wf->execMask(lane)) {
747 T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
748 word<T>(S1[lane], opHi, negHi, 1),
749 word<T>(S2[lane], opHi, negHi, 2),
750 clamp);
751 T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
752 word<T>(S1[lane], opLo, negLo, 1),
753 word<T>(S2[lane], opLo, negLo, 2),
754 clamp);
755
756 uint16_t upper_raw =
757 *reinterpret_cast<uint16_t*>(&upper_val);
758 uint16_t lower_raw =
759 *reinterpret_cast<uint16_t*>(&lower_val);
760
761 D[lane] = upper_raw << 16 | lower_raw;
762 }
763 }
764
765 D.write();
766 }
767
768 void
770 uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
771 {
772 Wavefront *wf = gpuDynInst->wavefront();
773 ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
774 ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
775 ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
776 VecOperandU32 D(gpuDynInst, instData.VDST);
777
778 S0.readSrc();
779 S1.readSrc();
780 S2.readSrc();
781
782 // OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
783 // dword1[15:0] is upper/lower 16b of src0 based on opsel[0]
784 // dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
785 // dword2[15:0] is upper/lower 16b of src1 based on opsel[1]
786 // dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
787 int opLo = instData.OPSEL;
788 int opHi = extData.OPSEL_HI;
789 int negLo = extData.NEG;
790 int negHi = instData.NEG_HI;
791 bool clamp = instData.CLMP;
792
793 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
794 if (wf->execMask(lane)) {
795 uint32_t dword1l =
796 word<uint16_t>(S0[lane], opLo, negLo, 0);
797 uint32_t dword1h =
798 word<uint16_t>(S0[lane], opHi, negHi, 0);
799 uint32_t dword2l =
800 word<uint16_t>(S1[lane], opLo, negLo, 1);
801 uint32_t dword2h =
802 word<uint16_t>(S1[lane], opHi, negHi, 1);
803
804 uint32_t dword1 = (dword1h << 16) | dword1l;
805 uint32_t dword2 = (dword2h << 16) | dword2l;
806
807 // Take in two uint32_t dwords and one src2 dword. The
808 // function will need to call bits to break up to the
809 // correct size and then reinterpret cast to the correct
810 // value.
811 D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
812 }
813 }
814
815 D.write();
816 }
817
818 private:
820
821 template<typename T>
822 T
823 word(uint32_t data, int opSel, int neg, int opSelBit)
824 {
825 // This method assumes two words packed into a dword
826 static_assert(sizeof(T) == 2);
827
828 bool select = bits(opSel, opSelBit, opSelBit);
829 uint16_t raw = select ? bits(data, 31, 16)
830 : bits(data, 15, 0);
831
832 // Apply input modifiers. This may seem odd, but the hardware
833 // just flips the MSb instead of doing unary negation.
834 bool negate = bits(neg, opSelBit, opSelBit);
835 if (negate) {
836 raw ^= 0x8000;
837 }
838
839 return *reinterpret_cast<T*>(&raw);
840 }
841 }; // Inst_VOP3P
842
844 {
845 public:
846 Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode);
848
849 int instSize() const override;
850 void generateDisassembly() override;
851
852 void initOperandInfo() override;
853
854 protected:
855 // first instruction DWORD
857 // second instruction DWORD
859
860 private:
862 }; // Inst_VOP3P
863
865 {
866 public:
867 Inst_DS(InFmt_DS*, const std::string &opcode);
868 ~Inst_DS();
869
870 int instSize() const override;
871 void generateDisassembly() override;
872
873 void initOperandInfo() override;
874
875 protected:
876 template<typename T>
877 void
879 {
880 Wavefront *wf = gpuDynInst->wavefront();
881
882 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
883 if (gpuDynInst->exec_mask[lane]) {
884 Addr vaddr = gpuDynInst->addr[lane] + offset;
885
886 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
887 = wf->ldsChunk->read<T>(vaddr);
888 }
889 }
890 }
891
892 template<int N>
893 void
895 {
896 Wavefront *wf = gpuDynInst->wavefront();
897
898 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
899 if (gpuDynInst->exec_mask[lane]) {
900 Addr vaddr = gpuDynInst->addr[lane] + offset;
901 for (int i = 0; i < N; ++i) {
902 (reinterpret_cast<VecElemU32*>(
903 gpuDynInst->d_data))[lane * N + i]
904 = wf->ldsChunk->read<VecElemU32>(
905 vaddr + i*sizeof(VecElemU32));
906 }
907 }
908 }
909 }
910
911 template<typename T>
912 void
913 initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
914 {
915 Wavefront *wf = gpuDynInst->wavefront();
916
917 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
918 if (gpuDynInst->exec_mask[lane]) {
919 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
920 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
921
922 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
923 = wf->ldsChunk->read<T>(vaddr0);
924 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
925 = wf->ldsChunk->read<T>(vaddr1);
926 }
927 }
928 }
929
930 template<typename T>
931 void
933 {
934 Wavefront *wf = gpuDynInst->wavefront();
935
936 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
937 if (gpuDynInst->exec_mask[lane]) {
938 Addr vaddr = gpuDynInst->addr[lane] + offset;
939 wf->ldsChunk->write<T>(vaddr,
940 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
941 }
942 }
943 }
944
945 template<int N>
946 void
948 {
949 Wavefront *wf = gpuDynInst->wavefront();
950
951 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
952 if (gpuDynInst->exec_mask[lane]) {
953 Addr vaddr = gpuDynInst->addr[lane] + offset;
954 for (int i = 0; i < N; ++i) {
956 vaddr + i*sizeof(VecElemU32),
957 (reinterpret_cast<VecElemU32*>(
958 gpuDynInst->d_data))[lane * N + i]);
959 }
960 }
961 }
962 }
963
964 template<typename T>
965 void
966 initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
967 {
968 Wavefront *wf = gpuDynInst->wavefront();
969
970 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
971 if (gpuDynInst->exec_mask[lane]) {
972 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
973 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
974 wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
975 gpuDynInst->d_data))[lane * 2]);
976 wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
977 gpuDynInst->d_data))[lane * 2 + 1]);
978 }
979 }
980 }
981
982 template<typename T>
983 void
985 {
986 Wavefront *wf = gpuDynInst->wavefront();
987
988 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
989 if (gpuDynInst->exec_mask[lane]) {
990 Addr vaddr = gpuDynInst->addr[lane] + offset;
991
992 AtomicOpFunctorPtr amo_op =
993 gpuDynInst->makeAtomicOpFunctor<T>(
994 &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
995 &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);
996
997 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
998 = wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));
999 }
1000 }
1001 }
1002
1003 void
1005 {
1006 Wavefront *wf = gpuDynInst->wavefront();
1007
1008 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1009 if (wf->execMask(lane)) {
1010 gpuDynInst->addr.at(lane) = (Addr)addr[lane];
1011 }
1012 }
1013 }
1014
1015 // first instruction DWORD
1017 // second instruction DWORD
1019 }; // Inst_DS
1020
1022 {
1023 public:
1024 Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
1025 ~Inst_MUBUF();
1026
1027 int instSize() const override;
1028 void generateDisassembly() override;
1029
1030 void initOperandInfo() override;
1031
1032 protected:
1033 template<typename T>
1034 void
1036 {
1037 // temporarily modify exec_mask to supress memory accesses to oob
1038 // regions. Only issue memory requests for lanes that have their
1039 // exec_mask set and are not out of bounds.
1040 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1041 gpuDynInst->exec_mask &= ~oobMask;
1042 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
1043 gpuDynInst->exec_mask = old_exec_mask;
1044 }
1045
1046
1047 template<int N>
1048 void
1050 {
1051 // temporarily modify exec_mask to supress memory accesses to oob
1052 // regions. Only issue memory requests for lanes that have their
1053 // exec_mask set and are not out of bounds.
1054 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1055 gpuDynInst->exec_mask &= ~oobMask;
1056 initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
1057 gpuDynInst->exec_mask = old_exec_mask;
1058 }
1059
1060 template<typename T>
1061 void
1063 {
1064 // temporarily modify exec_mask to supress memory accesses to oob
1065 // regions. Only issue memory requests for lanes that have their
1066 // exec_mask set and are not out of bounds.
1067 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1068 gpuDynInst->exec_mask &= ~oobMask;
1069 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
1070 gpuDynInst->exec_mask = old_exec_mask;
1071 }
1072
1073 template<int N>
1074 void
1076 {
1077 // temporarily modify exec_mask to supress memory accesses to oob
1078 // regions. Only issue memory requests for lanes that have their
1079 // exec_mask set and are not out of bounds.
1080 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1081 gpuDynInst->exec_mask &= ~oobMask;
1082 initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
1083 gpuDynInst->exec_mask = old_exec_mask;
1084 }
1085
1086 template<typename T>
1087 void
1089 {
1090 // temporarily modify exec_mask to supress memory accesses to oob
1091 // regions. Only issue memory requests for lanes that have their
1092 // exec_mask set and are not out of bounds.
1093 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1094 gpuDynInst->exec_mask &= ~oobMask;
1095 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
1096 gpuDynInst->exec_mask = old_exec_mask;
1097 }
1098
1099 void
1101 {
1102 // create request and set flags
1103 gpuDynInst->resetEntireStatusVector();
1104 gpuDynInst->setStatusVector(0, 1);
1105 RequestPtr req = std::make_shared<Request>(0, 0, 0,
1106 gpuDynInst->computeUnit()->
1107 requestorId(), 0,
1108 gpuDynInst->wfDynId);
1109 gpuDynInst->setRequestFlags(req);
1110 gpuDynInst->computeUnit()->
1111 injectGlobalMemFence(gpuDynInst, false, req);
1112 }
1113
1134 template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
1135 void
1136 calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
1137 SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
1138 {
1139 Addr vaddr = 0;
1140 Addr base_addr = 0;
1141 Addr stride = 0;
1142 Addr buf_idx = 0;
1143 Addr buf_off = 0;
1144 Addr buffer_offset = 0;
1145 BufferRsrcDescriptor rsrc_desc;
1146
1147 std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
1148 sizeof(BufferRsrcDescriptor));
1149
1150 base_addr = rsrc_desc.baseAddr;
1151
1152 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
1153 + rsrc_desc.stride) : rsrc_desc.stride;
1154
1155 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1156 if (gpuDynInst->exec_mask[lane]) {
1157 vaddr = base_addr + s_offset.rawData();
1163 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
1164
1165 buf_off = v_off[lane] + inst_offset;
1166
1167 if (rsrc_desc.swizzleEn) {
1168 Addr idx_stride = 8 << rsrc_desc.idxStride;
1169 Addr elem_size = 2 << rsrc_desc.elemSize;
1170 Addr idx_msb = buf_idx / idx_stride;
1171 Addr idx_lsb = buf_idx % idx_stride;
1172 Addr off_msb = buf_off / elem_size;
1173 Addr off_lsb = buf_off % elem_size;
1174 DPRINTF(VEGA, "mubuf swizzled lane %d: "
1175 "idx_stride = %llx, elem_size = %llx, "
1176 "idx_msb = %llx, idx_lsb = %llx, "
1177 "off_msb = %llx, off_lsb = %llx\n",
1178 lane, idx_stride, elem_size, idx_msb, idx_lsb,
1179 off_msb, off_lsb);
1180
1181 buffer_offset =(idx_msb * stride + off_msb * elem_size)
1182 * idx_stride + idx_lsb * elem_size + off_lsb;
1183 } else {
1184 buffer_offset = buf_off + stride * buf_idx;
1185 }
1186
1187
1195 if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
1196 if (buffer_offset >=
1197 rsrc_desc.numRecords - s_offset.rawData()) {
1198 DPRINTF(VEGA, "mubuf out-of-bounds condition 1: "
1199 "lane = %d, buffer_offset = %llx, "
1200 "const_stride = %llx, "
1201 "const_num_records = %llx\n",
1202 lane, buf_off + stride * buf_idx,
1203 stride, rsrc_desc.numRecords);
1204 oobMask.set(lane);
1205 continue;
1206 }
1207 }
1208
1209 if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
1210 if (buf_idx >= rsrc_desc.numRecords ||
1211 buf_off >= stride) {
1212 DPRINTF(VEGA, "mubuf out-of-bounds condition 2: "
1213 "lane = %d, offset = %llx, "
1214 "index = %llx, "
1215 "const_num_records = %llx\n",
1216 lane, buf_off, buf_idx,
1217 rsrc_desc.numRecords);
1218 oobMask.set(lane);
1219 continue;
1220 }
1221 }
1222
1223 vaddr += buffer_offset;
1224
1225 DPRINTF(VEGA, "Calculating mubuf address for lane %d: "
1226 "vaddr = %llx, base_addr = %llx, "
1227 "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
1228 lane, vaddr, base_addr, stride,
1229 buf_idx, buf_off);
1230 gpuDynInst->addr.at(lane) = vaddr;
1231 }
1232 }
1233 }
1234
1235 // first instruction DWORD
1237 // second instruction DWORD
1239 // Mask of lanes with out-of-bounds accesses. Needs to be tracked
1240 // seperately from the exec_mask so that we remember to write zero
1241 // to the registers associated with out of bounds lanes.
1243 }; // Inst_MUBUF
1244
1246 {
1247 public:
1248 Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
1249 ~Inst_MTBUF();
1250
1251 int instSize() const override;
1252 void initOperandInfo() override;
1253
1254 protected:
1255 // first instruction DWORD
1257 // second instruction DWORD
1259
1260 private:
1262 }; // Inst_MTBUF
1263
1265 {
1266 public:
1267 Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
1268 ~Inst_MIMG();
1269
1270 int instSize() const override;
1271 void initOperandInfo() override;
1272
1273 protected:
1274 // first instruction DWORD
1276 // second instruction DWORD
1278 }; // Inst_MIMG
1279
1281 {
1282 public:
1283 Inst_EXP(InFmt_EXP*, const std::string &opcode);
1284 ~Inst_EXP();
1285
1286 int instSize() const override;
1287 void initOperandInfo() override;
1288
1289 protected:
1290 // first instruction DWORD
1292 // second instruction DWORD
1294 }; // Inst_EXP
1295
1297 {
1298 public:
1299 Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
1300 ~Inst_FLAT();
1301
1302 int instSize() const override;
1303 void generateDisassembly() override;
1304
1305 void initOperandInfo() override;
1306
1307 protected:
1308 template<typename T>
1309 void
1311 {
1312 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1313 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
1314 } else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1315 // Store with more than one dword need to be swizzled and
1316 // should use the template<int N> version of this method.
1317 static_assert(sizeof(T) <= 4);
1318 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
1319 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1320 Wavefront *wf = gpuDynInst->wavefront();
1321 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1322 if (gpuDynInst->exec_mask[lane]) {
1323 Addr vaddr = gpuDynInst->addr[lane];
1324 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
1325 = wf->ldsChunk->read<T>(vaddr);
1326 }
1327 }
1328 }
1329 }
1330
1331 template<int N>
1332 void
1334 {
1335 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1336 initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
1337 } else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1338 initScratchReqHelper<N>(gpuDynInst, MemCmd::ReadReq);
1339 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1340 Wavefront *wf = gpuDynInst->wavefront();
1341 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1342 if (gpuDynInst->exec_mask[lane]) {
1343 Addr vaddr = gpuDynInst->addr[lane];
1344 for (int i = 0; i < N; ++i) {
1345 (reinterpret_cast<VecElemU32*>(
1346 gpuDynInst->d_data))[lane * N + i]
1347 = wf->ldsChunk->read<VecElemU32>(
1348 vaddr + i*sizeof(VecElemU32));
1349 }
1350 }
1351 }
1352 }
1353 }
1354
1355 template<typename T>
1356 void
1358 {
1359 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1360 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
1361 } else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1362 // Store with more than one dword need to be swizzled and
1363 // should use the template<int N> version of this method.
1364 static_assert(sizeof(T) <= 4);
1365 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
1366 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1367 Wavefront *wf = gpuDynInst->wavefront();
1368 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1369 if (gpuDynInst->exec_mask[lane]) {
1370 Addr vaddr = gpuDynInst->addr[lane];
1371 wf->ldsChunk->write<T>(vaddr,
1372 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
1373 }
1374 }
1375 }
1376 }
1377
1378 template<int N>
1379 void
1381 {
1382 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1383 initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
1384 } else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1385 swizzleData<N>(gpuDynInst);
1386 initScratchReqHelper<N>(gpuDynInst, MemCmd::WriteReq);
1387 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1388 Wavefront *wf = gpuDynInst->wavefront();
1389 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1390 if (gpuDynInst->exec_mask[lane]) {
1391 Addr vaddr = gpuDynInst->addr[lane];
1392 for (int i = 0; i < N; ++i) {
1394 vaddr + i*sizeof(VecElemU32),
1395 (reinterpret_cast<VecElemU32*>(
1396 gpuDynInst->d_data))[lane * N + i]);
1397 }
1398 }
1399 }
1400 }
1401 }
1402
1403 template<typename T>
1404 void
1406 {
1407 // Flat scratch requests may not be atomic according to ISA manual
1408 // up to MI200. See MI200 manual Table 45.
1409 assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
1410
1411 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1412 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
1413 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1414 Wavefront *wf = gpuDynInst->wavefront();
1415 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1416 if (gpuDynInst->exec_mask[lane]) {
1417 Addr vaddr = gpuDynInst->addr[lane];
1418 auto amo_op =
1419 gpuDynInst->makeAtomicOpFunctor<T>(
1420 &(reinterpret_cast<T*>(
1421 gpuDynInst->a_data))[lane],
1422 &(reinterpret_cast<T*>(
1423 gpuDynInst->x_data))[lane]);
1424
1425 T tmp = wf->ldsChunk->read<T>(vaddr);
1426 (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
1427 wf->ldsChunk->write<T>(vaddr, tmp);
1428 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
1429 }
1430 }
1431 }
1432 }
1433
1434 void
1437 {
1438 // Offset is a 13-bit field w/the following meanings:
1439 // In Flat instructions, offset is a 12-bit unsigned number
1440 // In Global/Scratch instructions, offset is a 13-bit signed number
1441 if (isFlat()) {
1442 offset = offset & 0xfff;
1443 } else {
1444 offset = (ScalarRegI32)sext<13>(offset);
1445 }
1446 // If saddr = 0x7f there is no scalar reg to read and address will
1447 // be a 64-bit address. Otherwise, saddr is the reg index for a
1448 // scalar reg used as the base address for a 32-bit address.
1449 if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
1450 ConstVecOperandU64 vbase(gpuDynInst, vaddr);
1451 vbase.read();
1452
1453 calcAddrVgpr(gpuDynInst, vbase, offset);
1454 } else if (isFlatGlobal()) {
1455 // Assume we are operating in 64-bit mode and read a pair of
1456 // SGPRs for the address base.
1457 ConstScalarOperandU64 sbase(gpuDynInst, saddr);
1458 sbase.read();
1459
1460 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
1461 voffset.read();
1462
1463 calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
1464 // For scratch, saddr = 0x7f there is no scalar reg to read and
1465 // a vgpr will be used for address offset. Otherwise, saddr is
1466 // the sgpr index holding the address offset. For scratch
1467 // instructions the offset GPR is always 32-bits.
1468 } else if (saddr != 0x7f) {
1469 assert(isFlatScratch());
1470
1471 ConstScalarOperandU32 soffset(gpuDynInst, saddr);
1472 soffset.read();
1473
1474 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
1475 if (instData.SVE) {
1476 voffset.read();
1477 }
1478
1479 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
1480
1481 int elemSize;
1482 auto staticInst = gpuDynInst->staticInstruction();
1483 if (gpuDynInst->isLoad()) {
1484 elemSize = staticInst->getOperandSize(2);
1485 } else {
1486 assert(gpuDynInst->isStore());
1487 elemSize = staticInst->getOperandSize(1);
1488 }
1489
1490 unsigned swizzleOffset = soffset.rawData() + offset;
1491 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1492 if (gpuDynInst->exec_mask[lane]) {
1493 swizzleOffset += instData.SVE ? voffset[lane] : 0;
1494 gpuDynInst->addr.at(lane) = flat_scratch_addr
1495 + swizzleAddr(swizzleOffset, lane, elemSize);
1496 }
1497 }
1498 } else {
1499 assert(isFlatScratch());
1500
1501 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
1502 if (instData.SVE) {
1503 voffset.read();
1504 }
1505
1506 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
1507
1508 int elemSize;
1509 auto staticInst = gpuDynInst->staticInstruction();
1510 if (gpuDynInst->isLoad()) {
1511 elemSize = staticInst->getOperandSize(2);
1512 } else {
1513 assert(gpuDynInst->isStore());
1514 elemSize = staticInst->getOperandSize(1);
1515 }
1516
1517 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1518 if (gpuDynInst->exec_mask[lane]) {
1519 VecElemU32 vgpr_offset =
1520 instData.SVE ? voffset[lane] : 0;
1521
1522 gpuDynInst->addr.at(lane) = flat_scratch_addr
1523 + swizzleAddr(vgpr_offset+offset, lane, elemSize);
1524 }
1525 }
1526 }
1527
1528 if (isFlat()) {
1529 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
1530 } else if (isFlatGlobal()) {
1531 gpuDynInst->staticInstruction()->executed_as =
1532 enums::SC_GLOBAL;
1533 } else {
1534 assert(isFlatScratch());
1535 gpuDynInst->staticInstruction()->executed_as =
1536 enums::SC_PRIVATE;
1537 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
1538 }
1539 }
1540
1541 void
1543 {
1544 if ((gpuDynInst->executedAs() == enums::SC_GLOBAL && isFlat())
1545 || isFlatGlobal()) {
1546 gpuDynInst->computeUnit()->globalMemoryPipe
1547 .issueRequest(gpuDynInst);
1548 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1549 assert(isFlat());
1550 gpuDynInst->computeUnit()->localMemoryPipe
1551 .issueRequest(gpuDynInst);
1552 } else {
1553 assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
1554 gpuDynInst->computeUnit()->globalMemoryPipe
1555 .issueRequest(gpuDynInst);
1556 }
1557 }
1558
1559 // Execute for atomics is identical besides the flag set in the
1560 // constructor, except cmpswap. For cmpswap, the offset to the "cmp"
1561 // register is needed. For all other operations this offset is zero
1562 // and implies the atomic is not a cmpswap.
1563 // RegT defines the type of GPU register (e.g., ConstVecOperandU32)
1564 // LaneT defines the type of the register elements (e.g., VecElemU32)
1565 template<typename RegT, typename LaneT, int CmpRegOffset = 0>
1566 void
1568 {
1569 Wavefront *wf = gpuDynInst->wavefront();
1570
1571 if (gpuDynInst->exec_mask.none()) {
1572 wf->decVMemInstsIssued();
1573 if (isFlat()) {
1574 wf->decLGKMInstsIssued();
1575 }
1576 return;
1577 }
1578
1579 gpuDynInst->execUnitId = wf->execUnitId;
1580 gpuDynInst->latency.init(gpuDynInst->computeUnit());
1581 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
1582
1583 RegT data(gpuDynInst, extData.DATA);
1584 RegT cmp(gpuDynInst, extData.DATA + CmpRegOffset);
1585
1586 data.read();
1587 if constexpr (CmpRegOffset) {
1588 cmp.read();
1589 }
1590
1592
1593 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1594 if (gpuDynInst->exec_mask[lane]) {
1595 if constexpr (CmpRegOffset) {
1596 (reinterpret_cast<VecElemU32*>(
1597 gpuDynInst->x_data))[lane] = data[lane];
1598 (reinterpret_cast<VecElemU32*>(
1599 gpuDynInst->a_data))[lane] = cmp[lane];
1600 } else {
1601 (reinterpret_cast<LaneT*>(gpuDynInst->a_data))[lane]
1602 = data[lane];
1603 }
1604 }
1605 }
1606
1607 issueRequestHelper(gpuDynInst);
1608 }
1609
1610 // RegT defines the type of GPU register (e.g., ConstVecOperandU32)
1611 // LaneT defines the type of the register elements (e.g., VecElemU32)
1612 template<typename RegT, typename LaneT>
1613 void
1615 {
1616 if (isAtomicRet()) {
1617 RegT vdst(gpuDynInst, extData.VDST);
1618
1619 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1620 if (gpuDynInst->exec_mask[lane]) {
1621 vdst[lane] = (reinterpret_cast<LaneT*>(
1622 gpuDynInst->d_data))[lane];
1623 }
1624 }
1625
1626 vdst.write();
1627 }
1628 }
1629
1630 // Swizzle memory such that dwords from each lane are interleaved.
1631 // For example, a global_store_dwordx2 where every lane has two dwords
1632 // A and B would write A B A B, A B ... A B in contiguous memory while
1633 // scratch should write A A ... A B B ... B for 64 x2 total dwords.
1634 // Only applies to >1 dword.
1635 template<int N>
1636 void
1638 {
1639 static_assert(N > 1);
1640
1641 uint32_t data[N * NumVecElemPerVecReg];
1642 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1643 for (int dword = 0; dword < N; ++dword) {
1644 data[dword * NumVecElemPerVecReg + lane] =
1645 (reinterpret_cast<VecElemU32*>(
1646 gpuDynInst->d_data))[lane * N + dword];
1647 }
1648 }
1649 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1650 for (int dword = 0; dword < N; ++dword) {
1651 (reinterpret_cast<VecElemU32*>(
1652 gpuDynInst->d_data))[lane * N + dword] =
1653 data[lane * N + dword];
1654 }
1655 }
1656 }
1657
1658 bool
1660 {
1661 return (extData.SADDR != 0x7f);
1662 }
1663
1664 // first instruction DWORD
1666 // second instruction DWORD
1668
1669 private:
1670 void initFlatOperandInfo();
1672
1675
1676 void
1679 {
1680 // Use SGPR pair as a base address and add VGPR-offset and
1681 // instruction offset. The VGPR-offset is always 32-bits so we
1682 // mask any upper bits from the vaddr.
1683 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1684 if (gpuDynInst->exec_mask[lane]) {
1685 ScalarRegI32 voffset = vaddr[lane];
1686 gpuDynInst->addr.at(lane) =
1687 saddr.rawData() + voffset + offset;
1688 }
1689 }
1690 }
1691
1692 void
1695 {
1696 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1697 if (gpuDynInst->exec_mask[lane]) {
1698 gpuDynInst->addr.at(lane) = addr[lane] + offset;
1699 }
1700 }
1701 }
1702
1704 swizzleAddr(VecElemU32 offset, int lane, int elem_size)
1705 {
1706 // This is not described in the spec. We use the swizzle from
1707 // buffer memory instructions and fix the stride to 4. Multiply
1708 // the thread ID by the storage size to avoid threads clobbering
1709 // their data.
1710 return ((offset / 4) * 4 * 64)
1711 + (offset % 4) + (lane * elem_size);
1712 }
1713
1714 Addr
1716 {
1717 return gpuDynInst->computeUnit()->shader->getScratchBase();
1718 }
1719 }; // Inst_FLAT
1720} // namespace VegaISA
1721} // namespace gem5
1722
1723#endif // __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
#define DPRINTF(x,...)
Definition trace.hh:209
const char data[]
const std::string & opcode() const
const std::string _opcode
void write(const uint32_t index, const T value)
a write operation
Definition lds_state.hh:111
T atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
an atomic operation
Definition lds_state.hh:153
T read(const uint32_t index)
a read operation
Definition lds_state.hh:72
void initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
void initOperandInfo() override
void initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
void initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
void calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
void generateDisassembly() override
int instSize() const override
void initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
void initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
void initOperandInfo() override
int instSize() const override
void atomicComplete(GPUDynInstPtr gpuDynInst)
void calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr, ConstScalarOperandU64 &saddr, ScalarRegI32 offset)
void calcAddr(GPUDynInstPtr gpuDynInst, ScalarRegU32 vaddr, ScalarRegU32 saddr, ScalarRegI32 offset)
void swizzleData(GPUDynInstPtr gpuDynInst)
void initMemRead(GPUDynInstPtr gpuDynInst)
Addr readFlatScratch(GPUDynInstPtr gpuDynInst)
void atomicExecute(GPUDynInstPtr gpuDynInst)
VecElemU32 swizzleAddr(VecElemU32 offset, int lane, int elem_size)
void generateDisassembly() override
void calcAddrVgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr, ScalarRegI32 offset)
void initOperandInfo() override
void issueRequestHelper(GPUDynInstPtr gpuDynInst)
void initAtomicAccess(GPUDynInstPtr gpuDynInst)
void initMemWrite(GPUDynInstPtr gpuDynInst)
int instSize() const override
int instSize() const override
void initOperandInfo() override
int instSize() const override
bool hasSecondDword(InFmt_MTBUF *)
void initOperandInfo() override
void initOperandInfo() override
void generateDisassembly() override
void initMemWrite(GPUDynInstPtr gpuDynInst)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
void calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx, SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
MUBUF insructions calculate their addresses as follows:
void initMemRead(GPUDynInstPtr gpuDynInst)
void initAtomicAccess(GPUDynInstPtr gpuDynInst)
int instSize() const override
void initOperandInfo() override
void calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
void generateDisassembly() override
void initMemRead(GPUDynInstPtr gpuDynInst)
initiate a memory read access for N dwords
int instSize() const override
void calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr, ScalarRegU32 offset)
For normal s_load_dword/s_store_dword instruction addresses.
void initMemWrite(GPUDynInstPtr gpuDynInst)
initiate a memory write access for N dwords
int instSize() const override
void generateDisassembly() override
void initOperandInfo() override
bool hasSecondDword(InFmt_SOP1 *)
bool hasSecondDword(InFmt_SOP2 *)
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
int instSize() const override
bool hasSecondDword(InFmt_SOPC *)
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
void generateDisassembly() override
bool hasSecondDword(InFmt_SOPK *)
void initOperandInfo() override
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
int instSize() const override
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
bool hasSecondDword(InFmt_VOP1 *)
T dppHelper(GPUDynInstPtr gpuDynInst, T &src1)
void initOperandInfo() override
void vop2Helper(GPUDynInstPtr gpuDynInst, void(*fOpImpl)(T &, T &, T &, Wavefront *))
int instSize() const override
T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T &src1)
void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T &vdst)
bool hasSecondDword(InFmt_VOP2 *)
void generateDisassembly() override
void generateDisassembly() override
const bool sgprDst
the v_cmp and readlane instructions in the VOP3 encoding are unique because they are the only instruc...
T omodModifier(T val, unsigned omod)
int instSize() const override
void initOperandInfo() override
bool hasSecondDword(InFmt_VOP3A *)
bool hasSecondDword(InFmt_VOP3B *)
void initOperandInfo() override
void generateDisassembly() override
int instSize() const override
bool hasSecondDword(InFmt_VOP3P_MAI *)
int instSize() const override
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
void initOperandInfo() override
T word(uint32_t data, int opSel, int neg, int opSelBit)
void generateDisassembly() override
int instSize() const override
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, T, bool))
bool hasSecondDword(InFmt_VOP3P *)
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
bool hasSecondDword(InFmt_VOPC *)
uint32_t sdwabSelect(uint32_t dword, const SDWASelVals sel, bool sign_ext, bool neg, bool abs)
void generateDisassembly() override
void sdwabHelper(GPUDynInstPtr gpuDynInst, int(*cmpFunc)(T, T))
void initOperandInfo() override
int instSize() const override
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:409
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:392
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
Definition operand.hh:491
void read() override
read from the vrf.
Definition operand.hh:147
void readSrc()
certain vector operands can read from the vrf/srf or constants.
Definition operand.hh:131
void write() override
write to the vrf.
Definition operand.hh:199
void decVMemInstsIssued()
void decLGKMInstsIssued()
LdsChunk * ldsChunk
Definition wavefront.hh:230
VectorMask & execMask()
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
Definition amo.hh:269
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
Bitfield< 3, 0 > mask
Definition pcstate.hh:63
Bitfield< 7 > b
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
Bitfield< 8 > a
Definition misc_types.hh:66
Bitfield< 21, 20 > stride
const int BITS_PER_WORD
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:836
uint32_t VecElemU32
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:892
const int MSB_PER_WORD
const int BITS_PER_BYTE
uint32_t ScalarRegU32
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:424
Bitfield< 63 > val
Definition misc.hh:804
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
SDWASelVals
Definition inst_util.hh:45
@ SDWA_WORD_0
Definition inst_util.hh:50
@ SDWA_DWORD
Definition inst_util.hh:52
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
InFmt_VOP_SDWAB iFmt_VOP_SDWAB

Generated on Mon Jan 13 2025 04:27:55 for gem5 by doxygen 1.9.8