gem5 v24.0.0.0
Loading...
Searching...
No Matches
op_encodings.hh
Go to the documentation of this file.
1/*
2 * Copyright (c) 2016-2021 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
33#define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
34
39#include "debug/GPUExec.hh"
40#include "debug/VEGA.hh"
42
43namespace gem5
44{
45
46namespace VegaISA
47{
49 {
50 uint64_t baseAddr : 48;
51 uint32_t stride : 14;
52 uint32_t cacheSwizzle : 1;
53 uint32_t swizzleEn : 1;
54 uint32_t numRecords : 32;
55 uint32_t dstSelX : 3;
56 uint32_t dstSelY : 3;
57 uint32_t dstSelZ : 3;
58 uint32_t dstSelW : 3;
59 uint32_t numFmt : 3;
60 uint32_t dataFmt : 4;
61 uint32_t elemSize : 2;
62 uint32_t idxStride : 2;
63 uint32_t addTidEn : 1;
64 uint32_t atc : 1;
65 uint32_t hashEn : 1;
66 uint32_t heap : 1;
67 uint32_t mType : 3;
68 uint32_t type : 2;
69 };
70
71 // --- purely virtual instruction classes ---
72
74 {
75 public:
76 Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
77
78 int instSize() const override;
79 void generateDisassembly() override;
80
81 void initOperandInfo() override;
82
83 protected:
84 // first instruction DWORD
86 // possible second DWORD
88 uint32_t varSize;
89
90 private:
92 }; // Inst_SOP2
93
95 {
96 public:
97 Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
98 ~Inst_SOPK();
99
100 int instSize() const override;
101 void generateDisassembly() override;
102
103 void initOperandInfo() override;
104
105 protected:
106 // first instruction DWORD
108 // possible second DWORD
110 uint32_t varSize;
111
112 private:
114 }; // Inst_SOPK
115
117 {
118 public:
119 Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
120 ~Inst_SOP1();
121
122 int instSize() const override;
123 void generateDisassembly() override;
124
125 void initOperandInfo() override;
126
127 protected:
128 // first instruction DWORD
130 // possible second DWORD
132 uint32_t varSize;
133
134 private:
136 }; // Inst_SOP1
137
139 {
140 public:
141 Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
142 ~Inst_SOPC();
143
144 int instSize() const override;
145 void generateDisassembly() override;
146
147 void initOperandInfo() override;
148
149 protected:
150 // first instruction DWORD
152 // possible second DWORD
154 uint32_t varSize;
155
156 private:
158 }; // Inst_SOPC
159
161 {
162 public:
163 Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
164 ~Inst_SOPP();
165
166 int instSize() const override;
167 void generateDisassembly() override;
168
169 void initOperandInfo() override;
170
171 protected:
172 // first instruction DWORD
174 }; // Inst_SOPP
175
177 {
178 public:
179 Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
180 ~Inst_SMEM();
181
182 int instSize() const override;
183 void generateDisassembly() override;
184
185 void initOperandInfo() override;
186
187 protected:
191 template<int N>
192 void
198
202 template<int N>
203 void
209
213 void
216 {
217 Addr vaddr = ((addr.rawData() + offset) & ~0x3);
218 gpu_dyn_inst->scalarAddr = vaddr;
219 }
220
226 void
229 {
230 BufferRsrcDescriptor rsrc_desc;
231 ScalarRegU32 clamped_offset(offset);
232 std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
233 sizeof(BufferRsrcDescriptor));
234
240 if (!rsrc_desc.stride && offset >= rsrc_desc.numRecords) {
241 clamped_offset = rsrc_desc.numRecords;
242 } else if (rsrc_desc.stride && offset
243 > (rsrc_desc.stride * rsrc_desc.numRecords)) {
244 clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
245 }
246
247 Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
248 gpu_dyn_inst->scalarAddr = vaddr;
249 }
250
251 // first instruction DWORD
253 // second instruction DWORD
255 }; // Inst_SMEM
256
258 {
259 public:
260 Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
261 ~Inst_VOP2();
262
263 int instSize() const override;
264 void generateDisassembly() override;
265
266 void initOperandInfo() override;
267
268 protected:
269 // first instruction DWORD
271 // possible second DWORD
273 uint32_t varSize;
274
275 template<typename T>
276 T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)
277 {
278 T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
279 // use copies of original src0, src1, and dest during selecting
280 T origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
281 T origSrc1(gpuDynInst, instData.VSRC1);
282
283 src0_sdwa.read();
284 origSrc0_sdwa.read();
285 origSrc1.read();
286
287 DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register v[%d], "
288 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "
289 "%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "
290 "SRC1_NEG: %d, SRC1_ABS: %d\n",
291 opcode().c_str(), extData.iFmt_VOP_SDWA.SRC0,
300
301 processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
302 src1, origSrc1);
303
304 return src0_sdwa;
305 }
306
307 template<typename T>
308 void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T & vdst)
309 {
310 T origVdst(gpuDynInst, instData.VDST);
311
312 Wavefront *wf = gpuDynInst->wavefront();
313 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
314 if (wf->execMask(lane)) {
315 origVdst[lane] = vdst[lane]; // keep copy consistent
316 }
317 }
318
319 processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
320 }
321
322 template<typename T>
323 T dppHelper(GPUDynInstPtr gpuDynInst, T & src1)
324 {
325 T src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
326 src0_dpp.read();
327
328 DPRINTF(VEGA, "Handling %s SRC DPP. SRC0: register v[%d], "
329 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "
330 "SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",
331 opcode().c_str(), extData.iFmt_VOP_DPP.SRC0,
336
337 processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
338
339 return src0_dpp;
340 }
341
342 template<typename ConstT, typename T>
343 void vop2Helper(GPUDynInstPtr gpuDynInst,
344 void (*fOpImpl)(T&, T&, T&, Wavefront*))
345 {
346 Wavefront *wf = gpuDynInst->wavefront();
347 T src0(gpuDynInst, instData.SRC0);
348 T src1(gpuDynInst, instData.VSRC1);
349 T vdst(gpuDynInst, instData.VDST);
350
351 src0.readSrc();
352 src1.read();
353
354 if (isSDWAInst()) {
355 T src0_sdwa = sdwaSrcHelper(gpuDynInst, src1);
356 fOpImpl(src0_sdwa, src1, vdst, wf);
357 sdwaDstHelper(gpuDynInst, vdst);
358 } else if (isDPPInst()) {
359 T src0_dpp = dppHelper(gpuDynInst, src1);
360 fOpImpl(src0_dpp, src1, vdst, wf);
361 } else {
362 // src0 is unmodified. We need to use the const container
363 // type to allow reading scalar operands from src0. Only
364 // src0 can index scalar operands. We copy this to vdst
365 // temporarily to pass to the lambda so the instruction
366 // does not need to write two lambda functions (one for
367 // a const src0 and one of a mutable src0).
368 ConstT const_src0(gpuDynInst, instData.SRC0);
369 const_src0.readSrc();
370
371 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
372 vdst[lane] = const_src0[lane];
373 }
374 fOpImpl(vdst, src1, vdst, wf);
375 }
376
377 vdst.write();
378 }
379
380 private:
382 }; // Inst_VOP2
383
385 {
386 public:
387 Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
388 ~Inst_VOP1();
389
390 int instSize() const override;
391 void generateDisassembly() override;
392
393 void initOperandInfo() override;
394
395 protected:
396 // first instruction DWORD
398 // possible second DWORD
400 uint32_t varSize;
401
402 private:
404 }; // Inst_VOP1
405
407 {
408 public:
409 Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
410 ~Inst_VOPC();
411
412 int instSize() const override;
413 void generateDisassembly() override;
414
415 void initOperandInfo() override;
416
417 protected:
418 // first instruction DWORD
420 // possible second DWORD
422 uint32_t varSize;
423
424 private:
426 }; // Inst_VOPC
427
429 {
430 public:
431 Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
432 ~Inst_VINTRP();
433
434 int instSize() const override;
435
436 protected:
437 // first instruction DWORD
439 }; // Inst_VINTRP
440
442 {
443 public:
444 Inst_VOP3A(InFmt_VOP3A*, const std::string &opcode, bool sgpr_dst);
445 ~Inst_VOP3A();
446
447 int instSize() const override;
448 void generateDisassembly() override;
449
450 void initOperandInfo() override;
451
452 protected:
453 // first instruction DWORD
455 // second instruction DWORD
457
458 // Output modifier for VOP3 instructions. This 2-bit field can be set
459 // to "0" to do nothing, "1" to multiply output value by 2, "2" to
460 // multiply output value by 4, or "3" to divide output value by 2. If
461 // the instruction supports clamping, this is applied *before* clamp
462 // but after the abs and neg modifiers.
463 template<typename T>
464 T omodModifier(T val, unsigned omod)
465 {
466 assert(omod < 4);
467
468 if constexpr (std::is_floating_point_v<T>) {
469 if (omod == 1) return val * T(2.0f);
470 if (omod == 2) return val * T(4.0f);
471 if (omod == 3) return val / T(2.0f);
472 } else {
473 assert(std::is_integral_v<T>);
474 if (omod == 1) return val * T(2);
475 if (omod == 2) return val * T(4);
476 if (omod == 3) return val / T(2);
477 }
478
479 return val;
480 }
481 private:
493 const bool sgprDst;
494 }; // Inst_VOP3A
495
497 {
498 public:
499 Inst_VOP3B(InFmt_VOP3B*, const std::string &opcode);
500 ~Inst_VOP3B();
501
502 int instSize() const override;
503 void generateDisassembly() override;
504
505 void initOperandInfo() override;
506
507 protected:
508 // first instruction DWORD
510 // second instruction DWORD
512
513 private:
515 }; // Inst_VOP3B
516
518 {
519 public:
520 Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode);
521 ~Inst_VOP3P();
522
523 int instSize() const override;
524 void generateDisassembly() override;
525
526 void initOperandInfo() override;
527
528 protected:
529 // first instruction DWORD
531 // second instruction DWORD
533
534 template<typename T>
535 void vop3pHelper(GPUDynInstPtr gpuDynInst,
536 T (*fOpImpl)(T, T, bool))
537 {
538 Wavefront *wf = gpuDynInst->wavefront();
539 ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
540 ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
541 VecOperandU32 D(gpuDynInst, instData.VDST);
542
543 S0.readSrc();
544 S1.readSrc();
545
546 int opLo = instData.OPSEL;
547 int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
548 int negLo = extData.NEG;
549 int negHi = instData.NEG_HI;
550 bool clamp = instData.CLMP;
551 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
552 if (wf->execMask(lane)) {
553 T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
554 word<T>(S1[lane], opHi, negHi, 1),
555 clamp);
556 T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
557 word<T>(S1[lane], opLo, negLo, 1),
558 clamp);
559
560 uint16_t upper_raw =
561 *reinterpret_cast<uint16_t*>(&upper_val);
562 uint16_t lower_raw =
563 *reinterpret_cast<uint16_t*>(&lower_val);
564
565 D[lane] = upper_raw << 16 | lower_raw;
566 }
567 }
568
569 D.write();
570 }
571
572 template<typename T>
573 void vop3pHelper(GPUDynInstPtr gpuDynInst,
574 T (*fOpImpl)(T, T, T, bool))
575 {
576 Wavefront *wf = gpuDynInst->wavefront();
577 ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
578 ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
579 ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
580 VecOperandU32 D(gpuDynInst, instData.VDST);
581
582 S0.readSrc();
583 S1.readSrc();
584 S2.readSrc();
585
586 int opLo = instData.OPSEL;
587 int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI;
588 int negLo = extData.NEG;
589 int negHi = instData.NEG_HI;
590 bool clamp = instData.CLMP;
591 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
592 if (wf->execMask(lane)) {
593 T upper_val = fOpImpl(word<T>(S0[lane], opHi, negHi, 0),
594 word<T>(S1[lane], opHi, negHi, 1),
595 word<T>(S2[lane], opHi, negHi, 2),
596 clamp);
597 T lower_val = fOpImpl(word<T>(S0[lane], opLo, negLo, 0),
598 word<T>(S1[lane], opLo, negLo, 1),
599 word<T>(S2[lane], opLo, negLo, 2),
600 clamp);
601
602 uint16_t upper_raw =
603 *reinterpret_cast<uint16_t*>(&upper_val);
604 uint16_t lower_raw =
605 *reinterpret_cast<uint16_t*>(&lower_val);
606
607 D[lane] = upper_raw << 16 | lower_raw;
608 }
609 }
610
611 D.write();
612 }
613
614 void
616 uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
617 {
618 Wavefront *wf = gpuDynInst->wavefront();
619 ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
620 ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
621 ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
622 VecOperandU32 D(gpuDynInst, instData.VDST);
623
624 S0.readSrc();
625 S1.readSrc();
626 S2.readSrc();
627
628 // OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
629 // dword1[15:0] is upper/lower 16b of src0 based on opsel[0]
630 // dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
631 // dword2[15:0] is upper/lower 16b of src1 based on opsel[1]
632 // dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
633 int opLo = instData.OPSEL;
634 int opHi = extData.OPSEL_HI;
635 int negLo = extData.NEG;
636 int negHi = instData.NEG_HI;
637 bool clamp = instData.CLMP;
638
639 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
640 if (wf->execMask(lane)) {
641 uint32_t dword1l =
642 word<uint16_t>(S0[lane], opLo, negLo, 0);
643 uint32_t dword1h =
644 word<uint16_t>(S0[lane], opHi, negHi, 0);
645 uint32_t dword2l =
646 word<uint16_t>(S1[lane], opLo, negLo, 1);
647 uint32_t dword2h =
648 word<uint16_t>(S1[lane], opHi, negHi, 1);
649
650 uint32_t dword1 = (dword1h << 16) | dword1l;
651 uint32_t dword2 = (dword2h << 16) | dword2l;
652
653 // Take in two uint32_t dwords and one src2 dword. The
654 // function will need to call bits to break up to the
655 // correct size and then reinterpret cast to the correct
656 // value.
657 D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
658 }
659 }
660
661 D.write();
662 }
663
664 private:
666
667 template<typename T>
668 T
669 word(uint32_t data, int opSel, int neg, int opSelBit)
670 {
671 // This method assumes two words packed into a dword
672 static_assert(sizeof(T) == 2);
673
674 bool select = bits(opSel, opSelBit, opSelBit);
675 uint16_t raw = select ? bits(data, 31, 16)
676 : bits(data, 15, 0);
677
678 // Apply input modifiers. This may seem odd, but the hardware
679 // just flips the MSb instead of doing unary negation.
680 bool negate = bits(neg, opSelBit, opSelBit);
681 if (negate) {
682 raw ^= 0x8000;
683 }
684
685 return *reinterpret_cast<T*>(&raw);
686 }
687 }; // Inst_VOP3P
688
690 {
691 public:
692 Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode);
694
695 int instSize() const override;
696 void generateDisassembly() override;
697
698 void initOperandInfo() override;
699
700 protected:
701 // first instruction DWORD
703 // second instruction DWORD
705
706 private:
708 }; // Inst_VOP3P
709
711 {
712 public:
713 Inst_DS(InFmt_DS*, const std::string &opcode);
714 ~Inst_DS();
715
716 int instSize() const override;
717 void generateDisassembly() override;
718
719 void initOperandInfo() override;
720
721 protected:
722 template<typename T>
723 void
725 {
726 Wavefront *wf = gpuDynInst->wavefront();
727
728 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
729 if (gpuDynInst->exec_mask[lane]) {
730 Addr vaddr = gpuDynInst->addr[lane] + offset;
731
732 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
733 = wf->ldsChunk->read<T>(vaddr);
734 }
735 }
736 }
737
738 template<int N>
739 void
741 {
742 Wavefront *wf = gpuDynInst->wavefront();
743
744 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
745 if (gpuDynInst->exec_mask[lane]) {
746 Addr vaddr = gpuDynInst->addr[lane] + offset;
747 for (int i = 0; i < N; ++i) {
748 (reinterpret_cast<VecElemU32*>(
749 gpuDynInst->d_data))[lane * N + i]
750 = wf->ldsChunk->read<VecElemU32>(
751 vaddr + i*sizeof(VecElemU32));
752 }
753 }
754 }
755 }
756
757 template<typename T>
758 void
759 initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
760 {
761 Wavefront *wf = gpuDynInst->wavefront();
762
763 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
764 if (gpuDynInst->exec_mask[lane]) {
765 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
766 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
767
768 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
769 = wf->ldsChunk->read<T>(vaddr0);
770 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
771 = wf->ldsChunk->read<T>(vaddr1);
772 }
773 }
774 }
775
776 template<typename T>
777 void
779 {
780 Wavefront *wf = gpuDynInst->wavefront();
781
782 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
783 if (gpuDynInst->exec_mask[lane]) {
784 Addr vaddr = gpuDynInst->addr[lane] + offset;
785 wf->ldsChunk->write<T>(vaddr,
786 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
787 }
788 }
789 }
790
791 template<int N>
792 void
794 {
795 Wavefront *wf = gpuDynInst->wavefront();
796
797 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
798 if (gpuDynInst->exec_mask[lane]) {
799 Addr vaddr = gpuDynInst->addr[lane] + offset;
800 for (int i = 0; i < N; ++i) {
802 vaddr + i*sizeof(VecElemU32),
803 (reinterpret_cast<VecElemU32*>(
804 gpuDynInst->d_data))[lane * N + i]);
805 }
806 }
807 }
808 }
809
810 template<typename T>
811 void
812 initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
813 {
814 Wavefront *wf = gpuDynInst->wavefront();
815
816 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
817 if (gpuDynInst->exec_mask[lane]) {
818 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
819 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
820 wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
821 gpuDynInst->d_data))[lane * 2]);
822 wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
823 gpuDynInst->d_data))[lane * 2 + 1]);
824 }
825 }
826 }
827
828 template<typename T>
829 void
831 {
832 Wavefront *wf = gpuDynInst->wavefront();
833
834 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
835 if (gpuDynInst->exec_mask[lane]) {
836 Addr vaddr = gpuDynInst->addr[lane] + offset;
837
838 AtomicOpFunctorPtr amo_op =
839 gpuDynInst->makeAtomicOpFunctor<T>(
840 &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
841 &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]);
842
843 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
844 = wf->ldsChunk->atomic<T>(vaddr, std::move(amo_op));
845 }
846 }
847 }
848
849 void
851 {
852 Wavefront *wf = gpuDynInst->wavefront();
853
854 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
855 if (wf->execMask(lane)) {
856 gpuDynInst->addr.at(lane) = (Addr)addr[lane];
857 }
858 }
859 }
860
861 // first instruction DWORD
863 // second instruction DWORD
865 }; // Inst_DS
866
868 {
869 public:
870 Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
871 ~Inst_MUBUF();
872
873 int instSize() const override;
874 void generateDisassembly() override;
875
876 void initOperandInfo() override;
877
878 protected:
879 template<typename T>
880 void
882 {
883 // temporarily modify exec_mask to supress memory accesses to oob
884 // regions. Only issue memory requests for lanes that have their
885 // exec_mask set and are not out of bounds.
886 VectorMask old_exec_mask = gpuDynInst->exec_mask;
887 gpuDynInst->exec_mask &= ~oobMask;
889 gpuDynInst->exec_mask = old_exec_mask;
890 }
891
892
893 template<int N>
894 void
896 {
897 // temporarily modify exec_mask to supress memory accesses to oob
898 // regions. Only issue memory requests for lanes that have their
899 // exec_mask set and are not out of bounds.
900 VectorMask old_exec_mask = gpuDynInst->exec_mask;
901 gpuDynInst->exec_mask &= ~oobMask;
903 gpuDynInst->exec_mask = old_exec_mask;
904 }
905
906 template<typename T>
907 void
909 {
910 // temporarily modify exec_mask to supress memory accesses to oob
911 // regions. Only issue memory requests for lanes that have their
912 // exec_mask set and are not out of bounds.
913 VectorMask old_exec_mask = gpuDynInst->exec_mask;
914 gpuDynInst->exec_mask &= ~oobMask;
916 gpuDynInst->exec_mask = old_exec_mask;
917 }
918
919 template<int N>
920 void
922 {
923 // temporarily modify exec_mask to supress memory accesses to oob
924 // regions. Only issue memory requests for lanes that have their
925 // exec_mask set and are not out of bounds.
926 VectorMask old_exec_mask = gpuDynInst->exec_mask;
927 gpuDynInst->exec_mask &= ~oobMask;
929 gpuDynInst->exec_mask = old_exec_mask;
930 }
931
932 template<typename T>
933 void
935 {
936 // temporarily modify exec_mask to supress memory accesses to oob
937 // regions. Only issue memory requests for lanes that have their
938 // exec_mask set and are not out of bounds.
939 VectorMask old_exec_mask = gpuDynInst->exec_mask;
940 gpuDynInst->exec_mask &= ~oobMask;
941 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
942 gpuDynInst->exec_mask = old_exec_mask;
943 }
944
945 void
947 {
948 // create request and set flags
949 gpuDynInst->resetEntireStatusVector();
950 gpuDynInst->setStatusVector(0, 1);
951 RequestPtr req = std::make_shared<Request>(0, 0, 0,
952 gpuDynInst->computeUnit()->
953 requestorId(), 0,
954 gpuDynInst->wfDynId);
955 gpuDynInst->setRequestFlags(req);
956 gpuDynInst->computeUnit()->
957 injectGlobalMemFence(gpuDynInst, false, req);
958 }
959
980 template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
981 void
982 calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
983 SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
984 {
985 Addr vaddr = 0;
986 Addr base_addr = 0;
987 Addr stride = 0;
988 Addr buf_idx = 0;
989 Addr buf_off = 0;
990 Addr buffer_offset = 0;
991 BufferRsrcDescriptor rsrc_desc;
992
993 std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
994 sizeof(BufferRsrcDescriptor));
995
996 base_addr = rsrc_desc.baseAddr;
997
998 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
999 + rsrc_desc.stride) : rsrc_desc.stride;
1000
1001 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1002 if (gpuDynInst->exec_mask[lane]) {
1003 vaddr = base_addr + s_offset.rawData();
1009 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
1010
1011 buf_off = v_off[lane] + inst_offset;
1012
1013 if (rsrc_desc.swizzleEn) {
1014 Addr idx_stride = 8 << rsrc_desc.idxStride;
1015 Addr elem_size = 2 << rsrc_desc.elemSize;
1016 Addr idx_msb = buf_idx / idx_stride;
1017 Addr idx_lsb = buf_idx % idx_stride;
1018 Addr off_msb = buf_off / elem_size;
1019 Addr off_lsb = buf_off % elem_size;
1020 DPRINTF(VEGA, "mubuf swizzled lane %d: "
1021 "idx_stride = %llx, elem_size = %llx, "
1022 "idx_msb = %llx, idx_lsb = %llx, "
1023 "off_msb = %llx, off_lsb = %llx\n",
1024 lane, idx_stride, elem_size, idx_msb, idx_lsb,
1025 off_msb, off_lsb);
1026
1027 buffer_offset =(idx_msb * stride + off_msb * elem_size)
1028 * idx_stride + idx_lsb * elem_size + off_lsb;
1029 } else {
1030 buffer_offset = buf_off + stride * buf_idx;
1031 }
1032
1033
1041 if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
1042 if (buffer_offset >=
1043 rsrc_desc.numRecords - s_offset.rawData()) {
1044 DPRINTF(VEGA, "mubuf out-of-bounds condition 1: "
1045 "lane = %d, buffer_offset = %llx, "
1046 "const_stride = %llx, "
1047 "const_num_records = %llx\n",
1048 lane, buf_off + stride * buf_idx,
1049 stride, rsrc_desc.numRecords);
1050 oobMask.set(lane);
1051 continue;
1052 }
1053 }
1054
1055 if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
1056 if (buf_idx >= rsrc_desc.numRecords ||
1057 buf_off >= stride) {
1058 DPRINTF(VEGA, "mubuf out-of-bounds condition 2: "
1059 "lane = %d, offset = %llx, "
1060 "index = %llx, "
1061 "const_num_records = %llx\n",
1062 lane, buf_off, buf_idx,
1063 rsrc_desc.numRecords);
1064 oobMask.set(lane);
1065 continue;
1066 }
1067 }
1068
1069 vaddr += buffer_offset;
1070
1071 DPRINTF(VEGA, "Calculating mubuf address for lane %d: "
1072 "vaddr = %llx, base_addr = %llx, "
1073 "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
1074 lane, vaddr, base_addr, stride,
1075 buf_idx, buf_off);
1076 gpuDynInst->addr.at(lane) = vaddr;
1077 }
1078 }
1079 }
1080
1081 // first instruction DWORD
1083 // second instruction DWORD
1085 // Mask of lanes with out-of-bounds accesses. Needs to be tracked
1086 // seperately from the exec_mask so that we remember to write zero
1087 // to the registers associated with out of bounds lanes.
1089 }; // Inst_MUBUF
1090
1092 {
1093 public:
1094 Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
1095 ~Inst_MTBUF();
1096
1097 int instSize() const override;
1098 void initOperandInfo() override;
1099
1100 protected:
1101 // first instruction DWORD
1103 // second instruction DWORD
1105
1106 private:
1108 }; // Inst_MTBUF
1109
1111 {
1112 public:
1113 Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
1114 ~Inst_MIMG();
1115
1116 int instSize() const override;
1117 void initOperandInfo() override;
1118
1119 protected:
1120 // first instruction DWORD
1122 // second instruction DWORD
1124 }; // Inst_MIMG
1125
1127 {
1128 public:
1129 Inst_EXP(InFmt_EXP*, const std::string &opcode);
1130 ~Inst_EXP();
1131
1132 int instSize() const override;
1133 void initOperandInfo() override;
1134
1135 protected:
1136 // first instruction DWORD
1138 // second instruction DWORD
1140 }; // Inst_EXP
1141
1143 {
1144 public:
1145 Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
1146 ~Inst_FLAT();
1147
1148 int instSize() const override;
1149 void generateDisassembly() override;
1150
1151 void initOperandInfo() override;
1152
1153 protected:
1154 template<typename T>
1155 void
1157 {
1158 if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
1159 gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1161 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1162 Wavefront *wf = gpuDynInst->wavefront();
1163 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1164 if (gpuDynInst->exec_mask[lane]) {
1165 Addr vaddr = gpuDynInst->addr[lane];
1166 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
1167 = wf->ldsChunk->read<T>(vaddr);
1168 }
1169 }
1170 }
1171 }
1172
1173 template<int N>
1174 void
1176 {
1177 if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
1178 gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1180 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1181 Wavefront *wf = gpuDynInst->wavefront();
1182 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1183 if (gpuDynInst->exec_mask[lane]) {
1184 Addr vaddr = gpuDynInst->addr[lane];
1185 for (int i = 0; i < N; ++i) {
1186 (reinterpret_cast<VecElemU32*>(
1187 gpuDynInst->d_data))[lane * N + i]
1188 = wf->ldsChunk->read<VecElemU32>(
1189 vaddr + i*sizeof(VecElemU32));
1190 }
1191 }
1192 }
1193 }
1194 }
1195
1196 template<typename T>
1197 void
1199 {
1200 if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
1201 gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1203 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1204 Wavefront *wf = gpuDynInst->wavefront();
1205 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1206 if (gpuDynInst->exec_mask[lane]) {
1207 Addr vaddr = gpuDynInst->addr[lane];
1208 wf->ldsChunk->write<T>(vaddr,
1209 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
1210 }
1211 }
1212 }
1213 }
1214
1215 template<int N>
1216 void
1218 {
1219 if (gpuDynInst->executedAs() == enums::SC_GLOBAL ||
1220 gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1222 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1223 Wavefront *wf = gpuDynInst->wavefront();
1224 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1225 if (gpuDynInst->exec_mask[lane]) {
1226 Addr vaddr = gpuDynInst->addr[lane];
1227 for (int i = 0; i < N; ++i) {
1229 vaddr + i*sizeof(VecElemU32),
1230 (reinterpret_cast<VecElemU32*>(
1231 gpuDynInst->d_data))[lane * N + i]);
1232 }
1233 }
1234 }
1235 }
1236 }
1237
1238 template<typename T>
1239 void
1241 {
1242 // Flat scratch requests may not be atomic according to ISA manual
1243 // up to MI200. See MI200 manual Table 45.
1244 assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
1245
1246 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1247 initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
1248 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1249 Wavefront *wf = gpuDynInst->wavefront();
1250 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1251 if (gpuDynInst->exec_mask[lane]) {
1252 Addr vaddr = gpuDynInst->addr[lane];
1253 auto amo_op =
1254 gpuDynInst->makeAtomicOpFunctor<T>(
1255 &(reinterpret_cast<T*>(
1256 gpuDynInst->a_data))[lane],
1257 &(reinterpret_cast<T*>(
1258 gpuDynInst->x_data))[lane]);
1259
1260 T tmp = wf->ldsChunk->read<T>(vaddr);
1261 (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
1262 wf->ldsChunk->write<T>(vaddr, tmp);
1263 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
1264 }
1265 }
1266 }
1267 }
1268
1269 void
1272 {
1273 // Offset is a 13-bit field w/the following meanings:
1274 // In Flat instructions, offset is a 12-bit unsigned number
1275 // In Global/Scratch instructions, offset is a 13-bit signed number
1276 if (isFlat()) {
1277 offset = offset & 0xfff;
1278 } else {
1280 }
1281 // If saddr = 0x7f there is no scalar reg to read and address will
1282 // be a 64-bit address. Otherwise, saddr is the reg index for a
1283 // scalar reg used as the base address for a 32-bit address.
1284 if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
1285 ConstVecOperandU64 vbase(gpuDynInst, vaddr);
1286 vbase.read();
1287
1288 calcAddrVgpr(gpuDynInst, vbase, offset);
1289 } else if (isFlatGlobal()) {
1290 // Assume we are operating in 64-bit mode and read a pair of
1291 // SGPRs for the address base.
1292 ConstScalarOperandU64 sbase(gpuDynInst, saddr);
1293 sbase.read();
1294
1295 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
1296 voffset.read();
1297
1298 calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
1299 // For scratch, saddr = 0x7f there is no scalar reg to read and
1300 // a vgpr will be used for address offset. Otherwise, saddr is
1301 // the sgpr index holding the address offset. For scratch
1302 // instructions the offset GPR is always 32-bits.
1303 } else if (saddr != 0x7f) {
1304 assert(isFlatScratch());
1305
1306 ConstScalarOperandU32 soffset(gpuDynInst, saddr);
1307 soffset.read();
1308
1309 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
1310 if (instData.SVE) {
1311 voffset.read();
1312 }
1313
1314 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
1315
1316 int elemSize;
1317 auto staticInst = gpuDynInst->staticInstruction();
1318 if (gpuDynInst->isLoad()) {
1319 elemSize = staticInst->getOperandSize(2);
1320 } else {
1321 assert(gpuDynInst->isStore());
1322 elemSize = staticInst->getOperandSize(1);
1323 }
1324
1325 unsigned swizzleOffset = soffset.rawData() + offset;
1326 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1327 if (gpuDynInst->exec_mask[lane]) {
1328 swizzleOffset += instData.SVE ? voffset[lane] : 0;
1329 gpuDynInst->addr.at(lane) = flat_scratch_addr
1330 + swizzle(swizzleOffset, lane, elemSize);
1331 }
1332 }
1333 } else {
1334 assert(isFlatScratch());
1335
1336 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
1337 if (instData.SVE) {
1338 voffset.read();
1339 }
1340
1341 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
1342
1343 int elemSize;
1344 auto staticInst = gpuDynInst->staticInstruction();
1345 if (gpuDynInst->isLoad()) {
1346 elemSize = staticInst->getOperandSize(2);
1347 } else {
1348 assert(gpuDynInst->isStore());
1349 elemSize = staticInst->getOperandSize(1);
1350 }
1351
1352 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1353 if (gpuDynInst->exec_mask[lane]) {
1354 VecElemU32 vgpr_offset =
1355 instData.SVE ? voffset[lane] : 0;
1356
1357 gpuDynInst->addr.at(lane) = flat_scratch_addr
1358 + swizzle(vgpr_offset + offset, lane, elemSize);
1359 }
1360 }
1361 }
1362
1363 if (isFlat()) {
1364 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
1365 } else if (isFlatGlobal()) {
1366 gpuDynInst->staticInstruction()->executed_as =
1367 enums::SC_GLOBAL;
1368 } else {
1369 assert(isFlatScratch());
1370 gpuDynInst->staticInstruction()->executed_as =
1371 enums::SC_PRIVATE;
1372 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
1373 }
1374 }
1375
1376 void
1378 {
1379 if ((gpuDynInst->executedAs() == enums::SC_GLOBAL && isFlat())
1380 || isFlatGlobal()) {
1381 gpuDynInst->computeUnit()->globalMemoryPipe
1382 .issueRequest(gpuDynInst);
1383 } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1384 assert(isFlat());
1385 gpuDynInst->computeUnit()->localMemoryPipe
1386 .issueRequest(gpuDynInst);
1387 } else {
1388 assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
1389 gpuDynInst->computeUnit()->globalMemoryPipe
1390 .issueRequest(gpuDynInst);
1391 }
1392 }
1393
1394 // Execute for atomics is identical besides the flag set in the
1395 // constructor, except cmpswap. For cmpswap, the offset to the "cmp"
1396 // register is needed. For all other operations this offset is zero
1397 // and implies the atomic is not a cmpswap.
1398 // RegT defines the type of GPU register (e.g., ConstVecOperandU32)
1399 // LaneT defines the type of the register elements (e.g., VecElemU32)
1400 template<typename RegT, typename LaneT, int CmpRegOffset = 0>
1401 void
1403 {
1404 Wavefront *wf = gpuDynInst->wavefront();
1405
1406 if (gpuDynInst->exec_mask.none()) {
1407 wf->decVMemInstsIssued();
1408 if (isFlat()) {
1409 wf->decLGKMInstsIssued();
1410 }
1411 return;
1412 }
1413
1414 gpuDynInst->execUnitId = wf->execUnitId;
1415 gpuDynInst->latency.init(gpuDynInst->computeUnit());
1416 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
1417
1418 RegT data(gpuDynInst, extData.DATA);
1419 RegT cmp(gpuDynInst, extData.DATA + CmpRegOffset);
1420
1421 data.read();
1422 if constexpr (CmpRegOffset) {
1423 cmp.read();
1424 }
1425
1427
1428 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1429 if (gpuDynInst->exec_mask[lane]) {
1430 if constexpr (CmpRegOffset) {
1431 (reinterpret_cast<VecElemU32*>(
1432 gpuDynInst->x_data))[lane] = data[lane];
1433 (reinterpret_cast<VecElemU32*>(
1434 gpuDynInst->a_data))[lane] = cmp[lane];
1435 } else {
1436 (reinterpret_cast<LaneT*>(gpuDynInst->a_data))[lane]
1437 = data[lane];
1438 }
1439 }
1440 }
1441
1442 issueRequestHelper(gpuDynInst);
1443 }
1444
1445 // RegT defines the type of GPU register (e.g., ConstVecOperandU32)
1446 // LaneT defines the type of the register elements (e.g., VecElemU32)
1447 template<typename RegT, typename LaneT>
1448 void
1450 {
1451 if (isAtomicRet()) {
1452 RegT vdst(gpuDynInst, extData.VDST);
1453
1454 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1455 if (gpuDynInst->exec_mask[lane]) {
1456 vdst[lane] = (reinterpret_cast<LaneT*>(
1457 gpuDynInst->d_data))[lane];
1458 }
1459 }
1460
1461 vdst.write();
1462 }
1463 }
1464
1465 bool
1467 {
1468 return (extData.SADDR != 0x7f);
1469 }
1470
1471 // first instruction DWORD
1473 // second instruction DWORD
1475
1476 private:
1477 void initFlatOperandInfo();
1479
1482
1483 void
1486 {
1487 // Use SGPR pair as a base address and add VGPR-offset and
1488 // instruction offset. The VGPR-offset is always 32-bits so we
1489 // mask any upper bits from the vaddr.
1490 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1491 if (gpuDynInst->exec_mask[lane]) {
1492 ScalarRegI32 voffset = vaddr[lane];
1493 gpuDynInst->addr.at(lane) =
1494 saddr.rawData() + voffset + offset;
1495 }
1496 }
1497 }
1498
1499 void
1502 {
1503 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
1504 if (gpuDynInst->exec_mask[lane]) {
1505 gpuDynInst->addr.at(lane) = addr[lane] + offset;
1506 }
1507 }
1508 }
1509
1511 swizzle(VecElemU32 offset, int lane, int elem_size)
1512 {
1513 // This is not described in the spec. We use the swizzle from
1514 // buffer memory instructions and fix the stride to 4. Multiply
1515 // the thread ID by the storage size to avoid threads clobbering
1516 // their data.
1517 return ((offset / 4) * 4 * 64)
1518 + (offset % 4) + (lane * elem_size);
1519 }
1520
1521 Addr
1523 {
1524 return gpuDynInst->computeUnit()->shader->getScratchBase();
1525 }
1526 }; // Inst_FLAT
1527} // namespace VegaISA
1528} // namespace gem5
1529
1530#endif // __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
#define DPRINTF(x,...)
Definition trace.hh:210
const char data[]
const std::string & opcode() const
void write(const uint32_t index, const T value)
a write operation
Definition lds_state.hh:111
T atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
an atomic operation
Definition lds_state.hh:153
T read(const uint32_t index)
a read operation
Definition lds_state.hh:72
void initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
void initOperandInfo() override
void initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
void initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
Inst_DS(InFmt_DS *, const std::string &opcode)
void calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
void generateDisassembly() override
int instSize() const override
void initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
void initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
Inst_EXP(InFmt_EXP *, const std::string &opcode)
void initOperandInfo() override
int instSize() const override
void atomicComplete(GPUDynInstPtr gpuDynInst)
VecElemU32 swizzle(VecElemU32 offset, int lane, int elem_size)
void calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr, ConstScalarOperandU64 &saddr, ScalarRegI32 offset)
void calcAddr(GPUDynInstPtr gpuDynInst, ScalarRegU32 vaddr, ScalarRegU32 saddr, ScalarRegI32 offset)
Inst_FLAT(InFmt_FLAT *, const std::string &opcode)
void initMemRead(GPUDynInstPtr gpuDynInst)
Addr readFlatScratch(GPUDynInstPtr gpuDynInst)
void atomicExecute(GPUDynInstPtr gpuDynInst)
void generateDisassembly() override
void calcAddrVgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr, ScalarRegI32 offset)
void initOperandInfo() override
void issueRequestHelper(GPUDynInstPtr gpuDynInst)
void initAtomicAccess(GPUDynInstPtr gpuDynInst)
void initMemWrite(GPUDynInstPtr gpuDynInst)
int instSize() const override
int instSize() const override
void initOperandInfo() override
Inst_MIMG(InFmt_MIMG *, const std::string &opcode)
int instSize() const override
bool hasSecondDword(InFmt_MTBUF *)
void initOperandInfo() override
Inst_MTBUF(InFmt_MTBUF *, const std::string &opcode)
void initOperandInfo() override
void generateDisassembly() override
void initMemWrite(GPUDynInstPtr gpuDynInst)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
void calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx, SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
MUBUF insructions calculate their addresses as follows:
void initMemRead(GPUDynInstPtr gpuDynInst)
void initAtomicAccess(GPUDynInstPtr gpuDynInst)
Inst_MUBUF(InFmt_MUBUF *, const std::string &opcode)
int instSize() const override
void initOperandInfo() override
void calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
Inst_SMEM(InFmt_SMEM *, const std::string &opcode)
void generateDisassembly() override
void initMemRead(GPUDynInstPtr gpuDynInst)
initiate a memory read access for N dwords
int instSize() const override
void calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr, ScalarRegU32 offset)
For normal s_load_dword/s_store_dword instruction addresses.
void initMemWrite(GPUDynInstPtr gpuDynInst)
initiate a memory write access for N dwords
int instSize() const override
void generateDisassembly() override
void initOperandInfo() override
Inst_SOP1(InFmt_SOP1 *, const std::string &opcode)
bool hasSecondDword(InFmt_SOP1 *)
bool hasSecondDword(InFmt_SOP2 *)
void generateDisassembly() override
Inst_SOP2(InFmt_SOP2 *, const std::string &opcode)
void initOperandInfo() override
int instSize() const override
int instSize() const override
bool hasSecondDword(InFmt_SOPC *)
void generateDisassembly() override
void initOperandInfo() override
Inst_SOPC(InFmt_SOPC *, const std::string &opcode)
Inst_SOPK(InFmt_SOPK *, const std::string &opcode)
int instSize() const override
void generateDisassembly() override
bool hasSecondDword(InFmt_SOPK *)
void initOperandInfo() override
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
Inst_SOPP(InFmt_SOPP *, const std::string &opcode)
int instSize() const override
Inst_VINTRP(InFmt_VINTRP *, const std::string &opcode)
Inst_VOP1(InFmt_VOP1 *, const std::string &opcode)
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
bool hasSecondDword(InFmt_VOP1 *)
T dppHelper(GPUDynInstPtr gpuDynInst, T &src1)
void initOperandInfo() override
void vop2Helper(GPUDynInstPtr gpuDynInst, void(*fOpImpl)(T &, T &, T &, Wavefront *))
int instSize() const override
T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T &src1)
void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T &vdst)
bool hasSecondDword(InFmt_VOP2 *)
Inst_VOP2(InFmt_VOP2 *, const std::string &opcode)
void generateDisassembly() override
Inst_VOP3A(InFmt_VOP3A *, const std::string &opcode, bool sgpr_dst)
void generateDisassembly() override
const bool sgprDst
the v_cmp and readlane instructions in the VOP3 encoding are unique because they are the only instruc...
T omodModifier(T val, unsigned omod)
int instSize() const override
void initOperandInfo() override
bool hasSecondDword(InFmt_VOP3A *)
Inst_VOP3B(InFmt_VOP3B *, const std::string &opcode)
bool hasSecondDword(InFmt_VOP3B *)
void initOperandInfo() override
void generateDisassembly() override
int instSize() const override
bool hasSecondDword(InFmt_VOP3P_MAI *)
Inst_VOP3P_MAI(InFmt_VOP3P_MAI *, const std::string &opcode)
int instSize() const override
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
void initOperandInfo() override
T word(uint32_t data, int opSel, int neg, int opSelBit)
void generateDisassembly() override
int instSize() const override
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, T, bool))
bool hasSecondDword(InFmt_VOP3P *)
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
Inst_VOP3P(InFmt_VOP3P *, const std::string &opcode)
bool hasSecondDword(InFmt_VOPC *)
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
Inst_VOPC(InFmt_VOPC *, const std::string &opcode)
void read() override
read from and write to the underlying register(s) that this operand is referring to.
Definition operand.hh:409
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
Definition operand.hh:392
void read() override
read from the vrf.
Definition operand.hh:147
void write() override
write to the vrf.
Definition operand.hh:199
void decVMemInstsIssued()
void decLGKMInstsIssued()
LdsChunk * ldsChunk
Definition wavefront.hh:230
VectorMask & execMask()
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
Definition amo.hh:269
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
Definition bitfield.hh:129
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
constexpr unsigned NumVecElemPerVecReg
Definition vec.hh:61
Bitfield< 21, 20 > stride
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
Definition inst_util.hh:836
uint32_t VecElemU32
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
Definition inst_util.hh:892
uint32_t ScalarRegU32
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Definition inst_util.hh:424
Bitfield< 63 > val
Definition misc.hh:804
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
void initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type, bool is_atomic=false)
Helper function for instructions declared in op_encodings.
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
Definition misc.hh:48
void initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
Helper function for scalar instructions declared in op_encodings.

Generated on Tue Jun 18 2024 16:23:47 for gem5 by doxygen 1.11.0