32#ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
33#define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
40#include "debug/GPUExec.hh"
41#include "debug/VEGA.hh"
219 gpu_dyn_inst->scalarAddr =
vaddr;
233 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.
rawDataPtr(),
249 gpu_dyn_inst->scalarAddr =
vaddr;
279 T src0_sdwa(gpuDynInst,
extData.iFmt_VOP_SDWA.SRC0);
281 T origSrc0_sdwa(gpuDynInst,
extData.iFmt_VOP_SDWA.SRC0);
282 T origSrc1(gpuDynInst,
instData.VSRC1);
285 origSrc0_sdwa.read();
288 DPRINTF(VEGA,
"Handling %s SRC SDWA. SRC0: register v[%d], "
289 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "
290 "%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "
291 "SRC1_NEG: %d, SRC1_ABS: %d\n",
295 extData.iFmt_VOP_SDWA.SRC0_SEXT,
297 extData.iFmt_VOP_SDWA.SRC1_SEL,
298 extData.iFmt_VOP_SDWA.SRC1_SEXT,
299 extData.iFmt_VOP_SDWA.SRC1_NEG,
300 extData.iFmt_VOP_SDWA.SRC1_ABS);
311 T origVdst(gpuDynInst,
instData.VDST);
316 origVdst[lane] = vdst[lane];
326 T src0_dpp(gpuDynInst,
extData.iFmt_VOP_DPP.SRC0);
329 DPRINTF(VEGA,
"Handling %s SRC DPP. SRC0: register v[%d], "
330 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "
331 "SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",
343 template<
typename ConstT,
typename T>
357 fOpImpl(src0_sdwa, src1, vdst, wf);
360 T src0_dpp =
dppHelper(gpuDynInst, src1);
361 fOpImpl(src0_dpp, src1, vdst, wf);
369 ConstT const_src0(gpuDynInst,
instData.SRC0);
370 const_src0.readSrc();
373 vdst[lane] = const_src0[lane];
375 fOpImpl(vdst, src1, vdst, wf);
428 bool sign_ext,
bool neg,
bool abs)
434 int low_bit = 0, high_bit = 0;
458 rv =
bits(dword, high_bit, low_bit);
460 uint32_t sign_bit = 1 << high_bit;
463 if (std::is_integral_v<T> && std::is_unsigned_v<T>) {
464 panic_if(neg,
"SWDAB negation operation on unsigned type!\n");
465 panic_if(sign_ext,
"SWDAB sign extend on unsigned type!\n");
470 if (std::is_integral_v<T>) {
472 if ((rv & sign_bit) && std::is_signed_v<T>) {
473 rv =
sext(rv, high_bit + 1) & 0xFFFFFFFF;
474 rv = std::abs(
static_cast<long long>(rv)) & 0xFFFFFFFF;
478 rv = rv &
mask(high_bit);
483 if (std::is_integral_v<T>) {
486 rv =
sext(rv, high_bit + 1) & 0xFFFFFFFF;
491 rv = rv ^
mask(high_bit);
496 if (std::is_integral_v<T>) {
498 rv =
sext(rv, high_bit + 1) & 0xFFFFFFFF;
504 panic(
"SDWAB sign extend set for non-integral type!\n");
515 DPRINTF(VEGA,
"Handling %s SRC SDWA. SRC0: register %s[%d], "
516 "sDst s[%d], sDst type %s, SRC0_SEL: %d, SRC0_SEXT: %d "
517 "SRC0_NEG: %d, SRC0_ABS: %d, SRC1: register %s[%d], "
518 "SRC1_SEL: %d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: "
520 (
extData.iFmt_VOP_SDWAB.S0 ?
"s" :
"v"),
523 (
extData.iFmt_VOP_SDWAB.SD ?
"SGPR" :
"VCC"),
524 extData.iFmt_VOP_SDWAB.SRC0_SEL,
525 extData.iFmt_VOP_SDWAB.SRC0_SEXT,
526 extData.iFmt_VOP_SDWAB.SRC0_NEG,
527 extData.iFmt_VOP_SDWAB.SRC0_ABS,
528 (
extData.iFmt_VOP_SDWAB.S1 ?
"s" :
"v"),
530 extData.iFmt_VOP_SDWAB.SRC1_SEL,
531 extData.iFmt_VOP_SDWAB.SRC1_SEXT,
532 extData.iFmt_VOP_SDWAB.SRC1_NEG,
533 extData.iFmt_VOP_SDWAB.SRC1_ABS);
536 int src0_idx =
extData.iFmt_VOP_SDWAB.SRC0;
537 src0_idx += (
extData.iFmt_VOP_SDWAB.S0 == 0) ? 0x100 : 0;
541 src1_idx += (
extData.iFmt_VOP_SDWAB.S1 == 0) ? 0x100 : 0;
544 int sdst_idx = (
extData.iFmt_VOP_SDWAB.SD == 1) ?
562 if (gpuDynInst->wavefront()->execMask(lane)) {
564 extData.iFmt_VOP_SDWAB.SRC0_SEXT,
565 extData.iFmt_VOP_SDWAB.SRC0_NEG,
566 extData.iFmt_VOP_SDWAB.SRC0_ABS);
568 extData.iFmt_VOP_SDWAB.SRC1_SEXT,
569 extData.iFmt_VOP_SDWAB.SRC1_NEG,
570 extData.iFmt_VOP_SDWAB.SRC1_ABS);
622 if constexpr (std::is_floating_point_v<T>) {
623 if (omod == 1)
return val * T(2.0f);
624 if (omod == 2)
return val * T(4.0f);
625 if (omod == 3)
return val / T(2.0f);
627 assert(std::is_integral_v<T>);
628 if (omod == 1)
return val * T(2);
629 if (omod == 2)
return val * T(4);
630 if (omod == 3)
return val / T(2);
690 T (*fOpImpl)(T, T,
bool))
707 T upper_val = fOpImpl(
word<T>(S0[lane], opHi, negHi, 0),
708 word<T>(S1[lane], opHi, negHi, 1),
710 T lower_val = fOpImpl(
word<T>(S0[lane], opLo, negLo, 0),
711 word<T>(S1[lane], opLo, negLo, 1),
715 *
reinterpret_cast<uint16_t*
>(&upper_val);
717 *
reinterpret_cast<uint16_t*
>(&lower_val);
719 D[lane] = upper_raw << 16 | lower_raw;
728 T (*fOpImpl)(T, T, T,
bool))
747 T upper_val = fOpImpl(
word<T>(S0[lane], opHi, negHi, 0),
748 word<T>(S1[lane], opHi, negHi, 1),
749 word<T>(S2[lane], opHi, negHi, 2),
751 T lower_val = fOpImpl(
word<T>(S0[lane], opLo, negLo, 0),
752 word<T>(S1[lane], opLo, negLo, 1),
753 word<T>(S2[lane], opLo, negLo, 2),
757 *
reinterpret_cast<uint16_t*
>(&upper_val);
759 *
reinterpret_cast<uint16_t*
>(&lower_val);
761 D[lane] = upper_raw << 16 | lower_raw;
770 uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t,
bool))
804 uint32_t dword1 = (dword1h << 16) | dword1l;
805 uint32_t dword2 = (dword2h << 16) | dword2l;
811 D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
823 word(uint32_t
data,
int opSel,
int neg,
int opSelBit)
826 static_assert(
sizeof(T) == 2);
828 bool select =
bits(opSel, opSelBit, opSelBit);
829 uint16_t raw = select ?
bits(
data, 31, 16)
834 bool negate =
bits(neg, opSelBit, opSelBit);
839 return *
reinterpret_cast<T*
>(&raw);
883 if (gpuDynInst->exec_mask[lane]) {
886 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
899 if (gpuDynInst->exec_mask[lane]) {
901 for (
int i = 0;
i < N; ++
i) {
903 gpuDynInst->d_data))[lane * N +
i]
918 if (gpuDynInst->exec_mask[lane]) {
919 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
920 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
922 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2]
924 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2 + 1]
937 if (gpuDynInst->exec_mask[lane]) {
940 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
952 if (gpuDynInst->exec_mask[lane]) {
954 for (
int i = 0;
i < N; ++
i) {
958 gpuDynInst->d_data))[lane * N +
i]);
971 if (gpuDynInst->exec_mask[lane]) {
972 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
973 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
975 gpuDynInst->d_data))[lane * 2]);
977 gpuDynInst->d_data))[lane * 2 + 1]);
989 if (gpuDynInst->exec_mask[lane]) {
993 gpuDynInst->makeAtomicOpFunctor<T>(
994 &(
reinterpret_cast<T*
>(gpuDynInst->a_data))[lane],
995 &(
reinterpret_cast<T*
>(gpuDynInst->x_data))[lane]);
997 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
1006 Wavefront *wf = gpuDynInst->wavefront();
1010 gpuDynInst->addr.at(lane) = (
Addr)
addr[lane];
1033 template<
typename T>
1040 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1043 gpuDynInst->exec_mask = old_exec_mask;
1054 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1057 gpuDynInst->exec_mask = old_exec_mask;
1060 template<
typename T>
1067 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1070 gpuDynInst->exec_mask = old_exec_mask;
1080 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1083 gpuDynInst->exec_mask = old_exec_mask;
1086 template<
typename T>
1093 VectorMask old_exec_mask = gpuDynInst->exec_mask;
1096 gpuDynInst->exec_mask = old_exec_mask;
1103 gpuDynInst->resetEntireStatusVector();
1104 gpuDynInst->setStatusVector(0, 1);
1105 RequestPtr req = std::make_shared<Request>(0, 0, 0,
1106 gpuDynInst->computeUnit()->
1108 gpuDynInst->wfDynId);
1109 gpuDynInst->setRequestFlags(req);
1110 gpuDynInst->computeUnit()->
1114 template<
int NumDwords,
int SignBit = 0>
1120 Wavefront *wf = gpuDynInst->wavefront();
1127 uint32_t m0_offset =
bits(lds_offset.
rawData(), 17, 2);
1128 uint32_t lds_addr = m0_offset * 4 + inst_offset;
1131 if (gpuDynInst->exec_mask[lane]) {
1132 uint32_t chunk_addr =
1133 lds_addr + lane * NumDwords *
sizeof(
VecElemU32);
1135 for (
int i = 0;
i < NumDwords; ++
i) {
1137 gpuDynInst->d_data))[lane * NumDwords +
i];
1138 if constexpr (SignBit != 0) {
1169 template<
typename VOFF,
typename VIDX,
typename SRSRC,
typename SOFF>
1172 SRSRC s_rsrc_desc, SOFF s_offset,
int inst_offset)
1179 Addr buffer_offset = 0;
1182 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
1191 if (gpuDynInst->exec_mask[lane]) {
1192 vaddr = base_addr + s_offset.rawData();
1198 buf_idx = v_idx[lane] + (rsrc_desc.
addTidEn ? lane : 0);
1200 buf_off = v_off[lane] + inst_offset;
1205 Addr idx_msb = buf_idx / idx_stride;
1206 Addr idx_lsb = buf_idx % idx_stride;
1207 Addr off_msb = buf_off / elem_size;
1208 Addr off_lsb = buf_off % elem_size;
1209 DPRINTF(VEGA,
"mubuf swizzled lane %d: "
1210 "idx_stride = %llx, elem_size = %llx, "
1211 "idx_msb = %llx, idx_lsb = %llx, "
1212 "off_msb = %llx, off_lsb = %llx\n",
1213 lane, idx_stride, elem_size, idx_msb, idx_lsb,
1216 buffer_offset =(idx_msb *
stride + off_msb * elem_size)
1217 * idx_stride + idx_lsb * elem_size + off_lsb;
1219 buffer_offset = buf_off +
stride * buf_idx;
1231 if (buffer_offset >=
1233 DPRINTF(VEGA,
"mubuf out-of-bounds condition 1: "
1234 "lane = %d, buffer_offset = %llx, "
1235 "const_stride = %llx, "
1236 "const_num_records = %llx\n",
1237 lane, buf_off +
stride * buf_idx,
1247 DPRINTF(VEGA,
"mubuf out-of-bounds condition 2: "
1248 "lane = %d, offset = %llx, "
1250 "const_num_records = %llx\n",
1251 lane, buf_off, buf_idx,
1258 vaddr += buffer_offset;
1260 DPRINTF(VEGA,
"Calculating mubuf address for lane %d: "
1261 "vaddr = %llx, base_addr = %llx, "
1262 "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
1265 gpuDynInst->addr.at(lane) =
vaddr;
1343 template<
typename T>
1347 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1349 }
else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1352 static_assert(
sizeof(T) <= 4);
1354 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1355 Wavefront *wf = gpuDynInst->wavefront();
1357 if (gpuDynInst->exec_mask[lane]) {
1359 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
1370 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1372 }
else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1374 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1375 Wavefront *wf = gpuDynInst->wavefront();
1377 if (gpuDynInst->exec_mask[lane]) {
1379 for (
int i = 0;
i < N; ++
i) {
1381 gpuDynInst->d_data))[lane * N +
i]
1390 template<
typename T>
1394 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1396 }
else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1399 static_assert(
sizeof(T) <= 4);
1401 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1402 Wavefront *wf = gpuDynInst->wavefront();
1404 if (gpuDynInst->exec_mask[lane]) {
1407 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
1417 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1419 }
else if (gpuDynInst->executedAs() == enums::SC_PRIVATE) {
1422 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1423 Wavefront *wf = gpuDynInst->wavefront();
1425 if (gpuDynInst->exec_mask[lane]) {
1427 for (
int i = 0;
i < N; ++
i) {
1431 gpuDynInst->d_data))[lane * N +
i]);
1438 template<
typename T>
1444 assert(gpuDynInst->executedAs() != enums::SC_PRIVATE);
1446 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
1448 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1449 Wavefront *wf = gpuDynInst->wavefront();
1451 if (gpuDynInst->exec_mask[lane]) {
1454 gpuDynInst->makeAtomicOpFunctor<T>(
1455 &(
reinterpret_cast<T*
>(
1456 gpuDynInst->a_data))[lane],
1457 &(
reinterpret_cast<T*
>(
1458 gpuDynInst->x_data))[lane]);
1461 (*amo_op)(
reinterpret_cast<uint8_t *
>(&tmp));
1463 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane] = tmp;
1503 }
else if (saddr != 0x7f) {
1524 [[maybe_unused]]
int elemSize;
1525 [[maybe_unused]]
auto staticInst =
1526 gpuDynInst->staticInstruction();
1527 if (gpuDynInst->isLoad()) {
1528 elemSize = staticInst->getOperandSize(2);
1530 assert(gpuDynInst->isStore());
1531 elemSize = staticInst->getOperandSize(1);
1536 assert((
offset % elemSize) == 0);
1537 assert((swizzleOffset % 4) == 0);
1541 if (gpuDynInst->exec_mask[lane]) {
1542 swizzleOffset +=
instData.SVE ? voffset[lane] : 0;
1543 gpuDynInst->addr.at(lane) = flat_scratch_addr
1562 if (gpuDynInst->exec_mask[lane]) {
1566 gpuDynInst->addr.at(lane) = flat_scratch_addr
1573 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
1575 gpuDynInst->staticInstruction()->executed_as =
1579 gpuDynInst->staticInstruction()->executed_as =
1581 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
1588 if ((gpuDynInst->executedAs() == enums::SC_GLOBAL &&
isFlat())
1590 gpuDynInst->computeUnit()->globalMemoryPipe
1591 .issueRequest(gpuDynInst);
1592 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1594 gpuDynInst->computeUnit()->localMemoryPipe
1595 .issueRequest(gpuDynInst);
1597 assert(gpuDynInst->executedAs() == enums::SC_PRIVATE);
1598 gpuDynInst->computeUnit()->globalMemoryPipe
1599 .issueRequest(gpuDynInst);
1609 template<
typename RegT,
typename LaneT,
int CmpRegOffset = 0>
1613 Wavefront *wf = gpuDynInst->wavefront();
1615 if (gpuDynInst->exec_mask.none()) {
1626 gpuDynInst->latency.init(gpuDynInst->computeUnit());
1627 gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
1630 RegT cmp(gpuDynInst,
extData.DATA + CmpRegOffset);
1633 if constexpr (CmpRegOffset) {
1640 if (gpuDynInst->exec_mask[lane]) {
1641 if constexpr (CmpRegOffset) {
1643 gpuDynInst->x_data))[lane] =
data[lane];
1645 gpuDynInst->a_data))[lane] = cmp[lane];
1647 (
reinterpret_cast<LaneT*
>(gpuDynInst->a_data))[lane]
1658 template<
typename RegT,
typename LaneT>
1663 RegT vdst(gpuDynInst,
extData.VDST);
1666 if (gpuDynInst->exec_mask[lane]) {
1667 vdst[lane] = (
reinterpret_cast<LaneT*
>(
1668 gpuDynInst->d_data))[lane];
1685 static_assert(N > 1);
1689 for (
int dword = 0; dword < N; ++dword) {
1692 gpuDynInst->d_data))[lane * N + dword];
1696 for (
int dword = 0; dword < N; ++dword) {
1698 gpuDynInst->d_data))[lane * N + dword] =
1699 data[lane * N + dword];
1704 template<
int NumDwords,
int SignBit = 0>
1710 Wavefront *wf = gpuDynInst->wavefront();
1716 uint32_t m0_offset =
bits(lds_offset.
rawData(), 17, 2);
1717 uint32_t lds_addr = m0_offset * 4 + inst_offset;
1720 if (gpuDynInst->exec_mask[lane]) {
1721 uint32_t chunk_addr =
1722 lds_addr + lane * NumDwords *
sizeof(
VecElemU32);
1724 for (
int i = 0;
i < NumDwords; ++
i) {
1726 gpuDynInst->d_data))[lane * NumDwords +
i];
1727 if constexpr (SignBit != 0) {
1741 return (
extData.SADDR != 0x7f);
1764 if (gpuDynInst->exec_mask[lane]) {
1766 gpuDynInst->addr.at(lane) =
1777 if (gpuDynInst->exec_mask[lane]) {
1778 gpuDynInst->addr.at(lane) =
addr[lane] +
offset;
1791 return ((
offset / 4) * 4 * 64) + (
offset % 4) + (tid * 4);
1797 return gpuDynInst->computeUnit()->shader->getScratchBase();
const std::string & opcode() const
bool isFlatGlobal() const
const std::string _opcode
bool isFlatScratch() const
void write(const uint32_t index, const T value)
a write operation
T atomic(const uint32_t index, AtomicOpFunctorPtr amoOp)
an atomic operation
T read(const uint32_t index)
a read operation
void initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
void initOperandInfo() override
void initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
void initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
Inst_DS(InFmt_DS *, const std::string &opcode)
void calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
void generateDisassembly() override
int instSize() const override
void initAtomicAccess(GPUDynInstPtr gpuDynInst, Addr offset)
void initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
Inst_EXP(InFmt_EXP *, const std::string &opcode)
void initOperandInfo() override
int instSize() const override
void generateFlatDisassembly()
void atomicComplete(GPUDynInstPtr gpuDynInst)
void calcAddrSgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &vaddr, ConstScalarOperandU64 &saddr, ScalarRegI32 offset)
void initFlatOperandInfo()
void calcAddr(GPUDynInstPtr gpuDynInst, ScalarRegU32 vaddr, ScalarRegU32 saddr, ScalarRegI32 offset)
void swizzleData(GPUDynInstPtr gpuDynInst)
Inst_FLAT(InFmt_FLAT *, const std::string &opcode)
void initMemRead(GPUDynInstPtr gpuDynInst)
Addr readFlatScratch(GPUDynInstPtr gpuDynInst)
void atomicExecute(GPUDynInstPtr gpuDynInst)
void generateDisassembly() override
void calcAddrVgpr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr, ScalarRegI32 offset)
VecElemI32 swizzleAddr(VecElemI32 offset, int tid)
void generateGlobalScratchDisassembly()
void initOperandInfo() override
void initGlobalScratchOperandInfo()
void ldsComplete(GPUDynInstPtr gpuDynInst)
void issueRequestHelper(GPUDynInstPtr gpuDynInst)
void initAtomicAccess(GPUDynInstPtr gpuDynInst)
void initMemWrite(GPUDynInstPtr gpuDynInst)
int instSize() const override
int instSize() const override
void initOperandInfo() override
Inst_MIMG(InFmt_MIMG *, const std::string &opcode)
int instSize() const override
bool hasSecondDword(InFmt_MTBUF *)
void initOperandInfo() override
Inst_MTBUF(InFmt_MTBUF *, const std::string &opcode)
void initOperandInfo() override
void generateDisassembly() override
void initMemWrite(GPUDynInstPtr gpuDynInst)
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
void calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx, SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
MUBUF insructions calculate their addresses as follows:
void ldsComplete(GPUDynInstPtr gpuDynInst)
void initMemRead(GPUDynInstPtr gpuDynInst)
void initAtomicAccess(GPUDynInstPtr gpuDynInst)
Inst_MUBUF(InFmt_MUBUF *, const std::string &opcode)
int instSize() const override
void initOperandInfo() override
void calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU128 &s_rsrc_desc, ScalarRegU32 offset)
For s_buffer_load_dword/s_buffer_store_dword instruction addresses.
Inst_SMEM(InFmt_SMEM *, const std::string &opcode)
void generateDisassembly() override
void initMemRead(GPUDynInstPtr gpuDynInst)
initiate a memory read access for N dwords
int instSize() const override
void calcAddr(GPUDynInstPtr gpu_dyn_inst, ConstScalarOperandU64 &addr, ScalarRegU32 offset)
For normal s_load_dword/s_store_dword instruction addresses.
void initMemWrite(GPUDynInstPtr gpuDynInst)
initiate a memory write access for N dwords
int instSize() const override
void generateDisassembly() override
void initOperandInfo() override
Inst_SOP1(InFmt_SOP1 *, const std::string &opcode)
bool hasSecondDword(InFmt_SOP1 *)
bool hasSecondDword(InFmt_SOP2 *)
void generateDisassembly() override
Inst_SOP2(InFmt_SOP2 *, const std::string &opcode)
void initOperandInfo() override
int instSize() const override
int instSize() const override
bool hasSecondDword(InFmt_SOPC *)
void generateDisassembly() override
void initOperandInfo() override
Inst_SOPC(InFmt_SOPC *, const std::string &opcode)
Inst_SOPK(InFmt_SOPK *, const std::string &opcode)
int instSize() const override
void generateDisassembly() override
bool hasSecondDword(InFmt_SOPK *)
void initOperandInfo() override
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
Inst_SOPP(InFmt_SOPP *, const std::string &opcode)
int instSize() const override
Inst_VINTRP(InFmt_VINTRP *, const std::string &opcode)
Inst_VOP1(InFmt_VOP1 *, const std::string &opcode)
void generateDisassembly() override
void initOperandInfo() override
int instSize() const override
bool hasSecondDword(InFmt_VOP1 *)
T dppHelper(GPUDynInstPtr gpuDynInst, T &src1)
void initOperandInfo() override
void vop2Helper(GPUDynInstPtr gpuDynInst, void(*fOpImpl)(T &, T &, T &, Wavefront *))
int instSize() const override
T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T &src1)
void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T &vdst)
bool hasSecondDword(InFmt_VOP2 *)
Inst_VOP2(InFmt_VOP2 *, const std::string &opcode)
void generateDisassembly() override
Inst_VOP3A(InFmt_VOP3A *, const std::string &opcode, bool sgpr_dst)
void generateDisassembly() override
const bool sgprDst
the v_cmp and readlane instructions in the VOP3 encoding are unique because they are the only instruc...
T omodModifier(T val, unsigned omod)
int instSize() const override
void initOperandInfo() override
bool hasSecondDword(InFmt_VOP3A *)
Inst_VOP3B(InFmt_VOP3B *, const std::string &opcode)
bool hasSecondDword(InFmt_VOP3B *)
void initOperandInfo() override
void generateDisassembly() override
int instSize() const override
bool hasSecondDword(InFmt_VOP3P_MAI *)
void generateDisassembly() override
void initOperandInfo() override
Inst_VOP3P_MAI(InFmt_VOP3P_MAI *, const std::string &opcode)
int instSize() const override
InFmt_VOP3P_MAI_1 extData
void dotHelper(GPUDynInstPtr gpuDynInst, uint32_t(*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
void initOperandInfo() override
T word(uint32_t data, int opSel, int neg, int opSelBit)
void generateDisassembly() override
int instSize() const override
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, T, bool))
bool hasSecondDword(InFmt_VOP3P *)
void vop3pHelper(GPUDynInstPtr gpuDynInst, T(*fOpImpl)(T, T, bool))
Inst_VOP3P(InFmt_VOP3P *, const std::string &opcode)
bool hasSecondDword(InFmt_VOPC *)
uint32_t sdwabSelect(uint32_t dword, const SDWASelVals sel, bool sign_ext, bool neg, bool abs)
void generateDisassembly() override
void sdwabHelper(GPUDynInstPtr gpuDynInst, int(*cmpFunc)(T, T))
void initOperandInfo() override
int instSize() const override
Inst_VOPC(InFmt_VOPC *, const std::string &opcode)
void read() override
read from and write to the underlying register(s) that this operand is referring to.
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
std::enable_if< Condition, void >::type setBit(int bit, int bit_val)
bit access to scalar data.
VEGAGPUStaticInst(const std::string &opcode)
void read() override
read from the vrf.
void readSrc()
certain vector operands can read from the vrf/srf or constants.
void write() override
write to the vrf.
void decVMemInstsIssued()
void untrackLGKMInst(GPUDynInstPtr gpu_dyn_inst)
void decLGKMInstsIssued()
void untrackVMemInst(GPUDynInstPtr gpu_dyn_inst)
std::unique_ptr< AtomicOpFunctor > AtomicOpFunctorPtr
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
constexpr uint64_t sext(uint64_t val)
Sign-extend an N-bit value to 64 bits.
#define panic(...)
This implements a cprintf based panic() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Bitfield< 21, 20 > stride
classes that represnt vector/scalar operands in VEGA ISA.
ScalarOperand< ScalarRegU64, false > ScalarOperandU64
VecOperand< VecElemU32, false > VecOperandU32
ScalarOperand< ScalarRegU32, true > ConstScalarOperandU32
VecOperand< VecElemU32, true > ConstVecOperandU32
ScalarOperand< ScalarRegU32, true, 4 > ConstScalarOperandU128
void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T &src0, T &origSrc0)
processSDWA_src is a helper function for implementing sub d-word addressing instructions for the src ...
ScalarOperand< ScalarRegU64, true > ConstScalarOperandU64
void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T &dst, T &origDst)
processSDWA_dst is a helper function for implementing sub d-word addressing instructions for the dst ...
const int NumVecElemPerVecReg(64)
VecOperand< VecElemU64, true > ConstVecOperandU64
void processDPP(GPUDynInstPtr gpuDynInst, InFmt_VOP_DPP dppInst, T &src0)
processDPP is a helper function for implementing Data Parallel Primitive instructions.
Copyright (c) 2024 Arm Limited All rights reserved.
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
void initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type, bool is_atomic=false)
Helper function for instructions declared in op_encodings.
void initScratchReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
void initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
Helper function for scalar instructions declared in op_encodings.