32 #ifndef __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
33 #define __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__
39 #include "debug/GPUExec.hh"
40 #include "debug/VEGA.hh"
195 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
206 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
218 gpu_dyn_inst->scalarAddr =
vaddr;
232 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.
rawDataPtr(),
240 if (!rsrc_desc.stride &&
offset >= rsrc_desc.numRecords) {
241 clamped_offset = rsrc_desc.numRecords;
242 }
else if (rsrc_desc.stride &&
offset
243 > (rsrc_desc.stride * rsrc_desc.numRecords)) {
244 clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
247 Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
248 gpu_dyn_inst->scalarAddr =
vaddr;
284 origSrc0_sdwa.read();
287 DPRINTF(VEGA,
"Handling %s SRC SDWA. SRC0: register v[%d], "
288 "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "
289 "%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "
290 "SRC1_NEG: %d, SRC1_ABS: %d\n",
315 origVdst[lane] = vdst[lane];
328 DPRINTF(VEGA,
"Handling %s SRC DPP. SRC0: register v[%d], "
329 "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "
330 "SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",
356 fOpImpl(src0_sdwa, src1, vdst, wf);
359 T src0_dpp =
dppHelper(gpuDynInst, src1);
360 fOpImpl(src0_dpp, src1, vdst, wf);
362 fOpImpl(src0, src1, vdst, wf);
501 if (gpuDynInst->exec_mask[lane]) {
504 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
517 if (gpuDynInst->exec_mask[lane]) {
519 for (
int i = 0;
i < N; ++
i) {
521 gpuDynInst->d_data))[lane * N +
i]
536 if (gpuDynInst->exec_mask[lane]) {
537 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
538 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
540 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2]
542 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2 + 1]
555 if (gpuDynInst->exec_mask[lane]) {
558 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
570 if (gpuDynInst->exec_mask[lane]) {
572 for (
int i = 0;
i < N; ++
i) {
576 gpuDynInst->d_data))[lane * N +
i]);
589 if (gpuDynInst->exec_mask[lane]) {
590 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
591 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
593 gpuDynInst->d_data))[lane * 2]);
595 gpuDynInst->d_data))[lane * 2 + 1]);
607 if (gpuDynInst->exec_mask[lane]) {
611 gpuDynInst->makeAtomicOpFunctor<T>(
612 &(
reinterpret_cast<T*
>(gpuDynInst->a_data))[lane],
613 &(
reinterpret_cast<T*
>(gpuDynInst->x_data))[lane]);
615 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
628 gpuDynInst->addr.at(lane) = (
Addr)
addr[lane];
658 VectorMask old_exec_mask = gpuDynInst->exec_mask;
659 gpuDynInst->exec_mask &= ~
oobMask;
661 gpuDynInst->exec_mask = old_exec_mask;
672 VectorMask old_exec_mask = gpuDynInst->exec_mask;
673 gpuDynInst->exec_mask &= ~
oobMask;
675 gpuDynInst->exec_mask = old_exec_mask;
685 VectorMask old_exec_mask = gpuDynInst->exec_mask;
686 gpuDynInst->exec_mask &= ~
oobMask;
688 gpuDynInst->exec_mask = old_exec_mask;
698 VectorMask old_exec_mask = gpuDynInst->exec_mask;
699 gpuDynInst->exec_mask &= ~
oobMask;
701 gpuDynInst->exec_mask = old_exec_mask;
708 gpuDynInst->resetEntireStatusVector();
709 gpuDynInst->setStatusVector(0, 1);
710 RequestPtr req = std::make_shared<Request>(0, 0, 0,
711 gpuDynInst->computeUnit()->
713 gpuDynInst->wfDynId);
714 gpuDynInst->setRequestFlags(req);
715 gpuDynInst->computeUnit()->
739 template<
typename VOFF,
typename VIDX,
typename SRSRC,
typename SOFF>
742 SRSRC s_rsrc_desc, SOFF s_offset,
int inst_offset)
749 Addr buffer_offset = 0;
752 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
755 base_addr = rsrc_desc.baseAddr;
757 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
758 + rsrc_desc.stride) : rsrc_desc.stride;
761 if (gpuDynInst->exec_mask[lane]) {
762 vaddr = base_addr + s_offset.rawData();
768 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
770 buf_off = v_off[lane] + inst_offset;
772 if (rsrc_desc.swizzleEn) {
773 Addr idx_stride = 8 << rsrc_desc.idxStride;
774 Addr elem_size = 2 << rsrc_desc.elemSize;
775 Addr idx_msb = buf_idx / idx_stride;
776 Addr idx_lsb = buf_idx % idx_stride;
777 Addr off_msb = buf_off / elem_size;
778 Addr off_lsb = buf_off % elem_size;
779 DPRINTF(VEGA,
"mubuf swizzled lane %d: "
780 "idx_stride = %llx, elem_size = %llx, "
781 "idx_msb = %llx, idx_lsb = %llx, "
782 "off_msb = %llx, off_lsb = %llx\n",
783 lane, idx_stride, elem_size, idx_msb, idx_lsb,
786 buffer_offset =(idx_msb *
stride + off_msb * elem_size)
787 * idx_stride + idx_lsb * elem_size + off_lsb;
789 buffer_offset = buf_off +
stride * buf_idx;
800 if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
802 rsrc_desc.numRecords - s_offset.rawData()) {
803 DPRINTF(VEGA,
"mubuf out-of-bounds condition 1: "
804 "lane = %d, buffer_offset = %llx, "
805 "const_stride = %llx, "
806 "const_num_records = %llx\n",
807 lane, buf_off +
stride * buf_idx,
808 stride, rsrc_desc.numRecords);
814 if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
815 if (buf_idx >= rsrc_desc.numRecords ||
817 DPRINTF(VEGA,
"mubuf out-of-bounds condition 2: "
818 "lane = %d, offset = %llx, "
820 "const_num_records = %llx\n",
821 lane, buf_off, buf_idx,
822 rsrc_desc.numRecords);
828 vaddr += buffer_offset;
830 DPRINTF(VEGA,
"Calculating mubuf address for lane %d: "
831 "vaddr = %llx, base_addr = %llx, "
832 "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
835 gpuDynInst->addr.at(lane) =
vaddr;
917 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
919 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
922 if (gpuDynInst->exec_mask[lane]) {
924 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
935 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
937 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
940 if (gpuDynInst->exec_mask[lane]) {
942 for (
int i = 0;
i < N; ++
i) {
944 gpuDynInst->d_data))[lane * N +
i]
957 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
959 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
962 if (gpuDynInst->exec_mask[lane]) {
965 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
975 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
977 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
980 if (gpuDynInst->exec_mask[lane]) {
982 for (
int i = 0;
i < N; ++
i) {
986 gpuDynInst->d_data))[lane * N +
i]);
997 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
999 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1000 Wavefront *wf = gpuDynInst->wavefront();
1002 if (gpuDynInst->exec_mask[lane]) {
1005 gpuDynInst->makeAtomicOpFunctor<T>(
1006 &(
reinterpret_cast<T*
>(
1007 gpuDynInst->a_data))[lane],
1008 &(
reinterpret_cast<T*
>(
1009 gpuDynInst->x_data))[lane]);
1012 (*amo_op)(
reinterpret_cast<uint8_t *
>(&tmp));
1014 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane] = tmp;
1053 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
1055 gpuDynInst->staticInstruction()->executed_as =
1063 if ((gpuDynInst->executedAs() == enums::SC_GLOBAL &&
isFlat())
1065 gpuDynInst->computeUnit()->globalMemoryPipe
1066 .issueRequest(gpuDynInst);
1067 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
1069 gpuDynInst->computeUnit()->localMemoryPipe
1070 .issueRequest(gpuDynInst);
1072 fatal(
"Unsupported scope for flat instruction.\n");
1102 if (gpuDynInst->exec_mask[lane]) {
1104 gpuDynInst->addr.at(lane) =
1115 if (gpuDynInst->exec_mask[lane]) {
1116 gpuDynInst->addr.at(lane) =
addr[lane] +
offset;
1124 #endif // __ARCH_VEGA_INSTS_OP_ENCODINGS_HH__