34 #ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
35 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
41 #include "debug/GCN3.hh"
42 #include "debug/GPUExec.hh"
197 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
208 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
220 gpu_dyn_inst->scalarAddr =
vaddr;
234 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.
rawDataPtr(),
242 if (!rsrc_desc.stride &&
offset >= rsrc_desc.numRecords) {
243 clamped_offset = rsrc_desc.numRecords;
244 }
else if (rsrc_desc.stride &&
offset
245 > (rsrc_desc.stride * rsrc_desc.numRecords)) {
246 clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
249 Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
250 gpu_dyn_inst->scalarAddr =
vaddr;
410 if (gpuDynInst->exec_mask[lane]) {
413 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
426 if (gpuDynInst->exec_mask[lane]) {
428 for (
int i = 0;
i < N; ++
i) {
430 gpuDynInst->d_data))[lane * N +
i]
445 if (gpuDynInst->exec_mask[lane]) {
446 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
447 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
449 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2]
451 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2 + 1]
464 if (gpuDynInst->exec_mask[lane]) {
467 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
479 if (gpuDynInst->exec_mask[lane]) {
481 for (
int i = 0;
i < N; ++
i) {
485 gpuDynInst->d_data))[lane * N +
i]);
498 if (gpuDynInst->exec_mask[lane]) {
499 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
500 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
502 gpuDynInst->d_data))[lane * 2]);
504 gpuDynInst->d_data))[lane * 2 + 1]);
516 gpuDynInst->addr.at(lane) = (
Addr)
addr[lane];
546 VectorMask old_exec_mask = gpuDynInst->exec_mask;
547 gpuDynInst->exec_mask &= ~
oobMask;
549 gpuDynInst->exec_mask = old_exec_mask;
560 VectorMask old_exec_mask = gpuDynInst->exec_mask;
561 gpuDynInst->exec_mask &= ~
oobMask;
563 gpuDynInst->exec_mask = old_exec_mask;
573 VectorMask old_exec_mask = gpuDynInst->exec_mask;
574 gpuDynInst->exec_mask &= ~
oobMask;
576 gpuDynInst->exec_mask = old_exec_mask;
586 VectorMask old_exec_mask = gpuDynInst->exec_mask;
587 gpuDynInst->exec_mask &= ~
oobMask;
589 gpuDynInst->exec_mask = old_exec_mask;
596 gpuDynInst->resetEntireStatusVector();
597 gpuDynInst->setStatusVector(0, 1);
598 RequestPtr req = std::make_shared<Request>(0, 0, 0,
599 gpuDynInst->computeUnit()->
601 gpuDynInst->wfDynId);
602 gpuDynInst->setRequestFlags(req);
603 gpuDynInst->computeUnit()->
627 template<
typename VOFF,
typename VIDX,
typename SRSRC,
typename SOFF>
630 SRSRC s_rsrc_desc, SOFF s_offset,
int inst_offset)
639 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
642 base_addr = rsrc_desc.baseAddr;
644 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
645 + rsrc_desc.stride) : rsrc_desc.stride;
648 if (gpuDynInst->exec_mask[lane]) {
649 vaddr = base_addr + s_offset.rawData();
655 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
657 buf_off = v_off[lane] + inst_offset;
667 if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
668 if (buf_off +
stride * buf_idx >=
669 rsrc_desc.numRecords - s_offset.rawData()) {
670 DPRINTF(GCN3,
"mubuf out-of-bounds condition 1: "
671 "lane = %d, buffer_offset = %llx, "
672 "const_stride = %llx, "
673 "const_num_records = %llx\n",
674 lane, buf_off +
stride * buf_idx,
675 rsrc_desc.stride, rsrc_desc.numRecords);
681 if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
682 if (buf_idx >= rsrc_desc.numRecords ||
684 DPRINTF(GCN3,
"mubuf out-of-bounds condition 2: "
685 "lane = %d, offset = %llx, "
687 "const_num_records = %llx\n",
688 lane, buf_off, buf_idx,
689 rsrc_desc.numRecords);
695 if (rsrc_desc.swizzleEn) {
696 Addr idx_stride = 8 << rsrc_desc.idxStride;
697 Addr elem_size = 2 << rsrc_desc.elemSize;
698 Addr idx_msb = buf_idx / idx_stride;
699 Addr idx_lsb = buf_idx % idx_stride;
700 Addr off_msb = buf_off / elem_size;
701 Addr off_lsb = buf_off % elem_size;
702 DPRINTF(GCN3,
"mubuf swizzled lane %d: "
703 "idx_stride = %llx, elem_size = %llx, "
704 "idx_msb = %llx, idx_lsb = %llx, "
705 "off_msb = %llx, off_lsb = %llx\n",
706 lane, idx_stride, elem_size, idx_msb, idx_lsb,
710 * idx_stride + idx_lsb * elem_size + off_lsb);
715 DPRINTF(GCN3,
"Calculating mubuf address for lane %d: "
716 "vaddr = %llx, base_addr = %llx, "
717 "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
720 gpuDynInst->addr.at(lane) =
vaddr;
802 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
804 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
807 if (gpuDynInst->exec_mask[lane]) {
809 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
820 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
822 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
825 if (gpuDynInst->exec_mask[lane]) {
827 for (
int i = 0;
i < N; ++
i) {
829 gpuDynInst->d_data))[lane * N +
i]
842 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
844 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
847 if (gpuDynInst->exec_mask[lane]) {
850 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
860 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
862 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
865 if (gpuDynInst->exec_mask[lane]) {
867 for (
int i = 0;
i < N; ++
i) {
871 gpuDynInst->d_data))[lane * N +
i]);
882 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
884 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
887 if (gpuDynInst->exec_mask[lane]) {
890 gpuDynInst->makeAtomicOpFunctor<T>(
891 &(
reinterpret_cast<T*
>(
892 gpuDynInst->a_data))[lane],
893 &(
reinterpret_cast<T*
>(
894 gpuDynInst->x_data))[lane]).get();
897 (*amo_op)(
reinterpret_cast<uint8_t *
>(&tmp));
899 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane] = tmp;
909 if (gpuDynInst->exec_mask[lane]) {
910 gpuDynInst->addr.at(lane) =
addr[lane];
913 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
924 #endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__