32 #ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
33 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
39 #include "debug/GCN3.hh"
40 #include "debug/GPUExec.hh"
195 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
206 initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
218 gpu_dyn_inst->scalarAddr =
vaddr;
232 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.
rawDataPtr(),
240 if (!rsrc_desc.stride &&
offset >= rsrc_desc.numRecords) {
241 clamped_offset = rsrc_desc.numRecords;
242 }
else if (rsrc_desc.stride &&
offset
243 > (rsrc_desc.stride * rsrc_desc.numRecords)) {
244 clamped_offset = (rsrc_desc.stride * rsrc_desc.numRecords);
247 Addr vaddr = ((rsrc_desc.baseAddr + clamped_offset) & ~0x3);
248 gpu_dyn_inst->scalarAddr =
vaddr;
408 if (gpuDynInst->exec_mask[lane]) {
411 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
424 if (gpuDynInst->exec_mask[lane]) {
426 for (
int i = 0;
i < N; ++
i) {
428 gpuDynInst->d_data))[lane * N +
i]
443 if (gpuDynInst->exec_mask[lane]) {
444 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
445 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
447 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2]
449 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2 + 1]
462 if (gpuDynInst->exec_mask[lane]) {
465 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
477 if (gpuDynInst->exec_mask[lane]) {
479 for (
int i = 0;
i < N; ++
i) {
483 gpuDynInst->d_data))[lane * N +
i]);
496 if (gpuDynInst->exec_mask[lane]) {
497 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
498 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
500 gpuDynInst->d_data))[lane * 2]);
502 gpuDynInst->d_data))[lane * 2 + 1]);
514 gpuDynInst->addr.at(lane) = (
Addr)
addr[lane];
544 VectorMask old_exec_mask = gpuDynInst->exec_mask;
545 gpuDynInst->exec_mask &= ~
oobMask;
547 gpuDynInst->exec_mask = old_exec_mask;
558 VectorMask old_exec_mask = gpuDynInst->exec_mask;
559 gpuDynInst->exec_mask &= ~
oobMask;
561 gpuDynInst->exec_mask = old_exec_mask;
571 VectorMask old_exec_mask = gpuDynInst->exec_mask;
572 gpuDynInst->exec_mask &= ~
oobMask;
574 gpuDynInst->exec_mask = old_exec_mask;
584 VectorMask old_exec_mask = gpuDynInst->exec_mask;
585 gpuDynInst->exec_mask &= ~
oobMask;
587 gpuDynInst->exec_mask = old_exec_mask;
594 gpuDynInst->resetEntireStatusVector();
595 gpuDynInst->setStatusVector(0, 1);
596 RequestPtr req = std::make_shared<Request>(0, 0, 0,
597 gpuDynInst->computeUnit()->
599 gpuDynInst->wfDynId);
600 gpuDynInst->setRequestFlags(req);
601 gpuDynInst->computeUnit()->
625 template<
typename VOFF,
typename VIDX,
typename SRSRC,
typename SOFF>
628 SRSRC s_rsrc_desc, SOFF s_offset,
int inst_offset)
635 Addr buffer_offset = 0;
638 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
641 base_addr = rsrc_desc.baseAddr;
643 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
644 + rsrc_desc.stride) : rsrc_desc.stride;
647 if (gpuDynInst->exec_mask[lane]) {
648 vaddr = base_addr + s_offset.rawData();
654 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
656 buf_off = v_off[lane] + inst_offset;
658 if (rsrc_desc.swizzleEn) {
659 Addr idx_stride = 8 << rsrc_desc.idxStride;
660 Addr elem_size = 2 << rsrc_desc.elemSize;
661 Addr idx_msb = buf_idx / idx_stride;
662 Addr idx_lsb = buf_idx % idx_stride;
663 Addr off_msb = buf_off / elem_size;
664 Addr off_lsb = buf_off % elem_size;
665 DPRINTF(GCN3,
"mubuf swizzled lane %d: "
666 "idx_stride = %llx, elem_size = %llx, "
667 "idx_msb = %llx, idx_lsb = %llx, "
668 "off_msb = %llx, off_lsb = %llx\n",
669 lane, idx_stride, elem_size, idx_msb, idx_lsb,
672 buffer_offset =(idx_msb *
stride + off_msb * elem_size)
673 * idx_stride + idx_lsb * elem_size + off_lsb;
675 buffer_offset = buf_off +
stride * buf_idx;
686 if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
688 rsrc_desc.numRecords - s_offset.rawData()) {
689 DPRINTF(GCN3,
"mubuf out-of-bounds condition 1: "
690 "lane = %d, buffer_offset = %llx, "
691 "const_stride = %llx, "
692 "const_num_records = %llx\n",
693 lane, buf_off +
stride * buf_idx,
694 rsrc_desc.stride, rsrc_desc.numRecords);
700 if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
701 if (buf_idx >= rsrc_desc.numRecords ||
703 DPRINTF(GCN3,
"mubuf out-of-bounds condition 2: "
704 "lane = %d, offset = %llx, "
706 "const_num_records = %llx\n",
707 lane, buf_off, buf_idx,
708 rsrc_desc.numRecords);
714 vaddr += buffer_offset;
716 DPRINTF(GCN3,
"Calculating mubuf address for lane %d: "
717 "vaddr = %llx, base_addr = %llx, "
718 "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
721 gpuDynInst->addr.at(lane) =
vaddr;
803 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
805 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
808 if (gpuDynInst->exec_mask[lane]) {
810 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
821 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
823 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
826 if (gpuDynInst->exec_mask[lane]) {
828 for (
int i = 0;
i < N; ++
i) {
830 gpuDynInst->d_data))[lane * N +
i]
843 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
845 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
848 if (gpuDynInst->exec_mask[lane]) {
851 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
861 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
863 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
866 if (gpuDynInst->exec_mask[lane]) {
868 for (
int i = 0;
i < N; ++
i) {
872 gpuDynInst->d_data))[lane * N +
i]);
883 if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
885 }
else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
888 if (gpuDynInst->exec_mask[lane]) {
891 gpuDynInst->makeAtomicOpFunctor<T>(
892 &(
reinterpret_cast<T*
>(
893 gpuDynInst->a_data))[lane],
894 &(
reinterpret_cast<T*
>(
895 gpuDynInst->x_data))[lane]);
898 (*amo_op)(
reinterpret_cast<uint8_t *
>(&tmp));
900 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane] = tmp;
910 if (gpuDynInst->exec_mask[lane]) {
911 gpuDynInst->addr.at(lane) =
addr[lane];
914 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
925 #endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__