36 #ifndef __ARCH_HSAIL_INSTS_MEM_HH__ 37 #define __ARCH_HSAIL_INSTS_MEM_HH__ 39 #include <type_traits> 55 if (m_type == Enums::M_U64 ||
56 m_type == Enums::M_S64 ||
57 m_type == Enums::M_F64) {
59 }
else if (m_type == Enums::M_U32 ||
60 m_type == Enums::M_S32 ||
61 m_type == Enums::M_F32) {
63 }
else if (m_type == Enums::M_U16 ||
64 m_type == Enums::M_S16 ||
65 m_type == Enums::M_F16) {
89 template<
typename DestOperandType,
typename AddrOperandType>
93 typename DestOperandType::DestOperand
dest;
100 using namespace Brig;
105 dest.init(op_offs, obj);
107 addr.init(op_offs, obj);
111 {
return(this->addr.isVectorRegister()); }
113 {
return dest.isVectorRegister(); }
116 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
117 return((operandIndex == 0) ? dest.isVectorRegister() :
118 this->addr.isVectorRegister());
122 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
123 return((operandIndex == 0) ? dest.isCondRegister() :
124 this->addr.isCondRegister());
128 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
129 return((operandIndex == 0) ? dest.isScalarRegister() :
130 this->addr.isScalarRegister());
134 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
135 if (operandIndex > 0)
136 return(this->addr.isVectorRegister());
140 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
141 return(operandIndex == 0);
145 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
146 return((operandIndex == 0) ? dest.opSize() :
147 this->addr.opSize());
152 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
153 return((operandIndex == 0) ? dest.regIndex() :
154 this->addr.regIndex());
158 if (this->addr.isVectorRegister())
164 template<
typename DestDataType,
typename AddrOperandType>
166 public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
170 void generateDisassembly();
175 AddrOperandType>(ib, obj, _opcode)
183 template<
typename DataType>
200 fatal(
"Bad ldas register operand type %d\n", regDataType.
type);
203 fatal(
"Bad ldas register operand kind %d\n", regDataType.
kind);
207 template<
typename MemOperandType,
typename DestOperandType,
208 typename AddrOperandType>
213 typename DestOperandType::DestOperand
dest;
225 using namespace Brig;
242 dest.init(op_offs, obj);
245 addr.init(op_offs, obj);
259 dest.init(op_offs, obj);
262 addr.init(op_offs, obj);
265 switch (memoryOrder) {
270 setFlag(RelaxedOrder);
279 setFlag(AcquireRelease);
282 fatal(
"LdInst has bad memory order type\n");
285 switch (memoryScope) {
290 setFlag(WorkitemScope);
293 setFlag(WorkgroupScope);
296 setFlag(DeviceScope);
299 setFlag(SystemScope);
302 fatal(
"LdInst has bad memory scope type\n");
307 setFlag(GlobalSegment);
310 setFlag(GroupSegment);
313 setFlag(PrivateSegment);
316 setFlag(ReadOnlySegment);
319 setFlag(SpillSegment);
325 setFlag(KernArgSegment);
331 panic(
"Ld: segment %d not supported\n", segment);
336 {
return(this->addr.isVectorRegister()); }
340 if (this->addr.isVectorRegister())
347 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
348 return((operandIndex == 0) ? dest.isVectorRegister() :
349 this->addr.isVectorRegister());
353 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
354 return((operandIndex == 0) ? dest.isCondRegister() :
355 this->addr.isCondRegister());
359 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
360 return((operandIndex == 0) ? dest.isScalarRegister() :
361 this->addr.isScalarRegister());
365 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
366 if (operandIndex > 0)
367 return(this->addr.isVectorRegister());
372 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
373 return(operandIndex == 0);
377 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
378 return((operandIndex == 0) ? dest.opSize() :
379 this->addr.opSize());
384 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
385 return((operandIndex == 0) ? dest.regIndex() :
386 this->addr.regIndex());
390 template<
typename MemDataType,
typename DestDataType,
391 typename AddrOperandType>
393 public LdInstBase<typename MemDataType::CType,
394 typename DestDataType::OperandType, AddrOperandType>,
397 typename DestDataType::OperandType::DestOperand dest_vect[4];
399 void generateDisassembly()
override;
405 typename DestDataType::OperandType,
406 AddrOperandType>(ib, obj, _opcode),
421 assert(num_dest_operands <= 4);
423 num_dest_operands = 1;
426 if (num_dest_operands > 1) {
429 for (
int i = 0;
i < num_dest_operands; ++
i) {
430 dest_vect[
i].init_from_vect(op_offs, obj,
i);
438 typedef typename MemDataType::CType c0;
440 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
442 if (num_dest_operands > 1) {
443 for (
int i = 0;
i < gpuDynInst->computeUnit()->wfSize(); ++
i)
444 if (gpuDynInst->exec_mask[
i])
445 gpuDynInst->statusVector.push_back(num_dest_operands);
447 gpuDynInst->statusVector.push_back(0);
450 for (
int k = 0;
k < num_dest_operands; ++
k) {
452 c0 *
d = &((c0*)gpuDynInst->d_data)
453 [
k * gpuDynInst->computeUnit()->wfSize()];
455 for (
int i = 0;
i < gpuDynInst->computeUnit()->wfSize(); ++
i) {
456 if (gpuDynInst->exec_mask[
i]) {
459 if (this->isLocalMem()) {
461 *d = gpuDynInst->wavefront()->ldsChunk->
465 vaddr,
sizeof(c0), 0,
466 gpuDynInst->computeUnit()->masterId(),
467 0, gpuDynInst->wfDynId);
469 gpuDynInst->setRequestFlags(req);
473 if (gpuDynInst->computeUnit()->shader->
474 separate_acquire_release &&
475 gpuDynInst->isAcquire()) {
479 gpuDynInst->execContinuation =
482 gpuDynInst->useContinuation =
true;
486 gpuDynInst->useContinuation =
false;
489 gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
497 gpuDynInst->updateStats();
503 typedef typename MemDataType::CType c1;
505 constexpr
bool is_vt_32 = DestDataType::vgprType ==
VT_32;
518 typedef typename std::conditional<is_vt_32,
519 typename std::conditional<std::is_floating_point<c1>::value,
520 float,
typename std::conditional<std::is_signed<c1>::value,
522 typename std::conditional<std::is_floating_point<c1>::value,
523 double,
typename std::conditional<std::is_signed<c1>::value,
524 int64_t, uint64_t>::type>::type>::type c0;
532 for (
int k = 0;
k < num_dest_operands; ++
k) {
533 assert((
sizeof(c1) * num_dest_operands)
536 int dst = this->dest.regIndex() +
k;
538 dst = dest_vect[
k].regIndex();
540 int physVgpr = w->
remap(dst,
sizeof(c0), 1);
542 regVec.push_back(physVgpr);
548 if (gpuDynInst->exec_mask[
i]) {
549 DPRINTF(GPUReg,
"CU%d, WF[%d][%d], lane %d: " 550 "$%s%d <- %d global ld done (src = wavefront " 552 w->
wfSlotId,
i,
sizeof(c0) == 4 ?
"s" :
"d",
566 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
567 vrf[w->
simdId]->exec(gpuDynInst->seqNum(),
w, regVec,
568 sizeof(c0), gpuDynInst->time);
570 if (this->isGlobalMem()) {
571 gpuDynInst->computeUnit()->globalMemoryPipe
572 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
574 assert(this->isLocalMem());
575 gpuDynInst->computeUnit()->localMemoryPipe
576 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
586 if (!this->isLocalMem()) {
587 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
588 && gpuDynInst->isAcquire()) {
590 gpuDynInst->useContinuation =
false;
592 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
593 gpuDynInst->computeUnit()->masterId(),
594 0, gpuDynInst->wfDynId);
596 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst,
false, req);
604 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
605 if ((num_dest_operands != getNumOperands()) &&
606 (operandIndex == (getNumOperands()-1)))
607 return(this->
addr.isVectorRegister());
608 if (num_dest_operands > 1) {
609 return dest_vect[operandIndex].isVectorRegister();
611 else if (num_dest_operands == 1) {
612 return LdInstBase<
typename MemDataType::CType,
613 typename DestDataType::OperandType,
620 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
621 if ((num_dest_operands != getNumOperands()) &&
622 (operandIndex == (getNumOperands()-1)))
623 return(this->
addr.isCondRegister());
624 if (num_dest_operands > 1)
625 return dest_vect[operandIndex].isCondRegister();
626 else if (num_dest_operands == 1)
627 return LdInstBase<
typename MemDataType::CType,
628 typename DestDataType::OperandType,
634 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
635 if ((num_dest_operands != getNumOperands()) &&
636 (operandIndex == (getNumOperands()-1)))
637 return(this->
addr.isScalarRegister());
638 if (num_dest_operands > 1)
639 return dest_vect[operandIndex].isScalarRegister();
640 else if (num_dest_operands == 1)
641 return LdInstBase<
typename MemDataType::CType,
642 typename DestDataType::OperandType,
648 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
649 if ((num_dest_operands != getNumOperands()) &&
650 (operandIndex == (getNumOperands()-1)))
651 return(this->
addr.isVectorRegister());
656 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
657 if ((num_dest_operands != getNumOperands()) &&
658 (operandIndex == (getNumOperands()-1)))
664 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
665 if ((num_dest_operands != getNumOperands()) &&
666 (operandIndex == (getNumOperands()-1)))
667 return(this->
addr.opSize());
668 if (num_dest_operands > 1)
669 return(dest_vect[operandIndex].opSize());
670 else if (num_dest_operands == 1)
671 return(
LdInstBase<
typename MemDataType::CType,
672 typename DestDataType::OperandType,
673 AddrOperandType>::dest.opSize());
679 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
680 if ((num_dest_operands != getNumOperands()) &&
681 (operandIndex == (getNumOperands()-1)))
682 return(this->
addr.regIndex());
683 if (num_dest_operands > 1)
684 return(dest_vect[operandIndex].regIndex());
685 else if (num_dest_operands == 1)
686 return(
LdInstBase<
typename MemDataType::CType,
687 typename DestDataType::OperandType,
688 AddrOperandType>::dest.regIndex());
693 if (this->
addr.isVectorRegister() || this->
addr.isScalarRegister())
694 return(num_dest_operands+1);
696 return(num_dest_operands);
701 template<
typename MemDT,
typename DestDT>
714 return new LdInst<MemDT, DestDT,
717 return new LdInst<MemDT, DestDT,
720 fatal(
"Bad ld register operand type %d\n", tmp.
regKind);
723 fatal(
"Bad ld register operand kind %d\n", tmp.
kind);
727 template<
typename MemDT>
742 return decodeLd2<MemDT, B32>(ib, obj);
746 return decodeLd2<MemDT, U32>(ib, obj);
750 return decodeLd2<MemDT, S32>(ib, obj);
753 return decodeLd2<MemDT, U32>(ib, obj);
755 fatal(
"Bad ld register operand type %d, %d\n",
761 return decodeLd2<MemDT, B64>(ib, obj);
763 return decodeLd2<MemDT, U64>(ib, obj);
765 return decodeLd2<MemDT, S64>(ib, obj);
767 return decodeLd2<MemDT, U64>(ib, obj);
769 fatal(
"Bad ld register operand type %d, %d\n",
773 fatal(
"Bad ld register operand type %d, %d\n", dest.
regKind,
778 template<
typename MemDataType,
typename SrcOperandType,
779 typename AddrOperandType>
783 typename SrcOperandType::SrcOperand
src;
795 using namespace Brig;
813 src.init(op_offs, obj);
817 addr.init(op_offs, obj);
827 addr.init(op_offs, obj);
830 src.init(op_offs, obj);
833 switch (memoryOrder) {
838 setFlag(RelaxedOrder);
847 setFlag(AcquireRelease);
850 fatal(
"StInst has bad memory order type\n");
853 switch (memoryScope) {
858 setFlag(WorkitemScope);
861 setFlag(WorkgroupScope);
864 setFlag(DeviceScope);
867 setFlag(SystemScope);
870 fatal(
"StInst has bad memory scope type\n");
875 setFlag(GlobalSegment);
878 setFlag(GroupSegment);
881 setFlag(PrivateSegment);
884 setFlag(ReadOnlySegment);
887 setFlag(SpillSegment);
896 panic(
"St: segment %d not supported\n", segment);
903 return src.isVectorRegister() + this->addr.isVectorRegister();
907 if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
914 assert(operandIndex >= 0 && operandIndex < getNumOperands());
915 return !operandIndex ? src.isVectorRegister() :
916 this->addr.isVectorRegister();
920 assert(operandIndex >= 0 && operandIndex < getNumOperands());
921 return !operandIndex ? src.isCondRegister() :
922 this->addr.isCondRegister();
926 assert(operandIndex >= 0 && operandIndex < getNumOperands());
927 return !operandIndex ? src.isScalarRegister() :
928 this->addr.isScalarRegister();
932 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
938 assert(operandIndex >= 0 && operandIndex < getNumOperands());
939 return !operandIndex ? src.opSize() : this->addr.opSize();
944 assert(operandIndex >= 0 && operandIndex < getNumOperands());
945 return !operandIndex ? src.regIndex() : this->addr.regIndex();
950 template<
typename MemDataType,
typename SrcDataType,
951 typename AddrOperandType>
953 public StInstBase<MemDataType, typename SrcDataType::OperandType,
958 typename SrcDataType::OperandType::SrcOperand src_vect[4];
960 void generateDisassembly()
override;
963 const char *_opcode,
int srcIdx)
964 :
StInstBase<MemDataType, typename SrcDataType::OperandType,
965 AddrOperandType>(ib, obj, _opcode),
991 assert(num_src_operands <= 4);
993 num_src_operands = 1;
996 if (num_src_operands > 1) {
999 for (
int i = 0;
i < num_src_operands; ++
i) {
1000 src_vect[
i].init_from_vect(op_offs, obj,
i);
1010 if (!this->isLocalMem()) {
1011 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1012 && gpuDynInst->isRelease()) {
1016 gpuDynInst->useContinuation =
true;
1018 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
1019 gpuDynInst->computeUnit()->masterId(),
1020 0, gpuDynInst->wfDynId);
1022 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst,
false, req);
1044 typedef typename MemDataType::CType c0;
1046 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1048 if (num_src_operands > 1) {
1049 for (
int i = 0;
i < gpuDynInst->computeUnit()->wfSize(); ++
i)
1050 if (gpuDynInst->exec_mask[
i])
1051 gpuDynInst->statusVector.push_back(num_src_operands);
1053 gpuDynInst->statusVector.push_back(0);
1056 for (
int k = 0;
k < num_src_operands; ++
k) {
1057 c0 *
d = &((c0*)gpuDynInst->d_data)
1058 [
k * gpuDynInst->computeUnit()->wfSize()];
1060 for (
int i = 0;
i < gpuDynInst->computeUnit()->wfSize(); ++
i) {
1061 if (gpuDynInst->exec_mask[
i]) {
1062 Addr vaddr = gpuDynInst->addr[
i] +
k *
sizeof(c0);
1064 if (this->isLocalMem()) {
1066 gpuDynInst->wavefront()->ldsChunk->write<c0>(
vaddr,
1070 0,
vaddr,
sizeof(c0), 0,
1071 gpuDynInst->computeUnit()->masterId(),
1072 0, gpuDynInst->wfDynId);
1074 gpuDynInst->setRequestFlags(req);
1080 gpuDynInst->useContinuation =
false;
1081 gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
1090 gpuDynInst->updateStats();
1096 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1097 if (operandIndex == num_src_operands)
1098 return this->
addr.isVectorRegister();
1099 if (num_src_operands > 1)
1100 return src_vect[operandIndex].isVectorRegister();
1101 else if (num_src_operands == 1)
1103 typename SrcDataType::OperandType,
1109 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1110 if (operandIndex == num_src_operands)
1111 return this->
addr.isCondRegister();
1112 if (num_src_operands > 1)
1113 return src_vect[operandIndex].isCondRegister();
1114 else if (num_src_operands == 1)
1116 typename SrcDataType::OperandType,
1122 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1123 if (operandIndex == num_src_operands)
1124 return this->
addr.isScalarRegister();
1125 if (num_src_operands > 1)
1126 return src_vect[operandIndex].isScalarRegister();
1127 else if (num_src_operands == 1)
1129 typename SrcDataType::OperandType,
1135 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1141 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1142 if (operandIndex == num_src_operands)
1143 return this->
addr.opSize();
1144 if (num_src_operands > 1)
1145 return src_vect[operandIndex].opSize();
1146 else if (num_src_operands == 1)
1148 typename SrcDataType::OperandType,
1149 AddrOperandType>::src.opSize();
1155 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1156 if (operandIndex == num_src_operands)
1157 return this->
addr.regIndex();
1158 if (num_src_operands > 1)
1159 return src_vect[operandIndex].regIndex();
1160 else if (num_src_operands == 1)
1162 typename SrcDataType::OperandType,
1163 AddrOperandType>::src.regIndex();
1168 if (this->
addr.isVectorRegister() || this->
addr.isScalarRegister())
1169 return num_src_operands + 1;
1171 return num_src_operands;
1176 template<
typename DataType,
typename SrcDataType>
1192 return new StInst<DataType, SrcDataType,
1198 return new StInst<DataType, SrcDataType,
1201 return new StInst<DataType, SrcDataType,
1204 fatal(
"Bad st register operand type %d\n", tmp.
type);
1207 fatal(
"Bad st register operand kind %d\n", tmp.
kind);
1211 template<
typename OperandType,
typename AddrOperandType,
int NumSrcOperands,
1216 typename OperandType::DestOperand
dest;
1217 typename OperandType::SrcOperand src[NumSrcOperands];
1227 const char *_opcode)
1230 using namespace Brig;
1246 setFlag(AtomicReturn);
1248 setFlag(AtomicNoReturn);
1251 switch (memoryOrder) {
1256 setFlag(RelaxedOrder);
1265 setFlag(AcquireRelease);
1268 fatal(
"AtomicInst has bad memory order type\n");
1271 switch (memoryScope) {
1276 setFlag(WorkitemScope);
1279 setFlag(WorkgroupScope);
1282 setFlag(DeviceScope);
1285 setFlag(SystemScope);
1288 fatal(
"AtomicInst has bad memory scope type\n");
1291 switch (atomicOperation) {
1305 setFlag(AtomicExch);
1326 fatal(
"Bad BrigAtomicOperation code %d\n", atomicOperation);
1331 setFlag(GlobalSegment);
1334 setFlag(GroupSegment);
1340 panic(
"Atomic: segment %d not supported\n", segment);
1345 dest.init(op_offs, obj);
1348 addr.init(op_offs, obj);
1350 for (
int i = 0;
i < NumSrcOperands; ++
i) {
1352 src[
i].init(op_offs, obj);
1357 addr.init(op_offs, obj);
1359 for (
int i = 0;
i < NumSrcOperands; ++
i) {
1361 src[
i].init(op_offs, obj);
1369 for (
int i = 0;
i < NumSrcOperands;
i++) {
1370 if (src[
i].isVectorRegister()) {
1374 if (addr.isVectorRegister())
1381 if (addr.isVectorRegister())
1382 return(NumSrcOperands + 2);
1383 return(NumSrcOperands + 1);
1387 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1388 if (operandIndex < NumSrcOperands)
1389 return src[operandIndex].isVectorRegister();
1390 else if (operandIndex == NumSrcOperands)
1391 return(addr.isVectorRegister());
1393 return dest.isVectorRegister();
1397 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1398 if (operandIndex < NumSrcOperands)
1399 return src[operandIndex].isCondRegister();
1400 else if (operandIndex == NumSrcOperands)
1401 return(addr.isCondRegister());
1403 return dest.isCondRegister();
1407 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1408 if (operandIndex < NumSrcOperands)
1409 return src[operandIndex].isScalarRegister();
1410 else if (operandIndex == NumSrcOperands)
1411 return(addr.isScalarRegister());
1413 return dest.isScalarRegister();
1417 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1418 if (operandIndex < NumSrcOperands)
1420 else if (operandIndex == NumSrcOperands)
1421 return(addr.isVectorRegister());
1427 if (operandIndex <= NumSrcOperands)
1434 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1435 if (operandIndex < NumSrcOperands)
1436 return(src[operandIndex].opSize());
1437 else if (operandIndex == NumSrcOperands)
1438 return(addr.opSize());
1440 return(dest.opSize());
1445 assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1446 if (operandIndex < NumSrcOperands)
1447 return(src[operandIndex].regIndex());
1448 else if (operandIndex == NumSrcOperands)
1449 return(addr.regIndex());
1451 return(dest.regIndex());
1456 template<
typename MemDataType,
typename AddrOperandType,
int NumSrcOperands,
1460 AddrOperandType, NumSrcOperands, HasDst>,
1464 void generateDisassembly()
override;
1467 const char *_opcode)
1468 :
AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
1469 NumSrcOperands, HasDst>
1481 if (!this->isLocalMem()) {
1482 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1483 && (gpuDynInst->isRelease()
1484 || gpuDynInst->isAcquireRelease())) {
1489 gpuDynInst->useContinuation =
true;
1492 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
1493 gpuDynInst->computeUnit()->masterId(),
1494 0, gpuDynInst->wfDynId);
1496 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst,
false, req);
1503 execAtomic(gpuDynInst);
1512 if (this->isAtomicRet()) {
1517 typedef typename MemDataType::CType CType;
1520 int dst = this->dest.regIndex();
1523 int physVgpr = w->
remap(dst,
sizeof(CType), 1);
1524 regVec.push_back(physVgpr);
1525 CType *p1 = &((CType*)gpuDynInst->d_data)[0];
1528 if (gpuDynInst->exec_mask[
i]) {
1529 DPRINTF(GPUReg,
"CU%d, WF[%d][%d], lane %d: " 1530 "$%s%d <- %d global ld done (src = wavefront " 1532 w->
wfSlotId,
i,
sizeof(CType) == 4 ?
"s" :
"d",
1544 int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
1545 vrf[w->
simdId]->exec(gpuDynInst->seqNum(),
w, regVec,
1546 sizeof(CType), gpuDynInst->time);
1548 if (this->isGlobalMem()) {
1549 gpuDynInst->computeUnit()->globalMemoryPipe
1550 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1552 assert(this->isLocalMem());
1553 gpuDynInst->computeUnit()->localMemoryPipe
1554 .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1568 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1570 typedef typename MemDataType::CType c0;
1572 c0 *
d = &((c0*) gpuDynInst->d_data)[0];
1573 c0 *
e = &((c0*) gpuDynInst->a_data)[0];
1574 c0 *
f = &((c0*) gpuDynInst->x_data)[0];
1576 for (
int i = 0;
i < gpuDynInst->computeUnit()->wfSize(); ++
i) {
1577 if (gpuDynInst->exec_mask[
i]) {
1580 if (this->isLocalMem()) {
1581 Wavefront *wavefront = gpuDynInst->wavefront();
1584 if (this->isAtomicAdd()) {
1587 }
else if (this->isAtomicSub()) {
1590 }
else if (this->isAtomicMax()) {
1594 }
else if (this->isAtomicMin()) {
1598 }
else if (this->isAtomicAnd()) {
1601 }
else if (this->isAtomicOr()) {
1604 }
else if (this->isAtomicXor()) {
1607 }
else if (this->isAtomicInc()) {
1610 }
else if (this->isAtomicDec()) {
1613 }
else if (this->isAtomicExch()) {
1615 }
else if (this->isAtomicCAS()) {
1620 fatal(
"Unrecognized or invalid HSAIL atomic op " 1625 std::make_shared<Request>(0,
vaddr,
sizeof(c0), 0,
1626 gpuDynInst->computeUnit()->masterId(),
1627 0, gpuDynInst->wfDynId,
1628 gpuDynInst->makeAtomicOpFunctor<c0>(
e,
1631 gpuDynInst->setRequestFlags(req);
1635 if (gpuDynInst->computeUnit()->shader->
1636 separate_acquire_release &&
1637 (gpuDynInst->isAcquire())) {
1641 gpuDynInst->execContinuation =
1644 gpuDynInst->useContinuation =
true;
1647 gpuDynInst->useContinuation =
false;
1650 gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
i,
1660 gpuDynInst->updateStats();
1670 if (!this->isLocalMem()) {
1671 if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1672 && gpuDynInst->isAcquire()) {
1677 gpuDynInst->useContinuation =
false;
1679 RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
1680 gpuDynInst->computeUnit()->masterId(),
1681 0, gpuDynInst->wfDynId);
1683 gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst,
false, req);
1689 template<
typename DataType,
typename AddrOperandType,
int NumSrcOperands>
1696 return decodeLd<DataType>(ib, obj);
1700 return decodeSt<S8,S8>(ib, obj);
1702 return decodeSt<S16,S16>(ib, obj);
1704 return decodeSt<S32,S32>(ib, obj);
1706 return decodeSt<S64,S64>(ib, obj);
1707 default:
fatal(
"AtomicSt: Operand type mismatch %d\n", ib->type);
1711 return new AtomicInst<DataType, AddrOperandType,
1712 NumSrcOperands,
false>(ib, obj,
"atomicnoret");
1714 return new AtomicInst<DataType, AddrOperandType,
1715 NumSrcOperands,
true>(ib, obj,
"atomic");
1719 template<
typename DataType,
int NumSrcOperands>
1732 NumSrcOperands>(ib, obj);
1738 NumSrcOperands>(ib, obj);
1741 NumSrcOperands>(ib, obj);
1743 fatal(
"Bad atomic register operand type %d\n", tmp.
type);
1746 fatal(
"Bad atomic register operand kind %d\n", tmp.
kind);
1751 template<
typename DataType>
1758 return decodeAtomicHelper<DataType, 2>(ib, obj);
1760 return decodeAtomicHelper<DataType, 1>(ib, obj);
1764 template<
typename DataType>
1770 return decodeAtomicHelper<DataType, 2>(ib, obj);
1772 return decodeAtomicHelper<DataType, 1>(ib, obj);
1777 #endif // __ARCH_HSAIL_INSTS_MEM_HH__
#define panic(...)
This implements a cprintf based panic() function.
Brig::BrigMemoryOrder memoryOrder
Brig::BrigMemoryOrder memoryOrder
bool isVectorRegister(int operandIndex) override
int getNumOperands() override
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst)
Defines classes encapsulating HSAIL instruction operands.
bool isDstOperand(int operandIndex) override
#define fatal(...)
This implements a cprintf based fatal() function.
BrigDataOffsetOperandList32_t operands
bool isDstOperand(int operandIndex)
bool isScalarRegister(int operandIndex) override
int getOperandSize(int operandIndex) override
void execSt(GPUDynInstPtr gpuDynInst) override
void completeAcc(GPUDynInstPtr gpuDynInst) override
void completeAcc(GPUDynInstPtr gpuDynInst) override
static const int MAX_REGS_FOR_NON_VEC_MEM_INST
bool isVectorRegister(int operandIndex) override
static const int MAX_WIDTH_FOR_MEM_INST
bool isVectorRegister(int operandIndex) override
bool isSrcOperand(int operandIndex) override
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
BrigMemoryOrder8_t memoryOrder
bool isVectorRegister(int operandIndex) override
StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
void initiateAcc(GPUDynInstPtr gpuDynInst) override
std::shared_ptr< Request > RequestPtr
void execAtomic(GPUDynInstPtr gpuDynInst) override
int getOperandSize(int operandIndex) override
RegAddrOperand< SRegOperand > SRegAddrOperand
virtual void execLdAcq(GPUDynInstPtr gpuDynInst)
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
bool isSrcOperand(int operandIndex) override
int getNumOperands() override
bool isSrcOperand(int operandIndex)
virtual void execAtomicAcq(GPUDynInstPtr gpuDynInst)
bool isDstOperand(int operandIndex) override
AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
int numDstRegOperands() override
bool isCondRegister(int operandIndex)
DestOperandType::DestOperand dest
MemInst(Enums::MemType m_type)
Brig::BrigMemoryScope memoryScope
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
bool isScalarRegister(int operandIndex) override
virtual void execSt(GPUDynInstPtr gpuDynInst)
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint16_t num_src_operands
virtual void execAtomic(GPUDynInstPtr gpuDynInst)
void execLdAcq(GPUDynInstPtr gpuDynInst) override
bool isCondRegister(int operandIndex) override
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
unsigned getOperandPtr(int offs, int index) const
bool isDstOperand(int operandIndex) override
BrigMemoryScope8_t memoryScope
int numSrcRegOperands() override
DestOperandType::DestOperand dest
Brig::BrigAtomicOperation atomicOperation
bool isCondRegister(int operandIndex) override
AddrOperandBase * getAddressOperand()
GPUStaticInst * decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
const uint8_t * getData(int offs) const
LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
GPUStaticInst * decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
ComputeUnit * computeUnit
Brig::BrigSegment segment
Brig::BrigSegment segment
Brig::BrigSegment segment
AddrOperandBase * addr_operand
int getOperandSize(int operandIndex)
Brig::BrigMemoryScope memoryScope
void initiateAcc(GPUDynInstPtr gpuDynInst) override
void execAtomicAcq(GPUDynInstPtr gpuDynInst) override
LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
bool isScalarRegister(int operandIndex) override
T read(const uint32_t index)
a read operation
bool isCondRegister(int operandIndex) override
BrigAtomicOperation8_t atomicOperation
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
OperandType::DestOperand dest
int numDstRegOperands() override
bool isCondRegister(int operandIndex) override
bool isVectorRegister(int operandIndex) override
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
void initiateAcc(GPUDynInstPtr gpuDynInst) override
bool isDstOperand(int operandIndex) override
int numSrcRegOperands() override
AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
bool isCondRegister(int operandIndex) override
RegAddrOperand< DRegOperand > DRegAddrOperand
bool isScalarRegister(int operandIndex)
uint16_t num_dest_operands
const Brig::BrigOperand * getOperand(int offs) const
void completeAcc(GPUDynInstPtr gpuDynInst) override
int getNumOperands() override
int numSrcRegOperands() override
int numDstRegOperands() override
The request should be marked with RELEASE.
void init_addr(AddrOperandBase *_addr_operand)
bool isSrcOperand(int operandIndex) override
SrcOperandType::SrcOperand src
std::vector< VectorRegisterFile * > vrf
BrigDataOffsetOperandList32_t elements
GPUStaticInst * constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
GPUStaticInst * decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
Brig::BrigRegisterKind regKind
int getOperandSize(int operandIndex) override
GPUStaticInst * decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
void write(const uint32_t index, const T value)
a write operation
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj)
int getNumOperands() override
GPUStaticInst * decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
bool isSrcOperand(int operandIndex) override
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0)
Brig::BrigMemoryOrder memoryOrder
int getOperandSize(int operandIndex) override
LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode)
GPUStaticInst * decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
int getNumOperands() override
GPUStaticInst * decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
bool isDstOperand(int operandIndex) override
Brig::BrigMemoryScope memoryScope
bool isSrcOperand(int operandIndex) override
int getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
The request should be marked with ACQUIRE.
bool isVectorRegister(int operandIndex)
bool isScalarRegister(int operandIndex) override
bool isScalarRegister(int operandIndex) override
int getOperandSize(int operandIndex) override
StInst(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode, int srcIdx)
ProbePointArg< PacketInfo > Packet
Packet probe point.