36 #ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__ 37 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__ 42 #include "debug/GPUExec.hh" 177 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
185 Addr split_addr =
roundDown(vaddr + req_size - 1, block_size);
187 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
193 bool misaligned_acc = split_addr >
vaddr;
196 gpuDynInst->computeUnit()->masterId(), 0,
197 gpuDynInst->wfDynId);
199 if (misaligned_acc) {
202 gpuDynInst->numScalarReqs = 2;
203 gpuDynInst->setRequestFlags(req1);
204 gpuDynInst->setRequestFlags(req2);
208 pkt2->
dataStatic(gpuDynInst->scalar_data + req1->getSize());
209 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
210 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
213 gpuDynInst->numScalarReqs = 1;
214 gpuDynInst->setRequestFlags(req);
217 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
228 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
236 Addr split_addr =
roundDown(vaddr + req_size - 1, block_size);
238 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
244 bool misaligned_acc = split_addr >
vaddr;
247 gpuDynInst->computeUnit()->masterId(), 0,
248 gpuDynInst->wfDynId);
250 if (misaligned_acc) {
253 gpuDynInst->numScalarReqs = 2;
254 gpuDynInst->setRequestFlags(req1);
255 gpuDynInst->setRequestFlags(req2);
259 pkt2->
dataStatic(gpuDynInst->scalar_data + req1->getSize());
260 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
261 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
264 gpuDynInst->numScalarReqs = 1;
265 gpuDynInst->setRequestFlags(req);
268 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
279 gpuDynInst->scalarAddr =
vaddr;
451 if (gpuDynInst->exec_mask[lane]) {
454 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]
467 if (gpuDynInst->exec_mask[lane]) {
468 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
469 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
471 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2]
473 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane * 2 + 1]
486 if (gpuDynInst->exec_mask[lane]) {
489 (
reinterpret_cast<T*
>(gpuDynInst->d_data))[lane]);
501 if (gpuDynInst->exec_mask[lane]) {
502 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
503 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
505 gpuDynInst->d_data))[lane * 2]);
507 gpuDynInst->d_data))[lane * 2 + 1]);
519 gpuDynInst->addr.at(lane) = (
Addr)addr[lane];
571 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
574 if (gpuDynInst->exec_mask[lane]) {
578 gpuDynInst->computeUnit()->masterId(), 0,
579 gpuDynInst->wfDynId);
581 gpuDynInst->setRequestFlags(req);
585 gpuDynInst->d_data))[lane]);
587 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
597 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
600 if (gpuDynInst->exec_mask[lane]) {
604 gpuDynInst->computeUnit()->masterId(),
605 0, gpuDynInst->wfDynId);
607 gpuDynInst->setRequestFlags(req);
610 gpuDynInst->d_data))[lane]);
611 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
623 gpuDynInst->computeUnit()->
625 gpuDynInst->wfDynId);
626 gpuDynInst->setRequestFlags(req);
627 gpuDynInst->computeUnit()->
628 injectGlobalMemFence(gpuDynInst,
false, req);
651 template<
typename VOFF,
typename VIDX,
typename SRSRC,
typename SOFF>
654 SRSRC s_rsrc_desc, SOFF s_offset,
int inst_offset)
663 std::memcpy((
void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
666 base_addr = rsrc_desc.baseAddr;
668 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
669 + rsrc_desc.stride) : rsrc_desc.stride;
672 if (gpuDynInst->exec_mask[lane]) {
673 vaddr = base_addr + s_offset.rawData();
679 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
681 buf_off = v_off[lane] + inst_offset;
683 if (rsrc_desc.swizzleEn) {
684 Addr idx_stride = 8 << rsrc_desc.idxStride;
685 Addr elem_size = 2 << rsrc_desc.elemSize;
686 Addr idx_msb = buf_idx / idx_stride;
687 Addr idx_lsb = buf_idx % idx_stride;
688 Addr off_msb = buf_off / elem_size;
689 Addr off_lsb = buf_off % elem_size;
691 vaddr += ((idx_msb * stride + off_msb * elem_size)
692 * idx_stride + idx_lsb * elem_size + off_lsb);
694 vaddr += buf_off + stride * buf_idx;
697 gpuDynInst->addr.at(lane) =
vaddr;
774 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
777 if (gpuDynInst->exec_mask[lane]) {
781 gpuDynInst->computeUnit()->masterId(), 0,
782 gpuDynInst->wfDynId);
784 gpuDynInst->setRequestFlags(req);
787 gpuDynInst->d_data))[lane]);
788 gpuDynInst->computeUnit()
789 ->sendRequest(gpuDynInst, lane, pkt);
799 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
802 if (gpuDynInst->exec_mask[lane]) {
806 gpuDynInst->computeUnit()->masterId(), 0,
807 gpuDynInst->wfDynId);
809 gpuDynInst->setRequestFlags(req);
811 pkt->
dataStatic(&(reinterpret_cast<VecElemU32*>(
812 gpuDynInst->d_data))[lane * N]);
813 gpuDynInst->computeUnit()
814 ->sendRequest(gpuDynInst, lane, pkt);
823 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
826 if (gpuDynInst->exec_mask[lane]) {
830 gpuDynInst->computeUnit()->masterId(),
831 0, gpuDynInst->wfDynId);
833 gpuDynInst->setRequestFlags(req);
836 gpuDynInst->d_data))[lane]);
837 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
848 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
851 if (gpuDynInst->exec_mask[lane]) {
855 gpuDynInst->computeUnit()->masterId(),
856 0, gpuDynInst->wfDynId);
858 gpuDynInst->setRequestFlags(req);
860 pkt->
dataStatic(&(reinterpret_cast<VecElemU32*>(
861 gpuDynInst->d_data))[lane * N]);
862 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
872 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
875 if (gpuDynInst->exec_mask[lane]) {
879 gpuDynInst->computeUnit()->masterId(), 0,
881 gpuDynInst->makeAtomicOpFunctor<T>(
882 &(
reinterpret_cast<T*
>(gpuDynInst->a_data))[lane],
883 &(
reinterpret_cast<T*
>(
884 gpuDynInst->x_data))[lane]));
886 gpuDynInst->setRequestFlags(req);
890 gpuDynInst->d_data))[lane]);
892 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
902 if (gpuDynInst->exec_mask[lane]) {
903 gpuDynInst->addr.at(lane) = addr[lane];
906 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
916 #endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
bool hasSecondDword(InFmt_SOP2 *)
int instSize() const override
void calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr, ScalarRegU32 offset)
int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override
Bitfield< 21, 20 > stride
std::shared_ptr< Request > RequestPtr
void generateDisassembly() override
void initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
std::bitset< std::numeric_limits< unsigned long long >::digits > VectorMask
void initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
void calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
void initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
void initMemRead(GPUDynInstPtr gpuDynInst)
InFmt_VOP3_SDST_ENC instData
void initMemWrite(GPUDynInstPtr gpuDynInst)
initiate a memory write access for N dwords
bool isVectorRegister(int opIdx) override
classes that represnt vector/scalar operands in GCN3 ISA.
Inst_SOP2(InFmt_SOP2 *, const std::string &opcode)
void splitOnVaddr(Addr split_addr, RequestPtr &req1, RequestPtr &req2)
Generate two requests as if this request had been split into two pieces.
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
const bool sgprDst
the v_cmp and readlane instructions in the VOP3 encoding are unique because they are the only instruc...
T read(const uint32_t index)
a read operation
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
void initMemRead(GPUDynInstPtr gpuDynInst)
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
void calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx, SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
MUBUF insructions calculate their addresses as follows:
void initAtomicAccess(GPUDynInstPtr gpuDynInst)
VectorMask execMask() const
void initMemRead(GPUDynInstPtr gpuDynInst)
initiate a memory read access for N dwords
void write(const uint32_t index, const T value)
a write operation
void initMemWrite(GPUDynInstPtr gpuDynInst)
void calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
bool isScalarRegister(int opIdx) override
std::enable_if< Condition, DataType >::type rawData() const
we store scalar data in a std::array, however if we need the full operand data we use this method to ...
void initMemWrite(GPUDynInstPtr gpuDynInst)
void initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
ProbePointArg< PacketInfo > Packet
Packet probe point.
const int NumVecElemPerVecReg(64)