55 int req_size = N *
sizeof(T);
56 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
58 bool misaligned_acc =
false;
59 RequestPtr req =
nullptr, req1 =
nullptr, req2 =
nullptr;
60 PacketPtr pkt =
nullptr, pkt1 =
nullptr, pkt2 =
nullptr;
62 gpuDynInst->resetEntireStatusVector();
64 if (gpuDynInst->exec_mask[lane]) {
73 assert(split_addr <=
vaddr || split_addr -
vaddr < block_size);
79 misaligned_acc = split_addr >
vaddr;
83 assert((
vaddr & 0x3) == 0);
86 assert(!misaligned_acc);
88 req = std::make_shared<Request>(
vaddr,
sizeof(T), 0,
89 gpuDynInst->computeUnit()->requestorId(), 0,
91 gpuDynInst->makeAtomicOpFunctor<T>(
92 &(
reinterpret_cast<T*
>(gpuDynInst->a_data))[lane],
93 &(
reinterpret_cast<T*
>(gpuDynInst->x_data))[lane]));
95 req = std::make_shared<Request>(
vaddr, req_size, 0,
96 gpuDynInst->computeUnit()->requestorId(), 0,
100 if (misaligned_acc) {
101 gpuDynInst->setStatusVector(lane, 2);
102 req->splitOnVaddr(split_addr, req1, req2);
103 gpuDynInst->setRequestFlags(req1);
104 gpuDynInst->setRequestFlags(req2);
105 pkt1 =
new Packet(req1, mem_req_type);
106 pkt2 =
new Packet(req2, mem_req_type);
107 pkt1->dataStatic(&(
reinterpret_cast<T*
>(
108 gpuDynInst->d_data))[lane * N]);
109 pkt2->dataStatic(&(
reinterpret_cast<T*
>(
110 gpuDynInst->d_data))[lane * N +
111 req1->getSize()/
sizeof(T)]);
112 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index: %d unaligned memory "
113 "request for %#x\n", gpuDynInst->cu_id,
114 gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
116 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
117 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
119 gpuDynInst->setStatusVector(lane, 1);
120 gpuDynInst->setRequestFlags(req);
121 pkt =
new Packet(req, mem_req_type);
123 gpuDynInst->d_data))[lane * N]);
124 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
127 gpuDynInst->setStatusVector(lane, 0);
137 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
139 gpuDynInst->resetEntireStatusVector();
141 if (gpuDynInst->exec_mask[lane]) {
144 for (
int dword = 0; dword < N; ++dword) {
147 vaddr[dword] = gpuDynInst->addr[lane] + dword *
stride;
153 "scratch access not yet implemented\n");
156 gpuDynInst->setStatusVector(lane, N);
160 for (
int dword = 0; dword < N; ++dword) {
161 req[dword] = std::make_shared<Request>(
vaddr[dword], req_size,
162 0, gpuDynInst->computeUnit()->requestorId(), 0,
163 gpuDynInst->wfDynId);
164 gpuDynInst->setRequestFlags(req[dword]);
165 pkt[dword] =
new Packet(req[dword], mem_req_type);
170 gpuDynInst->d_data))[data_elem]);
172 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
176 gpuDynInst->setStatusVector(lane, 0);
191 int req_size = N *
sizeof(T);
192 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
201 assert(split_addr <=
vaddr || split_addr -
vaddr < block_size);
207 bool misaligned_acc = split_addr >
vaddr &&
208 !gpuDynInst->staticInstruction()->hasNoAddr();
211 if (gpuDynInst->staticInstruction()->hasNoAddr()) {
216 gpuDynInst->computeUnit()->requestorId(), 0,
217 gpuDynInst->wfDynId);
219 if (misaligned_acc) {
221 req->splitOnVaddr(split_addr, req1, req2);
222 gpuDynInst->numScalarReqs = 2;
223 gpuDynInst->setRequestFlags(req1);
224 gpuDynInst->setRequestFlags(req2);
228 pkt2->
dataStatic(gpuDynInst->scalar_data + req1->getSize());
229 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: unaligned scalar memory request for"
230 " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
231 gpuDynInst->wfSlotId, split_addr);
232 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
233 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
235 gpuDynInst->numScalarReqs = 1;
236 gpuDynInst->setRequestFlags(req);
239 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);