55 int req_size = N *
sizeof(T);
56 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
58 bool misaligned_acc =
false;
59 RequestPtr req =
nullptr, req1 =
nullptr, req2 =
nullptr;
60 PacketPtr pkt =
nullptr, pkt1 =
nullptr, pkt2 =
nullptr;
62 gpuDynInst->resetEntireStatusVector();
64 if (gpuDynInst->exec_mask[lane]) {
65 vaddr = gpuDynInst->addr[lane];
73 assert(split_addr <=
vaddr || split_addr -
vaddr < block_size);
79 misaligned_acc = split_addr >
vaddr;
83 assert((
vaddr & 0x3) == 0);
86 assert(!misaligned_acc);
88 req = std::make_shared<Request>(
vaddr,
sizeof(T), 0,
89 gpuDynInst->computeUnit()->requestorId(), 0,
91 gpuDynInst->makeAtomicOpFunctor<T>(
92 &(
reinterpret_cast<T*
>(gpuDynInst->a_data))[lane],
93 &(
reinterpret_cast<T*
>(gpuDynInst->x_data))[lane]));
95 req = std::make_shared<Request>(
vaddr, req_size, 0,
96 gpuDynInst->computeUnit()->requestorId(), 0,
100 if (misaligned_acc) {
101 gpuDynInst->setStatusVector(lane, 2);
102 req->splitOnVaddr(split_addr, req1, req2);
103 gpuDynInst->setRequestFlags(req1);
104 gpuDynInst->setRequestFlags(req2);
105 pkt1 =
new Packet(req1, mem_req_type);
106 pkt2 =
new Packet(req2, mem_req_type);
107 pkt1->dataStatic(&(
reinterpret_cast<T*
>(
108 gpuDynInst->d_data))[lane * N]);
109 pkt2->dataStatic(&(
reinterpret_cast<T*
>(
110 gpuDynInst->d_data))[lane * N +
111 req1->getSize()/
sizeof(T)]);
112 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: index: %d unaligned memory "
113 "request for %#x\n", gpuDynInst->cu_id,
114 gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
116 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
117 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
119 gpuDynInst->setStatusVector(lane, 1);
120 gpuDynInst->setRequestFlags(req);
121 pkt =
new Packet(req, mem_req_type);
123 gpuDynInst->d_data))[lane * N]);
124 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
127 gpuDynInst->setStatusVector(lane, 0);
140 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
142 gpuDynInst->resetEntireStatusVector();
144 if (gpuDynInst->exec_mask[lane]) {
147 for (
int dword = 0; dword < N; ++dword) {
150 vaddr[dword] = gpuDynInst->addr[lane] + dword *
stride;
156 "scratch access not yet implemented\n");
159 gpuDynInst->setStatusVector(lane, N);
163 for (
int dword = 0; dword < N; ++dword) {
164 req[dword] = std::make_shared<Request>(
vaddr[dword], req_size,
165 0, gpuDynInst->computeUnit()->requestorId(), 0,
166 gpuDynInst->wfDynId);
167 gpuDynInst->setRequestFlags(req[dword]);
168 pkt[dword] =
new Packet(req[dword], mem_req_type);
173 gpuDynInst->d_data))[data_elem]);
175 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
179 gpuDynInst->setStatusVector(lane, 0);
194 int req_size = N *
sizeof(T);
195 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
204 assert(split_addr <=
vaddr || split_addr -
vaddr < block_size);
210 bool misaligned_acc = split_addr >
vaddr &&
211 !gpuDynInst->staticInstruction()->hasNoAddr();
214 if (gpuDynInst->staticInstruction()->hasNoAddr()) {
218 vaddr, req_size, std::move(flags),
219 gpuDynInst->computeUnit()->requestorId(), 0,
220 gpuDynInst->wfDynId);
222 if (misaligned_acc) {
224 req->splitOnVaddr(split_addr, req1, req2);
225 gpuDynInst->numScalarReqs = 2;
226 gpuDynInst->setRequestFlags(req1);
227 gpuDynInst->setRequestFlags(req2);
231 pkt2->
dataStatic(gpuDynInst->scalar_data + req1->getSize());
232 DPRINTF(GPUMem,
"CU%d: WF[%d][%d]: unaligned scalar memory request for"
233 " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
234 gpuDynInst->wfSlotId, split_addr);
235 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
236 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
238 gpuDynInst->numScalarReqs = 1;
239 gpuDynInst->setRequestFlags(req);
242 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);