Only Buffer When You Need To: Reducing On-chip GPU Traffic with Reconfigurable Local Atomic Buffers
atomicAdd(x,2)
atomicAdd(x,5)
atomicAdd(x,7)
mem_order_comm
1 2 3 loc = arr[tid]; // atomicAdd(&hist[loc], 1); atomicAdd(&hist[loc], 1, mem_order_comm);
1 2 3
loc = arr[tid]; // atomicAdd(&hist[loc], 1); atomicAdd(&hist[loc], 1, mem_order_comm);