After some help from @Tomas_Pevny as well as some research of my own, it looks like the solutions involves two realizations:
- GPU’s want inputs that are values, not pointers. Luckily, since our inputs are so small, we can pack each set of 16 UInt8’s into UInt128s with the reinterpret function so that we can pass an array of UInt128’s as inputs to the Kernel
- To allocate memory on the GPU side, we want to use the CuStaticSharedArray function to allocate an array that we can perform operations on the GPU side.
Both of these together gives me the following working code:
# Something like this, but more complicated
function blockAdd(in, out)
for i in 1:16
@inbounds out[(i%16)+1] = 2*in[i] + 1
end
end
function AESKernel!(in, out)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
# Allocate our blocks of memory
block_in = CuStaticSharedArray(UInt8, 16)
block_out = CuStaticSharedArray(UInt8, 16)
# Unpack our block from our UInt128
for j in 1:16
@inbounds block_in[j] = in[i] >> (8*(j-1)) % 256
end
# Now we can use our abstracted code
blockAdd(block_in, block_out)
# Repack the blocks
sum = 0
for j in 1:16
@inbounds sum += UInt128(block_out[j]) << (8*(j-1))
end
out[i] = sum
return nothing
end
function AESGPUTest()
randTextBlock = UInt8[i for i in 1:32]
i1 = reinterpret(UInt128, randTextBlock)
cu_i1 = CuArray(i1)
arraySize = length(i1)
cu_o1 = CuArray(UInt128[0 for i in 1:arraySize])
@cuda threads=2 AESKernel!(cu_i1, cu_o1)
end