I am trying to write a simple CUDA.jl sum kernel. However I always get:
ERROR: LoadError: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_AD
DRESS)
Any ideas? Here is my code:
using CUDA
function mysum_kernel!(out, arr)
tindex = threadIdx().x
stride = blockDim().x
acc = 0f0
for i in tindex:stride:length(arr)
@inbounds acc += arr[i]
end
nshacc = blockDim().x
shacc = @cuDynamicSharedMem(Float32, nshacc)
shacc[tindex] = acc
CUDA.sync_threads()
if tindex === 1
ret = 0f0
for i in eachindex(shacc)
ret += shacc[i]
end
#out[] = ret
end
CUDA.sync_threads()
return nothing
end
arr = CuArray{Float32}(1:13)
out = CUDA.zeros()
CUDA.@sync begin
@cuda threads=8 blocks=1 mysum_kernel!(out, arr)
end
@show out