Simple CUDA sum kernel

I am trying to write a simple CUDA.jl sum kernel. However I always get:

ERROR: LoadError: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_AD
DRESS)

Any ideas? Here is my code:

using CUDA

function mysum_kernel!(out, arr)
    tindex = threadIdx().x
    stride = blockDim().x
    acc = 0f0
    for i in tindex:stride:length(arr)
        @inbounds acc += arr[i]
    end
    nshacc = blockDim().x
    shacc = @cuDynamicSharedMem(Float32, nshacc)
    shacc[tindex] = acc
    CUDA.sync_threads()

    if tindex === 1
        ret = 0f0
        for i in eachindex(shacc)
            ret += shacc[i]
        end
        #out[] = ret
    end
    CUDA.sync_threads()
    return nothing
end

arr = CuArray{Float32}(1:13)
out = CUDA.zeros()
CUDA.@sync begin
    @cuda threads=8 blocks=1 mysum_kernel!(out, arr)
end
@show out