Hello,
I need to implement the cumulative sum (cumsum
) on a GPU array (CUDA.jl or Metal.jl). Looking at the CUDA.jl repository, I found this definition
function cumsum!(sums)
shift = 1
while shift < length(sums)
to_add = 0
@inbounds if threadIdx().x - shift > 0
to_add = sums[threadIdx().x - shift]
end
sync_threads()
@inbounds if threadIdx().x - shift > 0
sums[threadIdx().x] += to_add
end
sync_threads()
shift *= 2
end
end
which can be executed inside a CUDA kernel.
Now, I need to implement it using KernelAbstractions.jl, but I’m not familiar with shared memory, especially with KernelAbstractions.jl. The easiest way I thought was something like
function cumsum!(sums)
idx = @index(Global)
shift = 1
while shift < length(sums)
to_add = 0
@inbounds if idx - shift > 0
to_add = sums[idx - shift]
end
KernelAbstractions.@syncronize()
@inbounds if idx - shift > 0
sums[idx] += to_add
end
KernelAbstractions.@syncronize()
shift *= 2
end
end
But I don’t know if this is correct, or if I have to use KernelAbstractions.@localmem
or something else.