In my actual code I have a two-dimensional array and I want to perform a set of operations along one dimension while parallelize along the other. Something like this:
using CUDA
function kernel1(a, b)
id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
N1, N2 = size(a)
for i=id:stride:N1
@. a[i, :] = b
end
return nothing
end
function kernel2(a, b)
id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
N1, N2 = size(a)
for i=id:stride:N1
for j=1:N2
a[i, j] = b[j]
end
end
return nothing
end
N1 = 100
N2 = 10
a = CUDA.zeros((N1, N2))
b = CUDA.ones(N2)
# @cuda threads=N1 kernel1(a, b)
@cuda threads=N1 kernel2(a, b)