I’m using two for loops: one, to iterate over CUDA devices and launch kernels. The second, to iterate over CUDA devices and copy results back to the CPU. When I do this, I get an ‘undefined variable’ error due to local scoping of the CuArray variable. How do I avoid this?
Here’s a minimal working example:
function kernel!(a)
j = threadIdx().x
a[j] = 1.0
return
end
function run_kernel!(a)
idx = [1:10, 25:30]
ndevices = length(CUDA.devices())
# Launch kernels on 2 gpus
for idev in 1:ndevices
CUDA.device!(idev-1)
a_gpu = CuArray(view(a, idx[idev]))
n = length(idx[idev])
@cuda threads=n kernel!(a_gpu)
end
# Copy results back from 2 gpus
for idev in 1:ndevices
CUDA.device!(idev-1)
a[idx[idev]] .= Array(a_gpu)
end
end
a = zeros(30)
run_kernel!(a)