Variable scoping issue when using multiple GPUs in CUDA.jl

I’m using two for loops: one, to iterate over CUDA devices and launch kernels. The second, to iterate over CUDA devices and copy results back to the CPU. When I do this, I get an ‘undefined variable’ error due to local scoping of the CuArray variable. How do I avoid this?

Here’s a minimal working example:

function kernel!(a) 
    j = threadIdx().x
    a[j] = 1.0             
    return             
end                         
                            
function run_kernel!(a)        
    idx = [1:10, 25:30]       
                                           
    ndevices = length(CUDA.devices())      
                                           
    # Launch kernels on 2 gpus
    for idev in 1:ndevices
        CUDA.device!(idev-1)
        a_gpu = CuArray(view(a, idx[idev]))
        n = length(idx[idev])

        @cuda threads=n kernel!(a_gpu)     
    end                                    
                                           
    # Copy results back from 2 gpus          
    for idev in 1:ndevices
        CUDA.device!(idev-1)
        a[idx[idev]] .= Array(a_gpu)       
    end                                    
                                           
end                                              
                                           
a = zeros(30)                              
run_kernel!(a)                             

I ended up using a Vector to keep track of the references like so:

    ndevices = length(CUDA.devices()) 

    a_gpu_list = Vector{CuArray{T, 1}}(undef, ndevices)

    # Launch kernels on 2 gpus
    for idev in 1:ndevices
        CUDA.device!(idev-1)
        a_gpu = CuArray(view(a, idx[idev]))
        n = length(idx[idev])

        @cuda threads=n kernel!(a_gpu)
        a_gpu_list[idev] = a_gpu
    end                                    
                                           
    # Copy results back from 2 gpus          
    for idev in 1:ndevices
        CUDA.device!(idev-1)
        a[idx[idev]] .= Array(a_gpu_list[idev])       
    end   

I’m open to more efficient solutions to handle this.