I ended up using a Vector to keep track of the references like so:
ndevices = length(CUDA.devices())
a_gpu_list = Vector{CuArray{T, 1}}(undef, ndevices)
# Launch kernels on 2 gpus
for idev in 1:ndevices
CUDA.device!(idev-1)
a_gpu = CuArray(view(a, idx[idev]))
n = length(idx[idev])
@cuda threads=n kernel!(a_gpu)
a_gpu_list[idev] = a_gpu
end
# Copy results back from 2 gpus
for idev in 1:ndevices
CUDA.device!(idev-1)
a[idx[idev]] .= Array(a_gpu_list[idev])
end
I’m open to more efficient solutions to handle this.