CUDA performing scalar indexing when used along with Distributed

Hi,

Let me first provide a MWE, including error message (which helps :slight_smile: ).

using Distributed

addprocs(1)
@everywhere begin
    using CUDA
    x = CUDA.rand(10)
end

@sync @distributed for i = 1:2
    println(x)
end
Error message and stacktrace
ERROR: TaskFailedException

    nested task error: Unhandled Task ERROR: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore should be avoided.

If you want to allow scalar iteration, use `allowscalar` or `@allowscalar`
to enable scalar iteration globally or for the operations in question.
Stacktrace:
  [1] error(s::String)
    @ Base .\error.jl:35
  [2] errorscalar(op::String)
    @ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:155
  [3] _assertscalar(op::String, behavior::GPUArraysCore.ScalarIndexing)
    @ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:128
  [4] assertscalar(op::String)
    @ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:116
  [5] getindex
    @ (...)\.julia\packages\GPUArrays\qt4ax\src\host\indexing.jl:50 [inlined]
  [6] iterate
    @ .\abstractarray.jl:1217 [inlined]
  [7] iterate
    @ .\abstractarray.jl:1215 [inlined]
  [8] hash(A::CuArray{Float32, 1, CUDA.DeviceMemory}, h::UInt64)
    @ Base .\abstractarray.jl:3430
  [9] hash(x::CuArray{Float32, 1, CUDA.DeviceMemory})
    @ Base .\hashing.jl:30
 [10] serialize_global_from_main(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, sym::Symbol)
    @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:151
 [11] #8
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101 [inlined]
 [12] foreach
    @ .\abstractarray.jl:3097 [inlined]
 [13] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Core.TypeName)
    @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101
 [14] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
    @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:560
 [15] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
    @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:595
 [16] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
    @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:578
 [17] serialize_type(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType, ref::Bool)
    @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:602
 [18] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
    @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:671
 [19] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
    @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:655
 [20] serialize_msg(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, o::Distributed.CallMsg{:call})
    @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:78
 [21] #invokelatest#2
    @ .\essentials.jl:892 [inlined]
 [22] invokelatest
    @ .\essentials.jl:889 [inlined]
 [23] send_msg_(w::Distributed.Worker, header::Distributed.MsgHeader, msg::Distributed.CallMsg{:call}, now::Bool)
    @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:181
 [24] send_msg
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:122 [inlined]
 [25] #remotecall#156
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:436 [inlined]
 [26] remotecall
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:434 [inlined]
 [27] remotecall(::Function, ::Int64; kwargs::@Kwargs{})
    @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447
 [28] remotecall
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447 [inlined]
 [29] spawnat
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:11 [inlined]
 [30] spawn_somewhere
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:13 [inlined]
 [31] macro expansion
    @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:91 [inlined]
 [32] macro expansion
    @ .\task.jl:479 [inlined]
 [33] (::Distributed.var"#177#179"{var"#1#2", UnitRange{Int64}})()
    @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:278
Scalar indexing is disallowed.
    Invocation of getindex resulted in scalar indexing of a GPU array.
    This is typically caused by calling an iterating implementation of a method.
    Such implementations *do not* execute on the GPU, but very slowly on the CPU,
    and therefore should be avoided.

    If you want to allow scalar iteration, use `allowscalar` or `@allowscalar`
    to enable scalar iteration globally or for the operations in question.
    Stacktrace:
      [1] error(s::String)
        @ Base .\error.jl:35
      [2] errorscalar(op::String)
        @ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:155
      [3] _assertscalar(op::String, behavior::GPUArraysCore.ScalarIndexing)
        @ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:128
      [4] assertscalar(op::String)
        @ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:116
      [5] getindex
        @ (...)\.julia\packages\GPUArrays\qt4ax\src\host\indexing.jl:50 [inlined]
      [6] iterate
        @ .\abstractarray.jl:1217 [inlined]
      [7] iterate
        @ .\abstractarray.jl:1215 [inlined]
      [8] hash(A::CuArray{Float32, 1, CUDA.DeviceMemory}, h::UInt64)
        @ Base .\abstractarray.jl:3430
      [9] hash(x::CuArray{Float32, 1, CUDA.DeviceMemory})
        @ Base .\hashing.jl:30
     [10] serialize_global_from_main(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, sym::Symbol)
        @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:151
     [11] #8
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101 [inlined]
     [12] foreach
        @ .\abstractarray.jl:3097 [inlined]
     [13] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Core.TypeName)
        @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101
     [14] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
        @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:560
     [15] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
        @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:595
     [16] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
        @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:578
     [17] serialize_type(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType, ref::Bool)
        @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:602
     [18] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
        @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:671
     [19] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
        @ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:655
     [20] serialize_msg(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, o::Distributed.CallMsg{:call})
        @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:78
     [21] #invokelatest#2
        @ .\essentials.jl:892 [inlined]
     [22] invokelatest
        @ .\essentials.jl:889 [inlined]
     [23] send_msg_(w::Distributed.Worker, header::Distributed.MsgHeader, msg::Distributed.CallMsg{:call}, now::Bool)
        @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:181
     [24] send_msg
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:122 [inlined]
     [25] #remotecall#156
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:436 [inlined]
     [26] remotecall
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:434 [inlined]
     [27] remotecall(::Function, ::Int64; kwargs::@Kwargs{})
        @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447
     [28] remotecall
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447 [inlined]
     [29] spawnat
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:11 [inlined]
     [30] spawn_somewhere
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:13 [inlined]
     [31] macro expansion
        @ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:91 [inlined]
     [32] macro expansion
        @ .\task.jl:479 [inlined]
     [33] (::Distributed.var"#177#179"{var"#1#2", UnitRange{Int64}})()
        @ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:278
Stacktrace:
 [1] sync_end(c::Channel{Any})
   @ Base .\task.jl:448
 [2] macro expansion
   @ task.jl:480 [inlined]
 [3] top-level scope
   @ REPL[4]:1

Now I don’t have any experience with Distributed.jl and I also didn’t fully verify the explanation below. But to me it seems we want to send the code to execute (i.e. println(x)) to all processes (send_msg). For this we need to serialize x(*). As this is not explicitly implemented for CuArrays, we use the generic AbstractVector version, where we presumably iterate over its elements. For a CuArray this then results in the scalar indexing warning.

*But didn’t we define x on each process? So why would we need to send it?
Well, if you replace x = CUDA.rand(10) with x = rand() (and add an @everywhere println(x) here to see that the processes get different values), you’ll notice that the @distributed part now runs, but prints the same value for all processes. So apparently we are sending over our (the main process’s) value of x and not using the ones already defined on the other processes. See also this topic.