Hi,
Let me first provide a MWE, including error message (which helps ).
using Distributed
addprocs(1)
@everywhere begin
using CUDA
x = CUDA.rand(10)
end
@sync @distributed for i = 1:2
println(x)
end
Error message and stacktrace
ERROR: TaskFailedException
nested task error: Unhandled Task ERROR: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore should be avoided.
If you want to allow scalar iteration, use `allowscalar` or `@allowscalar`
to enable scalar iteration globally or for the operations in question.
Stacktrace:
[1] error(s::String)
@ Base .\error.jl:35
[2] errorscalar(op::String)
@ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:155
[3] _assertscalar(op::String, behavior::GPUArraysCore.ScalarIndexing)
@ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:128
[4] assertscalar(op::String)
@ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:116
[5] getindex
@ (...)\.julia\packages\GPUArrays\qt4ax\src\host\indexing.jl:50 [inlined]
[6] iterate
@ .\abstractarray.jl:1217 [inlined]
[7] iterate
@ .\abstractarray.jl:1215 [inlined]
[8] hash(A::CuArray{Float32, 1, CUDA.DeviceMemory}, h::UInt64)
@ Base .\abstractarray.jl:3430
[9] hash(x::CuArray{Float32, 1, CUDA.DeviceMemory})
@ Base .\hashing.jl:30
[10] serialize_global_from_main(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, sym::Symbol)
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:151
[11] #8
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101 [inlined]
[12] foreach
@ .\abstractarray.jl:3097 [inlined]
[13] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Core.TypeName)
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101
[14] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:560
[15] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:595
[16] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:578
[17] serialize_type(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType, ref::Bool)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:602
[18] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:671
[19] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:655
[20] serialize_msg(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, o::Distributed.CallMsg{:call})
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:78
[21] #invokelatest#2
@ .\essentials.jl:892 [inlined]
[22] invokelatest
@ .\essentials.jl:889 [inlined]
[23] send_msg_(w::Distributed.Worker, header::Distributed.MsgHeader, msg::Distributed.CallMsg{:call}, now::Bool)
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:181
[24] send_msg
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:122 [inlined]
[25] #remotecall#156
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:436 [inlined]
[26] remotecall
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:434 [inlined]
[27] remotecall(::Function, ::Int64; kwargs::@Kwargs{})
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447
[28] remotecall
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447 [inlined]
[29] spawnat
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:11 [inlined]
[30] spawn_somewhere
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:13 [inlined]
[31] macro expansion
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:91 [inlined]
[32] macro expansion
@ .\task.jl:479 [inlined]
[33] (::Distributed.var"#177#179"{var"#1#2", UnitRange{Int64}})()
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:278
Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore should be avoided.
If you want to allow scalar iteration, use `allowscalar` or `@allowscalar`
to enable scalar iteration globally or for the operations in question.
Stacktrace:
[1] error(s::String)
@ Base .\error.jl:35
[2] errorscalar(op::String)
@ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:155
[3] _assertscalar(op::String, behavior::GPUArraysCore.ScalarIndexing)
@ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:128
[4] assertscalar(op::String)
@ GPUArraysCore (...)\.julia\packages\GPUArraysCore\GMsgk\src\GPUArraysCore.jl:116
[5] getindex
@ (...)\.julia\packages\GPUArrays\qt4ax\src\host\indexing.jl:50 [inlined]
[6] iterate
@ .\abstractarray.jl:1217 [inlined]
[7] iterate
@ .\abstractarray.jl:1215 [inlined]
[8] hash(A::CuArray{Float32, 1, CUDA.DeviceMemory}, h::UInt64)
@ Base .\abstractarray.jl:3430
[9] hash(x::CuArray{Float32, 1, CUDA.DeviceMemory})
@ Base .\hashing.jl:30
[10] serialize_global_from_main(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, sym::Symbol)
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:151
[11] #8
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101 [inlined]
[12] foreach
@ .\abstractarray.jl:3097 [inlined]
[13] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Core.TypeName)
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\clusterserialize.jl:101
[14] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:560
[15] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:595
[16] serialize_type_data(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:578
[17] serialize_type(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::DataType, ref::Bool)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:602
[18] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:671
[19] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
@ Serialization (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Serialization\src\Serialization.jl:655
[20] serialize_msg(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, o::Distributed.CallMsg{:call})
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:78
[21] #invokelatest#2
@ .\essentials.jl:892 [inlined]
[22] invokelatest
@ .\essentials.jl:889 [inlined]
[23] send_msg_(w::Distributed.Worker, header::Distributed.MsgHeader, msg::Distributed.CallMsg{:call}, now::Bool)
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:181
[24] send_msg
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\messages.jl:122 [inlined]
[25] #remotecall#156
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:436 [inlined]
[26] remotecall
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:434 [inlined]
[27] remotecall(::Function, ::Int64; kwargs::@Kwargs{})
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447
[28] remotecall
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\remotecall.jl:447 [inlined]
[29] spawnat
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:11 [inlined]
[30] spawn_somewhere
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:13 [inlined]
[31] macro expansion
@ (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:91 [inlined]
[32] macro expansion
@ .\task.jl:479 [inlined]
[33] (::Distributed.var"#177#179"{var"#1#2", UnitRange{Int64}})()
@ Distributed (...)\.julia\juliaup\julia-1.10.4+0.x64.w64.mingw32\share\julia\stdlib\v1.10\Distributed\src\macros.jl:278
Stacktrace:
[1] sync_end(c::Channel{Any})
@ Base .\task.jl:448
[2] macro expansion
@ task.jl:480 [inlined]
[3] top-level scope
@ REPL[4]:1
Now I don’t have any experience with Distributed.jl and I also didn’t fully verify the explanation below. But to me it seems we want to send the code to execute (i.e. println(x)
) to all processes (send_msg
). For this we need to serialize
x
(*). As this is not explicitly implemented for CuArray
s, we use the generic AbstractVector
version, where we presumably iterate over its elements. For a CuArray
this then results in the scalar indexing warning.
*But didn’t we define x
on each process? So why would we need to send it?
Well, if you replace x = CUDA.rand(10)
with x = rand()
(and add an @everywhere println(x)
here to see that the processes get different values), you’ll notice that the @distributed
part now runs, but prints the same value for all processes. So apparently we are sending over our (the main process’s) value of x
and not using the ones already defined on the other processes. See also this topic.