Hey everybody, when I try to run the example from the Flux website for data distributed parallel training, I get the error that the gpu_device is not defined, looking suspiciously familiar to a Lux name. Any suggestions on what I am doing wrong?
I am using Julia 1.10.0 and Flux v0.15.2
Here is the full code that I execute with my 6 GPUs on the cluster:
mpiexecjl --project=. -n 6 julia src/testunet.jl
using Zygote, Statistics, StatsBase,
using Flux, MPI, NCCL, CUDA
#I tried importing gpu_device from Lux but it did not solve the issue.
#using Lux: gpu_device
using Optimisers
CUDA.allowscalar(false)
DistributedUtils.initialize(NCCLBackend)
backend = DistributedUtils.get_distributed_backend(NCCLBackend)
model = Chain(Dense(1 => 256, tanh), Dense(256 => 1)) |> gpu
x = rand(Float32, 1, 16) |> gpu
y = x .^ 3
data = DistributedUtils.DistributedDataContainer(backend, x)
model = DistributedUtils.synchronize!!(backend, DistributedUtils.FluxDistributedModel(model); root=0)
opt = DistributedUtils.DistributedOptimizer(backend, Optimisers.Adam(0.001f0))
st_opt = Optimisers.setup(opt, model)
st_opt = DistributedUtils.synchronize!!(backend, st_opt; root=0)
loss(model) = mean((model(x) .- y).^2)
for epoch in 1:100
global model, st_opt
l, grad = Zygote.withgradient(loss, model)
println("Epoch $epoch: Loss $l")
st_opt, model = Optimisers.update(st_opt, model, grad[1])
end
ERROR: LoadError: UndefVarError:
gpu_device
not defined
Stacktrace:
[1] allreduce!(backend::NCCLBackend{Communicator, MPIBackend{MPI.Comm}}, sendrecvbuf::CuArray{Float32, 2, CUDA.DeviceMemory}, op::typeof(Flux.DistributedUtils.avg))
@ Flux.DistributedUtils ~/.julia/packages/Flux/1wZQP/src/distributed/public_api.jl:135
[2] apply!(opt::Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, state::Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}, x::CuArray{Float32, 2, CUDA.DeviceMemory}, y::CuArray{Float32, 2, CUDA.DeviceMemory})
@ Flux.DistributedUtils ~/.julia/packages/Flux/1wZQP/src/distributed/public_api.jl:292
[3] _update!(ℓ::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, x::CuArray{Float32, 2, CUDA.DeviceMemory}; grads::IdDict{Optimisers.Leaf, Any}, params::IdDict{Any, Any})
@ Optimisers ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:96
[4] _update!
@ Optimisers ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:92 [inlined]
[5] #8
@ Optimisers ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:85 [inlined]
[6] map(f::Optimisers.var"#8#9"{IdDict{Optimisers.Leaf, Any}, IdDict{Any, Any}}, t::Tuple{Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, Tuple{}}, s::Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, typeof(tanh)})
@ Base ./tuple.jl:322
[7] UndefVarError:gpu_device
not defined
[8] mapvalue
@ ~/.julia/packages/Optimisers/a4OnF/src/utils.jl:2 [inlined]
[9] _update!(ℓ::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, x::CuArray{Float32, 2, CUDA.DeviceMemory}; grads::IdDict{Optimisers.Leaf, Any}, params::IdDict{Any, Any})
@ Optimisers ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:96
[10] _update!
@ ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:81 [inlined]
[11] #8
@ ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:85 [inlined]
[12] map
@ ./tuple.jl:319 [inlined]
[13] mapvalue
@ ~/.julia/packages/Optimisers/a4OnF/src/utils.jl:2 [inlined]
[14] map(f::Optimisers.var"#8#9"{IdDict{Optimisers.Leaf, Any}, IdDict{Any, Any}}, t::Tuple{Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, Tuple{}}, s::Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, typeof(tanh)})
@ Base ./tuple.jl:322
[15] _update!
@ ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:81 [inlined]
[16] #8
@ ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:85 [inlined]
[17] map
@ ./tuple.jl:318 [inlined]
[18] map
@ ./namedtuple.jl:269 [inlined]
[19] mapvalue
@ ~/.julia/packages/Optimisers/a4OnF/src/utils.jl:2 [inlined]
[20] map(f::Function, nt::@NamedTuple{weight::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, σ::Tuple{}}, nts::@NamedTuple{weight::CuArray{Float32, 2, CUDA.DeviceMemory}, bias::CuArray{Float32, 1, CUDA.DeviceMemory}, σ::typeof(tanh)})
@ Base ./namedtuple.jl:269
[21] _update!
@ Optimisers ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:81 [inlined]
[22] map(f::Optimisers.var"#8#9"{IdDict{Optimisers.Leaf, Any}, IdDict{Any, Any}}, t::Tuple{Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, Tuple{}}, s::Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, typeof(tanh)})
@ Base ./tuple.jl:322
[23] _update!(tree::Tuple{@NamedTuple{weight::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, σ::Tuple{}}, @NamedTuple{weight::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, σ::Tuple{}}}, x::Tuple{Dense{typeof(tanh), CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}}, Dense{typeof(identity), CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}}}; grads::IdDict{Optimisers.Leaf, Any}, params::IdDict{Any, Any})
@ Optimisers ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:85
[24] top-level scope
@ /lustre/miifs01/project/m2_jgu-tpchange/nibrast/code/packages/rhAi/src/testunet.jl:25
in expression starting at /lustre/miifs01/project/m2_jgu-tpchange/nibrast/code/packages/rhAi/src/testunet.jl:21
update!(::@NamedTuple{layers::Tuple{@NamedTuple{weight::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, σ::Tuple{}}, @NamedTuple{weight::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 2, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, bias::Optimisers.Leaf{Flux.DistributedUtils.DistributedOptimizer{NCCLBackend{Communicator, MPIBackend{MPI.Comm}}}, Tuple{CuArray{Float32, 1, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}, Tuple{Float32, Float32}}}, σ::Tuple{}}}}, ::Chain{Tuple{Dense{typeof(tanh), CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}}, Dense{typeof(identity), CuArray{Float32, 2, CUDA.DeviceMemory}, CuArray{Float32, 1, CUDA.DeviceMemory}}}}, ::@NamedTuple{layers::Tuple{@NamedTuple{weight::CuArray{Float32, 2, CUDA.DeviceMemory}, bias::CuArray{Float32, 1, CUDA.DeviceMemory}, σ::Nothing}, @NamedTuple{weight::CuArray{Float32, 2, CUDA.DeviceMemory}, bias::CuArray{Float32, 1, CUDA.DeviceMemory}, σ::Nothing}}})
@ Optimisers ~/.julia/packages/Optimisers/a4OnF/src/interface.jl:77