I want to run a single input neural network using CUDA as follows.
using CUDA, Lux, Random
rng = Random.default_rng()
data = gpu(rand(3,10))
neuralNetwork = Lux.Chain(Lux.Dense(1,4),Lux.Dense(4,2))
ps,st = Lux.setup(rng,neuralNetwork) |> gpu
function predict(p,t)
Array(first(neuralNetwork([t], p,st)))
end
function loss(p)
pred = CUDA.@allowscalar reduce(hcat,[predict(p,data[1,i]) for i in 1:10])
return sum(abs2,pred.-data[2:3,:])
end
println(loss(ps))
I get the following error when trying to run it on GPU, but running it on CPU works as intended.
ERROR: LoadError: MethodError: no method matching unsafe_convert(::Type{Ptr{Float32}}, ::CuPtr{Float32})
Closest candidates are:
unsafe_convert(!Matched::Type{RefOrCuRef{T}}, ::Union{CuPtr{T}, CUDA.CuRefArray{T, A} where A<:(AbstractArray{T})}) where T at ~/.julia/packages/CUDA/OYQsb/src/pointer.jl:264
unsafe_convert(!Matched::Type{RefOrCuRef{T}}, ::Any) where T at ~/.julia/packages/CUDA/OYQsb/src/pointer.jl:260
unsafe_convert(!Matched::Type{CuRef{T}}, ::Any) where T at ~/.julia/packages/CUDA/OYQsb/src/pointer.jl:208
...
Stacktrace:
[1] gemv!(trans::Char, alpha::Float32, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, X::Vector{Float32}, beta::Float32, Y::Vector{Float32})
@ LinearAlgebra.BLAS /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/blas.jl:666
[2] gemv!(y::Vector{Float32}, tA::Char, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, x::Vector{Float32}, α::Bool, β::Bool)
@ LinearAlgebra /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:503
[3] mul!
@ /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:65 [inlined]
[4] mul!
@ /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:276 [inlined]
[5] *
@ /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:52 [inlined]
[6] Dense
@ ~/.julia/packages/Lux/s0bDu/src/layers/basic.jl:243 [inlined]
[7] apply(model::Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}, x::Vector{Float32}, ps::NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, st::NamedTuple{(), Tuple{}})
@ LuxCore ~/.julia/packages/LuxCore/Uvg7A/src/LuxCore.jl:98
[8] macro expansion
@ ~/.julia/packages/Lux/s0bDu/src/layers/containers.jl:0 [inlined]
[9] applychain
@ ~/.julia/packages/Lux/s0bDu/src/layers/containers.jl:460 [inlined]
[10] (::Chain{NamedTuple{(:layer_1, :layer_2), Tuple{Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}, Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}}}})(x::Vector{Float32}, ps::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}, st::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(), Tuple{}}, NamedTuple{(), Tuple{}}}})
@ Lux ~/.julia/packages/Lux/s0bDu/src/layers/containers.jl:457
[11] predict(p::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}, t::Float32)
@ Main ~/Phytoplankton_PINN/test.jl:9
[12] (::var"#2#4"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}})(i::Int64)
@ Main ./none:0
[13] iterate
@ ./generator.jl:47 [inlined]
[14] collect(itr::Base.Generator{UnitRange{Int64}, var"#2#4"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}}})
@ Base ./array.jl:787
[15] (::var"#1#3"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}})()
@ Main ~/.julia/packages/GPUArraysCore/HaQcr/src/GPUArraysCore.jl:125
[16] task_local_storage(body::var"#1#3"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}}, key::Symbol, val::GPUArraysCore.ScalarIndexing)
@ Base ./task.jl:292
[17] macro expansion
@ ~/.julia/packages/GPUArraysCore/HaQcr/src/GPUArraysCore.jl:124 [inlined]
[18] loss(p::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}})
@ Main ~/Phytoplankton_PINN/test.jl:13
[19] top-level scope
@ ~/Phytoplankton_PINN/test.jl:17
in expression starting at /home/jarroyoesquivel/Phytoplankton_PINN/test.jl:17
srun: error: vgpu-004: task 0: Exited with exit code 1
I know reduce(hcat
does not properly work on GPU, but I can’t seem to find a workaround when the output of iterating over Lux
is an Array of Arrays that I need to transform to a matrix.