Unsafe_convert when trying to run a neural network on GPU

I want to run a single input neural network using CUDA as follows.

using CUDA, Lux, Random
rng = Random.default_rng()

data = gpu(rand(3,10))
neuralNetwork = Lux.Chain(Lux.Dense(1,4),Lux.Dense(4,2))
ps,st = Lux.setup(rng,neuralNetwork) |> gpu

function predict(p,t)
        Array(first(neuralNetwork([t], p,st)))
end

function loss(p)
        pred = CUDA.@allowscalar reduce(hcat,[predict(p,data[1,i]) for i in 1:10])
        return sum(abs2,pred.-data[2:3,:])
end

println(loss(ps))

I get the following error when trying to run it on GPU, but running it on CPU works as intended.

ERROR: LoadError: MethodError: no method matching unsafe_convert(::Type{Ptr{Float32}}, ::CuPtr{Float32})
Closest candidates are:
  unsafe_convert(!Matched::Type{RefOrCuRef{T}}, ::Union{CuPtr{T}, CUDA.CuRefArray{T, A} where A<:(AbstractArray{T})}) where T at ~/.julia/packages/CUDA/OYQsb/src/pointer.jl:264
  unsafe_convert(!Matched::Type{RefOrCuRef{T}}, ::Any) where T at ~/.julia/packages/CUDA/OYQsb/src/pointer.jl:260
  unsafe_convert(!Matched::Type{CuRef{T}}, ::Any) where T at ~/.julia/packages/CUDA/OYQsb/src/pointer.jl:208
  ...
Stacktrace:
  [1] gemv!(trans::Char, alpha::Float32, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, X::Vector{Float32}, beta::Float32, Y::Vector{Float32})
    @ LinearAlgebra.BLAS /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/blas.jl:666
  [2] gemv!(y::Vector{Float32}, tA::Char, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, x::Vector{Float32}, α::Bool, β::Bool)
    @ LinearAlgebra /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:503
  [3] mul!
    @ /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:65 [inlined]
  [4] mul!
    @ /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:276 [inlined]
  [5] *
    @ /carnegie/binaries/centos7/julia/1.8.5/share/julia/stdlib/v1.8/LinearAlgebra/src/matmul.jl:52 [inlined]
  [6] Dense
    @ ~/.julia/packages/Lux/s0bDu/src/layers/basic.jl:243 [inlined]
  [7] apply(model::Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}, x::Vector{Float32}, ps::NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, st::NamedTuple{(), Tuple{}})
    @ LuxCore ~/.julia/packages/LuxCore/Uvg7A/src/LuxCore.jl:98
  [8] macro expansion
    @ ~/.julia/packages/Lux/s0bDu/src/layers/containers.jl:0 [inlined]
  [9] applychain
    @ ~/.julia/packages/Lux/s0bDu/src/layers/containers.jl:460 [inlined]
 [10] (::Chain{NamedTuple{(:layer_1, :layer_2), Tuple{Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}, Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}}}})(x::Vector{Float32}, ps::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}, st::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(), Tuple{}}, NamedTuple{(), Tuple{}}}})
    @ Lux ~/.julia/packages/Lux/s0bDu/src/layers/containers.jl:457
 [11] predict(p::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}, t::Float32)
    @ Main ~/Phytoplankton_PINN/test.jl:9
 [12] (::var"#2#4"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}})(i::Int64)
    @ Main ./none:0
 [13] iterate
    @ ./generator.jl:47 [inlined]
 [14] collect(itr::Base.Generator{UnitRange{Int64}, var"#2#4"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}}})
    @ Base ./array.jl:787
 [15] (::var"#1#3"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}})()
    @ Main ~/.julia/packages/GPUArraysCore/HaQcr/src/GPUArraysCore.jl:125
 [16] task_local_storage(body::var"#1#3"{NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}}}, key::Symbol, val::GPUArraysCore.ScalarIndexing)
    @ Base ./task.jl:292
 [17] macro expansion
    @ ~/.julia/packages/GPUArraysCore/HaQcr/src/GPUArraysCore.jl:124 [inlined]
 [18] loss(p::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}})
    @ Main ~/Phytoplankton_PINN/test.jl:13
 [19] top-level scope
    @ ~/Phytoplankton_PINN/test.jl:17
in expression starting at /home/jarroyoesquivel/Phytoplankton_PINN/test.jl:17
srun: error: vgpu-004: task 0: Exited with exit code 1

I know reduce(hcat does not properly work on GPU, but I can’t seem to find a workaround when the output of iterating over Lux is an Array of Arrays that I need to transform to a matrix.

You’re mixing CPU and GPU arrays in a single operation, resulting in an attempt to convert a GPU array’s pointer to a CPU pointer, which isn’t allowed.

1 Like

[t] is what’s on the CPU. Do neuralNetwork(gpu([t]), p,st)

Thanks, that solves the problem I had on GPU. The MWE still doesn’t run though, the return line has a weird error on broadcast.

ERROR: LoadError: GPU compilation of kernel #broadcast_kernel#28(CUDA.CuKernelContext, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, typeof(-), Tuple{Base.Broadcast.Extruded{Matrix{Float32}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}}}, Int64) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, typeof(-), Tuple{Base.Broadcast.Extruded{Matrix{Float32}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}}}, which is not isbits:
  .args is of type Tuple{Base.Broadcast.Extruded{Matrix{Float32}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}} which is not isbits.
    .1 is of type Base.Broadcast.Extruded{Matrix{Float32}, Tuple{Bool, Bool}, Tuple{Int64, Int64}} which is not isbits.
      .x is of type Matrix{Float32} which is not isbits.


Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Z5kZC/src/validation.jl:88
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/Z5kZC/src/driver.jl:417 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/Z5kZC/src/driver.jl:416 [inlined]
  [5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Z5kZC/src/utils.jl:68
  [6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA ~/.julia/packages/CUDA/OYQsb/src/compiler/execution.jl:354
  [7] #224
    @ ~/.julia/packages/CUDA/OYQsb/src/compiler/execution.jl:347 [inlined]
  [8] JuliaContext(f::CUDA.var"#224#225"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#28", Tuple{CUDA.CuKernelContext, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, typeof(-), Tuple{Base.Broadcast.Extruded{Matrix{Float32}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}}}, Int64}}}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Z5kZC/src/driver.jl:76
  [9] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/OYQsb/src/compiler/execution.jl:346
 [10] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Z5kZC/src/cache.jl:90
 [11] cufunction(f::GPUArrays.var"#broadcast_kernel#28", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceMatrix{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2}, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, typeof(-), Tuple{Base.Broadcast.Extruded{Matrix{Float32}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}, Base.Broadcast.Extruded{CuDeviceMatrix{Float32, 1}, Tuple{Bool, Bool}, Tuple{Int64, Int64}}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/OYQsb/src/compiler/execution.jl:299
 [12] cufunction
    @ ~/.julia/packages/CUDA/OYQsb/src/compiler/execution.jl:292 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/CUDA/OYQsb/src/compiler/execution.jl:102 [inlined]
 [14] #launch_heuristic#248
    @ ~/.julia/packages/CUDA/OYQsb/src/gpuarrays.jl:17 [inlined]
 [15] _copyto!
    @ ~/.julia/packages/GPUArrays/6STCb/src/host/broadcast.jl:65 [inlined]
 [16] copyto!
    @ ~/.julia/packages/GPUArrays/6STCb/src/host/broadcast.jl:46 [inlined]
 [17] copy
    @ ~/.julia/packages/GPUArrays/6STCb/src/host/broadcast.jl:37 [inlined]
 [18] materialize(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{2}, Nothing, typeof(-), Tuple{Matrix{Float32}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}})
    @ Base.Broadcast ./broadcast.jl:860
 [19] loss(p::NamedTuple{(:layer_1, :layer_2), Tuple{NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}, NamedTuple{(:weight, :bias), Tuple{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}}}})
    @ Main ~/Phytoplankton_PINN/test.jl:14
 [20] top-level scope
    @ ~/Phytoplankton_PINN/test.jl:17
in expression starting at /home/jarroyoesquivel/Phytoplankton_PINN/test.jl:17
srun: error: vgpu-002: task 0: Exited with exit code 1

you’ll want to chop that manual conversion out

1 Like