Hello!
I’m trying to train a neural network using Flux.jl on the GPU with CUDA.jl. My issue (see error below) is that there seems to be some problem during the data transfer process between the GPU and CPU which causes the references to arrays to be lost. Even though I have no experience with GPU programming, it seems logical that both data and the model should be transferred to GPU memory in order to perform training and inference, but I think I must be missing something. This is the code I’m using to perform the training:
using Flux
using JLD2
using CUDA
using FileIO
function train(xtrain, ytrain, epochs::Integer)
input_shape = size(xtrain, 1)
dataloader = Flux.DataLoader((xtrain, ytrain),
batchsize = 64,
shuffle = true) |> gpu
model = Chain(
Dense(input_shape => 256, relu),
Dense(256 => 32, relu),
Dense(32 => 1, sigmoid),
) |> gpu
loss(x, y) = Flux.Losses.mse(model(x), y)
optimizer = Flux.ADAM()
trainable_params = Flux.params(model)
for epoch in 1:epochs
Flux.train!(loss, trainable_params, dataloader, optimizer)
end
end
function main()
(xtrain, ytrain) = FileIO.load("../data/tmp/nn_ready.jld2", "xtrain", "ytrain")
(xtest, ytest) = FileIO.load("../data/tmp/nn_ready.jld2", "xtest", "ytest")
train(xtrain, ytrain, 3)
end
main()
Which produces the following error:
ERROR: LoadError: ArgumentError: cannot take the CPU address of a CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}
Full error stacktrace
ERROR: LoadError: ArgumentError: cannot take the CPU address of a CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}
Stacktrace:
[1] unsafe_convert(#unused#::Type{Ptr{Float64}}, x::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer})
@ CUDA ~/.julia/packages/CUDA/fAEDi/src/array.jl:319
[2] gemm!(transA::Char, transB::Char, alpha::Float64, A::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, B::Matrix{Float64}, beta::Float64, C::Matrix{Float64})
@ LinearAlgebra.BLAS /usr/share/julia/stdlib/v1.7/LinearAlgebra/src/blas.jl:1421
[3] gemm_wrapper!(C::Matrix{Float64}, tA::Char, tB::Char, A::CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}, B::Matrix{Float64}, _add::LinearAlgebra.MulAddMul{true, true, Bool, Bool})
@ LinearAlgebra /usr/share/julia/stdlib/v1.7/LinearAlgebra/src/matmul.jl:671
[4] mul!
@ /usr/share/julia/stdlib/v1.7/LinearAlgebra/src/matmul.jl:169 [inlined]
[5] mul!
@ /usr/share/julia/stdlib/v1.7/LinearAlgebra/src/matmul.jl:275 [inlined]
[6] *(A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, B::Matrix{Float64})
@ LinearAlgebra /usr/share/julia/stdlib/v1.7/LinearAlgebra/src/matmul.jl:160
[7] rrule
@ ~/.julia/packages/ChainRules/MsNLy/src/rulesets/Base/arraymath.jl:64 [inlined]
[8] rrule
@ ~/.julia/packages/ChainRulesCore/RbX5a/src/rules.jl:134 [inlined]
[9] chain_rrule
@ ~/.julia/packages/Zygote/DkIUK/src/compiler/chainrules.jl:217 [inlined]
[10] macro expansion
@ ~/.julia/packages/Zygote/DkIUK/src/compiler/interface2.jl:0 [inlined]
[11] _pullback
@ ~/.julia/packages/Zygote/DkIUK/src/compiler/interface2.jl:9 [inlined]
[12] _pullback
@ ~/.julia/packages/Flux/6Q5r4/src/layers/basic.jl:159 [inlined]
[13] macro expansion
@ ~/.julia/packages/Flux/6Q5r4/src/layers/basic.jl:53 [inlined]
[14] _pullback
@ ~/.julia/packages/Flux/6Q5r4/src/layers/basic.jl:53 [inlined]
[15] _pullback(::Zygote.Context, ::typeof(Flux.applychain), ::Tuple{Dense{typeof(relu), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(relu), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(σ), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, ::Matrix{Float64})
@ Zygote ~/.julia/packages/Zygote/DkIUK/src/compiler/interface2.jl:0
[16] _pullback
@ ~/.julia/packages/Flux/6Q5r4/src/layers/basic.jl:51 [inlined]
[17] _pullback
@ ~/Documents/projetos/dl-solar-sao-paulo/src/train_model.jl:19 [inlined]
[18] _pullback(::Zygote.Context, ::var"#loss#19"{Chain{Tuple{Dense{typeof(relu), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(relu), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(σ), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}, ::Matrix{Float64}, ::LinearAlgebra.Adjoint{Float64, Vector{Float64}})
@ Zygote ~/.julia/packages/Zygote/DkIUK/src/compiler/interface2.jl:0
[19] _apply
@ ./boot.jl:814 [inlined]
[20] adjoint
@ ~/.julia/packages/Zygote/DkIUK/src/lib/lib.jl:204 [inlined]
[21] _pullback
@ ~/.julia/packages/ZygoteRules/AIbCs/src/adjoint.jl:65 [inlined]
[22] _pullback
@ ~/.julia/packages/Flux/6Q5r4/src/optimise/train.jl:120 [inlined]
[23] _pullback(::Zygote.Context, ::Flux.Optimise.var"#37#40"{var"#loss#19"{Chain{Tuple{Dense{typeof(relu), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(relu), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(σ), CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}, Tuple{Matrix{Float64}, LinearAlgebra.Adjoint{Float64, Vector{Float64}}}})
@ Zygote ~/.julia/packages/Zygote/DkIUK/src/compiler/interface2.jl:0
[24] pullback(f::Function, ps::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
@ Zygote ~/.julia/packages/Zygote/DkIUK/src/compiler/interface.jl:352
[25] gradient(f::Function, args::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}})
@ Zygote ~/.julia/packages/Zygote/DkIUK/src/compiler/interface.jl:75
[26] macro expansion
@ ~/.julia/packages/Flux/6Q5r4/src/optimise/train.jl:119 [inlined]
[27] macro expansion
@ ~/.julia/packages/ProgressLogging/6KXlp/src/ProgressLogging.jl:328 [inlined]
[28] train!(loss::Function, ps::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}}, data::MLUtils.DataLoader{Tuple{LinearAlgebra.Adjoint{Float64, Matrix{Float64}}, LinearAlgebra.Adjoint{Float64, Vector{Float64}}}, Random._GLOBAL_RNG}, opt::ADAM; cb::Flux.Optimise.var"#38#41")
@ Flux.Optimise ~/.julia/packages/Flux/6Q5r4/src/optimise/train.jl:117
[29] train!(loss::Function, ps::Zygote.Params{Zygote.Buffer{Any, Vector{Any}}}, data::MLUtils.DataLoader{Tuple{LinearAlgebra.Adjoint{Float64, Matrix{Float64}}, LinearAlgebra.Adjoint{Float64, Vector{Float64}}}, Random._GLOBAL_RNG}, opt::ADAM)
@ Flux.Optimise ~/.julia/packages/Flux/6Q5r4/src/optimise/train.jl:114
[30] train(xtrain::LinearAlgebra.Adjoint{Float64, Matrix{Float64}}, ytrain::LinearAlgebra.Adjoint{Float64, Vector{Float64}}, epochs::Int64)
@ Main ~/Documents/projetos/dl-solar-sao-paulo/src/train_model.jl:24
[31] main()
@ Main ~/Documents/projetos/dl-solar-sao-paulo/src/train_model.jl:31
[32] top-level scope
@ ~/Documents/projetos/dl-solar-sao-paulo/src/train_model.jl:35
[33] include(fname::String)
@ Base.MainInclude ./client.jl:451
[34] top-level scope
@ REPL[7]:1
[35] top-level scope
@ ~/.julia/packages/CUDA/fAEDi/src/initialization.jl:52
in expression starting at /var/home/enzo/Documents/projetos/dl-solar-sao-paulo/src/train_model.jl:35
I thought I was possibly forgetting some |> gpu
function in some of the other variables such as trainable_params
, optimizer
and the loss()
function so I put it virtually everywhere but still the error persisted.
Any ideas on that I may be missing (or how to properly train neural nets on GPU)?
Thanks in advance!
Packages and environment info
Julia
julia> versioninfo()
Julia Version 1.7.2
Commit bf53498635 (2022-02-06 15:21 UTC)
Platform Info:
OS: Linux (x86_64-redhat-linux)
CPU: Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-12.0.1 (ORCJIT, skylake)
Packages
(@v1.7) pkg> status
Status `~/.julia/environments/v1.7/Project.toml`
[c7e460c6] ArgParse v1.1.4
[6e4b80f9] BenchmarkTools v1.3.1
[336ed68f] CSV v0.10.4
[052768ef] CUDA v3.10.0
[13f3f980] CairoMakie v0.8.1
[a93c6f00] DataFrames v1.3.4
[1313f7d8] DataFramesMeta v0.11.0
[864edb3b] DataStructures v0.18.12
[e30172f5] Documenter v0.27.16
[5789e2e9] FileIO v1.14.0
[587475ba] Flux v0.13.1
[f7bf1975] Impute v0.6.8
[033835bb] JLD2 v0.4.22
[2b0e0bc5] LanguageServer v4.2.0
[add582a8] MLJ v0.18.2
[14b8a8f1] PkgTemplates v0.7.26
[295af30f] Revise v3.3.3
[1277b4bf] ShiftedArrays v1.0.0
[b3cc710f] StaticLint v8.1.0
[69024149] StringEncodings v0.3.5
[cf896787] SymbolServer v7.2.0
[a5390f91] ZipFile v0.9.4
CUDA.jl
julia> CUDA.versioninfo()
CUDA toolkit 11.7, artifact installation
NVIDIA driver 510.68.2, for CUDA 11.6
CUDA driver 11.6
Libraries:
- CUBLAS: 11.10.1
- CURAND: 10.2.10
- CUFFT: 10.7.2
- CUSOLVER: 11.3.5
- CUSPARSE: 11.7.3
- CUPTI: 17.0.0
- NVML: 11.0.0+510.68.2
- CUDNN: 8.30.2 (for CUDA 11.5.0)
- CUTENSOR: 1.4.0 (for CUDA 11.5.0)
Toolchain:
- Julia: 1.7.2
- LLVM: 12.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80
1 device:
0: NVIDIA GeForce MX150 (sm_61, 3.885 GiB / 4.000 GiB available)