Struggling to train a UDE model with a GPU

Hi
I have been trying to implement a UDE model and train it using a GPU, following the Neural ODE docs closely. However when I run the code I get an extremely long error and I cannot figure out how to go about solving it:

ERROR: `llvmcall` must be compiled to be called
Stacktrace:
   [1] macro expansion
     @ C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0 [inlined]
   [2] _pullback(::Zygote.Context{false}, ::Core.IntrinsicFunction, ::String, ::Type{Int64}, ::Type{Tuple{Ptr{Int64}}}, ::Ptr{Int64})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:9
   [3] _pullback
     @ .\atomics.jl:358 [inlined]
   [4] _pullback(ctx::Zygote.Context{false}, f::typeof(getindex), args::Base.Threads.Atomic{Int64})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
   [5] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\lib\utils\threading.jl:25 [inlined]
   [6] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\lib\utils\threading.jl:24 [inlined]
   [7] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\gpucompiler.jl:5 [inlined]
   [8] _pullback(ctx::Zygote.Context{false}, f::typeof(CUDA.device_properties), args::CUDA.CuDevice)
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
   [9] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\gpucompiler.jl:49 [inlined]
  [10] _pullback(::Zygote.Context{false}, ::CUDA.var"##CUDACompilerTarget#210", ::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, ::typeof(CUDA.CUDACompilerTarget), ::CUDA.CuDevice)
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [11] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\gpucompiler.jl:48 [inlined]
  [12] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:297 [inlined]
  [13] _pullback(::Zygote.Context{false}, ::CUDA.var"##cufunction#225", ::Nothing, ::Bool, ::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, ::typeof(CUDA.cufunction), ::typeof(CUDA.partial_mapreduce_grid), ::Type{Tuple{typeof(identity), typeof(DiffEqBase.abs2_and_sum), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CUDA.CuDeviceMatrix{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}}})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [14] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:293 [inlined]
  [15] _pullback(::Zygote.Context{false}, ::typeof(CUDA.cufunction), ::typeof(CUDA.partial_mapreduce_grid), ::Type{Tuple{typeof(identity), typeof(DiffEqBase.abs2_and_sum), Float32, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Val{true}, CUDA.CuDeviceMatrix{Float32, 1}, CUDA.CuDeviceVector{Float32, 1}}})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [16] macro expansion
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:102 [inlined]
  [17] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\mapreduce.jl:234 [inlined]
  [18] _pullback(::Zygote.Context{false}, ::CUDA.var"##mapreducedim!#300", ::Float32, ::typeof(GPUArrays.mapreducedim!), ::typeof(identity), ::typeof(DiffEqBase.abs2_and_sum), ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [19] _pullback
     @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\mapreduce.jl:169 [inlined]
  [20] _pullback(::Zygote.Context{false}, ::GPUArrays.var"#mapreducedim!##kw", ::NamedTuple{(:init,), Tuple{Float32}}, ::typeof(GPUArrays.mapreducedim!), ::typeof(identity), ::typeof(DiffEqBase.abs2_and_sum), ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [21] _pullback
     @ C:\Users\Stefan\.julia\packages\GPUArrays\g2pOV\src\host\mapreduce.jl:69 [inlined]
  [22] _pullback(::Zygote.Context{false}, ::GPUArrays.var"##_mapreduce#33", ::Colon, ::Float32, ::typeof(GPUArrays._mapreduce), ::typeof(DiffEqBase.UNITLESS_ABS2), ::typeof(DiffEqBase.abs2_and_sum), ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [23] _apply(::Function, ::Vararg{Any})
     @ Core .\boot.jl:816
  [24] adjoint
     @ C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\lib\lib.jl:203 [inlined]
  [25] _pullback
     @ C:\Users\Stefan\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:65 [inlined]
  [26] _pullback
     @ C:\Users\Stefan\.julia\packages\GPUArrays\g2pOV\src\host\mapreduce.jl:35 [inlined]
  [27] _pullback(::Zygote.Context{false}, ::GPUArrays.var"#_mapreduce##kw", ::NamedTuple{(:dims, :init), Tuple{Colon, Float32}}, ::typeof(GPUArrays._mapreduce), ::typeof(DiffEqBase.UNITLESS_ABS2), ::typeof(DiffEqBase.abs2_and_sum), ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [28] _apply(::Function, ::Vararg{Any})
     @ Core .\boot.jl:816
  [29] adjoint
     @ C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\lib\lib.jl:203 [inlined]
  [30] _pullback
     @ C:\Users\Stefan\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:65 [inlined]
  [31] _pullback
     @ C:\Users\Stefan\.julia\packages\GPUArrays\g2pOV\src\host\mapreduce.jl:31 [inlined]
--- the last 5 lines are repeated 1 more time ---
  [37] _pullback(::Zygote.Context{false}, ::Base.var"#mapreduce##kw", ::NamedTuple{(:init,), Tuple{Float32}}, ::typeof(mapreduce), ::typeof(DiffEqBase.UNITLESS_ABS2), ::typeof(DiffEqBase.abs2_and_sum), ::CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [38] _pullback
     @ C:\Users\Stefan\.julia\packages\DiffEqBase\QR8gq\src\common_defaults.jl:7 [inlined]
...
  [92] _apply
     @ .\boot.jl:816 [inlined]
  [93] adjoint
     @ C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\lib\lib.jl:203 [inlined]
  [94] _pullback
     @ C:\Users\Stefan\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:65 [inlined]
  [95] _pullback
     @ C:\Users\Stefan\.julia\packages\SciMLBase\ys6dl\src\scimlfunctions.jl:3596 [inlined]
  [96] _pullback(::Zygote.Context{false}, ::SciMLBase.OptimizationFunction{true, Optimization.AutoZygote, Main.EnergyHarvesterModel.var"#15#22"{LinearAlgebra.Adjoint{Float32, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#loss_p#20"{Main.EnergyHarvesterModel.var"#loss_adjoint#19"{Main.EnergyHarvesterModel.var"#predict_adjoint#18"}, SciMLBase.ODEProblem{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, false, ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}}, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, Main.EnergyHarvesterModel.var"#13#17"{ComponentArrays.ComponentVector{Float32, Vector{Float32}, Tuple{ComponentArrays.Axis{(α = 1, β = 2, δ = 3, ν = 4, ρ = 5, ω = 6, A = 7, B = 8, ugain = 9, θ = 10, L = 11, R = 12, m = 13)}}}, Main.EnergyHarvesterModel.var"#ODE#16", Lux.Chain{NamedTuple{(:layer_1, :layer_2), Tuple{Lux.Dense{true, typeof(NNlib.tanh_fast), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}, Lux.Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}}}}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, SciMLSensitivity.TrackerVJP, Tuple{Symbol}, NamedTuple{(:sensealg,), Tuple{SciMLSensitivity.TrackerVJP}}}, SciMLBase.StandardODEProblem}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, ::ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}}, ::SciMLBase.NullParameters)
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
  [97] _apply(::Function, ::Vararg{Any})
     @ Core .\boot.jl:816
  [98] adjoint
     @ C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\lib\lib.jl:203 [inlined]
  [99] _pullback
     @ C:\Users\Stefan\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:65 [inlined]
 [100] _pullback
     @ C:\Users\Stefan\.julia\packages\Optimization\aPPOg\src\function\zygote.jl:30 [inlined]
 [101] _pullback(ctx::Zygote.Context{false}, f::Optimization.var"#156#165"{SciMLBase.OptimizationFunction{true, Optimization.AutoZygote, Main.EnergyHarvesterModel.var"#15#22"{LinearAlgebra.Adjoint{Float32, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#loss_p#20"{Main.EnergyHarvesterModel.var"#loss_adjoint#19"{Main.EnergyHarvesterModel.var"#predict_adjoint#18"}, SciMLBase.ODEProblem{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, false, ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}}, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, Main.EnergyHarvesterModel.var"#13#17"{ComponentArrays.ComponentVector{Float32, Vector{Float32}, Tuple{ComponentArrays.Axis{(α = 1, β = 2, δ = 3, ν = 4, ρ = 5, ω = 6, A = 7, B = 8, ugain = 9, θ = 10, L = 11, R = 12, m = 13)}}}, Main.EnergyHarvesterModel.var"#ODE#16", Lux.Chain{NamedTuple{(:layer_1, :layer_2), Tuple{Lux.Dense{true, typeof(NNlib.tanh_fast), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}, Lux.Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}}}}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, SciMLSensitivity.TrackerVJP, Tuple{Symbol}, NamedTuple{(:sensealg,), Tuple{SciMLSensitivity.TrackerVJP}}}, SciMLBase.StandardODEProblem}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, SciMLBase.NullParameters}, args::ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
 [102] _apply(::Function, ::Vararg{Any})
     @ Core .\boot.jl:816
 [103] adjoint
     @ C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\lib\lib.jl:203 [inlined]
 [104] _pullback
     @ C:\Users\Stefan\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:65 [inlined]
 [105] _pullback
     @ C:\Users\Stefan\.julia\packages\Optimization\aPPOg\src\function\zygote.jl:34 [inlined]
 [106] _pullback(ctx::Zygote.Context{false}, f::Optimization.var"#158#167"{Tuple{}, Optimization.var"#156#165"{SciMLBase.OptimizationFunction{true, Optimization.AutoZygote, Main.EnergyHarvesterModel.var"#15#22"{LinearAlgebra.Adjoint{Float32, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#loss_p#20"{Main.EnergyHarvesterModel.var"#loss_adjoint#19"{Main.EnergyHarvesterModel.var"#predict_adjoint#18"}, SciMLBase.ODEProblem{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, false, ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}}, SciMLBase.ODEFunction{false, SciMLBase.AutoSpecialize, Main.EnergyHarvesterModel.var"#13#17"{ComponentArrays.ComponentVector{Float32, Vector{Float32}, Tuple{ComponentArrays.Axis{(α = 1, β = 2, δ = 3, ν = 4, ρ = 5, ω = 6, A = 7, B = 8, ugain = 9, θ = 10, L = 11, R = 12, m = 13)}}}, Main.EnergyHarvesterModel.var"#ODE#16", Lux.Chain{NamedTuple{(:layer_1, :layer_2), Tuple{Lux.Dense{true, typeof(NNlib.tanh_fast), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}, Lux.Dense{true, typeof(identity), typeof(Lux.glorot_uniform), typeof(Lux.zeros32)}}}}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, SciMLSensitivity.TrackerVJP, Tuple{Symbol}, NamedTuple{(:sensealg,), Tuple{SciMLSensitivity.TrackerVJP}}}, SciMLBase.StandardODEProblem}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, SciMLBase.NullParameters}}, args::ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface2.jl:0
 [107] pullback(f::Function, cx::Zygote.Context{false}, args::ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface.jl:44
 [108] pullback
     @ C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface.jl:42 [inlined]
 [109] gradient(f::Function, args::ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}})
     @ Zygote C:\Users\Stefan\.julia\packages\Zygote\g2w9o\src\compiler\interface.jl:96
 [110] (::Optimization.var"#157#166"{Optimization.var"#156#165"{SciMLBase.OptimizationFunction{true, Optimization.AutoZygote, Main.EnergyHarvesterModel.var"#15#22"{LinearAlgebra.Adjoint{Float32, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#loss_p#20"{Main.EnergyHarvesterModel.var"#loss_adjoint#19"{Main.EnergyHarvesterModel.var"#predict_adjoint#18"}, SciMLBase.ODEProblem{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, false, ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxeros32)}}}}}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, SciMLSensitivity.TrackerVJP, Tuple{Symbol}, NamedTuple{(:sensealg,), Tuple{SciMLSensitivity.TrackerVJP}}}, SciMLBase.StandardODEProblem}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, ComponentArrays.ComponentVector{Float32, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{ComponentArrays.Axis{(layer_1 = ViewAxis(1:150, Axis(weight = ViewAxis(1:100, ShapedAxis((50, 2), NamedTuple())), bias = ViewAxis(101:150, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(151:201, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}}, SciMLBase.NullParameters, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, opt::Optimisers.Adam{Float64}, data::Base.Iterators.Cycle{Tuple{Optimization.NullData}}; maxiters::Int64, callback::Function, progress::Bool, save_best::Bool, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})     @ OptimizationOptimisers C:\Users\Stefan\.julia\packages\OptimizationOptimisers\KGKWE\src\OptimizationOptimisers.jl:35 [114] #solve#544     @ C:\Users\Stefan\.julia\packages\SciMLBase\ys6dl\src\solve.jl:85 [inlined]
 [115] fit(fit_data::LinearAlgebra.Adjoint{Float32, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, tspan::Tuple{Float32, Float32}, force::Vector{Float32}, pars::ComponentArrays.ComponentVector{Float32, Vector{Float32}, Tuple{ComponentArrays.Axis{(α = 1, β = 2, δ = 3, ν = 4, ρ = 5, ω = 6, A = 7, B = 8, ugain = 9, θ = 10, L = 11, R = 12, m = 13)}}})
     @ Main.EnergyHarvesterModel c:\Users\Stefan\VScode projects\EnergyHarvester\src\NeuralODEModel.jl:56
 [116] top-level scope
     @ .\timing.jl:262 [inlined]

The function which produces this is:

  CUDA.allowscalar(true)

  rng = Random.default_rng() 
  dudt2 = Lux.Chain(Lux.Dense(2, 50, tanh), 
                    Lux.Dense(50, 1)) 

  ps, st = Lux.setup(rng, dudt2) 
  ps = ps |> ComponentArray |> Lux.gpu
  st = st |> Lux.gpu

  function ODE(u,ps,t,pars,dudt2, st)
    nn, _ = dudt2(u, ps, st)
    x,y = u
    du1 = y 
    du2 = ugain*(pars.A*cos(pars.ω*t) + pars.B*sin(pars.ω*t)) - (pars.δ*y+pars.α*x+pars.β*x^3+pars.ν*x^5+pars.ρ*x^7) + nn[1]
    du = [du1, du2] |> Lux.gpu
    return du
  end

  u0 = fit_data[:, 1] |> Lux.gpu

  UDE_prob = ODEProblem((u,p,t)->ODE(u,p,t,pars,dudt2,st),u0,tspan,ps, sensealg=TrackerVJP())

  function predict_adjoint(p, prob, u0, t_id)
    return solve(prob,Tsit5(),u0=u0,p=p,saveat=t_id)
  end

  function loss_adjoint(p, prob, u0, t_id, fit_data)
    pred = predict_adjoint(p, prob, u0, t_id)  
    loss= sum(abs2,(pred-fit_data))    
    return loss, pred
  end

  function loss_p(p, fit_data)
    loss, pred = loss_adjoint(p, UDE_prob, u0, tsteps, fit_data)
    return loss, pred
  end
  
  losses = Float32[] 
  
  callback = function (p, l)
    push!(losses, l)
    if length(losses)%50==0
        println("Current loss after $(length(losses)) iterations: $(losses[end])")
    end
    return false
  end

  adtype = Optimization.AutoZygote()
  optf = Optimization.OptimizationFunction((x, ps) -> loss_p(x, fit_data), adtype)
  optprob = Optimization.OptimizationProblem(optf, ps)  
  res1 = Optimization.solve(optprob, ADAM(0.01), callback=callback, maxiters = 5000)
  println("Training loss after $(length(losses)) iterations: $(losses[end])")

Thank you.

So that way I’m clear, is this the UDE example, then just some pieces were moved to the GPU? You’d likely want to only offload the neural net evaluation only, since the other pieces are too small to be efficient on the GPU (this entire thing is of course not going to be faster on a GPU, but I assume this is just an academic exercise?)

Thanks for the response,

Yes I’m trying to do just the neural net evaluation on the GPU. Ideally I would have a larger network, in place to fully utilise it, but I hadn’t thought it would be slower than just doing it all on the CPU. I have about 10,000 time series that need to be fitted, so the speed at which it runs is important and I figured I’d be able to mini-batch on the GPU as well.

If you batch the solves then it can be faster on the GPU for this kind of case, maybe. Depends on how stiff it is. Though a DiffEqGPU style approach to that could still be faster if the number of ODEs is small like this.