Struggling to train a UDE with DiffEqGPU

JuliaQ123 · February 20, 2023, 10:26pm

Hi,

I’ve been trying to train a UDE model on a set of time series in parallel using DiffEqGPU and EnsembleProblem, with different stored u0 and p values. The goal is to compute the losses simultaneously for each batch, utilising the power of my GPU. I’ve encountered a few issues along the way, mainly with EnsembleGPUArray(), and I’m unsure where to go from here:

CUDA.allowscalar(true)

println(size(pars), size(tranges), size(fit_data))
# (44,)(1, 44)(2, 104, 44)

NN = FastChain(FastDense(3, 50, tanh),
                FastDense(50, 1)) 

nn_ps = initial_params(NN) 
nn_ps = nn_ps |> Lux.gpu

u0s = cu(fit_data[:,1,:])  

function ude(u,nn_ps,t,p,NN)
  f = (p[7]*cos(p[6]*t) + p[8]*sin(p[6]*t))*p[9]  
  x,y = u
  in = cu([f, x, y])
  nn = NN(in, nn_ps)
  du1 = y 
  du2 = f - (p[3]*y+p[1]*x+p[2]*x^3+p[4]*x^5+p[5]*x^7) + nn[1]
  du = cu([du1, du2])
end

function predict_adjoint(prob)
  solve(prob, Tsit5(), EnsembleGPUArray(), trajectories=length(tranges), saveat=tranges[1])
end

function loss_adjoint(prob, batch)

    pred = predict_adjoint(prob)      
  
    loss = sum(abs2, batch[1,:,:].-pred[1,:,:])
  
    return loss
end

t_span = (tranges[1][1], tranges[1][end])

function loss_p(nn_ps, batch, tranges, pars)
  prob_NN = ODEProblem{false}((u,p,t)->ude(u,nn_ps,t,pars[1],NN),u0s[:, 1],t_span,nn_ps)

  function prob_func(prob, i, repeat)
    remake(prob, u0=u0s[:, i], saveat=t_span, p=pars[i])
  end

  prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)    
  l = loss_adjoint(prob_b, batch)
  println(l)
  return l
end

function MINIMISE(nn_ps, batch, time_batch, ode_ps)
  return loss_p(nn_ps, batch, time_batch, ode_ps)
end


adtype = Optimization.AutoFiniteDiff()
optfunc = OptimizationFunction((nn_ps, _) -> MINIMISE(nn_ps, fit_data, tranges, pars), adtype)
opt = ADAM(0.1)  
optprob = Optimization.OptimizationProblem(optfunc, nn_ps)
res = Optimization.solve(optprob, opt, maxiters=50)
println("Training loss after $(length(losses)) iterations: $(losses[end])")

My error message at the moment is:

ERROR: ArgumentError: tuple length should be ≥ 0, got -4
Stacktrace:
  [1] _ntuple(f::Adapt.var"#1#4"{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}, n::Int64)
    @ Base .\ntuple.jl:36
  [2] ntuple
    @ .\ntuple.jl:19 [inlined]
  [3] adapt_structure(to::CUDA.Adaptor, f::DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:19
  [4] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
  [5] Fix1
    @ .\operators.jl:1096 [inlined]
  [6] map
    @ .\tuple.jl:222 [inlined]
  [7] adapt_structure
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:3 [inlined]
  [8] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
  [9] Fix1
    @ .\operators.jl:1096 [inlined]
 [10] map
    @ .\tuple.jl:221 [inlined]
 [11] adapt_structure
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:3 [inlined]
 [12] adapt(to::CUDA.Adaptor, x::Tuple{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40
 [13] adapt_structure(to::CUDA.Adaptor, f::DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:24
 [14] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
 [15] Fix1
    @ .\operators.jl:1096 [inlined]
 [16] map
    @ .\tuple.jl:223 [inlined]
 [17] map
    @ .\tuple.jl:224 [inlined]
 [18] adapt_structure
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:3 [inlined]
 [19] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
 [20] adapt_structure(to::CUDA.Adaptor, f::Main.EnergyHarvesterModel.var"#33#43"{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#ude#39", DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}}})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:24
 [21] adapt(to::CUDA.Adaptor, x::Function)
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40
 [22] cudaconvert(arg::Function)
    @ CUDA C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:152
 [23] map(f::typeof(CUDA.cudaconvert), t::Tuple{Main.EnergyHarvesterModel.var"#33#43"{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#ude#39", DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}}}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Float32}) (repeats 2 times)
    @ Base .\tuple.jl:224
 [24] macro expansion
    @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:100 [inlined]

.......................

[49] macro expansion
    @ C:\Users\Stefan\.julia\packages\OptimizationOptimisers\KGKWE\src\OptimizationOptimisers.jl:36 [inlined]
 [50] macro expansion
    @ C:\Users\Stefan\.julia\packages\Optimization\aPPOg\src\utils.jl:37 [inlined]
 [51] __solve(prob::SciMLBase.OptimizationProblem{true, SciMLBase.OptimizationFunction{true, Optimization.AutoFiniteDiff{Val{:forward}, Val{:forward}, Val{:hcentral}}, Main.EnergyHarvesterModel.var"#34#46"{Main.EnergyHarvesterModel.var"#MINIMISE#45"{Main.EnergyHarvesterModel.var"#loss_p#42"{Tuple{Float32, Float32}, Main.EnergyHarvesterModel.var"#loss_adjoint#41"{Main.EnergyHarvesterModel.var"#predict_adjoint#40"}, Main.EnergyHarvesterModel.var"#ude#39", CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}}}}}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, SciMLBase.NullParameters, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, opt::Optimisers.Adam{Float64}, data::Base.Iterators.Cycle{Tuple{Optimization.NullData}}; maxiters::Int64, callback::Function, progress::Bool, save_best::Bool, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ OptimizationOptimisers C:\Users\Stefan\.julia\packages\OptimizationOptimisers\KGKWE\src\OptimizationOptimisers.jl:35
 [52] #solve#540
    @ C:\Users\Stefan\.julia\packages\SciMLBase\QqtZA\src\solve.jl:84 [inlined]
 [53] fit_df(train_df::DataFrames.SubDataFrame{DataFrames.DataFrame, DataFrames.Index, Vector{Int64}}, val_df::DataFrames.SubDataFrame{DataFrames.DataFrame, DataFrames.Index, Vector{Int64}}, par::ComponentArrays.ComponentVector{Float32, Vector{Float32}, Tuple{ComponentArrays.Axis{(ωn = 1, μ = 2, b = 3, ν = 4, ρ = 5, ω = 6, A = 7, B = 8, ugain = 9)}}})
    @ Main.EnergyHarvesterModel c:\Users\Stefan\VScode projects\EnergyHarvester\src\NeuralODEModel.jl:270
 [54] top-level scope
    @ .\timing.jl:262 [inlined]

I’m very new to this workflow and have tried to follow the tutorials as best as I can, so any help would be greatly appreciated.
Thanks.

ChrisRackauckas · February 21, 2023, 4:20pm

What does are you reading? This looks nothing like the tutorial:

https://docs.sciml.ai/DiffEqFlux/dev/examples/GPUs/

JuliaQ123 · February 21, 2023, 7:04pm

Hi, Thanks,
I tried to follow multiple tutorials from DiffEqGPU as well as an older NeuralODE one that I accidentally found. I have since rewritten the EnsembleProblem to try and isolate the problem (This follows the newer tutorials):

using DiffEqGPU, OrdinaryDiffEq, Lux, CUDA, Random, ComponentArrays

batchsize = 44

pars = rand(Float32, (9, batchsize)) |> Lux.gpu;
u0s = rand(Float32, (2, batchsize)) |> Lux.gpu;

tspan = [0.0f0, 0.08f0]
tsteps = range(tspan[1], tspan[2], length=5000)

rng = Random.default_rng()


NN = Lux.Chain(Lux.Dense(3, 50, tanh), Lux.Dense(50, 1))

nn_ps, st = Lux.setup(rng,NN) 
nn_ps = nn_ps |> ComponentArray |> Lux.gpu
st = st |> Lux.gpu

NN_(u, p) = NN(u, p, st)[1]

function ude(u,nn_ps,t,p)
    f = (p[7]*cos(p[6]*t) + p[8]*sin(p[6]*t))*p[9]  
    x,y = u
    in = [f, x, y] |> DiffEqFlux.gpu
    nn = NN_(in, nn_ps)[1]
    du1 = y 
    du2 = f - (p[3]*y+p[1]*x+p[2]*x^3+p[4]*x^5+p[5]*x^7) + nn
    du = [du1, du2]  |> DiffEqFlux.gpu
  end

prob_NN = ODEProblem{false}((u,p,t)->ude(u, nn_ps, t, pars[:, 1]), u0s[:, 1], tspan, nn_ps)

@time s1 = solve(prob_NN, Tsit5(), saveat=tsteps)

I have run into no issues in this part. However when I add Ensembling (copied from the DiffEqGPU docs):

function prob_func(prob, i, repeat)
    remake(prob, u0=u0s[:,i], p=pars[:, i])
end

prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)

@time s2 = solve(prob_b, Tsit5(), EnsembleGPUArray(), trajectories=1, saveat=tsteps)

println(sum(abs2, s1.-s2[:,:,1])) # =0

the issues come. This works fine when trajectories = 1, (the last line confirms the two solutions are indeed equal). However when I try and use Ensembling to solve more than one trajectory (i.e. trajectories = 2) is when it breaks down, giving this error message:

ERROR: InvalidIRError: compiling kernel #gpu_gpu_kernel_oop(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, var"#147#148", CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, Float32) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to getindex)
Stacktrace:
 [1] #147
   @ c:\Users\Stefan\VScode projects\EnergyHarvester\src\gpuLux.jl:32
 [2] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:31
 [3] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [4] gpu_gpu_kernel_oop
   @ .\none:0
Reason: unsupported dynamic function invocation (call to ude)
Stacktrace:
 [1] #147
   @ c:\Users\Stefan\VScode projects\EnergyHarvester\src\gpuLux.jl:32
 [2] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:31
 [3] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [4] gpu_gpu_kernel_oop
   @ .\none:0
Reason: unsupported dynamic function invocation (call to getindex)
Stacktrace:
 [1] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:33
 [2] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [3] gpu_gpu_kernel_oop
   @ .\none:0
Reason: unsupported dynamic function invocation (call to convert)
Stacktrace:
 [1] setindex!
   @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\device\array.jl:194
 [2] setindex!
   @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\device\array.jl:207
 [3] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:33
 [4] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [5] gpu_gpu_kernel_oop
   @ .\none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(DiffEqGPU.gpu_gpu_kernel_oop), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, var"#147#148", CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, Float32}}}, args::LLVM.Module)
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\validation.jl:141
  [2] macro expansion
    @ C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\driver.jl:418 [inlined]
  [3] macro expansion
    @ C:\Users\Stefan\.julia\packages\TimerOutputs\LHjFw\src\TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\driver.jl:416 [inlined]
  [5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\utils.jl:83
  [6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:355
  [7] #228
    @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:348 [inlined]
  [8] JuliaContext(f::CUDA.var"#228#229"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(DiffEqGPU.gpu_gpu_kernel_oop), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, var"#147#148", CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, Float32}}}})
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\driver.jl:76
  [9] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:347
 [10] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\cache.jl:90
...................
 [18] #__solve#561
    @ C:\Users\Stefan\.julia\packages\OrdinaryDiffEq\P7HJO\src\solve.jl:5 [inlined]
 [19] #solve_call#26
    @ C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:473 [inlined]
 [20] solve_up(prob::ODEProblem{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ODEFunction{true, SciMLBase.FullSpecialize, DiffEqGPU.var"#67#71"{var"#147#148", typeof(DiffEqGPU.gpu_kernel_oop)}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, sensealg::Nothing, u0::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, p::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, args::Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}; kwargs::Base.Pairs{Symbol, Any, NTuple{6, Symbol}, NamedTuple{(:adaptive, :unstable_check, :saveat, :callback, :merge_callbacks, :internalnorm), Tuple{Bool, DiffEqGPU.var"#15#21", StepRangeLen{Float32, Float64, Float64, Int64}, Nothing, Bool, typeof(DiffEqGPU.diffeqgpunorm)}}})
    @ DiffEqBase C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:835
 [21] #solve#31
    @ C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:802 [inlined]
eviceBuffer}, Tuple{Axis{(layer_1 = ViewAxis(1:200, Axis(weight = ViewAxis(1:150, ShapedAxis((50, 3), NamedTuple())), bias = ViewAxis(151:200, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(201:251, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}}, ODEFunction{false, SciMLBase.AutoSpecialize, var"#147#148", LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, typeof(prob_func), typeof(SciMLBase.DEFAULT_OUTPUT_FUNC), typeof(SciMLBase.DEFAULT_REDUCTION), Nothing}, alg::Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, ensemblealg::EnsembleGPUArray; trajectories::Int64, batch_size::Int64, unstable_check::Function, adaptive::Bool, kwargs::Base.Pairs{Symbol, StepRangeLen{Float32, Float64, Float64, Int64}, Tuple{Symbol}, NamedTuple{(:saveat,), Tuple{StepRangeLen{Float32, Float64, Float64, Int64}}}})    @ DiffEqGPU C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:523 [26] #solve#33    @ C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:851 [inlined] [27] top-level scope
    @ .\timing.jl:262

It works with EnsembleThreads() and EnsembleCPUArray() (which is significantly faster), but it doesn’t seem to work with EnsembleGPUArray(). Any ideas why?
Thanks.

ChrisRackauckas · February 21, 2023, 8:18pm

EnsembleGPUArray needs to be used with static kernels. That does not include neural networks like that. That wouldn’t be an efficient way to write this anyways. Why not batch via matmuls?

JuliaQ123 · March 2, 2023, 11:26am

Ok I see thanks.
How would I go about batching with matmuls then?

ChrisRackauckas · March 2, 2023, 1:50pm

Instead of solving on vectors, solve on matrices where each column is a different initial condition. You’d pair that with multiple shooting.

JuliaQ123 · March 6, 2023, 1:21pm

Is it possible to also pass through a different set of parameters? I have a different set of parameters for each time series which is why I tried ensembling in the first place.

ChrisRackauckas · March 6, 2023, 1:35pm

Yes, it does per-solve parameters just like u0.

JuliaQ123 · March 6, 2023, 5:31pm

I managed to do what you said only including the neural network, however, I am a bit confused as to including the ODE parameters as well. I have this so far:

  Random.seed!(0)
  rng = Random.default_rng()

  NN = Lux.Chain(Lux.Dense(3, 50, tanh), Lux.Dense(50, 1))

  nn_ps, st = Lux.setup(rng,NN) 
  nn_ps = nn_ps  |> ComponentArray 

  function ude(u, t, ps, NN_, st_)

    nn_ps = ps.nn_ps
    ode_ps = ps.ode_ps

    f = (ode_ps[7]*cos(ode_ps[6]*t) + ode_ps[8]*sin(ode_ps[6]*t))*ode_ps[9]  # Actuator

    x,y = u
    in = [f, x, y]
    nn = NN_(in, nn_ps, st_)[1]
    du1 = y 
    du2 = f - (ode_ps[3]*y+ode_ps[1]*x+ode_ps[2]*x^3+ode_ps[4]*x^5+ode_ps[5]*x^7) + nn[1]
    du = [du1, du2]
  end
  ######################################################

  function loss_function(data, pred)
    l = sum(abs2, data[1,:,:] - pred[1,:,:])
    return l
  end

  prob_NN = ODEProblem{false}((u, p, t) -> ude(u,t,p,NN,st), u0s[:, 1], tspan, pars[1])

  function prob_func(prob, i, repeat)
    ps =  ComponentArray(nn_ps=p, ode_ps=pars[i])
    remake(prob, u0=u0s[:, i], saveat=tspan, p=ps)
  end

  prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)
  
  function loss_multiple_shooting(p) 
    return multiple_shoot(p, fit_data, tsteps, prob_b, EnsembleThreads(), loss_function, Tsit5(), 
                          length(fit_data[1,:,1]); continuity_term=1, trajectories=batchsize)
  end

I’m trying to pass through a component array ps containing the neural network parameters as well as the ODE parameters into the UDE function, and I cannot figure out a way to do it. What would you suggest I change to manage it?
Thanks.

JuliaQ123 · March 7, 2023, 10:29pm

Managed to get it working, I figured I’d post the code that ended up working for anyone else who ends up trying a similar thing.
Thanks Chris for the help.

  println("Training....")
  # println(size(pars), size(tranges), size(fit_data))
  # (44,)(1, 44)(2, 104, 44)

  Random.seed!(0)
  rng = Random.default_rng()

  NN = Lux.Chain(Lux.Dense(3, 100, tanh), Lux.Dense(100, 1))

  nn_ps, st = Lux.setup(rng,NN) 
  nn_ps = nn_ps  |> ComponentArray 
  println(pwd())
  
  function ude(u, ps, t)
    nn_ps, ode_ps = ps   

    f = (ode_ps[7]*cos(ode_ps[6]*t) + ode_ps[8]*sin(ode_ps[6]*t))*ode_ps[9]  # Actuator

    x,y = u
    in = [f, x, y]
    nn = NN(in, nn_ps, st)[1]
    du1 = y 
    du2 = f - (ode_ps[3]*y+ode_ps[1]*x+ode_ps[2]*x^3+ode_ps[4]*x^5+ode_ps[5]*x^7) + nn[1]
    du = [du1, du2]
  end
  ######################################################

  function loss_function(data, pred)
    l = sum(abs2, data[1,:,:] - pred[1,:,:])
    return l
  end

  prob_NN = ODEProblem{false}((u, ps, t) -> ude(u,(ps, pars[1]), t), u0s[:, 1], tspan)

  function prob_func(prob, i, repeat)
    nn_ps = prob.p
    ps = (nn_ps, pars[i])
    remake(prob, u0=u0s[:, i], saveat=tspan, p=ps)
  end

  prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)
  
  function loss_multiple_shooting(nn_ps) 
    return multiple_shoot(nn_ps, fit_data, tsteps, prob_b, EnsembleThreads(), loss_function, Tsit5(), 
                          3; continuity_term=400, trajectories=batchsize)
  end

  adtype = Optimization.AutoZygote()
  optfunc = OptimizationFunction((nn_ps, x) -> loss_multiple_shooting(nn_ps), adtype)
  optprob = Optimization.OptimizationProblem(optfunc, nn_ps)
  res = Optimization.solve(optprob, PolyOpt(), maxiters=100000) # Using validation early stopping

Topic		Replies	Views
Struggling to train a UDE model with a GPU New to Julia question , cuda , differentialequation , diffeqflux	3	383	February 14, 2023
]test DiffEqGPU errors with " unsupported call to the Julia runtime" GPU first-steps	19	1613	September 20, 2019
Errors on 2nd Iteration of EnsembleGPUArrays when using Lux Modelling & Simulations	2	104	June 28, 2024
Universal Differential Equations General Usage	4	310	May 29, 2025
Mini batching with ensemble problem diffeq and neural ode's sciml Machine Learning	11	528	December 20, 2022

Struggling to train a UDE with DiffEqGPU

Related topics