Struggling to train a UDE with DiffEqGPU

Hi,

I’ve been trying to train a UDE model on a set of time series in parallel using DiffEqGPU and EnsembleProblem, with different stored u0 and p values. The goal is to compute the losses simultaneously for each batch, utilising the power of my GPU. I’ve encountered a few issues along the way, mainly with EnsembleGPUArray(), and I’m unsure where to go from here:

CUDA.allowscalar(true)

println(size(pars), size(tranges), size(fit_data))
# (44,)(1, 44)(2, 104, 44)

NN = FastChain(FastDense(3, 50, tanh),
                FastDense(50, 1)) 

nn_ps = initial_params(NN) 
nn_ps = nn_ps |> Lux.gpu

u0s = cu(fit_data[:,1,:])  

function ude(u,nn_ps,t,p,NN)
  f = (p[7]*cos(p[6]*t) + p[8]*sin(p[6]*t))*p[9]  
  x,y = u
  in = cu([f, x, y])
  nn = NN(in, nn_ps)
  du1 = y 
  du2 = f - (p[3]*y+p[1]*x+p[2]*x^3+p[4]*x^5+p[5]*x^7) + nn[1]
  du = cu([du1, du2])
end

function predict_adjoint(prob)
  solve(prob, Tsit5(), EnsembleGPUArray(), trajectories=length(tranges), saveat=tranges[1])
end

function loss_adjoint(prob, batch)

    pred = predict_adjoint(prob)      
  
    loss = sum(abs2, batch[1,:,:].-pred[1,:,:])
  
    return loss
end

t_span = (tranges[1][1], tranges[1][end])

function loss_p(nn_ps, batch, tranges, pars)
  prob_NN = ODEProblem{false}((u,p,t)->ude(u,nn_ps,t,pars[1],NN),u0s[:, 1],t_span,nn_ps)

  function prob_func(prob, i, repeat)
    remake(prob, u0=u0s[:, i], saveat=t_span, p=pars[i])
  end

  prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)    
  l = loss_adjoint(prob_b, batch)
  println(l)
  return l
end

function MINIMISE(nn_ps, batch, time_batch, ode_ps)
  return loss_p(nn_ps, batch, time_batch, ode_ps)
end


adtype = Optimization.AutoFiniteDiff()
optfunc = OptimizationFunction((nn_ps, _) -> MINIMISE(nn_ps, fit_data, tranges, pars), adtype)
opt = ADAM(0.1)  
optprob = Optimization.OptimizationProblem(optfunc, nn_ps)
res = Optimization.solve(optprob, opt, maxiters=50)
println("Training loss after $(length(losses)) iterations: $(losses[end])") 

My error message at the moment is:

ERROR: ArgumentError: tuple length should be ≥ 0, got -4
Stacktrace:
  [1] _ntuple(f::Adapt.var"#1#4"{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}, n::Int64)
    @ Base .\ntuple.jl:36
  [2] ntuple
    @ .\ntuple.jl:19 [inlined]
  [3] adapt_structure(to::CUDA.Adaptor, f::DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:19
  [4] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
  [5] Fix1
    @ .\operators.jl:1096 [inlined]
  [6] map
    @ .\tuple.jl:222 [inlined]
  [7] adapt_structure
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:3 [inlined]
  [8] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
  [9] Fix1
    @ .\operators.jl:1096 [inlined]
 [10] map
    @ .\tuple.jl:221 [inlined]
 [11] adapt_structure
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:3 [inlined]
 [12] adapt(to::CUDA.Adaptor, x::Tuple{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40
 [13] adapt_structure(to::CUDA.Adaptor, f::DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:24
 [14] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
 [15] Fix1
    @ .\operators.jl:1096 [inlined]
 [16] map
    @ .\tuple.jl:223 [inlined]
 [17] map
    @ .\tuple.jl:224 [inlined]
 [18] adapt_structure
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:3 [inlined]
 [19] adapt
    @ C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40 [inlined]
 [20] adapt_structure(to::CUDA.Adaptor, f::Main.EnergyHarvesterModel.var"#33#43"{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#ude#39", DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}}})
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\base.jl:24
 [21] adapt(to::CUDA.Adaptor, x::Function)
    @ Adapt C:\Users\Stefan\.julia\packages\Adapt\0zP2x\src\Adapt.jl:40
 [22] cudaconvert(arg::Function)
    @ CUDA C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:152
 [23] map(f::typeof(CUDA.cudaconvert), t::Tuple{Main.EnergyHarvesterModel.var"#33#43"{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Vector{CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Main.EnergyHarvesterModel.var"#ude#39", DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}}}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Float32}) (repeats 2 times)
    @ Base .\tuple.jl:224
 [24] macro expansion
    @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:100 [inlined]

.......................

[49] macro expansion
    @ C:\Users\Stefan\.julia\packages\OptimizationOptimisers\KGKWE\src\OptimizationOptimisers.jl:36 [inlined]
 [50] macro expansion
    @ C:\Users\Stefan\.julia\packages\Optimization\aPPOg\src\utils.jl:37 [inlined]
 [51] __solve(prob::SciMLBase.OptimizationProblem{true, SciMLBase.OptimizationFunction{true, Optimization.AutoFiniteDiff{Val{:forward}, Val{:forward}, Val{:hcentral}}, Main.EnergyHarvesterModel.var"#34#46"{Main.EnergyHarvesterModel.var"#MINIMISE#45"{Main.EnergyHarvesterModel.var"#loss_p#42"{Tuple{Float32, Float32}, Main.EnergyHarvesterModel.var"#loss_adjoint#41"{Main.EnergyHarvesterModel.var"#predict_adjoint#40"}, Main.EnergyHarvesterModel.var"#ude#39", CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, DiffEqFlux.FastChain{Tuple{DiffEqFlux.FastDense{typeof(NNlib.tanh_fast), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}, DiffEqFlux.FastDense{typeof(identity), DiffEqFlux.var"#initial_params#107"{Vector{Float32}}, Nothing}}}}}}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED_NO_TIME), Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, SciMLBase.NullParameters, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}}, opt::Optimisers.Adam{Float64}, data::Base.Iterators.Cycle{Tuple{Optimization.NullData}}; maxiters::Int64, callback::Function, progress::Bool, save_best::Bool, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ OptimizationOptimisers C:\Users\Stefan\.julia\packages\OptimizationOptimisers\KGKWE\src\OptimizationOptimisers.jl:35
 [52] #solve#540
    @ C:\Users\Stefan\.julia\packages\SciMLBase\QqtZA\src\solve.jl:84 [inlined]
 [53] fit_df(train_df::DataFrames.SubDataFrame{DataFrames.DataFrame, DataFrames.Index, Vector{Int64}}, val_df::DataFrames.SubDataFrame{DataFrames.DataFrame, DataFrames.Index, Vector{Int64}}, par::ComponentArrays.ComponentVector{Float32, Vector{Float32}, Tuple{ComponentArrays.Axis{(ωn = 1, μ = 2, b = 3, ν = 4, ρ = 5, ω = 6, A = 7, B = 8, ugain = 9)}}})
    @ Main.EnergyHarvesterModel c:\Users\Stefan\VScode projects\EnergyHarvester\src\NeuralODEModel.jl:270
 [54] top-level scope
    @ .\timing.jl:262 [inlined]

I’m very new to this workflow and have tried to follow the tutorials as best as I can, so any help would be greatly appreciated.
Thanks.

What does are you reading? This looks nothing like the tutorial:

https://docs.sciml.ai/DiffEqFlux/dev/examples/GPUs/

Hi, Thanks,
I tried to follow multiple tutorials from DiffEqGPU as well as an older NeuralODE one that I accidentally found. I have since rewritten the EnsembleProblem to try and isolate the problem (This follows the newer tutorials):

using DiffEqGPU, OrdinaryDiffEq, Lux, CUDA, Random, ComponentArrays

batchsize = 44

pars = rand(Float32, (9, batchsize)) |> Lux.gpu;
u0s = rand(Float32, (2, batchsize)) |> Lux.gpu;

tspan = [0.0f0, 0.08f0]
tsteps = range(tspan[1], tspan[2], length=5000)

rng = Random.default_rng()


NN = Lux.Chain(Lux.Dense(3, 50, tanh), Lux.Dense(50, 1))

nn_ps, st = Lux.setup(rng,NN) 
nn_ps = nn_ps |> ComponentArray |> Lux.gpu
st = st |> Lux.gpu

NN_(u, p) = NN(u, p, st)[1]

function ude(u,nn_ps,t,p)
    f = (p[7]*cos(p[6]*t) + p[8]*sin(p[6]*t))*p[9]  
    x,y = u
    in = [f, x, y] |> DiffEqFlux.gpu
    nn = NN_(in, nn_ps)[1]
    du1 = y 
    du2 = f - (p[3]*y+p[1]*x+p[2]*x^3+p[4]*x^5+p[5]*x^7) + nn
    du = [du1, du2]  |> DiffEqFlux.gpu
  end

prob_NN = ODEProblem{false}((u,p,t)->ude(u, nn_ps, t, pars[:, 1]), u0s[:, 1], tspan, nn_ps)

@time s1 = solve(prob_NN, Tsit5(), saveat=tsteps)

I have run into no issues in this part. However when I add Ensembling (copied from the DiffEqGPU docs):

function prob_func(prob, i, repeat)
    remake(prob, u0=u0s[:,i], p=pars[:, i])
end

prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)

@time s2 = solve(prob_b, Tsit5(), EnsembleGPUArray(), trajectories=1, saveat=tsteps)

println(sum(abs2, s1.-s2[:,:,1])) # =0

the issues come. This works fine when trajectories = 1, (the last line confirms the two solutions are indeed equal). However when I try and use Ensembling to solve more than one trajectory (i.e. trajectories = 2) is when it breaks down, giving this error message:

ERROR: InvalidIRError: compiling kernel #gpu_gpu_kernel_oop(KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, var"#147#148", CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, Float32) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to getindex)
Stacktrace:
 [1] #147
   @ c:\Users\Stefan\VScode projects\EnergyHarvester\src\gpuLux.jl:32
 [2] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:31
 [3] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [4] gpu_gpu_kernel_oop
   @ .\none:0
Reason: unsupported dynamic function invocation (call to ude)
Stacktrace:
 [1] #147
   @ c:\Users\Stefan\VScode projects\EnergyHarvester\src\gpuLux.jl:32
 [2] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:31
 [3] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [4] gpu_gpu_kernel_oop
   @ .\none:0
Reason: unsupported dynamic function invocation (call to getindex)
Stacktrace:
 [1] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:33
 [2] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [3] gpu_gpu_kernel_oop
   @ .\none:0
Reason: unsupported dynamic function invocation (call to convert)
Stacktrace:
 [1] setindex!
   @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\device\array.jl:194
 [2] setindex!
   @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\device\array.jl:207
 [3] macro expansion
   @ C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:33
 [4] gpu_gpu_kernel_oop
   @ C:\Users\Stefan\.julia\packages\KernelAbstractions\C8flJ\src\macros.jl:81
 [5] gpu_gpu_kernel_oop
   @ .\none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(DiffEqGPU.gpu_gpu_kernel_oop), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, var"#147#148", CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, Float32}}}, args::LLVM.Module)
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\validation.jl:141
  [2] macro expansion
    @ C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\driver.jl:418 [inlined]
  [3] macro expansion
    @ C:\Users\Stefan\.julia\packages\TimerOutputs\LHjFw\src\TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\driver.jl:416 [inlined]
  [5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\utils.jl:83
  [6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:355
  [7] #228
    @ C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:348 [inlined]
  [8] JuliaContext(f::CUDA.var"#228#229"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(DiffEqGPU.gpu_gpu_kernel_oop), Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, var"#147#148", CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, Float32}}}})
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\driver.jl:76
  [9] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA C:\Users\Stefan\.julia\packages\CUDA\BbliS\src\compiler\execution.jl:347
 [10] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler C:\Users\Stefan\.julia\packages\GPUCompiler\kb6yJ\src\cache.jl:90
...................
 [18] #__solve#561
    @ C:\Users\Stefan\.julia\packages\OrdinaryDiffEq\P7HJO\src\solve.jl:5 [inlined]
 [19] #solve_call#26
    @ C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:473 [inlined]
 [20] solve_up(prob::ODEProblem{CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, Tuple{Float32, Float32}, true, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ODEFunction{true, SciMLBase.FullSpecialize, DiffEqGPU.var"#67#71"{var"#147#148", typeof(DiffEqGPU.gpu_kernel_oop)}, LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, sensealg::Nothing, u0::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, p::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, args::Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}; kwargs::Base.Pairs{Symbol, Any, NTuple{6, Symbol}, NamedTuple{(:adaptive, :unstable_check, :saveat, :callback, :merge_callbacks, :internalnorm), Tuple{Bool, DiffEqGPU.var"#15#21", StepRangeLen{Float32, Float64, Float64, Int64}, Nothing, Bool, typeof(DiffEqGPU.diffeqgpunorm)}}})
    @ DiffEqBase C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:835
 [21] #solve#31
    @ C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:802 [inlined]
eviceBuffer}, Tuple{Axis{(layer_1 = ViewAxis(1:200, Axis(weight = ViewAxis(1:150, ShapedAxis((50, 3), NamedTuple())), bias = ViewAxis(151:200, ShapedAxis((50, 1), NamedTuple())))), layer_2 = ViewAxis(201:251, Axis(weight = ViewAxis(1:50, ShapedAxis((1, 50), NamedTuple())), bias = ViewAxis(51:51, ShapedAxis((1, 1), NamedTuple())))))}}}, ODEFunction{false, SciMLBase.AutoSpecialize, var"#147#148", LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, typeof(prob_func), typeof(SciMLBase.DEFAULT_OUTPUT_FUNC), typeof(SciMLBase.DEFAULT_REDUCTION), Nothing}, alg::Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, ensemblealg::EnsembleGPUArray; trajectories::Int64, batch_size::Int64, unstable_check::Function, adaptive::Bool, kwargs::Base.Pairs{Symbol, StepRangeLen{Float32, Float64, Float64, Int64}, Tuple{Symbol}, NamedTuple{(:saveat,), Tuple{StepRangeLen{Float32, Float64, Float64, Int64}}}})    @ DiffEqGPU C:\Users\Stefan\.julia\packages\DiffEqGPU\2OsE7\src\DiffEqGPU.jl:523 [26] #solve#33    @ C:\Users\Stefan\.julia\packages\DiffEqBase\WXn2i\src\solve.jl:851 [inlined] [27] top-level scope
    @ .\timing.jl:262

It works with EnsembleThreads() and EnsembleCPUArray() (which is significantly faster), but it doesn’t seem to work with EnsembleGPUArray(). Any ideas why?
Thanks.

EnsembleGPUArray needs to be used with static kernels. That does not include neural networks like that. That wouldn’t be an efficient way to write this anyways. Why not batch via matmuls?

Ok I see thanks.
How would I go about batching with matmuls then?

Instead of solving on vectors, solve on matrices where each column is a different initial condition. You’d pair that with multiple shooting.

Is it possible to also pass through a different set of parameters? I have a different set of parameters for each time series which is why I tried ensembling in the first place.

Yes, it does per-solve parameters just like u0.

I managed to do what you said only including the neural network, however, I am a bit confused as to including the ODE parameters as well. I have this so far:

  Random.seed!(0)
  rng = Random.default_rng()

  NN = Lux.Chain(Lux.Dense(3, 50, tanh), Lux.Dense(50, 1))

  nn_ps, st = Lux.setup(rng,NN) 
  nn_ps = nn_ps  |> ComponentArray 

  function ude(u, t, ps, NN_, st_)

    nn_ps = ps.nn_ps
    ode_ps = ps.ode_ps

    f = (ode_ps[7]*cos(ode_ps[6]*t) + ode_ps[8]*sin(ode_ps[6]*t))*ode_ps[9]  # Actuator

    x,y = u
    in = [f, x, y]
    nn = NN_(in, nn_ps, st_)[1]
    du1 = y 
    du2 = f - (ode_ps[3]*y+ode_ps[1]*x+ode_ps[2]*x^3+ode_ps[4]*x^5+ode_ps[5]*x^7) + nn[1]
    du = [du1, du2]
  end
  ######################################################

  function loss_function(data, pred)
    l = sum(abs2, data[1,:,:] - pred[1,:,:])
    return l
  end

  prob_NN = ODEProblem{false}((u, p, t) -> ude(u,t,p,NN,st), u0s[:, 1], tspan, pars[1])

  function prob_func(prob, i, repeat)
    ps =  ComponentArray(nn_ps=p, ode_ps=pars[i])
    remake(prob, u0=u0s[:, i], saveat=tspan, p=ps)
  end

  prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)
  
  function loss_multiple_shooting(p) 
    return multiple_shoot(p, fit_data, tsteps, prob_b, EnsembleThreads(), loss_function, Tsit5(), 
                          length(fit_data[1,:,1]); continuity_term=1, trajectories=batchsize)
  end

I’m trying to pass through a component array ps containing the neural network parameters as well as the ODE parameters into the UDE function, and I cannot figure out a way to do it. What would you suggest I change to manage it?
Thanks.

Managed to get it working, I figured I’d post the code that ended up working for anyone else who ends up trying a similar thing.
Thanks Chris for the help.

  println("Training....")
  # println(size(pars), size(tranges), size(fit_data))
  # (44,)(1, 44)(2, 104, 44)

  Random.seed!(0)
  rng = Random.default_rng()

  NN = Lux.Chain(Lux.Dense(3, 100, tanh), Lux.Dense(100, 1))

  nn_ps, st = Lux.setup(rng,NN) 
  nn_ps = nn_ps  |> ComponentArray 
  println(pwd())
  
  function ude(u, ps, t)
    nn_ps, ode_ps = ps   

    f = (ode_ps[7]*cos(ode_ps[6]*t) + ode_ps[8]*sin(ode_ps[6]*t))*ode_ps[9]  # Actuator

    x,y = u
    in = [f, x, y]
    nn = NN(in, nn_ps, st)[1]
    du1 = y 
    du2 = f - (ode_ps[3]*y+ode_ps[1]*x+ode_ps[2]*x^3+ode_ps[4]*x^5+ode_ps[5]*x^7) + nn[1]
    du = [du1, du2]
  end
  ######################################################

  function loss_function(data, pred)
    l = sum(abs2, data[1,:,:] - pred[1,:,:])
    return l
  end

  prob_NN = ODEProblem{false}((u, ps, t) -> ude(u,(ps, pars[1]), t), u0s[:, 1], tspan)

  function prob_func(prob, i, repeat)
    nn_ps = prob.p
    ps = (nn_ps, pars[i])
    remake(prob, u0=u0s[:, i], saveat=tspan, p=ps)
  end

  prob_b = EnsembleProblem(prob_NN, prob_func=prob_func)
  
  function loss_multiple_shooting(nn_ps) 
    return multiple_shoot(nn_ps, fit_data, tsteps, prob_b, EnsembleThreads(), loss_function, Tsit5(), 
                          3; continuity_term=400, trajectories=batchsize)
  end

  adtype = Optimization.AutoZygote()
  optfunc = OptimizationFunction((nn_ps, x) -> loss_multiple_shooting(nn_ps), adtype)
  optprob = Optimization.OptimizationProblem(optfunc, nn_ps)
  res = Optimization.solve(optprob, PolyOpt(), maxiters=100000) # Using validation early stopping