Kernel Error while running code on GPU

I have been trying to run the following code on a GPU:

using OrdinaryDiffEq
using DiffEqFlux
using Random
using CUDA

using DiffEqFlux: gpu
CUDA.allowscalar(true)

function RelativisticOrbitModel(u, model_params, t)
    χ, ϕ = u
    p, M, e  = model_params
    
    numer = (p-2-2*e*cos(χ)) * (1+e*cos(χ))^2
    denom = sqrt( (p-2)^2-4*e^2 )

    χ̇ = numer * sqrt( p-6-2*e*cos(χ) )/( M*(p^2)*denom )
    ϕ̇ = numer / (M*(p^(3/2))*denom)

    return [χ̇, ϕ̇]

end

dts=25
w = 122
i = 0.9
k=8.0
rng = MersenneTwister(w);
mass_ratio = 0.0
u0 = Float32[pi, 0.0]
tspan = (0.0f0, 2.1295f4)
model_params = Float32[8.0, 1.0, 0.9]


# Functions to generate the training data 
  function ϕprime(p,e,m,x)
    num = (p-2 .-2*e*cos.(x)).*((1 .+ e*cos.(x)).^2)
    den = (m*(p^1.5))*((p-2)^2 - 4*e^2)^0.5
  
    return num/den
  end
  
  function χprime(p,e,m,x)
    num = (p-2 .-2*e*cos.(x)).*((1 .+ e*cos.(x)).^2).*(p-6 .-2*e*cos.(x)).^0.5
    den = (m*(p^2))*((p-2)^2 - 4*e^2)^0.5
  
    return num/den
  end

prob = ODEProblem(RelativisticOrbitModel, u0, tspan, model_params)
## Solve on CPU once to get timesteps and then continue solving on GPU
soln1 = (solve(prob, RK4(), dt = dts,saveat = dts, adaptive=false))
t_steps = soln1.t
soln1 = gpu(solve(prob, RK4(), dt = dts,saveat = t_steps, adaptive=false))

# Get the training data
chi_t = χprime(k,i,1.0,soln1[1,:]) |> gpu
phi_t = ϕprime(k,i,1.0,soln1[2,:])  |> gpu
len = length(t_steps)

# Define the neural network
NN2_gpu = FastChain((x, p) ->gpu([cos(x[1])]),
  FastDense(1, 32, relu),
  FastDense(32, 32, relu),
  FastDense(32, 2))
u0 = Float32[pi, 0.0] |> gpu
NN2_params_gpu = initial_params(NN2_gpu) .* 0 |> gpu


chi_t_nn = fill!(similar(t_steps, 1, size(t_steps)[end]), 0)
phi_t_nn = fill!(similar(t_steps, 1, size(t_steps)[end]), 0)

function NN_output(NN2_params_gpu, chi_t_nn, phi_t_nn)
    
    chi_t_nn = reshape(chi_t_nn, 853)
    chi_t_nn2 = Zygote.Buffer(chi_t_nn)
    
    
    phi_t_nn = reshape(phi_t_nn, 853)
    phi_t_nn2 = Zygote.Buffer(phi_t_nn)
    
    for i in 1:len
       nx1 = NN2_gpu(soln1[i], NN2_params_gpu)[1]
        nx2 = NN2_gpu(soln1[i], NN2_params_gpu)[2]
        chi_t_nn2[i] =  nx1
        phi_t_nn2[i] =  nx2
        # print(i)
      end
    return copy(chi_t_nn2), copy(phi_t_nn2)
  end

# Loss function
function l13(NN2_params_gpu)

    chi_t_nn2 = (NN_output(NN2_params_gpu, chi_t_nn, phi_t_nn)[1] ) |> gpu
    phi_t_nn2 = (NN_output(NN2_params_gpu, chi_t_nn, phi_t_nn)[2])  |> gpu

    loss = CUDA.sum(abs2, (chi_t .- chi_t_nn2)).*dts
    return loss
end

gradient(l13, NN2_params_gpu)

and I encounter the following KernelError :

ERROR: GPU compilation of kernel #broadcast_kernel#26(CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}}}, Int64) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}}}, which is not isbits:
  .args is of type Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}} which is not isbits.
    .1 is of type Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}} which is not isbits.
      .x is of type Vector{Float32} which is not isbits.

And I’m using these version of the packages :

Status `~/.julia/environments/v1.9/Project.toml`
⌅ [052768ef] CUDA v4.0.1
⌃ [aae7a2af] DiffEqFlux v1.53.0
  [41bf760c] DiffEqSensitivity v6.79.0
⌅ [7da242da] Enzyme v0.10.18
⌅ [587475ba] Flux v0.13.17
⌃ [98e50ef6] JuliaFormatter v1.0.43
⌃ [961ee093] ModelingToolkit v8.50.0
  [429524aa] Optim v1.7.8
⌃ [1dea7af3] OrdinaryDiffEq v6.51.2
  [91a5bcdd] Plots v1.39.0
⌃ [82ae8749] StatsAPI v1.6.0
  [8dfed614] Test

I realize that I’m using earlier versions of some packages and I am not sure if this error is due to that. But any help regarding what the error is about and how I could solve it would be very helpful.
Also please let me know if I could’ve presented the problem in a better manner, not used to posting on this forum. Apologies for error on my part.

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}}}, which is not isbits:
  .args is of type Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}} which is not isbits.
    .1 is of type Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}} which is not isbits.
      .x is of type Vector{Float32} which is not isbits.

There’s a CPU array in your GPU broadcast expression.

Thank you, is there a way for me to pin down which array that is? Can I look into what the GPU broadcast expression is exactly?

Look at the remainder of the backtrace (which you didn’t post here) to identify the expression, and the error already states which array is the CPU one (but you can just @show typeof(...) the args).

The stack trace exceeds the character limit allowed, hence breaking it up into 2 replies. Here is the first one :

Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/validation.jl:88
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:154 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:152 [inlined]
  [5] emit_julia(job::GPUCompiler.CompilerJob; validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:83
  [6] emit_julia
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:77 [inlined]
  [7] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.ThreadSafeContext)
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:359
  [8] #221
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:354 [inlined]
  [9] LLVM.ThreadSafeContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#26", Tuple{CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}}}, Int64}}}})
    @ LLVM ~/.julia/packages/LLVM/HykgZ/src/executionengine/ts_module.jl:14
 [10] JuliaContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#26", Tuple{CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}}}, Int64}}}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:74
 [11] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:353
 [12] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/cache.jl:90
 [13] cufunction(f::GPUArrays.var"#broadcast_kernel#26", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}}}, Int64}}; name::Nothing, always_inline::Bool, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:306
 [14] cufunction(f::GPUArrays.var"#broadcast_kernel#26", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(*), Tuple{Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{CUDA.CuRefValue{typeof(relu)}, Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}}}, Int64}})
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:299
 [15] macro expansion
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:102 [inlined]
 [16] #launch_heuristic#245
    @ ~/.julia/packages/CUDA/ZdCxS/src/gpuarrays.jl:17 [inlined]
 [17] launch_heuristic
    @ ~/.julia/packages/CUDA/ZdCxS/src/gpuarrays.jl:15 [inlined]
 [18] _copyto!
    @ ~/.julia/packages/GPUArrays/5XhED/src/host/broadcast.jl:65 [inlined]
 [19] copyto!
    @ ~/.julia/packages/GPUArrays/5XhED/src/host/broadcast.jl:46 [inlined]
 [20] copy
    @ ~/.julia/packages/GPUArrays/5XhED/src/host/broadcast.jl:37 [inlined]
 [21] materialize(bc::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(*), Tuple{Vector{Float32}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, typeof(ForwardDiff.derivative), Tuple{Base.RefValue{typeof(relu)}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}})
    @ Base.Broadcast ./broadcast.jl:873
 [22] (::DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}})(?::Vector{Float32})
    @ DiffEqFlux ~/.julia/packages/DiffEqFlux/2IJEZ/src/fast_layers.jl:196
 [23] (::DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}})(?::Vector{Float32})
    @ DiffEqFlux ~/.julia/packages/ZygoteRules/OgCVT/src/adjoint.jl:71
 [24] Pullback
    @ ~/.julia/packages/DiffEqFlux/2IJEZ/src/fast_layers.jl:20 [inlined]
 [25] (::Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}})(?::Zygote.OneElement{Float32, 1, Tuple{Int64}, Tuple{Base.OneTo{Int64}}})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0
 [26] Pullback
    @ ~/.julia/packages/DiffEqFlux/2IJEZ/src/fast_layers.jl:20 [inlined]
 [27] (::Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}})(?::Zygote.OneElement{Float32, 1, Tuple{Int64}, Tuple{Base.OneTo{Int64}}})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0
 [28] Pullback
    @ ~/.julia/packages/DiffEqFlux/2IJEZ/src/fast_layers.jl:20 [inlined]
 [29] (::Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{var"#3#4", FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, 

Here is the second part :

Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.Pullback{Tuple{var"#3#4", Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#vect_pullback#1373"{1, Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}}, Zygote.ZBack{ChainRules.var"#cos_pullback#1311"{Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), Float32, Val{1}}, Tuple{Zygote.Pullback{Tuple{typeof(getindex), Float32, Int64}, Any}}}, Zygote.Pullback{Tuple{typeof(gpu), Vector{Float32}}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Flux.FluxCUDAAdaptor, Vector{Float32}}, Any}, Zygote.Pullback{Tuple{Type{Flux.FluxCUDAAdaptor}}, Tuple{}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing, Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing, Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing, Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), var"#3#4"}, Tuple{}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), var"#3#4"}, Tuple{}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}})(?::Zygote.OneElement{Float32, 1, Tuple{Int64}, Tuple{Base.OneTo{Int64}}})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0
[30] Pullback
    @ ~/.julia/packages/DiffEqFlux/2IJEZ/src/fast_layers.jl:21 [inlined]
 [31] (::Zygote.Pullback{Tuple{FastChain{Tuple{var"#3#4", FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}}, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{var"#3#4", FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{typeof(DiffEqFlux.applychain), Tuple{}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, DiffEqFlux.var"#270#back#146"{DiffEqFlux.var"#FastDense_adjoint#145"{FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}, Tuple{Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Bool}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1350"{Int64, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:bias, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Bool}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:in, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:out, Zygote.Context{false}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, Int64}}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.Pullback{Tuple{var"#3#4", Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#vect_pullback#1373"{1, Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}}, Zygote.ZBack{ChainRules.var"#cos_pullback#1311"{Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), Float32, Val{1}}, Tuple{Zygote.Pullback{Tuple{typeof(getindex), Float32, Int64}, Any}}}, Zygote.Pullback{Tuple{typeof(gpu), Vector{Float32}}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Flux.FluxCUDAAdaptor, Vector{Float32}}, Any}, Zygote.Pullback{Tuple{Type{Flux.FluxCUDAAdaptor}}, Tuple{}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing, Nothing}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.var"#2145#back#281"{Zygote.var"#277#280"}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing, Nothing}}}, Zygote.var"#2129#back#273"{Zygote.var"#268#272"{Tuple{Nothing, Nothing, Nothing}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.ZBack{Zygote.var"#convert_pullback#330"}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}}}, Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), var"#3#4"}, Tuple{}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(DiffEqFlux.paramlength), var"#3#4"}, Tuple{}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Int64, Int64}}}}}, Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:layers, Zygote.Context{false}, FastChain{Tuple{var"#3#4", FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}}, Tuple{var"#3#4", FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(relu), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}, FastDense{typeof(identity), DiffEqFlux.var"#initial_params#135"{Vector{Float32}}, Nothing}}}}}})(?::Zygote.OneElement{Float32, 1, Tuple{Int64}, Tuple{Base.OneTo{Int64}}})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0
 [32] Pullback
    @ ./REPL[31]:12 [inlined]
 [33] (::Zygote.Pullback{Tuple{typeof(NN_output), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Matrix{Float32}, Matrix{Float32}}, Any})(?::Nothing)
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0
 [34] Pullback
    @ ./REPL[32]:5 [inlined]
 [35] (::Zygote.Pullback{Tuple{typeof(l13), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#3834#back#1221"{Zygote.var"#1217#1220"{Float32, Int64}}, ComposedFunction{Zygote.Pullback{Tuple{Zygote.var"#1461#1462", typeof(abs2), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#4229#back#1457"{Zygote.var"#1453#1456"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.var"#4010#back#1303"{Zygote.var"#1299#1302"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}, typeof(ZygoteRules.unthunk_tangent)}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), Float32}, Tuple{}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.var"#3802#back#1205"{Zygote.var"#1201#1204"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(NN_output), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Matrix{Float32}, Matrix{Float32}}, Any}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.var"#2033#back#213"{Zygote.var"#back#211"{2, 1, Zygote.Context{false}, Vector{Float32}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.Pullback{Tuple{typeof(NN_output), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Matrix{Float32}, Matrix{Float32}}, Any}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Int64}}, Zygote.var"#2033#back#213"{Zygote.var"#back#211"{2, 2, Zygote.Context{false}, Vector{Float32}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(|>), Vector{Float32}, typeof(gpu)}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Vector{Float32}}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Flux.FluxCUDAAdaptor, Vector{Float32}}, Any}, Zygote.Pullback{Tuple{Type{Flux.FluxCUDAAdaptor}}, Tuple{}}}}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.Pullback{Tuple{typeof(|>), Vector{Float32}, typeof(gpu)}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Vector{Float32}}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Flux.FluxCUDAAdaptor, Vector{Float32}}, Any}, Zygote.Pullback{Tuple{Type{Flux.FluxCUDAAdaptor}}, Tuple{}}}}}}}})(?::Float32)
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0
 [36] (::Zygote.var"#75#76"{Zygote.Pullback{Tuple{typeof(l13), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#3834#back#1221"{Zygote.var"#1217#1220"{Float32, Int64}}, ComposedFunction{Zygote.Pullback{Tuple{Zygote.var"#1461#1462", typeof(abs2), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#4229#back#1457"{Zygote.var"#1453#1456"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.var"#4010#back#1303"{Zygote.var"#1299#1302"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}, typeof(ZygoteRules.unthunk_tangent)}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), Float32}, Tuple{}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.var"#3802#back#1205"{Zygote.var"#1201#1204"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(NN_output), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Matrix{Float32}, Matrix{Float32}}, Any}, Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.var"#2033#back#213"{Zygote.var"#back#211"{2, 1, Zygote.Context{false}, Vector{Float32}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.Pullback{Tuple{typeof(NN_output), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Matrix{Float32}, Matrix{Float32}}, Any}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Int64}}, Zygote.var"#2033#back#213"{Zygote.var"#back#211"{2, 2, Zygote.Context{false}, Vector{Float32}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(|>), Vector{Float32}, typeof(gpu)}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Vector{Float32}}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Flux.FluxCUDAAdaptor, Vector{Float32}}, Any}, Zygote.Pullback{Tuple{Type{Flux.FluxCUDAAdaptor}}, Tuple{}}}}}}, Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Matrix{Float32}}}, Zygote.Pullback{Tuple{typeof(|>), Vector{Float32}, typeof(gpu)}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Vector{Float32}}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Flux.FluxCUDAAdaptor, Vector{Float32}}, Any}, Zygote.Pullback{Tuple{Type{Flux.FluxCUDAAdaptor}}, Tuple{}}}}}}}}})(?::Float32)
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface.jl:45
 [37] gradient(f::Function, args::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface.jl:97
 [38] top-level scope
    @ REPL[33]:1
 [39] top-level scope
    @ ~/.julia/packages/CUDA/ZdCxS/src/initialization.jl:155

This leads me to believe that something is wrong with the neural network itself. Would I be accurate in concluding that?
Thank you once again!

DiffEqFlux.var"#initial_params#135"{Vector{Float32}}

Your u0 is a CPU array; although I’m unfamiliar with DiffEq I guess your prob needs to be constructed with GPU arrays.

I converted u0 into a GPU array and constructed prob again, but that leads me to the same error while computing soln1.
Could the DiffEqFlux.var"#initial_params#135"{Vector{Float32}} point to the parameters of the neural network itself?

Did you also convert model_params, the other CPU vector?
In any case, this looks more like an issue with the use of DiffEq or Flux, so you may be better off asking people who are familiar with those packages.

You can try converting the parameters of NN2_gpu to gpu arrays like so:

NN2_gpu = FastChain((x, p) -> [cos(x[1])],
  FastDense(1, 32, relu),
  FastDense(32, 32, relu),
  FastDense(32, 2)) |> gpu

i.e. you didn’t transfer the parameters of FastDense (and the FastChain as a whole) to gpu. There might be other problems but I would start there.

I did that as well, but faced a similar error. Although I have been able to replicate the error while using Lux.jl as well. With that package, I was able to narrow down the error to a function in my code that computed the second derivatives. Here is an example of the error along with the stack trace :

using Zygote
using CUDA
function d2_dt2(v::AbstractVector, dt)
    a = 2 * v[1] - 5 * v[2] + 4 * v[3] - v[4]
    b = v[1:(end - 2)] .- 2 * v[2:(end - 1)] .+ v[3:end]
    c = 2 * v[end] - 5 * v[end - 1] + 4 * v[end - 2] - v[end - 3]
    return [a; b; c] / (dt^2)
end
v = rand(10)  # A random vector of length 10
dt = 0.1      # A sample time step
v_gpu = cu(v)  # Convert to a CUDA array
grad_test_gpu = gradient(v -> sum(d2_dt2(v, dt)), v_gpu)  # Gradient computation with GPU array

with this setup, I encounter the following error with the stack trace :

ERROR: GPU compilation of kernel #broadcast_kernel#26(CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(Zygote.accum), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}}, Int64) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(Zygote.accum), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}}, which is not isbits:
  .args is of type Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}} which is not isbits.
    .2 is of type Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}} which is not isbits.
      .x is of type Vector{Float32} which is not isbits.


Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/validation.jl:88
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:154 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:152 [inlined]
  [5] emit_julia(job::GPUCompiler.CompilerJob; validate::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:83
  [6] emit_julia
    @ ~/.julia/packages/GPUCompiler/S3TWf/src/utils.jl:77 [inlined]
  [7] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.ThreadSafeContext)
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:359
  [8] #221
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:354 [inlined]
  [9] LLVM.ThreadSafeContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#26", Tuple{CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(Zygote.accum), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}}})
    @ LLVM ~/.julia/packages/LLVM/HykgZ/src/executionengine/ts_module.jl:14
 [10] JuliaContext(f::CUDA.var"#221#222"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#26", Tuple{CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(Zygote.accum), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}}})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/driver.jl:74
 [11] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:353
 [12] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/S3TWf/src/cache.jl:90
 [13] cufunction(f::GPUArrays.var"#broadcast_kernel#26", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{Float32, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Tuple{Base.OneTo{Int64}}, typeof(Zygote.accum), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}, Base.Broadcast.Extruded{Vector{Float32}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}; name::Nothing, always_inline::Bool, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:306
 [14] cufunction
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:299 [inlined]
 [15] macro expansion
    @ ~/.julia/packages/CUDA/ZdCxS/src/compiler/execution.jl:102 [inlined]
 [16] #launch_heuristic#245
    @ ~/.julia/packages/CUDA/ZdCxS/src/gpuarrays.jl:17 [inlined]
 [17] launch_heuristic
    @ ~/.julia/packages/CUDA/ZdCxS/src/gpuarrays.jl:15 [inlined]
 [18] _copyto!
    @ ~/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:65 [inlined]
 [19] materialize!
    @ ~/.julia/packages/GPUArrays/t0LfC/src/host/broadcast.jl:41 [inlined]
 [20] materialize!
    @ ./broadcast.jl:881 [inlined]
 [21] (::Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}})(dy::Vector{Float32})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/lib/array.jl:57
 [22] (::Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}})(Δ::Vector{Float32})
    @ Zygote ~/.julia/packages/ZygoteRules/OgCVT/src/adjoint.jl:71
 [23] Pullback
    @ ./REPL[110]:3 [inlined]
 [24] (::Zygote.Pullback{Tuple{typeof(d2_dt2), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float64}, Tuple{Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#vcat_pullback#1412"{Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}, Tuple{Tuple{}, Tuple{Int64}, Tuple{}}, Val{1}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#3786#back#1197"{Zygote.var"#1191#1195"{Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{2}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.ZBack{ChainRules.var"#slash_pullback_scalar#1580"{Vector{Float32}, Float64}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{Zygote.var"#literal_pow_pullback#331"{2, Float64}}, Zygote.var"#3802#back#1205"{Zygote.var"#1201#1204"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(|>), Vector{Float64}, typeof(gpu)}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Vector{Float64}}, Any}}}, Zygote.ZBack{ChainRules.var"#times_pullback#1498"{Int64, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.var"#1926#back#161"{Zygote.var"#157#160"}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{1}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Float32, Float32}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{4}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{3}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Float32, Float32}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}}})(Δ::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0

Had to break it up into 2 again. This was the first part.

Here is the second part :

[25] Pullback
    @ ./REPL[112]:1 [inlined]
 [26] (::Zygote.Pullback{Tuple{var"#17#18", CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Float64}}, Zygote.var"#4229#back#1457"{Zygote.var"#1453#1456"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(d2_dt2), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float64}, Tuple{Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#vcat_pullback#1412"{Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}, Tuple{Tuple{}, Tuple{Int64}, Tuple{}}, Val{1}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#3786#back#1197"{Zygote.var"#1191#1195"{Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{2}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.ZBack{ChainRules.var"#slash_pullback_scalar#1580"{Vector{Float32}, Float64}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{Zygote.var"#literal_pow_pullback#331"{2, Float64}}, Zygote.var"#3802#back#1205"{Zygote.var"#1201#1204"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(|>), Vector{Float64}, typeof(gpu)}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Vector{Float64}}, Any}}}, Zygote.ZBack{ChainRules.var"#times_pullback#1498"{Int64, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.var"#1926#back#161"{Zygote.var"#157#160"}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{1}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Float32, Float32}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{4}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{3}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Float32, Float32}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}}}}})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface2.jl:0
 [27] (::Zygote.var"#75#76"{Zygote.Pullback{Tuple{var"#17#18", CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.var"#1990#back#194"{Zygote.var"#190#193"{Zygote.Context{false}, GlobalRef, Float64}}, Zygote.var"#4229#back#1457"{Zygote.var"#1453#1456"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(d2_dt2), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Float64}, Tuple{Zygote.Pullback{Tuple{typeof(Base.Broadcast.materialize), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#vcat_pullback#1412"{Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}, Tuple{Tuple{}, Tuple{Int64}, Tuple{}}, Val{1}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#3786#back#1197"{Zygote.var"#1191#1195"{Tuple{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{2}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.ZBack{ChainRules.var"#slash_pullback_scalar#1580"{Vector{Float32}, Float64}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{Zygote.var"#literal_pow_pullback#331"{2, Float64}}, Zygote.var"#3802#back#1205"{Zygote.var"#1201#1204"{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, Zygote.Pullback{Tuple{typeof(|>), Vector{Float64}, typeof(gpu)}, Tuple{Zygote.Pullback{Tuple{typeof(gpu), Vector{Float64}}, Any}}}, Zygote.ZBack{ChainRules.var"#times_pullback#1498"{Int64, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, ChainRulesCore.ProjectTo{AbstractArray, NamedTuple{(:element, :axes), Tuple{ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, Tuple{Base.OneTo{Int64}}}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.var"#1926#back#161"{Zygote.var"#157#160"}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{1}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Float32, Float32}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{4}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.Pullback{Tuple{typeof(Zygote.literal_getindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Val{3}}, Tuple{Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{Int64}}}, Zygote.ZBack{Zygote.var"#plus_pullback#345"{Tuple{Float32, Float32}}}, Zygote.ZBack{ChainRules.var"#times_pullback2#1346"{Int64, Float32}}, Zygote.ZBack{ChainRules.var"#:_pullback#276"{Tuple{Int64, Int64}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float32, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.ZBack{ChainRules.var"#-_pullback#1335"{Int64, Bool, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}, ChainRulesCore.ProjectTo{Float64, NamedTuple{(), Tuple{}}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}, Zygote.var"#2610#back#533"{Zygote.var"#543#545"{1, Float32, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Tuple{UnitRange{Int64}}}}, Zygote.Pullback{Tuple{typeof(lastindex), CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Tuple{Zygote.Pullback{Tuple{Type{IndexLinear}}, Tuple{}}, Zygote.Pullback{Tuple{typeof(last), Base.OneTo{Int64}}, Tuple{Zygote.var"#2184#back#303"{Zygote.var"#back#302"{:stop, Zygote.Context{false}, Base.OneTo{Int64}, Int64}}, Zygote.ZBack{Zygote.var"#convert_pullback#330"}}}, Zygote.ZBack{ChainRules.var"#eachindex_pullback#376"{Tuple{IndexLinear, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}}}}}}}}})(Δ::Float32)
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface.jl:45
 [28] gradient(f::Function, args::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
    @ Zygote ~/.julia/packages/Zygote/XJ8pP/src/compiler/interface.jl:97
 [29] top-level scope
    @ REPL[112]:1
 [30] top-level scope
    @ ~/.julia/packages/CUDA/ZdCxS/src/initialization.jl:155

I tried this but I believe this would not work due to the following discussion at this GitHub link.
I’ve tried to define the layers individually and then convert the parameters to the gpu but haven’t had luck there yet.