CUDA and estimation of parameters of a Differential Equation

I am trying to estimate three parameters of a ODE system with 368 equations couple. Since it is a big system I have decided to use CUDA as recommeded by other colleagues to speed up the process. Because the ODE is written matricially.
A toy and reproducible example is:

using DifferentialEquations
using DataFrames
using CSV
using Plots
using Shapefile
import GeoDataFrames as GDF
using LinearAlgebra
using ODE
using Interpolations
using DiffEqParamEstim
using SparseArrays
using CUDA
using ParameterizedFunctions
#using RecursiveArrayTools
using Optimization
using Distributions
loc = "/home/marta/"
loc = "U:/"

# Constants
m_c = 0.001 # probability of mosquito in a car
alp = 1/200 # average natural dispersal distance

# Create fake data
end_ind = 5

# Create fake mobility matrix
eta =  cu([0.0       7.74415   5.24014     8.57707     5.21319;
        6.9175    0.0       7.23771    45.2212     16.8025;
        6.17319   7.426     0.0        17.7236      5.49988;
        8.40241  44.21     20.1597      0.0      5309.46;
        5.8309   17.5283    5.36841  5226.36        0.0])

# Create fake distance matrix
dist = [00.0       0.586656  0.574806  0.407857  0.414613;
        0.586656  0.0       0.734197  0.677742  0.67704;
        0.574806  0.734197  0.0       0.649191  0.674846;
        0.407857  0.677742  0.649191  0.0       0.945872;
        0.414613  0.67704   0.674846  0.945872  0.0]

mat_ar = exp.(-alp .* dist)
mat[diagind(mat_ar)] .= 0 # Set to zero diagonal of distance
mat = cu(mat_ar)

# Choose number of patches and IC
pop_init_ar = zeros(end_ind)
pop_init_ar[1] = 1
pop_init = cu(pop_init_ar)

# Fake time series
num_years = 1
num_days = 365*num_years
days = 1:num_days
days_per_year = 365
dates = collect(1:num_days)

#  Fake seasonal
function fake_timeseries!(days ,days_per_year,num_days,dates)
    R_Ms = Matrix{Float64}(undef, num_days,end_ind)
    R_Ms[1:end,1] .= (30 .+ 10 * sin.(2 * π * days / days_per_year) + 2. * randn(num_days))./10
    R_Ms[1:end,2] .= (30 .+ 10 * sin.(2 * π * days / days_per_year) + 2. * randn(num_days))./30
    R_Ms[1:end,3] .= (30 .+ 10 * sin.(2 * π * days / days_per_year) + 2. * randn(num_days))./40
    R_Ms[1:end,4] .= (30 .+ 10 * sin.(2 * π * days / days_per_year) + 2. * randn(num_days))./10
    R_Ms[1:end,5] .= (30 .+ 10 * sin.(2 * π * days / days_per_year) + 2. * randn(num_days))./100
    
    # Perform interpolation for each location
    for i in 1:end_ind
        # Extract temperature values for the current location
        R_Ms_val = R_Ms[:, i]
        dates_num = 1:size(dates, 1)

        # Perform linear interpolation
        itp = LinearInterpolation(dates_num, R_Ms_val,extrapolation_bc=Flat())
        # Store the interpolated function
        push!(interpolated_functions, itp)

    end
end

# Create an array to store interpolated functions
interpolated_functions = []
fake_timeseries!(days ,days_per_year,num_days,dates)

plot(interpolated_functions[1:end_ind])

# Integration non autonomous
function fun_na!(du, u, p, t)
    R_M_vec = [interpolated_functions[i](t) for i in 1:end_ind] |> cu # Time series RM
    du.= R_M_vec.*(p[1].*mat*u .+  p[2].*(eta)*u) .* (1 .- u) - p[3] .* u
    nothing
end
  
# Set parameters
t0=0.0f0
tf=365.0f0
tspan = (t0, tf)
t_vect=1:tf
u0 = pop_init
p = [0.01,0.01,0.005]

# Create the ode model
prob = DifferentialEquations.ODEProblem(fun_na!, u0, tspan, p)
rtol=1e-14
alg= DP8() #For low tolerances, Rodas4()

# Test Integrate ODE
@time obs = DifferentialEquations.solve(prob,alg;p=p, saveat=10)
plot(obs)

# Create the vector of times of the observation. Assuming we observe in August 15 (or 16 depending onn leap years)
# t_obs = zeros(num_years)
# t_obs[1] = 227 
# for i = 2:length(t_obs)
#   t_obs[i] = t_obs[i-1] + 365
# end

# # Create fake obs if p(t)>0.5 then 1
# matrix_obs = zeros(end_ind,num_years)

# # Loop through unique id_mitma values
# for i in 1:end_ind
#     # Get years for the current id_mitma
#     for j in 1:num_years
#         prob = sol(t_obs[j])[i]
#         # Convert years to indices in the matrix
#         if (prob > 0.7)
#            matrix_obs[i,j:end] .= 1 
#         end
#     end
# end

# Fake observed data
t = collect(range(0, stop = tf, length = 40))
using RecursiveArrayTools # for VectorOfArray
randomized = VectorOfArray([(obs(t[i]) ) for i in 1:length(t)])
data = convert(Array, randomized)

using ForwardDiff, OptimizationOptimJL, OptimizationBBO
# Compute the cost function
@time cost_function = build_loss_objective(prob, Rodas5(), L2Loss(t, data),
 Optimization.AutoForwardDiff(), maxiters = 10000, verbose = false)

# Optimization
low_bound = [0,0,0]
up_bound = [0.1,0.1,0.1]
optprob = Optimization.OptimizationProblem(cost_function, [0.001,0.001,0.001],
lb = low_bound, ub = up_bound)
@time optsol = solve(optprob, BFGS())

And I get the following error:
ERROR: MethodError: no method matching geqrf!(::CuArray{ForwardDiff.Dual{ForwardDiff.Tag{…}, Float64, 3}, 2, CUDA.DeviceMemory})

Closest candidates are:
geqrf!(::StridedCuMatrix{ComplexF64})
@ CUDA C:\Users\pardoaraujo.julia\packages\CUDA\75aiI\lib\cusolver\dense.jl:172
geqrf!(::StridedCuMatrix{Float32}, ::CuArray{Float32, 1})
@ CUDA C:\Users\pardoaraujo.julia\packages\CUDA\75aiI\lib\cusolver\dense.jl:150
geqrf!(::StridedCuMatrix{Float32})
@ CUDA C:\Users\pardoaraujo.julia\packages\CUDA\75aiI\lib\cusolver\dense.jl:172

It almost certainly won’t speedup a code with those stats.