Issues with Convergence in Differential Equation Solver Using Dual Numbers

Okay I have implemented DiffCache and tried some other solvers, but I am getting the following warning (reltol = 1e-6):

┌ Warning: dt(2.2737367544323206e-13) <= dtmin(2.2737367544323206e-13) at t=2010.097388613536, and step error estimate = 52082.68508677296. Aborting. There is either an error in your model specification or the true solution is unstable.
└ @ SciMLBase ~/.julia/packages/SciMLBase/8XHkk/src/integrator_interface.jl:599

Also when applying Rosenbrock23() as solver method I get the following:

ERROR: First call to automatic differentiation for the Jacobian
failed. This means that the user `f` function is not compatible
with automatic differentiation. Methods to fix this include:

1. Turn off automatic differentiation (e.g. Rosenbrock23() becomes
   Rosenbrock23(autodiff=false)). More details can befound at
   https://docs.sciml.ai/DiffEqDocs/stable/features/performance_overloads/
2. Improving the compatibility of `f` with ForwardDiff.jl automatic 
   differentiation (using tools like PreallocationTools.jl). More details
   can be found at https://docs.sciml.ai/DiffEqDocs/stable/basics/faq/#Autodifferentiation-and-Dual-Numbers
3. Defining analytical Jacobians. More details can be
   found at https://docs.sciml.ai/DiffEqDocs/stable/types/ode_types/#SciMLBase.ODEFunction

Note: turning off automatic differentiation tends to have a very minimal
performance impact (for this use case, because it's forward mode for a
square Jacobian. This is different from optimization gradient scenarios).
However, one should be careful as some methods are more sensitive to
accurate gradients than others. Specifically, Rodas methods like `Rodas4`
and `Rodas5P` require accurate Jacobians in order to have good convergence,
while many other methods like BDF (`QNDF`, `FBDF`), SDIRK (`KenCarp4`),
and Rosenbrock-W (`Rosenbrock23`) do not. Thus if using an algorithm which
is sensitive to autodiff and solving at a low tolerance, please change the
algorithm as well.

And the following warning:

┌ Warning: The supplied DiffCache was too small and was enlarged. This incurs allocations
│     on the first call to `get_tmp`. If few calls to `get_tmp` occur and optimal performance is essential,
│     consider changing 'N'/chunk size of this DiffCache to 25.
└ @ PreallocationTools ~/.julia/packages/PreallocationTools/mJSsc/src/PreallocationTools.jl:151

This leads me to believe that my implementation is not correct. I did the following, to my Forward model:

iceflow_caches = (
        H̄ = DiffCache(model.iceflow.H̄),
        S = DiffCache(model.iceflow.S),
        D = DiffCache(model.iceflow.D),
        Dx = DiffCache(model.iceflow.Dx),
        Dy = DiffCache(model.iceflow.Dy),
        ∇S = DiffCache(model.iceflow.∇S),
        ∇Sx = DiffCache(model.iceflow.∇Sx),
        ∇Sy = DiffCache(model.iceflow.∇Sy),
        Fx = DiffCache(model.iceflow.Fx),
        Fy = DiffCache(model.iceflow.Fy),
        Fxx = DiffCache(model.iceflow.Fxx),
        Fyy = DiffCache(model.iceflow.Fyy),
        dSdx = DiffCache(model.iceflow.dSdx),
        dSdy = DiffCache(model.iceflow.dSdy),
        dSdx_edges = DiffCache(model.iceflow.dSdx_edges),
        dSdy_edges = DiffCache(model.iceflow.dSdy_edges)
    )

p = (simulation,iceflow_caches)

# Define problem to be solved
iceflow_prob = ODEProblem{true,SciMLBase.FullSpecialize}(du, model.iceflow.H, params.simulation.tspan, tstops=params.solver.tstops, p)
iceflow_sol = solve(iceflow_prob, 
                        params.solver.solver, 
                        callback=cb, 
                        tstops=params.solver.tstops, 
                        reltol=params.solver.reltol, 
                        save_everystep=params.solver.save_everystep, 
                        progress=params.solver.progress, 
                        progress_steps=params.solver.progress_steps)

function SIA2D!(dH::Matrix{<:Real}, H::Matrix{<:Real}, p, t::F) where {F <: Real}

    simulation = p[1]
    caches = p[2]
    
    # Retrieve parameters
    SIA2D_model::SIA2Dmodel = simulation.model.iceflow
    glacier::Sleipnir.Glacier2D = simulation.glaciers[simulation.model.iceflow.glacier_idx[]]
    params::Sleipnir.Parameters = simulation.parameters
        int_type = simulation.parameters.simulation.int_type
    H̄ = get_tmp(caches.H̄,H)
    A = SIA2D_model.A
    n = SIA2D_model.n
    B = glacier.B
    S = get_tmp(caches.S,H)
    dSdx = get_tmp(caches.dSdx,H)
    dSdy = get_tmp(caches.dSdy,H)
    D = get_tmp(caches.D,H)
    Dx = get_tmp(caches.Dx,H)
    Dy = get_tmp(caches.Dy,H)
    dSdx_edges = get_tmp(caches.dSdx_edges,H)
    dSdy_edges = get_tmp(caches.dSdy_edges,H)
    ∇S = get_tmp(caches.∇S,H)
    ∇Sx = get_tmp(caches.∇Sx,H)
    ∇Sy = get_tmp(caches.∇Sy,H)
    Fx = get_tmp(caches.Fx,H)
    Fy = get_tmp(caches.Fy,H)
    Fxx = get_tmp(caches.Fxx,H)
    Fyy = get_tmp(caches.Fyy,H)
    Δx::F = glacier.Δx
    Δy::F = glacier.Δy
    Γ = SIA2D_model.Γ
    ρ::F = simulation.parameters.physical.ρ
    g::F = simulation.parameters.physical.g

    # First, enforce values to be positive
    map!(x -> ifelse(x>0.0,x,0.0), H, H)
    
    # Update glacier surface altimetry
    S .= B .+ H

    # All grid variables computed in a staggered grid
    # Compute surface gradients on edges
    diff_x!(dSdx, S, Δx)  
    diff_y!(dSdy, S, Δy) 
    avg_y!(∇Sx, dSdx)
    avg_x!(∇Sy, dSdy)
    ∇S .= (∇Sx.^2 .+ ∇Sy.^2).^((n[] - 1)/2) 

    # @infiltrate
    avg!(H̄, H)
    Γ[] = 2.0 * A[] * (ρ * g)^n[] / (n[]+2) # 1 / m^3 s 
    D .= Γ[] .* H̄.^(n[] + 2) .* ∇S

    # Compute flux components
    @views diff_x!(dSdx_edges, S[:,2:end - 1], Δx)
    @views diff_y!(dSdy_edges, S[2:end - 1,:], Δy)

    # Cap surface elevaton differences with the upstream ice thickness to 
    # imporse boundary condition of the SIA equation
    η₀ = params.physical.η₀
    dSdx_edges .= @views @. min(dSdx_edges,  η₀ * H[2:end, 2:end-1]/Δx)
    dSdx_edges .= @views @. max(dSdx_edges, -η₀ * H[1:end-1, 2:end-1]/Δx)
    dSdy_edges .= @views @. min(dSdy_edges,  η₀ * H[2:end-1, 2:end]/Δy)
    dSdy_edges .= @views @. max(dSdy_edges, -η₀ * H[2:end-1, 1:end-1]/Δy)

    avg_y!(Dx, D)
    avg_x!(Dy, D)
    Fx .= .-Dx .* dSdx_edges
    Fy .= .-Dy .* dSdy_edges 

    #  Flux divergence
    diff_x!(Fxx, Fx, Δx)
    diff_y!(Fyy, Fy, Δy)
    inn(dH) .= .-(Fxx .+ Fyy) 
end

Any idea what I could be doing wrong?

Edit: For completeness some of the extra functions shown.


# Helper functions for the staggered grid

####  Non-allocating functions  ####

diff_x!(O, I, Δx) = @views @. O = (I[begin+1:end,:] - I[1:end-1,:]) / Δx

diff_y!(O, I, Δy) = @views @. O = (I[:,begin+1:end] - I[:,1:end - 1]) / Δy

avg!(O, I) = @views @. O = (I[1:end-1,1:end-1] + I[2:end,1:end-1] + I[1:end-1,2:end] + I[2:end,2:end]) * 0.25

avg_x!(O, I) = @views @. O = (I[1:end-1,:] + I[2:end,:]) * 0.5

avg_y!(O, I) = @views @. O = (I[:,1:end-1] + I[:,2:end]) * 0.5

####  Allocating functions  ####

"""
    avg(A)

4-point average of a matrix
"""
@views avg(A) = 0.25 .* ( A[1:end-1,1:end-1] .+ A[2:end,1:end-1] .+ A[1:end-1,2:end] .+ A[2:end,2:end] )


"""
    avg_x(A)

2-point average of a matrix's X axis
"""
@views avg_x(A) = 0.5 .* ( A[1:end-1,:] .+ A[2:end,:] )

"""
    avg_y(A)

2-point average of a matrix's Y axis
"""
@views avg_y(A) = 0.5 .* ( A[:,1:end-1] .+ A[:,2:end] )

"""
    diff_x(A)

2-point differential of a matrix's X axis
"""
@views diff_x(A) = (A[begin + 1:end, :] .- A[1:end - 1, :])

"""
    diff_y(A)

2-point differential of a matrix's Y axis
"""
@views diff_y(A) = (A[:, begin + 1:end] .- A[:, 1:end - 1])

"""
    inn(A)

Access the inner part of the matrix (-2,-2)
"""
@views inn(A) = A[2:end-1,2:end-1]

"""
    inn1(A)

Access the inner part of the matrix (-1,-1)
"""
@views inn1(A) = A[1:end-1,1:end-1]