Mismatched ForwardDiff and FiniteDifference gradients with ImplicitDifferentiation

@gdalle I am attempting to use ImplicitDifferentiation with ForwardDiff to compute the gradient of an optimization problem. I’ve tried to figure out how to reduce the problem to a minimum working example, but I have not come up with anything succinct. I’ve posted an abbreviated version of the problem below:

using UnPack
using FFTW: fftfreq, rfftfreq
using Optimization
using OptimizationOptimJL
using ImplicitDifferentiation
using ForwardDiff: gradient
using FiniteDifferences

function setup_grid_cache(mθ::Int,
    nζ::Int,
    pol_modes::AbstractVector,
    tor_modes::AbstractVector,
    Nfp::T) where {T}
    θ = [i for i in range(start = zero(T), step = 2π/(mθ - 1), length = mθ), _ in range(start = zero(T), step = 2π/(nζ - 1)/Nfp, length = nζ)]
    ζ = [j for _ in range(start = zero(T), step = 2π/(mθ - 1), length = mθ), j in range(start = zero(T), step = 2π/(nζ - 1)/Nfp, length = nζ)]
    return setup_grid_cache(θ, ζ, pol_modes, tor_modes, Nfp)
end

function setup_grid_cache(θ::AbstractArray,
    ζ::AbstractArray,
    pol_modes::AbstractVector,
    tor_modes::AbstractVector,
    Nfp::T) where {T}
    MN = length(pol_modes)
    cϕ = Matrix{T}(undef, MN, length(ζ))
    sϕ = similar(cϕ)
    ∂cϕ∂θ = similar(cϕ)
    ∂cϕ∂ζ = similar(cϕ)
    ∂²cϕ∂θ² = similar(cϕ)
    ∂²cϕ∂ζ² = similar(cϕ)
    ∂²cϕ∂θ∂ζ = similar(cϕ)
    ∂sϕ∂θ = similar(cϕ)
    ∂sϕ∂ζ = similar(cϕ)
    ∂²sϕ∂θ² = similar(cϕ)
    ∂²sϕ∂ζ² = similar(cϕ)
    ∂²sϕ∂θ∂ζ = similar(cϕ)
    @inbounds for j in eachindex(θ, ζ), i in eachindex(pol_modes, tor_modes)
        sϕ[i, j], cϕ[i, j] = sincos(pol_modes[i] * θ[j] + Nfp * tor_modes[i] * ζ[j])
        ∂cϕ∂θ[i, j] = -pol_modes[i] * sϕ[i, j]
        ∂cϕ∂ζ[i, j] = -tor_modes[i] * Nfp * sϕ[i, j]
        ∂²cϕ∂θ²[i, j] = -pol_modes[i] ^ 2 * cϕ[i, j]
        ∂²cϕ∂ζ²[i, j] = -(tor_modes[i] * Nfp) ^ 2 * cϕ[i, j]
        ∂²cϕ∂θ∂ζ[i, j] = -pol_modes[i] * tor_modes[i] * Nfp * cϕ[i, j]
        ∂sϕ∂θ[i, j] = pol_modes[i] * cϕ[i, j]
        ∂sϕ∂ζ[i, j] = tor_modes[i] * Nfp * cϕ[i, j]
        ∂²sϕ∂θ²[i, j] = -pol_modes[i] ^ 2 * sϕ[i, j]
        ∂²sϕ∂ζ²[i, j] = -(tor_modes[i] * Nfp) ^ 2 * sϕ[i, j]
        ∂²sϕ∂θ∂ζ[i, j] = -pol_modes[i] * tor_modes[i] * Nfp * sϕ[i, j]
    end
    return (θ = θ, ζ = ζ, cϕ = cϕ, sϕ = sϕ, 
            ∂cϕ∂θ = ∂cϕ∂θ, ∂cϕ∂ζ = ∂cϕ∂ζ, ∂sϕ∂θ = ∂sϕ∂θ, ∂sϕ∂ζ = ∂sϕ∂ζ,
            ∂²cϕ∂θ² = ∂²cϕ∂θ², ∂²cϕ∂ζ² = ∂²cϕ∂ζ², ∂²cϕ∂θ∂ζ = ∂²cϕ∂θ∂ζ,
            ∂²sϕ∂θ² = ∂²sϕ∂θ², ∂²sϕ∂ζ² = ∂²sϕ∂ζ², ∂²sϕ∂θ∂ζ = ∂²sϕ∂θ∂ζ)
end

function FG_matrix_derivative(Rc::AbstractVector,
    Zs::AbstractVector,
    ∂cϕ∂θ::AbstractVector,
    ∂cϕ∂ζ::AbstractVector,
    ∂sϕ∂θ::AbstractVector,
    ∂sϕ∂ζ::AbstractVector,
    ∂²cϕ∂θ²::AbstractVector,
    ∂²cϕ∂ζ²::AbstractVector,
    ∂²cϕ∂θ∂ζ::AbstractVector,
    ∂²sϕ∂θ²::AbstractVector,
    ∂²sϕ∂ζ²::AbstractVector,
    ∂²sϕ∂θ∂ζ::AbstractVector,
    ι::T) where {T}
    rθ, rζ, zθ, zζ, rθθ, zθθ, rθζ, zθζ, rζζ, zζζ = zeros(T, 10)
    @inbounds for i in eachindex(Rc, Zs)
        rθ += Rc[i] * ∂cϕ∂θ[i]
        rζ += Rc[i] * ∂cϕ∂ζ[i]
        zθ += Zs[i] * ∂sϕ∂θ[i]
        zζ += Zs[i] * ∂sϕ∂ζ[i]
        rθθ += Rc[i] * ∂²cϕ∂θ²[i]
        rζζ += Rc[i] * ∂²cϕ∂ζ²[i]
        rθζ += Rc[i] * ∂²cϕ∂θ∂ζ[i]
        zθθ += Zs[i] * ∂²sϕ∂θ²[i]
        zζζ += Zs[i] * ∂²sϕ∂ζ²[i]
        zθζ += Zs[i] * ∂²sϕ∂θ∂ζ[i]
    end

    return (rζ ^ 2 + zζ ^ 2 + 1 + ι * (rθ * rζ + zθ * zζ),
    2 * (rζ * rθζ + zζ * zθζ) + ι * (rθθ * rζ + rθ * rθζ + zθθ * zζ + zθ * zθζ),
    rθ * rζ + zθ * zζ + ι * (rθ ^ 2  + zθ ^ 2),
    rθζ * rζ + rθ * rζζ + zθζ * zζ + zθ * zζζ + 2 * ι * (rθζ * rθ + zθζ * zθ))
end

function scalar_jacobian_residual(u::AbstractVector,
    cϕ::AbstractVector,
    ∂cϕ∂θ::AbstractVector, 
    ∂cϕ∂ζ::AbstractVector,
    F::T, 
    G::T, 
    F′::T, 
    G′::T) where {T}
    j, jθ, jζ = tuple(zeros(T, 3)...)
    @inbounds for i in eachindex(cϕ, ∂cϕ∂θ, ∂cϕ∂ζ)
        j += u[i] * cϕ[i]
        jθ += ∂cϕ∂θ[i] * u[i]
        jζ += ∂cϕ∂ζ[i] * u[i]
    end
    return ((F′ - G′) * j - F * jθ + G * jζ) ^ 2

end

function scalar_jacobian_gradient!(du::AbstractVector,
    u::AbstractVector,
    cϕ::AbstractVector,
    ∂cϕ∂θ::AbstractVector, 
    ∂cϕ∂ζ::AbstractVector,
    F::T, 
    G::T, 
    F′::T, 
    G′::T) where {T}
    j, jθ, jζ = tuple(zeros(T, 3)...)
    @inbounds for i in eachindex(cϕ, ∂cϕ∂θ, ∂cϕ∂ζ)
        j += u[i] * cϕ[i]
        jθ += ∂cϕ∂θ[i] * u[i]
        jζ += ∂cϕ∂ζ[i] * u[i]
        du[i] = ((F′ - G′) * cϕ[i] - F * ∂cϕ∂θ[i] + G * ∂cϕ∂ζ[i])
    end
    res = ((F′ - G′) * j - F * jθ + G * jζ)
    du .*= 2 * res
    return nothing
end

function scalar_jacobian_residual(u::AbstractVector,
    pars::NT) where {NT <: NamedTuple}
    @unpack grid_cache, F_mat, G_mat, Fθ_mat, Gζ_mat = pars
    res = zero(eltype(u))
    @unpack cϕ, #=sϕ,=# ∂cϕ∂θ, ∂cϕ∂ζ#=, ∂sϕ∂θ, ∂sϕ∂ζ=# = grid_cache
    @inbounds for i in axes(cϕ, 2)
        res += scalar_jacobian_residual(u, view(cϕ, :, i),
        view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i),
        F_mat[i], G_mat[i], Fθ_mat[i], Gζ_mat[i])
    end
    return res
end

function scalar_jacobian_gradient!(G::AbstractVector,
    u::AbstractVector,
    pars::NT) where {NT <: NamedTuple}
    @unpack grid_cache, F_mat, G_mat, Fθ_mat, Gζ_mat, dj, ∂j = pars

    fill!(dj, zero(eltype(u)))
    @unpack cϕ, ∂cϕ∂θ, ∂cϕ∂ζ = grid_cache
    @inbounds for i in axes(cϕ, 2)
        scalar_jacobian_gradient!(∂j, u, view(cϕ, :, i), 
        view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i), 
        F_mat[i], G_mat[i], Fθ_mat[i], Gζ_mat[i])
        dj .+= ∂j
    end
    dj[1] = zero(eltype(u))
    copyto!(G, dj)
    
    return nothing
end

function scalar_jacobian_parameters(grid_cache::NT,
    Rc::AbstractVector,
    Zs::AbstractVector,
    ι::T) where {T, NT <: NamedTuple}

    @unpack ∂cϕ∂θ, ∂cϕ∂ζ, ∂sϕ∂θ, ∂sϕ∂ζ, ∂²cϕ∂θ², ∂²cϕ∂ζ², ∂²cϕ∂θ∂ζ, ∂²sϕ∂θ², ∂²sϕ∂ζ², ∂²sϕ∂θ∂ζ = grid_cache
    MN = size(∂cϕ∂θ, 1)
    grid_size = size(grid_cache.θ)
    F_mat = similar(Rc, grid_size)
    G_mat = similar(F_mat)
    Fθ_mat = similar(F_mat)
    Gζ_mat = similar(G_mat)
    f, fθ, g, gζ = (zero(T), zero(T), zero(T), zero(T))
    @inbounds for i in eachindex(F_mat, G_mat, Fθ_mat, Gζ_mat)
        f, fθ, g, gζ = FG_matrix_derivative(Rc, #=Rs, Zc,=# Zs, view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i), 
        view(∂sϕ∂θ, :, i), view(∂sϕ∂ζ, :, i), view(∂²cϕ∂θ², :, i), view(∂²cϕ∂ζ², :, i), view(∂²cϕ∂θ∂ζ, :, i), 
        view(∂²sϕ∂θ², :, i), view(∂²sϕ∂ζ², :, i), view(∂²sϕ∂θ∂ζ, :, i), ι)
        F_mat[i] = f
        Fθ_mat[i] = fθ
        G_mat[i] = g
        Gζ_mat[i] = gζ
    end
    
    return (grid_cache = grid_cache, F_mat = F_mat, G_mat = G_mat, Fθ_mat = Fθ_mat, Gζ_mat = Gζ_mat, dj = zeros(T, MN), ∂j = zeros(T, MN))
end

function compute_scalar_jacobian!(jac_modes::AbstractVector,
    guess::AbstractVector,
    parameters::NT) where {NT <: NamedTuple}
    jac_func = OptimizationFunction(scalar_jacobian_residual; grad = scalar_jacobian_gradient!)
    opt_prob = OptimizationProblem(jac_func, guess, parameters)
    sol = solve(opt_prob, OptimizationOptimJL.LBFGS())
    copyto!(jac_modes, sol.u)
    return nothing
end

function compute_scalar_jacobian(Rc::AbstractVector,
    Zs::AbstractVector,
    eq_params::NT) where {NT <: NamedTuple}
    @unpack grid_cache, ι = eq_params
    parameters = scalar_jacobian_parameters(grid_cache, Rc, Zs, ι)
    guess = copy(Rc)
    guess[1] = one(eltype(Rc))
    jac_modes = similar(guess)
    compute_scalar_jacobian!(jac_modes, guess, parameters)
    return jac_modes
end

function scalar_jacobian_forward(x::AbstractVector, eq_params::NT) where {NT <: NamedTuple}
    @unpack inds = eq_params
    jac_modes = compute_scalar_jacobian(x[inds[1]], x[inds[2]], eq_params)
    return jac_modes 
end

function scalar_jacobian_optimality_condition(x::AbstractVector, 
    jac_modes::AbstractVector, 
    eq_params::NT) where {NT <: NamedTuple}
    @unpack M, N, pol_modes, tor_modes, ι, Nfp, inds = eq_params
    Rc = x[inds[1]]
    Zs = x[inds[2]]
    T = promote_type(typeof(x), typeof(jac_modes))
    ET = eltype(T)
    grid_cache = setup_grid_cache(2 * M, N, pol_modes, tor_modes, Nfp)
    @unpack cϕ, sϕ, ∂cϕ∂θ, ∂cϕ∂ζ, ∂sϕ∂θ, ∂sϕ∂ζ, ∂²cϕ∂θ², ∂²cϕ∂ζ², ∂²cϕ∂θ∂ζ, ∂²sϕ∂θ², ∂²sϕ∂ζ², ∂²sϕ∂θ∂ζ = grid_cache
    Jc = jac_modes[1 : M * N]
    cond_vec = Vector{ET}(undef, #=2 *=# M * N)
    @inbounds for i in eachindex(cond_vec)
        f, fθ, g, gζ = FG_matrix_derivative(Rc, #=Rs, Zc,=# Zs, view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i), 
        view(∂sϕ∂θ, :, i), view(∂sϕ∂ζ, :, i), view(∂²cϕ∂θ², :, i), view(∂²cϕ∂ζ², :, i), view(∂²cϕ∂θ∂ζ, :, i), 
        view(∂²sϕ∂θ², :, i), view(∂²sϕ∂ζ², :, i), view(∂²sϕ∂θ∂ζ, :, i), ι)
        cϕ_view = view(cϕ, :, i)
        ∂cϕ∂θ_view = view(∂cϕ∂θ, :, i)
        ∂cϕ∂ζ_view = view(∂cϕ∂ζ, :, i)
        jac = zero(ET)
        jac_θ = zero(ET)
        jac_ζ = zero(ET)
        for j in eachindex(Jc)    
            jac += Jc[j] * cϕ_view[j]
            jac_θ += Jc[j] * ∂cϕ∂θ_view[j]
            jac_ζ += Jc[j] * ∂cϕ∂ζ_view[j]
        end
        cond_vec[i] = (fθ - gζ) * jac - f * jac_θ + g * jac_ζ
    end
    return cond_vec
end

implicit_scalar_jacobian = ImplicitFunction(scalar_jacobian_forward, scalar_jacobian_optimality_condition)

function setup_jacobian(M = 3, N = 3, ι = 0.796, Nfp = 10., mθ = 32, nζ = 32)
    Rc = zeros(M, N)
    Zs = zeros(M, N)
    Rc[1] = 1.
    Rc[2] = 0.1
    Rc[1,2] = 0.1
    Zs[2] = 0.1
    Zs[1,2] = 0.1
    pol_modes = repeat(rfftfreq(2 * M - isodd(M), 2 * M - isodd(M)), outer = N)
    tor_modes = tor_modes = repeat(fftfreq(N, N), inner = M)
    grid_cache = setup_grid_cache(mθ, nζ, pol_modes, tor_modes, Nfp)
    inds = [(i - 1) * M * N + 1:i * M * N for i in 1:2]
    eq_params = (M = M, N = N, pol_modes = pol_modes, tor_modes = tor_modes, ι = ι, Nfp = Nfp, grid_cache = grid_cache, inds = inds)
    x = vcat(vec(Rc), vec(Zs))
    return x, eq_params
end

This can be run with

x0, eq_params = setup_jacobian()

jacobian_objective(_x) = sum(implicit_scalar_jacobian(_x, eq_params))

grad_ad = gradient(jacobian_objective, x0)
grad_fd = grad(central_fdm(5, 1), jacobian_objective, x0)[1]

isapprox.(grad_ad, grad_fd, atol = 1e-8)

which produces

18-element BitVector:
 1
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0

I know it’s a long example, but I’m trying to figure out if I’m using ImplicitDifferentiation correclty or if this is a bug, so any help would be appreciated.

Are you even sure the gradient is incorrect? atol = 1e-8 seems rather low to compare with finite differences.

1 Like

Yes, here’s the output:

julia> grad_ad
18-element Vector{Float64}:
  0.0
  3.855146513548839
  5.692010684884117
  4.169107713407151
  5.2118991317169705
 -7.222128655065506
  4.169107713407151
  9.60050221120679
 -5.677386823784674
  0.0
  0.9388195535048295
  3.1518181411772446
 -3.970399267803387
  3.266888174181338
  2.8617605663372996
  3.970399267803387
 -6.033167595868925
  1.796942908177839

julia> grad_fd
18-element Vector{Float64}:
 -8.303510771730269e-15
  5.261445591532297
  3.692493561750632
  4.513231837957292
 -2.479290516313889
  7.424423114685446
  4.513231837952674
 48.09989500734477
  1.6380839471935604
 -8.303510771730269e-15
 -0.46747952446504054
 -0.16928610690089677
 -4.314523392358872
 -6.982771092837856
  2.9698865621894606
  4.314523392348743
 21.290946459268554
 -4.750077906680258

I don’t see anything obviously wrong with the way you used ImplicitDifferentiation.jl, but it is really hard to tell with such a maximal working example.
Did you try differentiating with Enzyme.jl or ForwardDiff.jl? That would give an exact result, which is more trustworthy than finite differences especially if there are discontinuities in your function.

Yes, the autodiff result I have is with ForwardDiff. I’ve checked the finite difference result a decreasing step size and increasing approximation order and the result converges to the above finite difference result. @jwalker and I will try to come up with a better MWE that replicates this behavior.

1 Like

Maybe you can play around with the linear solver in the ImplicitFunction to diagnose whether the linear system has converged or not. Lack of convergence can be a sign of something going wrong in implicit diff.

Thanks @gdalle, I will take a look at that

Of course the more likely culprit is an ill-specified optimality condition / solving routine

Can you elaborate on what you mean by ill-specified? When I calculate the optimality condition, I get a vector with entries at 1e-13 and lower, which seems correct to me. We also can reproduce an analytic theory calculation, so I’m fairly confident the desired solution is being computed.

I meant a bug in either of those two functions, but if the conditions are zero at optimality this bodes well

I meant it would be great if we could get a reference gradient (without implicit differentiation) from a true autodiff backend (as opposed to finite differences). Is it feasible by modifying the code a little bit to allow Dual numbers?

I think that’s the issue here. It’s easy to diagnose by switching the linear solver as follows:

implicit_scalar_jacobian = ImplicitFunction(
    scalar_jacobian_forward,
    scalar_jacobian_optimality_condition;
    linear_solver=\
)

Then you get a LinearAlgebra.SingularException when you try to compute the gradient. That means the Jacobian is not invertible at the point in question, and so the implicit function theorem does not apply.

The default linear solver in ImplicitFunction is a Krylov solver which does not complain when the solve fails. I don’t recall why I set it up like that, maybe I should change it. In the present scenario, I think the Krylov solver tries to minimize || Ax - b ||^2 instead, but I can’t tell you what this means mathematically when the final residual is not zero.

Many thanks again @gdalle for your help and willingness to take a look. Your comment led me to take a deeper dive into the structure of the problem and yes, it looks like the current solution method I have leads to a left-hand-side operator. So it was me incorrectly using ImplicitDifferentiation and not checking whether the implicit function theory conditions were satisfied.

2 Likes

I mean, no one really checks these conditions, but when the results don’t make any sense it’s good to remember them ^^