Custom rule for differentiating through Newton solver using ForwardDiff: works for gradient, fails for hessian

I’ll take a look one of these days, thanks a bunch!

Ok @mohamed82008 @touste with a few tweaks I brought ImplicitDifferentiation within a factor of 3 of the manual implementation for this specific case. Here’s what I did:

  • Update to ImplicitDifferentiation v0.4, in which the forward rule is handwritten instead of relying on ForwardDiffChainRules. I guess this gets rid of the type instabilities
  • Remove the leftmost two columns of the profile flamegraph you showed by overriding LinearOperators.get_nargs (for some reason this fills a full methods table and we don’t need that here). This will be merged soon.
  • Replace vectors with static vectors wherever possible since we have a one-dimensional output
Code
using BenchmarkTools
using ForwardDiff
using ImplicitDifferentiation
using LinearAlgebra
using StaticArrays
using Roots

# Nth order polynomial evaluated at x
eval_poly(x, p) = sum(p[i] * x^(i - 1) for i in eachindex(p))

# Polynomial root (this is the slow part that should never be called with Dual numbers)
function root(p::AbstractVector{T}) where {T<:Real}
    f(x) = eval_poly(x, p)
    xᵅ = find_zero(f, 0.0)
    return xᵅ
end

# Gradient of the root wrt the polynomial coefficients
function ∂root∂p(p::AbstractVector{T}) where {T}
    # Find the root
    xᵅ = find_zero(x -> eval_poly(x, p), zero(T))
    # Implicit function theorem (see https://juliamath.github.io/Roots.jl/dev/roots/#Sensitivity)
    # Derivatives are computed using ForwardDiff
    ∂f∂x = ForwardDiff.derivative(x -> eval_poly(x, p), xᵅ)
    ∂f∂p = ForwardDiff.gradient(p -> eval_poly(xᵅ, p), p)
    return -∂f∂p / ∂f∂x
end

# Hessian of the root wrt the polynomial coefficients
function ∂²root∂p²(p::AbstractVector{T}) where {T}
    # Find the root
    xᵅ = find_zero(x -> eval_poly(x, p), zero(T))
    # Second order implicit function theorem
    ∂f∂x = ForwardDiff.derivative(x -> eval_poly(x, p), xᵅ)
    ∂²f∂x² = ForwardDiff.derivative(
        x -> ForwardDiff.derivative(x -> eval_poly(x, p), x), xᵅ
    )
    ∂f∂p = ForwardDiff.gradient(p -> eval_poly(xᵅ, p), p)
    ∂²f∂p² = ForwardDiff.hessian(p -> eval_poly(xᵅ, p), p)
    ∂²f∂p∂x = ForwardDiff.gradient(p -> ForwardDiff.derivative(x -> eval_poly(x, p), xᵅ), p)
    ∂xᵅ∂p = -∂f∂p / ∂f∂x
    return -(∂²f∂p² + ∂²f∂p∂x * ∂xᵅ∂p' + ∂xᵅ∂p * ∂²f∂p∂x' + ∂²f∂x² * ∂xᵅ∂p * ∂xᵅ∂p') / ∂f∂x
end

# Implicit function
zero_gradient(p, x, z) = SVector(eval_poly(x[1], p))
linear_solver(A, b) = (Matrix(A) \ b, (solved=true,))
implicit = ImplicitFunction(p -> (SVector(root(p)), 0), zero_gradient, linear_solver)

D(p) = ForwardDiff.gradient(p -> implicit(p)[1][1], p)
DD(p) = ForwardDiff.hessian(p -> implicit(p)[1][1], p)
Test
julia> p = [
           0.4595890823651114,
           -0.12268155438606576,
           0.2555413922391766,
           -0.445873981165829,
           0.3981791064025203,
           -0.14840698579393818,
           -0.09402206963799742,
       ];

julia> implicit(p)
([1.2475949272769737], 0)

julia> D(p) ≈ ∂root∂p(p)
true

julia> DD(p) ≈ ∂²root∂p²(p)
true

julia> @btime D($p);
  6.934 μs (78 allocations: 7.61 KiB)

julia> @btime ∂root∂p($p);
  2.438 μs (9 allocations: 1.78 KiB)

julia> @btime DD($p);
  17.376 μs (155 allocations: 42.62 KiB)

julia> @btime ∂²root∂p²($p);
  6.798 μs (37 allocations: 22.75 KiB)
Env
(jl_QY14CI) pkg> st
Status `/private/var/folders/cx/7tl61h5d5tvbhk6xjtnyb85m0000gn/T/jl_QY14CI/Project.toml`
  [f6369f11] ForwardDiff v0.10.35
  [57b37032] ImplicitDifferentiation v0.4.2
  [f2b01f46] Roots v2.0.16
  [90137ffa] StaticArrays v1.5.25
3 Likes

Hi @gdalle, thank you for this! It’s much faster indeed.

It helped me write a custom version for one-dimensional output that is almost on par with the manual implementation. See below:

function root(p_and_dp::AbstractVector{ForwardDiff.Dual{T,R,N}}) where {T,R,N}
    
    p = ForwardDiff.value.(p_and_dp)
    dp = ForwardDiff.partials.(p_and_dp)

    xᵅ = root(p)

    ∂f∂x = ForwardDiff.derivative(x -> eval_poly(x, p), xᵅ)
    ∂f∂p = similar(p)
    ForwardDiff.gradient!(∂f∂p, p -> eval_poly(xᵅ, p), p)

    dxᵅ = - (∂f∂p' * dp) / ∂f∂x

    xᵅ_and_dxᵅ = ForwardDiff.Dual{T}(xᵅ, ForwardDiff.Partials(ntuple(k -> dxᵅ[k], Val(N))))

    return xᵅ_and_dxᵅ
end
@btime ForwardDiff.hessian($root, $p)
  11.512 μs (23 allocations: 29.56 KiB)
@btime ∂²root∂p²($p);
  10.361 μs (37 allocations: 22.75 KiB)
ForwardDiff.hessian(root, p) ≈ ∂²root∂p²(p)
  true

If you want to have a specialized version for 1D output it might look like this (untested):

function (implicit::ImplicitFunction1D)(  # I don't know how to specialize on the dimension of the output
    x_and_dx::AbstractArray{Dual{T,R,N}}; kwargs...
) where {T,R,N}
    conditions = implicit.conditions
    # No need for linear solver here

    x = value.(x_and_dx)
    dx = partials.(p_and_dp)

    y, z = implicit(x; kwargs...)

    ∂f∂x = derivative(_y -> conditions(x, _y, z; kwargs...), y)
    ∂f∂p = similar(p)
    gradient!(∂f∂p, _x -> conditions(_x, y, z; kwargs...), x)

    dy = - (∂f∂p' * dx) / ∂f∂x

    y_and_dy = Dual{T}(y, Partials(ntuple(k -> dy[k], Val(N))))

    return y_and_dy

end
1 Like

Very cool, thanks! We’re currently reshaping the interface but as soon as that is done I’ll take a look