@gdalle I am attempting to use ImplicitDifferentiation with ForwardDiff to compute the gradient of an optimization problem. I’ve tried to figure out how to reduce the problem to a minimum working example, but I have not come up with anything succinct. I’ve posted an abbreviated version of the problem below:
using UnPack
using FFTW: fftfreq, rfftfreq
using Optimization
using OptimizationOptimJL
using ImplicitDifferentiation
using ForwardDiff: gradient
using FiniteDifferences
function setup_grid_cache(mθ::Int,
nζ::Int,
pol_modes::AbstractVector,
tor_modes::AbstractVector,
Nfp::T) where {T}
θ = [i for i in range(start = zero(T), step = 2π/(mθ - 1), length = mθ), _ in range(start = zero(T), step = 2π/(nζ - 1)/Nfp, length = nζ)]
ζ = [j for _ in range(start = zero(T), step = 2π/(mθ - 1), length = mθ), j in range(start = zero(T), step = 2π/(nζ - 1)/Nfp, length = nζ)]
return setup_grid_cache(θ, ζ, pol_modes, tor_modes, Nfp)
end
function setup_grid_cache(θ::AbstractArray,
ζ::AbstractArray,
pol_modes::AbstractVector,
tor_modes::AbstractVector,
Nfp::T) where {T}
MN = length(pol_modes)
cϕ = Matrix{T}(undef, MN, length(ζ))
sϕ = similar(cϕ)
∂cϕ∂θ = similar(cϕ)
∂cϕ∂ζ = similar(cϕ)
∂²cϕ∂θ² = similar(cϕ)
∂²cϕ∂ζ² = similar(cϕ)
∂²cϕ∂θ∂ζ = similar(cϕ)
∂sϕ∂θ = similar(cϕ)
∂sϕ∂ζ = similar(cϕ)
∂²sϕ∂θ² = similar(cϕ)
∂²sϕ∂ζ² = similar(cϕ)
∂²sϕ∂θ∂ζ = similar(cϕ)
@inbounds for j in eachindex(θ, ζ), i in eachindex(pol_modes, tor_modes)
sϕ[i, j], cϕ[i, j] = sincos(pol_modes[i] * θ[j] + Nfp * tor_modes[i] * ζ[j])
∂cϕ∂θ[i, j] = -pol_modes[i] * sϕ[i, j]
∂cϕ∂ζ[i, j] = -tor_modes[i] * Nfp * sϕ[i, j]
∂²cϕ∂θ²[i, j] = -pol_modes[i] ^ 2 * cϕ[i, j]
∂²cϕ∂ζ²[i, j] = -(tor_modes[i] * Nfp) ^ 2 * cϕ[i, j]
∂²cϕ∂θ∂ζ[i, j] = -pol_modes[i] * tor_modes[i] * Nfp * cϕ[i, j]
∂sϕ∂θ[i, j] = pol_modes[i] * cϕ[i, j]
∂sϕ∂ζ[i, j] = tor_modes[i] * Nfp * cϕ[i, j]
∂²sϕ∂θ²[i, j] = -pol_modes[i] ^ 2 * sϕ[i, j]
∂²sϕ∂ζ²[i, j] = -(tor_modes[i] * Nfp) ^ 2 * sϕ[i, j]
∂²sϕ∂θ∂ζ[i, j] = -pol_modes[i] * tor_modes[i] * Nfp * sϕ[i, j]
end
return (θ = θ, ζ = ζ, cϕ = cϕ, sϕ = sϕ,
∂cϕ∂θ = ∂cϕ∂θ, ∂cϕ∂ζ = ∂cϕ∂ζ, ∂sϕ∂θ = ∂sϕ∂θ, ∂sϕ∂ζ = ∂sϕ∂ζ,
∂²cϕ∂θ² = ∂²cϕ∂θ², ∂²cϕ∂ζ² = ∂²cϕ∂ζ², ∂²cϕ∂θ∂ζ = ∂²cϕ∂θ∂ζ,
∂²sϕ∂θ² = ∂²sϕ∂θ², ∂²sϕ∂ζ² = ∂²sϕ∂ζ², ∂²sϕ∂θ∂ζ = ∂²sϕ∂θ∂ζ)
end
function FG_matrix_derivative(Rc::AbstractVector,
Zs::AbstractVector,
∂cϕ∂θ::AbstractVector,
∂cϕ∂ζ::AbstractVector,
∂sϕ∂θ::AbstractVector,
∂sϕ∂ζ::AbstractVector,
∂²cϕ∂θ²::AbstractVector,
∂²cϕ∂ζ²::AbstractVector,
∂²cϕ∂θ∂ζ::AbstractVector,
∂²sϕ∂θ²::AbstractVector,
∂²sϕ∂ζ²::AbstractVector,
∂²sϕ∂θ∂ζ::AbstractVector,
ι::T) where {T}
rθ, rζ, zθ, zζ, rθθ, zθθ, rθζ, zθζ, rζζ, zζζ = zeros(T, 10)
@inbounds for i in eachindex(Rc, Zs)
rθ += Rc[i] * ∂cϕ∂θ[i]
rζ += Rc[i] * ∂cϕ∂ζ[i]
zθ += Zs[i] * ∂sϕ∂θ[i]
zζ += Zs[i] * ∂sϕ∂ζ[i]
rθθ += Rc[i] * ∂²cϕ∂θ²[i]
rζζ += Rc[i] * ∂²cϕ∂ζ²[i]
rθζ += Rc[i] * ∂²cϕ∂θ∂ζ[i]
zθθ += Zs[i] * ∂²sϕ∂θ²[i]
zζζ += Zs[i] * ∂²sϕ∂ζ²[i]
zθζ += Zs[i] * ∂²sϕ∂θ∂ζ[i]
end
return (rζ ^ 2 + zζ ^ 2 + 1 + ι * (rθ * rζ + zθ * zζ),
2 * (rζ * rθζ + zζ * zθζ) + ι * (rθθ * rζ + rθ * rθζ + zθθ * zζ + zθ * zθζ),
rθ * rζ + zθ * zζ + ι * (rθ ^ 2 + zθ ^ 2),
rθζ * rζ + rθ * rζζ + zθζ * zζ + zθ * zζζ + 2 * ι * (rθζ * rθ + zθζ * zθ))
end
function scalar_jacobian_residual(u::AbstractVector,
cϕ::AbstractVector,
∂cϕ∂θ::AbstractVector,
∂cϕ∂ζ::AbstractVector,
F::T,
G::T,
F′::T,
G′::T) where {T}
j, jθ, jζ = tuple(zeros(T, 3)...)
@inbounds for i in eachindex(cϕ, ∂cϕ∂θ, ∂cϕ∂ζ)
j += u[i] * cϕ[i]
jθ += ∂cϕ∂θ[i] * u[i]
jζ += ∂cϕ∂ζ[i] * u[i]
end
return ((F′ - G′) * j - F * jθ + G * jζ) ^ 2
end
function scalar_jacobian_gradient!(du::AbstractVector,
u::AbstractVector,
cϕ::AbstractVector,
∂cϕ∂θ::AbstractVector,
∂cϕ∂ζ::AbstractVector,
F::T,
G::T,
F′::T,
G′::T) where {T}
j, jθ, jζ = tuple(zeros(T, 3)...)
@inbounds for i in eachindex(cϕ, ∂cϕ∂θ, ∂cϕ∂ζ)
j += u[i] * cϕ[i]
jθ += ∂cϕ∂θ[i] * u[i]
jζ += ∂cϕ∂ζ[i] * u[i]
du[i] = ((F′ - G′) * cϕ[i] - F * ∂cϕ∂θ[i] + G * ∂cϕ∂ζ[i])
end
res = ((F′ - G′) * j - F * jθ + G * jζ)
du .*= 2 * res
return nothing
end
function scalar_jacobian_residual(u::AbstractVector,
pars::NT) where {NT <: NamedTuple}
@unpack grid_cache, F_mat, G_mat, Fθ_mat, Gζ_mat = pars
res = zero(eltype(u))
@unpack cϕ, #=sϕ,=# ∂cϕ∂θ, ∂cϕ∂ζ#=, ∂sϕ∂θ, ∂sϕ∂ζ=# = grid_cache
@inbounds for i in axes(cϕ, 2)
res += scalar_jacobian_residual(u, view(cϕ, :, i),
view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i),
F_mat[i], G_mat[i], Fθ_mat[i], Gζ_mat[i])
end
return res
end
function scalar_jacobian_gradient!(G::AbstractVector,
u::AbstractVector,
pars::NT) where {NT <: NamedTuple}
@unpack grid_cache, F_mat, G_mat, Fθ_mat, Gζ_mat, dj, ∂j = pars
fill!(dj, zero(eltype(u)))
@unpack cϕ, ∂cϕ∂θ, ∂cϕ∂ζ = grid_cache
@inbounds for i in axes(cϕ, 2)
scalar_jacobian_gradient!(∂j, u, view(cϕ, :, i),
view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i),
F_mat[i], G_mat[i], Fθ_mat[i], Gζ_mat[i])
dj .+= ∂j
end
dj[1] = zero(eltype(u))
copyto!(G, dj)
return nothing
end
function scalar_jacobian_parameters(grid_cache::NT,
Rc::AbstractVector,
Zs::AbstractVector,
ι::T) where {T, NT <: NamedTuple}
@unpack ∂cϕ∂θ, ∂cϕ∂ζ, ∂sϕ∂θ, ∂sϕ∂ζ, ∂²cϕ∂θ², ∂²cϕ∂ζ², ∂²cϕ∂θ∂ζ, ∂²sϕ∂θ², ∂²sϕ∂ζ², ∂²sϕ∂θ∂ζ = grid_cache
MN = size(∂cϕ∂θ, 1)
grid_size = size(grid_cache.θ)
F_mat = similar(Rc, grid_size)
G_mat = similar(F_mat)
Fθ_mat = similar(F_mat)
Gζ_mat = similar(G_mat)
f, fθ, g, gζ = (zero(T), zero(T), zero(T), zero(T))
@inbounds for i in eachindex(F_mat, G_mat, Fθ_mat, Gζ_mat)
f, fθ, g, gζ = FG_matrix_derivative(Rc, #=Rs, Zc,=# Zs, view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i),
view(∂sϕ∂θ, :, i), view(∂sϕ∂ζ, :, i), view(∂²cϕ∂θ², :, i), view(∂²cϕ∂ζ², :, i), view(∂²cϕ∂θ∂ζ, :, i),
view(∂²sϕ∂θ², :, i), view(∂²sϕ∂ζ², :, i), view(∂²sϕ∂θ∂ζ, :, i), ι)
F_mat[i] = f
Fθ_mat[i] = fθ
G_mat[i] = g
Gζ_mat[i] = gζ
end
return (grid_cache = grid_cache, F_mat = F_mat, G_mat = G_mat, Fθ_mat = Fθ_mat, Gζ_mat = Gζ_mat, dj = zeros(T, MN), ∂j = zeros(T, MN))
end
function compute_scalar_jacobian!(jac_modes::AbstractVector,
guess::AbstractVector,
parameters::NT) where {NT <: NamedTuple}
jac_func = OptimizationFunction(scalar_jacobian_residual; grad = scalar_jacobian_gradient!)
opt_prob = OptimizationProblem(jac_func, guess, parameters)
sol = solve(opt_prob, OptimizationOptimJL.LBFGS())
copyto!(jac_modes, sol.u)
return nothing
end
function compute_scalar_jacobian(Rc::AbstractVector,
Zs::AbstractVector,
eq_params::NT) where {NT <: NamedTuple}
@unpack grid_cache, ι = eq_params
parameters = scalar_jacobian_parameters(grid_cache, Rc, Zs, ι)
guess = copy(Rc)
guess[1] = one(eltype(Rc))
jac_modes = similar(guess)
compute_scalar_jacobian!(jac_modes, guess, parameters)
return jac_modes
end
function scalar_jacobian_forward(x::AbstractVector, eq_params::NT) where {NT <: NamedTuple}
@unpack inds = eq_params
jac_modes = compute_scalar_jacobian(x[inds[1]], x[inds[2]], eq_params)
return jac_modes
end
function scalar_jacobian_optimality_condition(x::AbstractVector,
jac_modes::AbstractVector,
eq_params::NT) where {NT <: NamedTuple}
@unpack M, N, pol_modes, tor_modes, ι, Nfp, inds = eq_params
Rc = x[inds[1]]
Zs = x[inds[2]]
T = promote_type(typeof(x), typeof(jac_modes))
ET = eltype(T)
grid_cache = setup_grid_cache(2 * M, N, pol_modes, tor_modes, Nfp)
@unpack cϕ, sϕ, ∂cϕ∂θ, ∂cϕ∂ζ, ∂sϕ∂θ, ∂sϕ∂ζ, ∂²cϕ∂θ², ∂²cϕ∂ζ², ∂²cϕ∂θ∂ζ, ∂²sϕ∂θ², ∂²sϕ∂ζ², ∂²sϕ∂θ∂ζ = grid_cache
Jc = jac_modes[1 : M * N]
cond_vec = Vector{ET}(undef, #=2 *=# M * N)
@inbounds for i in eachindex(cond_vec)
f, fθ, g, gζ = FG_matrix_derivative(Rc, #=Rs, Zc,=# Zs, view(∂cϕ∂θ, :, i), view(∂cϕ∂ζ, :, i),
view(∂sϕ∂θ, :, i), view(∂sϕ∂ζ, :, i), view(∂²cϕ∂θ², :, i), view(∂²cϕ∂ζ², :, i), view(∂²cϕ∂θ∂ζ, :, i),
view(∂²sϕ∂θ², :, i), view(∂²sϕ∂ζ², :, i), view(∂²sϕ∂θ∂ζ, :, i), ι)
cϕ_view = view(cϕ, :, i)
∂cϕ∂θ_view = view(∂cϕ∂θ, :, i)
∂cϕ∂ζ_view = view(∂cϕ∂ζ, :, i)
jac = zero(ET)
jac_θ = zero(ET)
jac_ζ = zero(ET)
for j in eachindex(Jc)
jac += Jc[j] * cϕ_view[j]
jac_θ += Jc[j] * ∂cϕ∂θ_view[j]
jac_ζ += Jc[j] * ∂cϕ∂ζ_view[j]
end
cond_vec[i] = (fθ - gζ) * jac - f * jac_θ + g * jac_ζ
end
return cond_vec
end
implicit_scalar_jacobian = ImplicitFunction(scalar_jacobian_forward, scalar_jacobian_optimality_condition)
function setup_jacobian(M = 3, N = 3, ι = 0.796, Nfp = 10., mθ = 32, nζ = 32)
Rc = zeros(M, N)
Zs = zeros(M, N)
Rc[1] = 1.
Rc[2] = 0.1
Rc[1,2] = 0.1
Zs[2] = 0.1
Zs[1,2] = 0.1
pol_modes = repeat(rfftfreq(2 * M - isodd(M), 2 * M - isodd(M)), outer = N)
tor_modes = tor_modes = repeat(fftfreq(N, N), inner = M)
grid_cache = setup_grid_cache(mθ, nζ, pol_modes, tor_modes, Nfp)
inds = [(i - 1) * M * N + 1:i * M * N for i in 1:2]
eq_params = (M = M, N = N, pol_modes = pol_modes, tor_modes = tor_modes, ι = ι, Nfp = Nfp, grid_cache = grid_cache, inds = inds)
x = vcat(vec(Rc), vec(Zs))
return x, eq_params
end
This can be run with
x0, eq_params = setup_jacobian()
jacobian_objective(_x) = sum(implicit_scalar_jacobian(_x, eq_params))
grad_ad = gradient(jacobian_objective, x0)
grad_fd = grad(central_fdm(5, 1), jacobian_objective, x0)[1]
isapprox.(grad_ad, grad_fd, atol = 1e-8)
which produces
18-element BitVector:
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
I know it’s a long example, but I’m trying to figure out if I’m using ImplicitDifferentiation correclty or if this is a bug, so any help would be appreciated.