Using a sparse Hessian in Optim.jl

rsenne · October 25, 2024, 11:01pm

I’m currently trying to make code for a package I’m developing as performant as possible. One function I’m writing is to perform smoothing for a state-space model using a method advocated in this paper:

where one uses Newton’s method to directly optimize the complete-data likelihood of the model w.r.t. the latent states. I’ve implemented this, but when using ProfView I’m noticing that most of the time taken to run this method is a cholesky decomposition of the Hessian in this section of code in Optim.jl:

function update_state!(d, state::NewtonState, method::Newton)
    # Search direction is always the negative gradient divided by
    # a matrix encoding the absolute values of the curvatures
    # represented by H. It deviates from the usual "add a scaled
    # identity matrix" version of the modified Newton method. More
    # information can be found in the discussion at issue #153.
    T = eltype(state.x)

    if typeof(NLSolversBase.hessian(d)) <: AbstractSparseMatrix
        state.s .= .-(NLSolversBase.hessian(d)\convert(Vector{T}, gradient(d)))
    else
        state.F = cholesky!(Positive, NLSolversBase.hessian(d))
        if typeof(gradient(d)) <: Array
            # is this actually StridedArray?
            ldiv!(state.s, state.F, -gradient(d))
        else
            # not Array, we can't do inplace ldiv
            gv = Vector{T}(undef, length(gradient(d)))
            copyto!(gv, -gradient(d))
            copyto!(state.s, state.F\gv)
        end
    end
    # Determine the distance of movement along the search line
    lssuccess = perform_linesearch!(state, method, d)

    # Update current position # x = x + alpha * s
    @. state.x = state.x + state.alpha * state.s
    lssuccess == false # break on linesearch error
end

It seems to me if I use a sparse matrix it just performs a regular matrix solve and the reason we are doing the cholesky decomposition is because my matrix is being passed as a dense matrix? I made this assumption and tried doing the following:

function smooth(
    lds::LinearDynamicalSystem{S,O}, y::Matrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
    T_steps, D = size(y, 2), lds.latent_dim

    X₀ = zeros(T, D * T_steps)

    function nll(vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        return -loglikelihood(x, lds, y)
    end

    function g!(g::Vector{T}, vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        grad = Gradient(lds, y, x)
        return g .= vec(-grad)
    end

    function h!(h::SparseMatrixCSC, vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        H, _, _, _ = Hessian(lds, y)
        return h .= -H
    end

    res = optimize(nll, g!, h!, X₀, Newton(linesearch=LineSearches.BackTracking()))

    x = reshape(res.minimizer, D, T_steps)

    H, main, super, sub = Hessian(lds, y)
    p_smooth, inverse_offdiag = block_tridiagonal_inverse(-sub, -main, -super)

    gauss_entropy = 0.5 * (size(H, 1) * log(2π) + logdet(-H))

    @inbounds for i in 1:T_steps
        p_smooth[:, :, i] .= 0.5 .* (p_smooth[:, :, i] .+ p_smooth[:, :, i]')
    end

    inverse_offdiag = cat(zeros(T, D, D), inverse_offdiag; dims=3)

    return x, p_smooth, inverse_offdiag, gauss_entropy
end

But this ended up not working, and then i tried just removing the type hint but then it still seemed to perform the cholesky decomposition. I’m not sure I’m barking up the right tree and i couldn’t find any information on using sparse matrices on the documentation for Optim.

Here are the loglikelihood, Gradient, and Hessian calculations I perform in my code:

function loglikelihood(
    x::AbstractMatrix{T}, lds::LinearDynamicalSystem{S,O}, y::AbstractMatrix{U}
) where {T<:Real,U<:Real,S<:GaussianStateModel{<:Real},O<:GaussianObservationModel{<:Real}}
    T_steps = size(y, 2)

    A, Q, x0, P0 = lds.state_model.A,
    lds.state_model.Q, lds.state_model.x0,
    lds.state_model.P0
    C, R = lds.obs_model.C, lds.obs_model.R

    inv_R = inv(R)
    inv_Q = inv(Q)
    inv_P0 = inv(P0)

    dx0 = x[:, 1] - x0
    ll = dx0' * inv_P0 * dx0

    @inbounds for t in 1:T_steps
        if t > 1
            dx = x[:, t] - A * x[:, t - 1]
            ll += dx' * inv_Q * dx
        end
        dy = y[:, t] - C * x[:, t]
        ll += dy' * inv_R * dy
    end

    return -0.5 * ll
end

function Gradient(
    lds::LinearDynamicalSystem{S,O}, y::Matrix{T}, x::Matrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
    latent_dim, T_steps = size(x)
    obs_dim, _ = size(y)

    # precompute and store variables
    A, Q, x0, P0 = lds.state_model.A,
    lds.state_model.Q, lds.state_model.x0,
    lds.state_model.P0
    C, R = lds.obs_model.C, lds.obs_model.R

    inv_R = inv(R)
    inv_Q = inv(Q)
    inv_P0 = inv(P0)

    grad = zeros(T, latent_dim, T_steps)
    
    AinvQ = A' * inv_Q
    CinvR = C' * inv_R

    # First time step
    grad[:, 1] =
        AinvQ * (x[:, 2] - A * x[:, 1]) + CinvR * (y[:, 1] - C * x[:, 1]) -
        inv_P0 * (x[:, 1] - x0)

    # Middle time steps
   @inbounds for t in 2:(T_steps - 1)
        grad[:, t] =
           CinvR * (y[:, t] - C * x[:, t]) - inv_Q * (x[:, t] - A * x[:, t - 1]) +
            AinvQ * (x[:, t + 1] - A * x[:, t])
    end

    # Last time step
    grad[:, T_steps] =
        CinvR * (y[:, T_steps] - C * x[:, T_steps]) -
        inv_Q * (x[:, T_steps] - A * x[:, T_steps - 1])

    return grad
end

function Hessian(
    lds::LinearDynamicalSystem{S,O}, y::AbstractMatrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
    A, Q, x0, P0 = lds.state_model.A,
    lds.state_model.Q, lds.state_model.x0,
    lds.state_model.P0
    C, R = lds.obs_model.C, lds.obs_model.R

    T_steps = size(y, 2)

    inv_R = inv(R)
    inv_Q = inv(Q)
    inv_P0 = inv(P0)

    H_sub_entry = inv_Q * A
    H_super_entry = Matrix(H_sub_entry')
    H_sub = Vector{Matrix{T}}(undef, T_steps - 1)
    H_super = Vector{Matrix{T}}(undef, T_steps - 1)

    @inbound for i in 1:(T_steps - 1)
        H_sub[i] = H_sub_entry
        H_super[i] = H_super_entry
    end

    yt_given_xt = -C' * inv_R * C
    xt_given_xt_1 = -inv_Q
    xt1_given_xt = -A' * inv_Q * A
    x_t = -inv_P0

    H_diag = Vector{Matrix{T}}(undef, T_steps)

    @inbound for i in 2:(T_steps - 1)
        H_diag[i] = yt_given_xt + xt_given_xt_1 + xt1_given_xt
    end

    H_diag[1] = yt_given_xt + xt1_given_xt + x_t
    H_diag[T_steps] = yt_given_xt + xt_given_xt_1

    H = block_tridgm(H_diag, H_super, H_sub)

    return H, H_diag, H_super, H_sub
end

In the last step where i call block_trigdm(), I indeed generate a sparse Hessian already:

function block_tridgm(
    main_diag::Vector{Matrix{T}},
    upper_diag::Vector{Matrix{T}},
    lower_diag::Vector{Matrix{T}},
) where {T<:Real}
    # Check that the vectors have the correct lengths
    if length(upper_diag) != length(main_diag) - 1 ||
        length(lower_diag) != length(main_diag) - 1
        error(
            "The length of upper_diag and lower_diag must be one less than the length of main_diag",
        )
    end

    # Determine the size of the blocks and the total matrix size
    m = size(main_diag[1], 1) # Size of each block
    n = length(main_diag) # Number of blocks
    N = m * n # Total size of the matrix

    # Initialize containers for constructing sparse matrix
    row_indices = Int[]
    col_indices = Int[]
    values = T[]

    # Function to add block indices and values to arrays
    function append_block(i_row, i_col, block)
        base_row = (i_row - 1) * m
        base_col = (i_col - 1) * m
        for j in 1:m
            for i in 1:m
                push!(row_indices, base_row + i)
                push!(col_indices, base_col + j)
                push!(values, block[i, j])
            end
        end
    end

    # Fill in the main diagonal blocks
    for i in 1:n
        append_block(i, i, main_diag[i])
    end

    # Fill in the upper diagonal blocks
    for i in 1:(n - 1)
        append_block(i, i + 1, upper_diag[i])
    end

    # Fill in the lower diagonal blocks
    for i in 1:(n - 1)
        append_block(i + 1, i, lower_diag[i])
    end

    # Create sparse matrix from collected indices and values
    A = sparse(row_indices, col_indices, values, N, N)

    return A
end

This still technically isn’t MWRE @gdalle so I apologize for that…but i figured adding all the code for my structs to this post would probably make it pretty unintelligible

gdalle · October 26, 2024, 8:22am

Can you provide a complete reproducible example? For instance, where do your Gradient and Hessian functions come from?

gdalle · October 26, 2024, 4:04pm

Can you maybe use the debugger to get inside of the update_state! function from Optim.jl and print the type of NLSolversBase.hessian(d)? It doesn’t seem to be <:AbstractSparseMatrix, otherwise the right branch would be taken.
You can also just pkg> dev Optim and add some printing wherever needed.

rsenne · October 26, 2024, 5:45pm

Sure enough, the type is Matrix{Float64}. Not sure why this would be as I am specifically constructing the Hessian as a sparse matrix…

gdalle · October 26, 2024, 5:47pm

Can you try to track the construction of this matrix?

rsenne · October 26, 2024, 5:54pm

Yes will try that now, at least in my code when I define the analytic function in smooth:

    function h!(h, vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        H, _, _, _ = Hessian(lds, y)
        println(typeof(H))
        return h .= -H
    end

This spits out a SparseArrays.SparseMatrixCSC{Float64, Int64}. So, now i think I’ll have to go into how Optim is handling the Hessian I imagine?

gdalle · October 26, 2024, 6:01pm

Yeah, I’d probably dev Optim and add prints everywhere to figure out where the object gets converted. I don’t know that package so I can’t help with their internals

rsenne · October 26, 2024, 6:16pm

Okay i found a solution. It requires that you initially set up a TwiceDifferentiable object with the initial hessian, gradient values etc. as follows:

function smooth(
    lds::LinearDynamicalSystem{S,O}, y::Matrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
    T_steps, D = size(y, 2), lds.latent_dim

    X₀ = zeros(T, D * T_steps)

    function nll(vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        return -loglikelihood(x, lds, y)
    end

    function g!(g::Vector{T}, vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        grad = Gradient(lds, y, x)
        return g .= vec(-grad)
    end

    function h!(h::AbstractSparseMatrix, vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        H, _, _, _ = Hessian(lds, y)
        copyto!(h, -H)
        return 
    end

    # set up initial values
    initial_f = nll(X₀)
    
    inital_g = similar(X₀)
    g!(inital_g, X₀)

    initial_h = spzeros(Float64, length(X₀), length(X₀))
    h!(initial_h, X₀)

    # set up a TwiceDifferentiable object i guess?
    td = TwiceDifferentiable(nll, g!, h!, X₀, initial_f, inital_g, initial_h)

    # set up Optim.Options
    opts = Optim.Options(g_tol=1e-12, x_tol=1e-12, f_tol=1e-12, iterations=100)

    # Go!
    res = optimize(td, X₀, Newton(linesearch=LineSearches.BackTracking()), opts)
    
    # Profit
    x = reshape(res.minimizer, D, T_steps)

    H, main, super, sub = Hessian(lds, y)
    p_smooth, inverse_offdiag = block_tridiagonal_inverse(-sub, -main, -super)

    gauss_entropy = 0.5 * (size(H, 1) * log(2π) + logdet(-H))

    @inbounds for i in 1:T_steps
        p_smooth[:, :, i] .= 0.5 .* (p_smooth[:, :, i] .+ p_smooth[:, :, i]')
    end

    inverse_offdiag = cat(zeros(T, D, D), inverse_offdiag; dims=3)

    return x, p_smooth, inverse_offdiag, gauss_entropy
end

This makes it s.t. the initial Hessian is a Sparse matrix. I’m going to take a peek into Optim I think and see if there is a way the interface could be changed to infer the type of the Hessian at the first iteration as I found this to be be a bit unintuitive. This also made my runtime go from ~20s for 100 iterations to ~~<0.5s~~ <8s so this was a massive performance bite. Would love to talk to a developer from Optim to understand if this is a worthwhile feature. But overall happy i found a solution. Thanks @gdalle for the help

gdalle · October 26, 2024, 6:19pm

Just out of curiosity, would you mind trying out DifferentiationInterface.jl to compute your sparse hessians with autodiff instead of manually? I’m curious to see if the performance is comparable to a handwritten implementation.

rsenne · October 26, 2024, 6:21pm

Yeah more than happy, to do that. I’ll take a look now and let you know what I see

gdalle · October 26, 2024, 6:24pm

Cool! You can follow this tutorial. Try AutoForwardDiff() first as the dense second order backend to get things working, then move to AutoEnzyme() to get peak performance. Note that you can also use it to compute the gradient in-place.

rsenne · October 26, 2024, 7:18pm

Just want to confirm this is roughly what you are looking for, I ran the following code based on the linked tutorial:

using DifferentiationInterface
using Enzyme
using ForwardDiff
using SparseConnectivityTracer: TracerSparsityDetector
using SparseMatrixColorings

dense_second_order_backend = AutoForwardDiff()

sparse_second_order_backend = AutoSparse(
    dense_second_order_backend;
    sparsity_detector=TracerSparsityDetector(),
    coloring_algorithm=GreedyColoringAlgorithm(),
)

lds = GaussianLDS(;A=A, Q=Q, C=C, R=R, x0=x0, P0=P0, obs_dim=obs_dim, latent_dim=latent_dim, fit_bool=fill(true, 6))

f = x -> StateSpaceDynamics.loglikelihood(x, lds, observations[:, :, 1])

H = hessian(f, sparse_second_order_backend, zeros(2, 1000))

Then using benchmark tools i got the following result:

Let me know if anything so far looks egregiously incorrect/suboptimal lol

gdalle · October 26, 2024, 7:50pm

This is in fact egregiously suboptimal

The goal of my question was to figure out if your handwritten implementations of Gradient and (especially) Hessian are useful. I imagine they were not completely straightforward to implement, and autodiff can take care of that for you. With the ecosystem around DifferentiationInterface.jl, you don’t even need to specify the sparsity pattern ahead of time, it is detected for you (so that only nonzero coefficients of the Hessian get computed). But this detection is costly, so it should only be done once (through the so-called “preparation” step) and then reused for every Hessian update.

First, you’d have to check that DifferentiationInterface.hessian returns the exact same matrix as the one you compute manually. If not, then there is a bug on at least one of the two sides and we should investigate. Once that is confirmed, the real question is whether we waste performance by replacing your manual versions with the automated ones. Note that the manual versions themselves could probably be optimized but that’s a separate question.

So what I had in mind was benchmarking your smooth function against another one that looks like this. Since I don’t have all the dependencies, I can’t run the code, so beware of syntax errors, but it should give you the general idea. I’m not sure about x0 though, and why it works with a matrix instead of a vector

import DifferentiationInterface as DI
using Enzyme
using ForwardDiff
using SparseConnectivityTracer: TracerSparsityDetector
using SparseMatrixColorings

function smooth_autodiff(
    lds::LinearDynamicalSystem{S,O}, y::Matrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
    backend = DI.AutoForwardDiff()  # replace by DI.AutoEnzyme() once it works
    sparse_backend =  DI.AutoSparse(
        dense_second_order_backend;
        sparsity_detector=TracerSparsityDetector(),
        coloring_algorithm=GreedyColoringAlgorithm(),
    )

    T_steps, D = size(y, 2), lds.latent_dim

    X₀ = zeros(T, D * T_steps)
    example_vec_x = nothing  # vec(x0[1, :]) maybe?

    function nll(vec_x::Vector{T})
        x = reshape(vec_x, D, T_steps)
        return -loglikelihood(x, lds, y)
    end

    gradient_prep = DI.prepare_gradient(nll, backend, example_vec_x)
    hessian_prep = DI.prepare_hessian(nll, sparse_backend, example_vec_x)

    function g!(g::Vector{T}, vec_x::Vector{T})
        return DI.gradient!(nll, g, gradient_prep, backend, vec_x)
    end

    function h!(h::AbstractSparseMatrix, vec_x::Vector{T})
        return DI.hessian!(nll, h, hessian_prep, sparse_backend, vec_x)
    end

    initial_g = DI.gradient(nll, gradient_prep, backend, example_vec_x)
    initial_H = DI.hessian(nll, hessian_prep, sparse_backend, example_vec_x)
    
    # Create TwiceDifferentiable object I guess?
    td = TwiceDifferentiable(
        nll,
        g!,
        h!,
        X₀,  # why not a vector here?
        Float64(nll(X₀)),  # Initial value as Float64
        initial_g,         # Initial gradient
        initial_H         # Initial Hessian
    )
    
    res = optimize(td, X₀, Newton(linesearch=LineSearches.BackTracking()))

    # blablabla
end

rsenne · October 26, 2024, 8:24pm

That’s why i figured i would ask .

This does produce the same results to my analytical calculation.

This was honestly just an issue of my code being harder to read than necessary. x0 is in fact a vector defined as zeros(T, D * T_steps) where T<:Real i.e., i was just typing.

I’ve now got it working through the scaffold you very nicely supplied to me:

function smooth_autodiff(
    lds::StateSpaceDynamics.LinearDynamicalSystem{S,O}, y::Matrix{T}
) where {T<:Real,S<:StateSpaceDynamics.GaussianStateModel{T},O<:StateSpaceDynamics.GaussianObservationModel{T}}

    backend = DifferentiationInterface.AutoForwardDiff()  # replace by DI.AutoEnzyme() once it works
    sparse_backend =  DifferentiationInterface.AutoSparse(
        backend;
        sparsity_detector=TracerSparsityDetector(),
        coloring_algorithm=GreedyColoringAlgorithm(),
        )

    T_steps, D = size(y, 2), lds.latent_dim

    X₀ = zeros(T, D * T_steps)

    function nll(vec_x::AbstractVector)
        x = reshape(vec_x, D, T_steps)
        return -loglikelihood(x, lds, y)
    end

    gradient_prep = DifferentiationInterface.prepare_gradient(nll, backend, X₀)
    hessian_prep = DifferentiationInterface.prepare_hessian(nll, sparse_backend, X₀)

    function g!(g::AbstractVector, vec_x::AbstractVector)
        return DifferentiationInterface.gradient!(nll, g, gradient_prep, backend, vec_x)
    end

    function h!(h::AbstractSparseMatrix, vec_x::AbstractVector)
        return DifferentiationInterface.hessian!(nll, h, hessian_prep, sparse_backend, vec_x)
    end

    initial_g = DifferentiationInterface.gradient(nll, gradient_prep, backend, X₀)
    initial_H = DifferentiationInterface.hessian(nll, hessian_prep, sparse_backend, X₀)
    
    # Create TwiceDifferentiable object I guess?
    td = TwiceDifferentiable(
        nll,
        g!,
        h!,
        X₀,
        Float64(nll(X₀)),  # Initial value as Float64
        initial_g,         # Initial gradient
        initial_H         # Initial Hessian
    )
    
    res = optimize(td, X₀, Newton(linesearch=LineSearches.BackTracking()))
    
    x = reshape(res.minimizer, D, T_steps)

    H, main, super, sub = StateSpaceDynamics.Hessian(lds, y)
    p_smooth, inverse_offdiag = block_tridiagonal_inverse(-sub, -main, -super)

    gauss_entropy = 0.5 * (size(H, 1) * log(2π) + logdet(-H))

    for i in 1:T_steps
        p_smooth[:, :, i] .= 0.5 .* (p_smooth[:, :, i] .+ p_smooth[:, :, i]')
    end

    inverse_offdiag = cat(zeros(T, D, D), inverse_offdiag; dims=3)

    return x, p_smooth, inverse_offdiag, gauss_entropy
end

When i benchmark this i get the following results:

Compared to mine:

But of course, this is using AutoForwardDiff() so I will now give it a shot with AutoEnzyme() and let you know!

gdalle · October 26, 2024, 8:32pm

I suspect most of the time is spent in sparsity detection, for which the backend switch won’t help. But if you provide us with some complete runnable code, maybe @hill and I can use it as a benchmark case to study some optimizations. At the very least a screenshot of the @profview flame graph would help.

rsenne · October 26, 2024, 8:41pm

Would you be willing to work on a branch on my repo? I could push the current notebook I’m working on and so the sampling/problem setup would be taken care of and then you would be able to toy around to your heart’s content! Would also be great having a veteran Julian-er poking around my work

Otherwise I’m happy to just send code through here/send you a picture of the flame graph. Let me know what you’d prefer!

gdalle · October 26, 2024, 8:46pm

Sure! Just push a reproducible notebook with both manual and automatic versions of the optimization routine, and let’s see how much performance we can squeeze out

rsenne · October 26, 2024, 10:53pm

I’ll have to get back to you on the notebook–some of my unit tests are sporadically failing with the new interface, so seems I may have solved done issue and introduced another…

rsenne · October 27, 2024, 1:17pm

“Unsolving” this issue as I beat the victory drums too early. My proposed solution from above fixed the issue I was having where the Hessian was being converted to a dense matrix, but it introduced a new issue where the optimization, despite being reported as succesful, never actually updated the initial solution. This is slightly concerning to me as I can’t seem to find any good reasons why this would be the case as I just used the constructor pattern from NLSolversBase.jl

function TwiceDifferentiable(f, g, h,
                             x::AbstractVector{TX},
                             F::Real = real(zero(eltype(x))),
                             G = alloc_DF(x, F),
                             H = alloc_H(x, F); inplace = true) where {TX}
    g! = df!_from_df(g, F, inplace)
    h! = h!_from_h(h, F, inplace)

    fg! = make_fdf(x, F, f, g!)
    x_f, x_df, x_h = x_of_nans(x), x_of_nans(x), x_of_nans(x)

    return TwiceDifferentiable(f, g!, fg!, nothing, nothing, h!, F, G, H, x_f, x_df, x_h, [0,], [0,], [0,])
end

I don’t yet know the internals of these two packages well enough to say if this is an issue on my end or if there is some more pernicious bug here. Regardless I think this merits some more documentation at best or some code modifications. Will continue to tinker in these two packages and hopefully I’ll get this squared away soon…

rsenne · October 27, 2024, 3:10pm

Okay, although I haven’t figured out why this happened in the first place, my solution from above:

Was in fact the “correct” way to go, but for some currently unknown reason, doing this messed up the convergence checking in Optim, and so to get aroudn this I had to set up an Optim.Options() struct to pass to the optimizer

rsenne:

    # set up Optim.Options
    opts = Optim.Options(g_tol=1e-12, x_tol=1e-12, f_tol=1e-12, iterations=100)

    # Go!
    res = optimize(td, X₀, Newton(linesearch=LineSearches.BackTracking()), opts)

and this ended up being able to fix it. So I’m re-accepting my solution from before with this additional caveat, though I do think this merits additional investigation. It definitely was my own fault for not catching the fact the solution was erroneous but, it is concerning to me that it was returning a “succesful” optimization. I’ll probably look into this further in the upcoming week, but for now I am going to save my changes and not look at my laptop for the next 24 hours