I’m currently trying to make code for a package I’m developing as performant as possible. One function I’m writing is to perform smoothing for a state-space model using a method advocated in this paper:
where one uses Newton’s method to directly optimize the complete-data likelihood of the model w.r.t. the latent states. I’ve implemented this, but when using ProfView I’m noticing that most of the time taken to run this method is a cholesky decomposition of the Hessian in this section of code in Optim.jl:
function update_state!(d, state::NewtonState, method::Newton)
# Search direction is always the negative gradient divided by
# a matrix encoding the absolute values of the curvatures
# represented by H. It deviates from the usual "add a scaled
# identity matrix" version of the modified Newton method. More
# information can be found in the discussion at issue #153.
T = eltype(state.x)
if typeof(NLSolversBase.hessian(d)) <: AbstractSparseMatrix
state.s .= .-(NLSolversBase.hessian(d)\convert(Vector{T}, gradient(d)))
else
state.F = cholesky!(Positive, NLSolversBase.hessian(d))
if typeof(gradient(d)) <: Array
# is this actually StridedArray?
ldiv!(state.s, state.F, -gradient(d))
else
# not Array, we can't do inplace ldiv
gv = Vector{T}(undef, length(gradient(d)))
copyto!(gv, -gradient(d))
copyto!(state.s, state.F\gv)
end
end
# Determine the distance of movement along the search line
lssuccess = perform_linesearch!(state, method, d)
# Update current position # x = x + alpha * s
@. state.x = state.x + state.alpha * state.s
lssuccess == false # break on linesearch error
end
It seems to me if I use a sparse matrix it just performs a regular matrix solve and the reason we are doing the cholesky decomposition is because my matrix is being passed as a dense matrix? I made this assumption and tried doing the following:
function smooth(
lds::LinearDynamicalSystem{S,O}, y::Matrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
T_steps, D = size(y, 2), lds.latent_dim
X₀ = zeros(T, D * T_steps)
function nll(vec_x::Vector{T})
x = reshape(vec_x, D, T_steps)
return -loglikelihood(x, lds, y)
end
function g!(g::Vector{T}, vec_x::Vector{T})
x = reshape(vec_x, D, T_steps)
grad = Gradient(lds, y, x)
return g .= vec(-grad)
end
function h!(h::SparseMatrixCSC, vec_x::Vector{T})
x = reshape(vec_x, D, T_steps)
H, _, _, _ = Hessian(lds, y)
return h .= -H
end
res = optimize(nll, g!, h!, X₀, Newton(linesearch=LineSearches.BackTracking()))
x = reshape(res.minimizer, D, T_steps)
H, main, super, sub = Hessian(lds, y)
p_smooth, inverse_offdiag = block_tridiagonal_inverse(-sub, -main, -super)
gauss_entropy = 0.5 * (size(H, 1) * log(2π) + logdet(-H))
@inbounds for i in 1:T_steps
p_smooth[:, :, i] .= 0.5 .* (p_smooth[:, :, i] .+ p_smooth[:, :, i]')
end
inverse_offdiag = cat(zeros(T, D, D), inverse_offdiag; dims=3)
return x, p_smooth, inverse_offdiag, gauss_entropy
end
But this ended up not working, and then i tried just removing the type hint but then it still seemed to perform the cholesky decomposition. I’m not sure I’m barking up the right tree and i couldn’t find any information on using sparse matrices on the documentation for Optim.
Here are the loglikelihood, Gradient, and Hessian calculations I perform in my code:
function loglikelihood(
x::AbstractMatrix{T}, lds::LinearDynamicalSystem{S,O}, y::AbstractMatrix{U}
) where {T<:Real,U<:Real,S<:GaussianStateModel{<:Real},O<:GaussianObservationModel{<:Real}}
T_steps = size(y, 2)
A, Q, x0, P0 = lds.state_model.A,
lds.state_model.Q, lds.state_model.x0,
lds.state_model.P0
C, R = lds.obs_model.C, lds.obs_model.R
inv_R = inv(R)
inv_Q = inv(Q)
inv_P0 = inv(P0)
dx0 = x[:, 1] - x0
ll = dx0' * inv_P0 * dx0
@inbounds for t in 1:T_steps
if t > 1
dx = x[:, t] - A * x[:, t - 1]
ll += dx' * inv_Q * dx
end
dy = y[:, t] - C * x[:, t]
ll += dy' * inv_R * dy
end
return -0.5 * ll
end
function Gradient(
lds::LinearDynamicalSystem{S,O}, y::Matrix{T}, x::Matrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
latent_dim, T_steps = size(x)
obs_dim, _ = size(y)
# precompute and store variables
A, Q, x0, P0 = lds.state_model.A,
lds.state_model.Q, lds.state_model.x0,
lds.state_model.P0
C, R = lds.obs_model.C, lds.obs_model.R
inv_R = inv(R)
inv_Q = inv(Q)
inv_P0 = inv(P0)
grad = zeros(T, latent_dim, T_steps)
AinvQ = A' * inv_Q
CinvR = C' * inv_R
# First time step
grad[:, 1] =
AinvQ * (x[:, 2] - A * x[:, 1]) + CinvR * (y[:, 1] - C * x[:, 1]) -
inv_P0 * (x[:, 1] - x0)
# Middle time steps
@inbounds for t in 2:(T_steps - 1)
grad[:, t] =
CinvR * (y[:, t] - C * x[:, t]) - inv_Q * (x[:, t] - A * x[:, t - 1]) +
AinvQ * (x[:, t + 1] - A * x[:, t])
end
# Last time step
grad[:, T_steps] =
CinvR * (y[:, T_steps] - C * x[:, T_steps]) -
inv_Q * (x[:, T_steps] - A * x[:, T_steps - 1])
return grad
end
function Hessian(
lds::LinearDynamicalSystem{S,O}, y::AbstractMatrix{T}
) where {T<:Real,S<:GaussianStateModel{T},O<:GaussianObservationModel{T}}
A, Q, x0, P0 = lds.state_model.A,
lds.state_model.Q, lds.state_model.x0,
lds.state_model.P0
C, R = lds.obs_model.C, lds.obs_model.R
T_steps = size(y, 2)
inv_R = inv(R)
inv_Q = inv(Q)
inv_P0 = inv(P0)
H_sub_entry = inv_Q * A
H_super_entry = Matrix(H_sub_entry')
H_sub = Vector{Matrix{T}}(undef, T_steps - 1)
H_super = Vector{Matrix{T}}(undef, T_steps - 1)
@inbound for i in 1:(T_steps - 1)
H_sub[i] = H_sub_entry
H_super[i] = H_super_entry
end
yt_given_xt = -C' * inv_R * C
xt_given_xt_1 = -inv_Q
xt1_given_xt = -A' * inv_Q * A
x_t = -inv_P0
H_diag = Vector{Matrix{T}}(undef, T_steps)
@inbound for i in 2:(T_steps - 1)
H_diag[i] = yt_given_xt + xt_given_xt_1 + xt1_given_xt
end
H_diag[1] = yt_given_xt + xt1_given_xt + x_t
H_diag[T_steps] = yt_given_xt + xt_given_xt_1
H = block_tridgm(H_diag, H_super, H_sub)
return H, H_diag, H_super, H_sub
end
In the last step where i call block_trigdm()
, I indeed generate a sparse Hessian already:
function block_tridgm(
main_diag::Vector{Matrix{T}},
upper_diag::Vector{Matrix{T}},
lower_diag::Vector{Matrix{T}},
) where {T<:Real}
# Check that the vectors have the correct lengths
if length(upper_diag) != length(main_diag) - 1 ||
length(lower_diag) != length(main_diag) - 1
error(
"The length of upper_diag and lower_diag must be one less than the length of main_diag",
)
end
# Determine the size of the blocks and the total matrix size
m = size(main_diag[1], 1) # Size of each block
n = length(main_diag) # Number of blocks
N = m * n # Total size of the matrix
# Initialize containers for constructing sparse matrix
row_indices = Int[]
col_indices = Int[]
values = T[]
# Function to add block indices and values to arrays
function append_block(i_row, i_col, block)
base_row = (i_row - 1) * m
base_col = (i_col - 1) * m
for j in 1:m
for i in 1:m
push!(row_indices, base_row + i)
push!(col_indices, base_col + j)
push!(values, block[i, j])
end
end
end
# Fill in the main diagonal blocks
for i in 1:n
append_block(i, i, main_diag[i])
end
# Fill in the upper diagonal blocks
for i in 1:(n - 1)
append_block(i, i + 1, upper_diag[i])
end
# Fill in the lower diagonal blocks
for i in 1:(n - 1)
append_block(i + 1, i, lower_diag[i])
end
# Create sparse matrix from collected indices and values
A = sparse(row_indices, col_indices, values, N, N)
return A
end
This still technically isn’t MWRE @gdalle so I apologize for that…but i figured adding all the code for my structs to this post would probably make it pretty unintelligible