Is this a bug with Cholesky?


#1

I have a matrix called “post_var” that fails with a cholesky decomposition but when I copy and paste the matrix and run cholseky on that it works. Can someone check out what’s going on?

The original code was from a 2014 blog post (kalman-filter-for-panel-data-and-mle-in-julia-part-1). I’ve updated the code to run on 1.0.1. I’ve pared down everything below to just reproduce the error. There are two code blocks below. One for the REPL output and one for reproducing the error.

Here’s the REPL of what’s going on

julia> print(post_var)
[0.5429 2.21032e-7; 2.21032e-7 0.268941]
julia> cholesky(post_var)
ERROR: PosDefException: matrix is not Hermitian; Cholesky factorization failed.
Stacktrace:
 [1] checkpositivedefinite(::Int64) at C:\cygwin\home\Administrator\buildbot\worker\package_win64\bu
ild\usr\share\julia\stdlib\v1.0\LinearAlgebra\src\factorization.jl:11
 [2] #cholesky!#91(::Bool, ::Function, ::Array{Float64,2}, ::Val{false}) at C:\cygwin\home\Administr
ator\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.0\LinearAlgebra\src\cholesky.jl:1
82
 [3] #cholesky#95 at .\none:0 [inlined]
 [4] cholesky at C:\cygwin\home\Administrator\buildbot\worker\package_win64\build\usr\share\julia\st
dlib\v1.0\LinearAlgebra\src\cholesky.jl:275 [inlined] (repeats 2 times)
 [5] top-level scope at none:0

julia> test = [0.5429 2.21032e-7; 2.21032e-7 0.268941]
2×2 Array{Float64,2}:
 0.5429      2.21032e-7
 2.21032e-7  0.268941

julia> print(cholesky(test))
Cholesky{Float64,Array{Float64,2}}([0.736817 2.99982e-7; 2.21032e-7 0.518595], 'U', 0)

Here’s the code to reproduce:

using LinearAlgebra
using DataFrames
using Distributions

function unpackParams(p, stateDim, obsDim)
    place = 0
    A = reshape(p[(place + 1):(place + stateDim^2)], (stateDim, stateDim))
    place += stateDim^2
    V = diagm(0=>exp.(p[(place + 1):(place + stateDim)]))
    place += stateDim
    Cparams = p[(place + 1):(place + stateDim * (obsDim - 1))]
    C = zeros(stateDim * obsDim, stateDim)
    for j in 1:stateDim
            C[(1 + obsDim * (j - 1)):obsDim * j, j] = vcat([1.0], Cparams[(1 + (obsDim - 1) * (j - 1)):(obsDim - 1) * j])
    end
    place += (obsDim - 1) * stateDim
    W = diagm(0=>exp.(p[(place + 1):(place + obsDim * stateDim)]))
    return Dict("A"=>A, "V"=>V, "C"=>C, "W"=>W)
end

function KalmanDGP(p, stateDim, obsDim, N, T, init_exp, init_var)
    # initialize data
    data = zeros(N, stateDim*obsDim * T + 1)
    # parameters
    unpacked = unpackParams(p, stateDim, obsDim)
    A = unpacked["A"]
    V = unpacked["V"]
    C = unpacked["C"]
    W = unpacked["W"]
    # draw from DGP
    for i in 1:N
        # data of individual i
        iData = ones(stateDim * obsDim * T + 1)
        current_state = rand(MvNormal(reshape(init_exp, (stateDim, )), init_var))
        r_error_var = Matrix{Float64}(I, obsDim * stateDim, obsDim * stateDim)
        μ = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        iData[1:stateDim*obsDim] = rand(MvNormal(r_error_var))
        for t in 2:T
            current_state = A * current_state + rand(MvNormal(V))
            iData[((t - 1) * stateDim * obsDim + 1):(t * stateDim * obsDim)] = C * current_state + rand(MvNormal(W))
        end
        # add individual to data
        data[i, :] = iData
    end
    return data
end

function incrementKF(p, post_exp, post_var, new_obs, stateDim, obsDim)
    # unpack parameters
    unpacked = unpackParams(p, stateDim, obsDim)
    A = unpacked["A"]
    V = unpacked["V"]
    C = unpacked["C"]
    W = unpacked["W"]
    # predict
    prior_exp = A * post_exp
    prior_var = A * post_var * transpose(A) + V
    obs_prior_exp = C * prior_exp
    obs_prior_var = C * prior_var * transpose(C) + W
    # update
    residual = new_obs - obs_prior_exp
    obs_prior_cov = prior_var * transpose(C)
    kalman_gain = obs_prior_cov * inv(obs_prior_var)
    post_exp = prior_exp + kalman_gain * residual
    post_var = prior_var - kalman_gain * transpose(obs_prior_cov)
    # step likelihood
    dist = MvNormal(reshape(obs_prior_exp, (length(obs_prior_exp),)), obs_prior_var)
    log_like = logpdf(dist, new_obs)
    return Dict("post_exp"=>post_exp, "post_var"=>post_var, "log_like"=>log_like)
end


params0 = [1.,0.,0.,1.,0.,0.,.5,-.5,.5,-.5,0.,1.,0.,-1.,0.,0.]
stateDim = 2
obsDim = 3
T = 4
N = 1000
init_exp = zeros(stateDim)
init_var = Matrix{Float64}(I, stateDim, stateDim)
data = KalmanDGP(params0, stateDim, obsDim, N, T, init_exp, init_var)
data = DataFrame(data)
names!(data, [:one_1,:one_2,:one_3,:one_4,:one_5,:one_6,:two_1,:two_2,:two_3,:two_4,:two_5,:two_6,:three_1,:three_2,:three_3,:three_4,:three_5,:three_6,:four_1,:four_2,:four_3,:four_4,:four_5,:four_6,:outcome])
obsDict = [[:one_1,:one_2,:one_3,:one_4,:one_5,:one_6],
        [:two_1,:two_2,:two_3,:two_4,:two_5,:two_6],
        [:three_1,:three_2,:three_3,:three_4,:three_5,:three_6],
        [:four_1,:four_2,:four_3,:four_4,:four_5,:four_6]]

iData = data[1,:]
cols = length(iData[obsDict[1]])
# initialization
post_exp = init_exp
post_var = init_var
init_obs = convert(Array, iData[obsDict[1]])[1:cols]
dist = MvNormal(Matrix{Float64}(I, length(init_obs), length(init_obs)))

p = [1.00002, 0.0, 6.05545e-6, 1.0, 0.0, 0.0, 0.5, -0.5, 0.5, -0.5, 0.0, 1.0, 0.0, -1.0, 0.0, 0.0]

iData = data[1, :]
cols = length(iData[obsDict[1]])
# initialization
post_exp = init_exp
post_var = init_var
init_obs = convert(Array, iData[obsDict[1]])[1:cols]
dist = MvNormal(Matrix{Float64}(I, length(init_obs), length(init_obs)))
log_like = logpdf(dist, init_obs)
for t = 1
        global post_exp, post_var
        # predict and update
        new_obs = convert(Array, iData[obsDict[t + 1]])[1:cols]
        new_post = incrementKF(p, post_exp, post_var, new_obs, stateDim, obsDim)
        # replace
        post_exp = new_post["post_exp"]
        post_var = new_post["post_var"]
end


print(post_var)
cholesky(post_var)
test = [0.5429 2.21032e-7; 2.21032e-7 0.268941]
print(cholesky(test))

#2

In general you can not construct the exact same object from the printed output, you will not get back the exact same thing:

julia> post_var - [0.5429 2.21032e-7; 2.21032e-7 0.268941]
2×2 Array{Float64,2}:
 -4.4542e-9    6.40177e-14
  6.40177e-14  4.2137e-7  

julia> post_var ≈ [0.5429 2.21032e-7; 2.21032e-7 0.268941]
false

#3

Not all of the digits are printed by default, so when you copy and paste you are slightly changing the inputs.

In general, roundoff errors can cause a matrix to be slightly non-symmetric or slightly indefinite. Try cholesky(Hermitian(A)) to tell Julia to assume A is exactly Hermitian and look only at the entries in the upper triangle.


#4

I wonder if cholesky should simply not have methods for general matrices, only for Symmetric, Hermitian, Diagonal, etc, and make it clear that the user is responsible for providing the matrix in the appropriate form.


#5

@fredrikekre, yes, that makes sense.

@stevengj, thank you! solved almost the entire issue.

@Tamas_Papp, the issue was related to calling MvNormal with a covariance matrix that was getting spit back as not pos-def. I had to use the GaussianDistributions.jl package to pass the covariance matrix as type Hermitian for it to work.

For posterity, full working code is below

using DataFrames
using Distributions
using Optim
using LinearAlgebra
using GaussianDistributions

function unpackParams(p, stateDim, obsDim)
    place = 0
    A = reshape(p[(place + 1):(place + stateDim^2)], (stateDim, stateDim))
    place += stateDim^2
    V = diagm(0=>exp.(p[(place + 1):(place + stateDim)]))
    place += stateDim
    Cparams = p[(place + 1):(place + stateDim * (obsDim - 1))]
    C = zeros(stateDim * obsDim, stateDim)
    for j in 1:stateDim
            C[(1 + obsDim * (j - 1)):obsDim * j, j] = vcat([1.0], Cparams[(1 + (obsDim - 1) * (j - 1)):(obsDim - 1) * j])
    end
    place += (obsDim - 1) * stateDim
    W = diagm(0=>exp.(p[(place + 1):(place + obsDim * stateDim)]))
    return Dict("A"=>A, "V"=>V, "C"=>C, "W"=>W)
end

function KalmanDGP(p, stateDim, obsDim, N, T, init_exp, init_var)
    # initialize data
    data = zeros(N, stateDim*obsDim * T + 1)
    # parameters
    unpacked = unpackParams(p, stateDim, obsDim)
    A = unpacked["A"]
    V = unpacked["V"]
    C = unpacked["C"]
    W = unpacked["W"]
    # draw from DGP
    for i in 1:N
        # data of individual i
        iData = ones(stateDim * obsDim * T + 1)
        current_state = rand(MvNormal(reshape(init_exp, (stateDim, )), init_var))
        r_error_var = Matrix{Float64}(I, obsDim * stateDim, obsDim * stateDim)
        μ = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        iData[1:stateDim*obsDim] = rand(MvNormal(r_error_var))
        for t in 2:T
            current_state = A * current_state + rand(MvNormal(V))
            iData[((t - 1) * stateDim * obsDim + 1):(t * stateDim * obsDim)] = C * current_state + rand(MvNormal(W))
        end
        # add individual to data
        data[i, :] = iData
    end
    return data
end

function incrementKF(p, post_exp, post_var, new_obs, stateDim, obsDim)
    # unpack parameters
    unpacked = unpackParams(p, stateDim, obsDim)
    A = unpacked["A"]
    V = unpacked["V"]
    C = unpacked["C"]
    W = unpacked["W"]
    # predict
    prior_exp = A * post_exp
    prior_var = A * post_var * transpose(A) + V
    obs_prior_exp = C * prior_exp
    obs_prior_var = C * prior_var * transpose(C) + W
    # update
    residual = new_obs - obs_prior_exp
    obs_prior_cov = prior_var * transpose(C)
    kalman_gain = obs_prior_cov * inv(obs_prior_var)
    post_exp = prior_exp + kalman_gain * residual
    post_var = prior_var - kalman_gain * transpose(obs_prior_cov)
    # step likelihood
    dist = Gaussian(reshape(obs_prior_exp, (length(obs_prior_exp),)), Hermitian(obs_prior_var))
    log_like = logpdf(dist, new_obs)
    return Dict("post_exp"=>post_exp, "post_var"=>post_var, "log_like"=>log_like)
end

function indivKF(p, df, obsDict, init_exp, init_var, stateDim, obsDim, T, i)
    iData = df[i,:]
    cols = length(iData[obsDict[1]])
    # initialization
    post_exp = init_exp
    post_var = init_var
    init_obs = convert(Array, iData[obsDict[1]])[1:cols]
    dist = MvNormal(Matrix{Float64}(I, length(init_obs), length(init_obs)))
    log_like = logpdf(dist, init_obs)
    for t = 1:(T-1)
        # predict and update
        new_obs = convert(Array, iData[obsDict[t + 1]])[1:cols]
        
        new_post = incrementKF(p, post_exp, post_var, new_obs, stateDim, obsDim)
        # replace
        post_exp = new_post["post_exp"]
        post_var = Hermitian(new_post["post_var"])
        # contribute
        log_like += new_post["log_like"]
    end
    return log_like
end

function sampleKF(p,df,obsDict,init_exp,init_var,stateDim,obsDim,T)
    log_like = 0.0
    N = size(df, 1)
    for i in 1:N
        log_like += indivKF(p, df, obsDict, init_exp, init_var, stateDim, obsDim, T, i)
    end
    neg_avg_log_like = -log_like/N
    # println("current average negative log-likelihood: ", neg_avg_log_like)
    return neg_avg_log_like[1]
end


function estimateKF()
    params0 = [1.,0.,0.,1.,0.,0.,.5,-.5,.5,-.5,0.,1.,0.,-1.,0.,0.]
    stateDim = 2
    obsDim = 3
    T = 4
    N = 1000
    init_exp = zeros(stateDim)
    init_var = Matrix{Float64}(I, stateDim, stateDim)
    data = KalmanDGP(params0, stateDim, obsDim, N, T, init_exp, init_var)
    data = DataFrame(data)
    names!(data, [:one_1,:one_2,:one_3,:one_4,:one_5,:one_6,:two_1,:two_2,:two_3,:two_4,:two_5,:two_6,:three_1,:three_2,:three_3,:three_4,:three_5,:three_6,:four_1,:four_2,:four_3,:four_4,:four_5,:four_6,:outcome])
    obsDict = [[:one_1,:one_2,:one_3,:one_4,:one_5,:one_6],
            [:two_1,:two_2,:two_3,:two_4,:two_5,:two_6],
            [:three_1,:three_2,:three_3,:three_4,:three_5,:three_6],
            [:four_1,:four_2,:four_3,:four_4,:four_5,:four_6]]

    function wrapLoglike(params)
        print("current parameters: ", params)
        return sampleKF(params, data, obsDict, init_exp, init_var, stateDim, obsDim, T)
    end
    MLE = Optim.optimize(wrapLoglike, params0, ConjugateGradient(), Optim.Options(time_limit = 5.0))
    optimParams = unpackParams(MLE.minimizer, stateDim, obsDim)
    return optimParams
end

optimParams0 = estimateKF()