ForwardDiff.jl return NaN values in Hessian matrix

Hi guys, here are a demo of maximum likelihood estimation. Below, sample is used to sample the data, and neg_loglikehood is used to calculated the negative log likelihood function.

I wonder why ForwardDiff.jl returns hessian matrix H with all NaN values (BTW, the gradient is right). Are there some rules that we should obey to avoid NaN values?

I am sure there’re no numerical issues within the neg_loglikehood.

using BenchmarkTools, InteractiveUtils
using Random, Distributions
using LogExpFunctions, SpecialFunctions
using Optim, LineSearches,ForwardDiff 
using LinearAlgebra: diag, dot
using StatsBase: CoefTable

function sample(rng, Inds, Tlen, Q0, v20, v2En, QE, alpha2, beta0, beta1, gamma0, gamma1)
	d = zeros(Int64, Inds, Tlen)
	n = zeros(Int64, Inds, Tlen)
	s = zeros(Int64, Inds, Tlen)
	demographic = randn(Inds)

	for i in axes(d, 1)
		m = Q0
		v2m = v20
        v2Es = exp(gamma0 + gamma1 * demographic[i])

		for t in axes(d, 2)
			d[i, t] = rand(rng, Poisson(3))
			n[i, t] = rand(rng, Poisson(3))

			# decision process
			# U = m + beta0 + beta2 * demographic[i]
            alpha1 = beta0 + beta1 * demographic[i]
            U = alpha1 * m + alpha2 * d[i, t]
			p1 = exp(U) / (1 + exp(U))
			s[i, t] = rand(rng, Binomial(d[i, t], p1))

			# learning process
			QEn = randn(rng) * sqrt(v2En * n[i, t]) + QE * n[i, t]
			QEs = randn(rng) * sqrt(v2Es * s[i, t]) + QE * s[i, t]

			denominator = 1 / v2m + n[i, t] / v2En + s[i, t] / v2Es
			m = (m / v2m + QEn / v2En + QEs / v2Es) / denominator
			v2m = 1 / denominator
		end
	end
	return d, n, s, demographic
end
begin
	rng = Random.seed!(2024 - 5 - 28)
	Inds = 300
	Tlen = 20
	Q0 = 1.0
    v20 = exp(6.0)
	v2En = exp(3.0)
    QE = 5.0
    alpha2 = 0.5
	beta0 = 0.5
	beta1 = 0.5
    gamma0 = 2.0
    gamma1 = 2.0
	d, n, s, demographic = sample(rng, Inds, Tlen, Q0, v20, v2En, QE, alpha2, beta0, beta1, gamma0, gamma1)
end

function neg_loglikelihood(rng, pars::Vector{T}, Q0, v20, d, n, s, demographic, nsam) where {T <: Real}
	QE = pars[1]
	v2En = exp(pars[2])
	alpha2 = pars[3]
	beta0 = pars[4]
	beta1 = pars[5]
    gamma0 = pars[6]
    gamma1 = pars[7]

	ll = zero(T)
	for i in axes(d, 1)
		lli = zeros(T, nsam)
		@inbounds for j in 1:nsam
			m = Q0
			v2m = v20
            v2Es = exp(gamma0 + gamma1 * demographic[i])
			@inbounds for t in axes(d, 2)
                alpha1 = beta0 + beta1 * demographic[i]
                U = alpha1 * m + alpha2 * d[i, t]

				lli[j] += logfactorial(d[i, t]) - logfactorial(s[i, t]) - logfactorial(d[i, t] - s[i, t]) + s[i, t] * (U - log1pexp(U)) + (d[i, t] - s[i, t]) * (-log1pexp(U))

				QEn = randn(rng) * sqrt(v2En * n[i, t]) + QE * n[i, t]
				QEs = randn(rng) * sqrt(v2Es * s[i, t]) + QE * s[i, t]

				denominator = 1 / v2m + n[i, t] / v2En + s[i, t] / v2Es
				m = (m / v2m + QEn / v2En + QEs / v2Es) / denominator
				v2m = 1 / denominator
			end
		end
		ll += logsumexp(lli) + log(1 / nsam)
	end
	return -ll
end

rng = Random.seed!(2024 - 5 - 30)
pars_true = [QE, log(v2En), alpha2, beta0, beta1, gamma0, gamma1]

H = ForwardDiff.hessian(pars -> neg_loglikelihood(rng, pars, Q0, v20, d, n, s, demographic, 30), [-30.8633, 5.7517, 0.5360, -0.0774, -0.0808, -42.3168, 56.3017])

Are you sure? I get NaNs in the gradient too with your code

There are zeros in the matrix s. This leads to NaN in the computation of the derivative of QEs because it involves the sqrt(v2Es * s[i,t]).

3 Likes

If I skip the inner loop with n[i,t]*s[i,t] == 0 && continue there are still NaNs in the Hessian, but not in the gradient. The NaNs disappear if I use BigFloat, though it’s slow:

ForwardDiff.hessian(pars -> neg_loglikelihood(rng, pars, Q0, v20, d, n, s, demographic, 30), 
BigFloat.([-30.8633, 5.7517, 0.5360, -0.0774, -0.0808, -42.3168, 56.3017]))

This indicates that something in the computation underflows with Float64, generating an intermediate 0.0/0.0 or Inf/Inf computation, or similar. Perhaps the formulas can be rewritten to avoid quantities very close to zero?