ForwardDiff.jl return NaN values in Hessian matrix

Strange_Xue · June 1, 2024, 3:03am

Hi guys, here are a demo of maximum likelihood estimation. Below, sample is used to sample the data, and neg_loglikehood is used to calculated the negative log likelihood function.

I wonder why ForwardDiff.jl returns hessian matrix H with all NaN values (BTW, the gradient is right). Are there some rules that we should obey to avoid NaN values?

I am sure there’re no numerical issues within the neg_loglikehood.

using BenchmarkTools, InteractiveUtils
using Random, Distributions
using LogExpFunctions, SpecialFunctions
using Optim, LineSearches,ForwardDiff 
using LinearAlgebra: diag, dot
using StatsBase: CoefTable

function sample(rng, Inds, Tlen, Q0, v20, v2En, QE, alpha2, beta0, beta1, gamma0, gamma1)
	d = zeros(Int64, Inds, Tlen)
	n = zeros(Int64, Inds, Tlen)
	s = zeros(Int64, Inds, Tlen)
	demographic = randn(Inds)

	for i in axes(d, 1)
		m = Q0
		v2m = v20
        v2Es = exp(gamma0 + gamma1 * demographic[i])

		for t in axes(d, 2)
			d[i, t] = rand(rng, Poisson(3))
			n[i, t] = rand(rng, Poisson(3))

			# decision process
			# U = m + beta0 + beta2 * demographic[i]
            alpha1 = beta0 + beta1 * demographic[i]
            U = alpha1 * m + alpha2 * d[i, t]
			p1 = exp(U) / (1 + exp(U))
			s[i, t] = rand(rng, Binomial(d[i, t], p1))

			# learning process
			QEn = randn(rng) * sqrt(v2En * n[i, t]) + QE * n[i, t]
			QEs = randn(rng) * sqrt(v2Es * s[i, t]) + QE * s[i, t]

			denominator = 1 / v2m + n[i, t] / v2En + s[i, t] / v2Es
			m = (m / v2m + QEn / v2En + QEs / v2Es) / denominator
			v2m = 1 / denominator
		end
	end
	return d, n, s, demographic
end
begin
	rng = Random.seed!(2024 - 5 - 28)
	Inds = 300
	Tlen = 20
	Q0 = 1.0
    v20 = exp(6.0)
	v2En = exp(3.0)
    QE = 5.0
    alpha2 = 0.5
	beta0 = 0.5
	beta1 = 0.5
    gamma0 = 2.0
    gamma1 = 2.0
	d, n, s, demographic = sample(rng, Inds, Tlen, Q0, v20, v2En, QE, alpha2, beta0, beta1, gamma0, gamma1)
end

function neg_loglikelihood(rng, pars::Vector{T}, Q0, v20, d, n, s, demographic, nsam) where {T <: Real}
	QE = pars[1]
	v2En = exp(pars[2])
	alpha2 = pars[3]
	beta0 = pars[4]
	beta1 = pars[5]
    gamma0 = pars[6]
    gamma1 = pars[7]

	ll = zero(T)
	for i in axes(d, 1)
		lli = zeros(T, nsam)
		@inbounds for j in 1:nsam
			m = Q0
			v2m = v20
            v2Es = exp(gamma0 + gamma1 * demographic[i])
			@inbounds for t in axes(d, 2)
                alpha1 = beta0 + beta1 * demographic[i]
                U = alpha1 * m + alpha2 * d[i, t]

				lli[j] += logfactorial(d[i, t]) - logfactorial(s[i, t]) - logfactorial(d[i, t] - s[i, t]) + s[i, t] * (U - log1pexp(U)) + (d[i, t] - s[i, t]) * (-log1pexp(U))

				QEn = randn(rng) * sqrt(v2En * n[i, t]) + QE * n[i, t]
				QEs = randn(rng) * sqrt(v2Es * s[i, t]) + QE * s[i, t]

				denominator = 1 / v2m + n[i, t] / v2En + s[i, t] / v2Es
				m = (m / v2m + QEn / v2En + QEs / v2Es) / denominator
				v2m = 1 / denominator
			end
		end
		ll += logsumexp(lli) + log(1 / nsam)
	end
	return -ll
end

rng = Random.seed!(2024 - 5 - 30)
pars_true = [QE, log(v2En), alpha2, beta0, beta1, gamma0, gamma1]

H = ForwardDiff.hessian(pars -> neg_loglikelihood(rng, pars, Q0, v20, d, n, s, demographic, 30), [-30.8633, 5.7517, 0.5360, -0.0774, -0.0808, -42.3168, 56.3017])

gdalle · June 1, 2024, 8:36am

Are you sure? I get NaNs in the gradient too with your code

sgaure · June 1, 2024, 12:23pm

There are zeros in the matrix s. This leads to NaN in the computation of the derivative of QEs because it involves the sqrt(v2Es * s[i,t]).

sgaure · June 1, 2024, 1:30pm

If I skip the inner loop with n[i,t]*s[i,t] == 0 && continue there are still NaNs in the Hessian, but not in the gradient. The NaNs disappear if I use BigFloat, though it’s slow:

ForwardDiff.hessian(pars -> neg_loglikelihood(rng, pars, Q0, v20, d, n, s, demographic, 30), 
BigFloat.([-30.8633, 5.7517, 0.5360, -0.0774, -0.0808, -42.3168, 56.3017]))

This indicates that something in the computation underflows with Float64, generating an intermediate 0.0/0.0 or Inf/Inf computation, or similar. Perhaps the formulas can be rewritten to avoid quantities very close to zero?

Topic		Replies	Views
Hessian matrix for standard errors using Optim.jl example Statistics question	1	2937	November 27, 2018
ForwardDiff.jl returns NaNs for calculating gradient of simulation model Probabilistic programming forwarddiff , bayesian-inference , ad	2	310	February 8, 2024
Gradient using ForwardDiff.jl not working General Usage forwarddiff , autodiff , gradient	2	68	November 28, 2024
Maximum Likelihood Example Update Optimization (Mathematical)	2	529	December 7, 2018
Diagnosing NaNs from ForwardDiff General Usage question	4	1059	October 16, 2018

ForwardDiff.jl return NaN values in Hessian matrix

Related topics