Optim.jl violates inequality constraint

I’m trying to do maximum likelihood estimation with constraints:

p[1] + p[2] + p[3] + ... == 1
p[1] * a[1] / (1 - b[1]) + p[2] * a[2] / (1 - b[2]) + ... < 1

However, Optim seems to ignore the inequality constraint.

Constraints code

I set up both constraints like this:

function constr1(θ::AV)
    par = get_params(θ)

    sum(par[:p]) - 1 # == 0
end

function constr2(θ::AV)
    par = get_params(θ)
    p, α, β = par[:p], par[:α], par[:β]

    sum(p .* α ./ (1 .- β)) - 1 # <= 0
end

constr(θ::AV) = [constr1(θ), constr2(θ)]

constr!(c::AV, θ::AV)::AV = (c .= constr(θ); c)

constr_jac!(J::AM, θ::AV)::AM = ForwardDiff.jacobian!(J, constr, θ)

function constr_hess!(H::AM, θ::AV, λ::AV)::AM
    myH = similar(H)

    ForwardDiff.hessian!(myH, constr1, θ)
    H .+= λ[1] .* myH

    ForwardDiff.hessian!(myH, constr2, θ)
    H .+= λ[2] .* myH

    H
end

Here, get_params extracts parameter vectors p, μ, ω, α, β from the parameter vector θ.

Setting up optimization problem

K = 2
# Initial guess
θ0 = [
    ones(K) ./ K; #p
    zeros(K); # μ
    rand(K); # ω
    zeros(K) .+ 1e-5; # α
    zeros(K) .+ 1e-5
]

objective(θ) = D(θ, 1.0, data)

# Lower bounds
θ_lo = [
    zeros(K); # p
    fill(-Inf, K); # μ
    zeros(K); # ω
    zeros(K); zeros(K) # α, β
]
# Upper bounds
θ_hi = [
    ones(K);
    fill(Inf, K);
    fill(Inf, K);
    fill(Inf, K); fill(1., K); # α, β
]

# constr1 == 0
# -Inf <= constr2 <= 0
constr_lo = [0, -Inf]
constr_hi = [0, 0.]

dfc = TwiceDifferentiableConstraints(
    constr!, constr_jac!, constr_hess!,
    θ_lo, θ_hi, constr_lo, constr_hi
)

optimize(objective, dfc, θ0, IPNewton(), autodiff=:forward)

As far as I understand, constr(θ) must return a vector of two elements (because there are two constraints), where:

  • the first element is exactly zero, since the constraint is == 0
  • the second element must be negative, since the constraint is ... - 1 < 0

Indeed, constr(θ0) == [0.0, -0.999989999899999], so the initial guess seems fine.

Problem

However, optimization fails after some steps. If I @show constr(ForwardDiff.value.(θ)) inside the objective function, I get this:

constr(ForwardDiff.value.(θ)) = [4.440892098500626e-16, 0.06428113096769317]

…where the second element is not negative, so the constraint is not satisfied.


What am I doing wrong with the inequality constraint?

A reproducible example can be found here: https://gist.github.com/ForceBru/847d5b214c2102547f056dcb1956cdae

Any help? Please? Any pointers? Anything?

  • I put @asserts everywhere, they all pass.
  • I used autodiff with constraints - same error.
  • I changed the order of constraints - this didn’t help.
  • I changed the inequality constraint from <= 0 to >= 0 - it didn’t help
  • I replaced broadcasting like sum(p .* α ./ (1 .- β)) with generator expressions, like sum(p[k] * α[k] / (1 - β[k]) for k in eachindex(p)). I know Symbolics.jl can’t differentiate broadcasted expressions, so maybe they’re the problem here… even though Optim uses ForwardDiff? Yet nothing changed after I did that.
  • I tried a finite value instead of Inf for the inequality constraint’s upper bound - didn’t help.
  • I tried multiple versions of Optim.jl: 1.6.1, 1.6.0, 1.5.0, 1.4.0 and 1.3.0 - same error everywhere

It’s like the inequality constraint isn’t there…


Here’s the updated code, littered with asserts and without manual derivative computation.

import Pkg
Pkg.activate(temp=true, io=devnull)
Pkg.add(name="Optim", version="1.6.1", io=devnull)
Pkg.add(["ForwardDiff", "Distributions"], io=devnull)
Pkg.status()

import Random
Random.seed!(0)

using Optim, ForwardDiff, Distributions

const AV = AbstractVector{T} where T;
const AM = AbstractMatrix{T} where T;

# ===== Functions to be optimized =====
function D_t(p::AV, μ::AV, var::AV, b::Real, X::Real)
    K = length(p)
    @assert K == 2

    p1 = sum(@. p^2 * sqrt(π / (b + var)))
    p2 = sum(
        @. p * sqrt(2π / (2b + var)) * exp(-(X - μ)^2 / (4b + 2var))
    )
    p3 = sum(
        p[k] * p[h]
        * sqrt(2π / (2b + var[k] + var[h]))
        * exp(-(μ[k] - μ[h])^2 / (4b + 2var[k] + 2var[h]))
        for k ∈ 1:K, h ∈ 1:K
    )

    sqrt(π/b) + p1 - 2p2 + 2 * (p3 - p1)
end

"E(X^2)"
function var_err(p::AV, μ::AV, ω::AV, α::AV, β::AV)::Real
    @assert length(p) == 2
    @assert length(μ) == length(p)
    @assert length(ω) == length(p)
    @assert length(α) == length(p)
    @assert length(β) == length(p)

    idx = eachindex(p)

    s1 = sum(p[k] * μ[k]^2 for k in idx)
    @assert s1 >= 0
    s2 = sum(p[k] * ω[k] / (1 - β[k]) for k in idx)
    @assert s2 >= 0

    # Exactly the same as `constr[1]`
    # in `constr!(constr, θ::AV)` below
    s3 = 1 - sum(p[k] * α[k] / (1 - β[k]) for k in idx)

    @show ForwardDiff.value(s3)

    var = (s1 + s2) / s3
    if !(var ≥ 0)
        @assert s3 <= 0
        @show ForwardDiff.value(var)
        @show ForwardDiff.value.(p) ForwardDiff.value.(μ) ForwardDiff.value.(ω) ForwardDiff.value.(α) ForwardDiff.value.(β)
        @assert var ≥ 0
    end

    var
end

"E(σ_k^2)"
function expected_var(p::AV, μ::AV, ω::AV, α::AV, β::AV)
    verr = var_err(p, μ, ω, α, β)

    @. (ω + α * verr) / (1 - β)
end

function D(p::AV, μ::AV, ω::AV, α::AV, β::AV, b::Real, X::AV{<:Real})
    var = expected_var(p, μ, ω, α, β)
    @assert size(var) == size(p)

    ret = D_t(p, μ, var, b, X[1])
    for t ∈ 2:length(X)
        @. var = ω + α * X[t-1]^2 + β * var
        ret += D_t(p, μ, var, b, X[t])
    end

    ret
end

"Extract parameters from vector θ"
function get_params(θ::AV)
    @assert length(θ) % 5 == 0
    K = length(θ) ÷ 5 # 5 variables
    @assert K == 2

    p = @view θ[1:K]
    μ = @view θ[K+1:2K]
    ω = @view θ[2K+1:3K]
    α = @view θ[3K+1:4K]
    β = @view θ[4K+1:end]

    @assert length(p) == K
    @assert length(μ) == K
    @assert length(ω) == K
    @assert length(α) == K
    @assert length(β) == K

    @assert all(p .>= 0)
    @assert all(ω .>= 0)
    @assert all(α .>= 0)
    @assert all(β .>= 0)

    (p, μ, ω, α, β)
end

# ===== Constraints =====
function constr!(constraints, θ::AV)
    p, μ, ω, α, β = get_params(θ)
    idx = eachindex(p)

    @assert length(constraints) == 2

    constraints[2] = sum(p) - 1 # == 0
    constraints[1] = 1 - sum(p[k] * α[k] / (1 - β[k]) for k in idx) # >= 0

    constraints
end

constr(θ::AV) = constr!(zeros(eltype(θ), 2), θ)

"Final objective function for Optim"
D(θ::AV, b::Real, X::AV{<:Real}) = D(get_params(θ)..., b, X)


# ===== Try to optimize D =====
@info "Generating random data..."
distr = UnivariateGMM([-1, 1], [0.2, .3], Categorical([.4, .6]))
data = rand(distr, 100)

θ0 = [
    1/2; 1/2; #p
    0; 0; # μ
    rand(2); # ω
    1e-5; 1e-5; # α
    1e-5; 1e-5
]
@info "Initial value" θ0
@show get_params(θ0)

@show constr(θ0)

@info "Setting up optimization problem..."
objective(θ) = D(θ, 1.0, data)

θ_lo = [
    0, 0, # p
    -Inf, -Inf, # μ
    0, 0, # ω
    0, 0, 0, 0 # α, β
]
θ_hi = [
    1, 1,
    Inf, Inf,
    Inf, Inf,
    Inf, Inf, 1, 1 # α, β
]
# Must lie EXACTLY in interior
@assert all(θ0 .> θ_lo)
@assert all(θ0 .< θ_hi)

# 0 <= constr1 < Inf
# constr2 == 0
constr_lo = Float64[0, 0]
constr_hi = Float64[Inf, 0]
@assert all(constr(θ0) .≥ constr_lo)
@assert all(constr(θ0) .≤ constr_hi)

dfc = TwiceDifferentiableConstraints(
    constr!,
    θ_lo, θ_hi, constr_lo, constr_hi,
    :forward # use autodiff!
)

@info "Optimizing..."
ret = optimize(objective, dfc, θ0, IPNewton(), autodiff=:forward)

display(ret)

What am I doing wrong here??

What do you mean when you say the “optimization fails”?

And have you tried another package like JuMP or NLopt?

By “optimization fails” I mean that Optim.jl violates the inequality constraint, as stated in the title.

When I run this code, the output looks like this:

      Status `/private/var/folders/ys/3h0gnqns4b98zb66_vl_m35m0000gn/T/jl_8iC0Np/Project.toml`
  [31c24e10] Distributions v0.25.46
  [f6369f11] ForwardDiff v0.10.25
  [429524aa] Optim v1.6.1
[ Info: Generating random data...
┌ Info: Initial value
│   θ0 =
│    10-element Vector{Float64}:
│     0.5
│     0.5
│     0.0
│     0.0
│     0.3076608766072988
│     0.2803873329461656
│     1.0e-5
│     1.0e-5
│     1.0e-5
└     1.0e-5
get_params(θ0) = ([0.5, 0.5], [0.0, 0.0], [0.3076608766072988, 0.2803873329461656], [1.0e-5, 1.0e-5], [1.0e-5, 1.0e-5])
constr(θ0) = [0.999989999899999, 0.0]
[ Info: Setting up optimization problem...
constr! = constr!
[ Info: Optimizing...
ForwardDiff.value(s3) = 0.999989999899999
ForwardDiff.value(s3) = Dual{ForwardDiff.Tag{typeof(objective), Float64}}(0.999989999899999,-1.000010000100001e-5,-1.000010000100001e-5,-0.0,-0.0,-0.0,-0.0,-0.5000050000500005,-0.5000050000500005,-5.00010000150002e-6,-5.00010000150002e-6)
ForwardDiff.value(s3) = 0.9999809154864348
...
ForwardDiff.value(s3) = 0.9965220053647488
ForwardDiff.value(s3) = Dual{ForwardDiff.Tag{typeof(objective), Float64}}(0.9965220053647488,-0.0035224798058174664,-0.003433432142628737,-0.0,-0.0,-0.0,-0.0,-0.5018859397772173,-0.5010145138215388,-0.001767883087688969,-0.0017201993356783809)
ForwardDiff.value(s3) = -0.06428220890828795
ForwardDiff.value(var) = -1.5104538785258217
ForwardDiff.value.(p) = [0.6340530871847483, 0.3659469128152516]
ForwardDiff.value.(μ) = [0.29075393704923885, 0.09769969796228732]
ForwardDiff.value.(ω) = [0.03220713502201594, 0.0002772290095625074]
ForwardDiff.value.(α) = [0.5171137635535503, 0.581144621641674]
ForwardDiff.value.(β) = [0.4868815931762428, 0.49994821923123]
ERROR: LoadError: AssertionError: var ≥ 0

So, ForwardDiff.value(s3) is positive at first, as it must be (because s3 is literally the value of the inequality constraint), but then it becomes negative: ForwardDiff.value(s3) = -0.06428220890828795. Thus, the model variance becomes negative as well:

ForwardDiff.value(var) = -1.5104538785258217

This doesn’t make any sense, because that inequality constraint is there specifically to prevent the variance from being negative!

I tried JuMP + Ipopt, and it doesn’t violate the inequality.

1 Like