Thanks a ton. This worked with minor tweaks to your fg!
. Question for you: do we need to pre-initialize G
somewhere inside fg!
?
Copying the full code and output for someone who may run into something similar. I converted the likelihood function to log-likelihood and adjusted gradients accordingly since that’s a more typical thing to encounter in statistics/economics.
#data
data = DataFrame(x = repeat([1,2], 500), y = sample([0,1], 1000))
#log-likelihood
function likelihood(beta::Array{T}, data::R) where {T, R}
Prob = SharedArray{T}(1000, 1)
@distributed (+) for i in 1:1000
if data[!, :y][i] .== 1
Prob[i] = exp(beta[1] .* data[!, :x][i])/(1 + exp(beta[1] .* data[!, :x][i]))
else
Prob[i] = 1/(1 + exp(beta[1] .* data[!, :x][i]))
end
end
return Prob, -sum(log.(Prob))/1000
end
#manual gradients of log-likelihood
function gradient_likelihood(Prob, data::R) where {R}
grad = SharedArray{Float64}(1000, 1)
@distributed (+) for i in 1:1000
if data[!, :y][i] .== 1
grad[i] = (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
else
grad[i] = - (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
end
end
return -sum(grad)/1000
end
#check results
julia> likelihood([1.0], data)
([0.2689414213699951; 0.8807970779778824; … ; 0.2689414213699951; 0.8807970779778824], 0.9240948492805974)
julia> gradient_likelihood(likelihood([1.0], data)[1], data)
0.4503263672928852
julia> ForwardDiff.gradient(x -> likelihood(x, data)[2], [1.0])
1-element Array{Float64,1}:
0.4503263672928852
#avoid repeating computations in Optim.optimize
function fg!(F, G, beta)
Prob = SharedArray{Float64}(1000, 1)
@distributed (+) for i in 1:1000
if data[!, :y][i] .== 1
Prob[i] = exp(beta[1] .* data[!, :x][i])/(1 + exp(beta[1] .* data[!, :x][i]))
else
Prob[i] = 1/(1 + exp(beta[1] .* data[!, :x][i]))
end
end
if !(G == nothing)
grad = SharedArray{Float64}(1000, 1)
@distributed (+) for i in 1:1000
if data[!, :y][i] .== 1
grad[i] = (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
else
grad[i] = - (Prob[i] .* (1 - Prob[i]) .* data[!, :x][i])./Prob[i]
end
end
copyto!(G, -sum(grad)/1000)
end
if !(F == nothing)
return -sum(log.(Prob))/1000
end
end
julia> Optim.optimize(Optim.only_fg!(fg!), rand(Uniform(), 1), Optim.LBFGS(), Optim.Options(show_trace = true))
Iter Function value Gradient norm
0 7.616226e-01 2.766918e-01
* time: 0.0
1 6.915962e-01 1.335341e-02
* time: 0.031000137329101562
2 6.914531e-01 1.507051e-06
* time: 0.04999995231628418
3 6.914531e-01 1.105449e-15
* time: 0.07500004768371582
* Status: success
* Candidate solution
Minimizer: [7.37e-02]
Minimum: 6.914531e-01
* Found with
Algorithm: L-BFGS
Initial Point: [5.61e-01]
* Convergence measures
|x - x'| = 2.42e-06 ≰ 0.0e+00
|x - x'|/|x'| = 3.29e-05 ≰ 0.0e+00
|f(x) - f(x')| = 1.83e-12 ≰ 0.0e+00
|f(x) - f(x')|/|f(x')| = 2.64e-12 ≰ 0.0e+00
|g(x)| = 1.11e-15 ≤ 1.0e-08
* Work counters
Seconds run: 0 (vs limit Inf)
Iterations: 3
f(x) calls: 9
∇f(x) calls: 9