Below is code to compute the gradient of the log likelihood of a conditional logit choice model. When I call the gradient multiple times, I eventually run out of memory, even though everything created within the function should be able to be garbage collected after each call.
The code below is a working example that simulates data and calls the function LL_grad
in a loop. It starts by using a few GB of memory on my machine, but after 100-200 calls of LL_grad
Julia runs out of memory (~150GB). It includes a relatively large number of functions/definitions, but one should be able to cut and paste it directly into REPL. Note that one may need to adjust the data size (e.g. by reducing num_i
) according to their available RAM. Also, I am using V0.6.3,
Any thoughts about what could be causing this or suggestions about how to fix it would be greatly appreciated. Thank you!
using Parameters, Distributions
##################STRUCTS####################
@with_kw struct LogitData
X ::Matrix{Float64}
J ::Vector{Int64}
j :: Int64
N :: Int64
end
@with_kw struct ThetaLogit{T}
β::Vector{T}
Q::Vector{T}
end
@with_kw struct LogitGradLL
dβ::Vector{Float64} = zeros(l.β)
dQ::Vector{Float64} =zeros(l.Q)
end
LogitGradLL(θ::ThetaLogit) = LogitGradLL(zeros(length(θ.β)),zeros(length(θ.Q)))
############################################################################
################FUNCTIONS TO CREATE DATA###################
function rand_logit_obs(true_b,true_FEs,num_b,num_fac,N)
X = randn(N,num_b)
J = sample(1:num_fac,N;replace=false)
u = X*true_b .+ view(true_FEs,J) .+ rand(Gumbel(),N)
j = findmax(u)[2]
return LogitData(X=X,J=J,j=j,N=N)
end
function create_data(num_b,num_fac,num_i,cs_range)
cs_sizes=rand(cs_range,num_i)
true_b = randn(num_b)
true_FEs = randn(num_fac)
true_FEs = true_FEs .- true_FEs[1] #Normalize to first fixed-effect.
D = [rand_logit_obs(true_b,true_FEs,num_b,num_fac,cs_sizes[ii]) for ii=1:num_i]
return D, true_b, true_FEs
end
###################################################
#########MAIN FUNCTIONS##########################
function update_dLL!(dLL,D_i,s::AbstractVector)
@unpack dβ, dQ = dLL
@unpack X, j, J, N = D_i
@inbounds begin
for xx = 1:size(X,2)
dβ[xx] += X[j,xx] - dot(s,view(X,:,xx))
end
for jj=1:N
dQ[J[jj]] -= s[jj]
end
dQ[J[j]] += 1.0
end
return nothing
end
function LL_grad_i!(D_i,θ,dLL)
@unpack X, J, j = D_i
@unpack β, Q = θ
@inbounds begin
eu = exp.(X*β .+ view(Q,J))
s = eu ./ sum(eu)
update_dLL!(dLL,D_i,s)
end
return nothing
end
function agg_dLLs(dLLs,N_i)
f_names = fieldnames(dLLs[1])
for ff in f_names
getfield(dLLs[1],ff) .= broadcast(+,[getfield(dLL,ff) for dLL in dLLs]...)
end
return reduce(vcat,[getfield(dLLs[1],ff) for ff in f_names]) ./ N_i
end
function LL_grad(D,θ,dLLs)
θ.Q[1]=0.0 #Normalize one FE.
N_i = length(D)::Int64
clear_struct!(dLLs)
Threads.@threads for ii=1:length(D)
thd_id = Threads.threadid()
D_i = D[ii]
@inbounds LL_grad_i!(D_i,θ,dLLs[thd_id])
end
grad = -agg_dLLs(dLLs,N_i)::Vector{Float64}
grad[length(θ.β)+1]=0.0 #Make the first Q gradiend zero.
return grad
end
LL_grad(θ_vec) = LL_grad(D::Vector{LogitData},
ThetaLogit(β=θ_vec[1:num_b],Q=θ_vec[(num_b+1):end])::ThetaLogit{Float64},
dLLs::Vector{LogitGradLL})::Vector{Float64}
####ANCILLARY FUNCTIONS###########
function clear_struct!(x)
for ff in fieldnames(x)
getfield(x,ff) .= 0.0
end
return nothing
end
function clear_struct!(x::Vector)
for ii in eachindex(x)
clear_struct!(x[ii])
end
return nothing
end
###############################################
#Create the data.
num_b = 5
num_FE= 1_000
num_i = 500_000
cs_range=30:150
D, true_β, true_Q = create_data(num_b,num_FE,num_i,cs_range)
θ_ex = ThetaLogit(β=zeros(num_b),Q=zeros(num_FE))
dLLs = [LogitGradLL(θ_ex) for tt=1:Threads.nthreads()]
#Repeatedly call LL_grad
for itr=1:1_000
println("Iter: ",itr)
curr_θ = randn(num_b+num_FE)
LL_grad(curr_θ)
end