I am relatively new to Julia. I am building a code to optimize some objective function that is called many times. The most important intermediate step is computing the following function, called recover_δ!
which is a fixed point iteration implemented using the squarem algorithm.
What I cannot understand is why running the same function recover_δ!
twice generates 81 times more allocations (6k to 504k) than running the function once, and takes 218 times (5.7 ms vs 1.2 s) longer! Here is what I mean.
Running the code twice separately
Running the function twice but in separate parts does the following:
δ_test = copy(δ_logit)
β_test1 = [-3.5; -0.8; zeros(n_β-2)]
Xβ_init1 = X_nl * β_test1
time1 = @benchmark recover_δ!($δ_test, $Xβ_init1)
δ_test = copy(δ_logit)
β_test2 = [-2.0; -0; zeros(n_β-2)]
Xβ_init2 = X_nl * β_test2
time2 = @benchmark recover_δ!($δ_test, $Xβ_init2)
And gives as output (for time1
and time2
, respectively):
BenchmarkTools.Trial: 870 samples with 1 evaluation.
Range (min … max): 3.544 ms … 14.373 ms ┊ GC (min … max): 0.00% … 73.78%
Time (median): 4.273 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.743 ms ± 2.922 ms ┊ GC (mean ± σ): 29.20% ± 26.51%
█▇▄▂▃▂
███████▃▂▂▁▁▁▁▁▁▂▂▁▂▂▂▁▂▃▂▂▃▃▂▂▃▃▃▃▃▃▃▂▃▃▃▃▃▃▃▂▃▃▃▃▃▃▂▂▂▂▂ ▃
3.54 ms Histogram: frequency by time 13.2 ms <
Memory estimate: 40.66 MiB, allocs estimate: 6138.
BenchmarkTools.Trial: 883 samples with 1 evaluation.
Range (min … max): 3.528 ms … 15.513 ms ┊ GC (min … max): 0.00% … 71.82%
Time (median): 4.208 ms ┊ GC (median): 0.00%
Time (mean ± σ): 5.655 ms ± 2.910 ms ┊ GC (mean ± σ): 29.08% ± 26.45%
█▇▅▅▆▆▅ ▁▁▁ ▁ ▁
███████▇▁▁▁▁▁▁▁▁▆▅▁▄▅▁▆▆▆▆▆▇▆▇█▇▇██████▆▆██▇███▅███▇▇▆▆▅▁▆ █
3.53 ms Histogram: log(frequency) by time 13.2 ms <
Memory estimate: 40.66 MiB, allocs estimate: 6192.
Running the code once altogether
On the other hand, if we define a function that runs it twice, from the same starting point, say:
Xβ_init1 = X_nl * β_test1
Xβ_init2 = X_nl * β_test2
function run_twice(Xβ_init1, Xβ_init2)
δ_test = copy(δ_logit) # added to emphasize that they are starting from the same point
recover_δ!(δ_test, Xβ_init1)
δ_test = copy(δ_logit) # added to emphasize that they are starting from the same point
recover_δ!(δ_test, Xβ_init2)
end
@benchmark run_twice($Xβ_init1, $Xβ_init2)
The above code produces massively different results.
BenchmarkTools.Trial: 5 samples with 1 evaluation.
Range (min … max): 1.178 s … 1.272 s ┊ GC (min … max): 18.20% … 22.65%
Time (median): 1.239 s ┊ GC (median): 19.96%
Time (mean ± σ): 1.231 s ± 37.123 ms ┊ GC (mean ± σ): 20.27% ± 2.11%
█ █ █ █ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁█ ▁
1.18 s Histogram: frequency by time 1.27 s <
Memory estimate: 7.81 GiB, allocs estimate: 503622.
I just don’t get why. I am completely lost on why running the same code twice can produce massively different results than running it once, but two different times. Running the function run_twice
without the δ_test = copy(δ_logit)
lines produces almost the same results as the above (mean time and allocs estimates are virtually the same, if not higher).
I just don’t get what I am doing wrong, and how to optimize the code further. The code for this function is below:
function recover_δ!(δ, Xβ)
# Initialize variables for ForwardDiff
len_δ = length(log_share)
type_Xβ = eltype(Xβ)
δk_new = zeros(type_Xβ, len_δ);
δ1 = zeros(type_Xβ, len_δ);
δ2 = zeros(type_Xβ, len_δ);
δk = zeros(type_Xβ, len_δ);
rk = zeros(type_Xβ, len_δ);
vk = zeros(type_Xβ, len_δ);
σk_agg = zeros(type_Xβ, len_δ);
σ1_agg = zeros(type_Xβ, len_δ);
σk_ind = similar(Xβ);
σ1_ind = similar(Xβ);
Threads.@threads for i in eachindex(mkt_starts_chunks)
# Select observations of the full data
idx = mkt_starts_chunks[i]:mkt_ends_chunks[i]
# Get IDs of things in the data
id_alt = @view id_fd_alt[idx]
id_ind = @view id_fd_ind[idx];
id_dt = id_alt[1]:id_alt[end];
# Get slices to work on
wt_mkt = @view wt_per_mkt[idx];
Xβ_slice = @view Xβ[idx]
sh_slice = @view log_share[id_dt];
NoOut_slice = @view NoOut[id_dt];
δ_slice = @view δ[id_dt];
# Get slices for δ auxiliary variables
δk_slice = @view δk[id_dt];
δk_new_slice = @view δk_new[id_dt];
δ1_slice = @view δ1[id_dt];
δ2_slice = @view δ2[id_dt][NoOut_slice];
# Get slices of rk, vk
rk_slice = @view rk[id_dt][NoOut_slice]
vk_slice = @view vk[id_dt][NoOut_slice]
# Get slices for σk, and σ1
σk_agg_slice = @view σk_agg[id_dt]
σ1_agg_slice = @view σ1_agg[id_dt]
# Get slices of σk_ind and σ1_ind
σk_ind_slice = @view σk_ind[idx]
σ1_ind_slice = @view σ1_ind[idx]
# Renormalize for sum
id_alt = id_alt .- id_alt[1] .+ 1;
id_ind = id_ind .- id_ind[1] .+ 1;
# Get new values of delta
squarem_δ!(δ_slice, sh_slice, Xβ_slice, id_alt, id_ind, wt_mkt, NoOut_slice, i,
rk_slice, vk_slice, σk_agg_slice, σ1_agg_slice, σk_ind_slice, σ1_ind_slice,
δk_new_slice, δ1_slice, δ2_slice, δk_slice)
end
return δ
end
function squarem_δ!(δ, log_share, Xβ, id_fd_alt, id_fd_ind, wt_per_mkt, NoOut, chunk_id,
rk, vk, σk_agg, σ1_agg, σk_ind, σ1_ind, δk_new, δ1, δ2, δk)
# Parameters for the squarem
stepmin_α = 1.0;
stepmax_α = 1.0;
step_α = 4;
# PRECOMPUTE PARAMETERS FOR SPEED
log_market_share_NoOut = @view log_share[NoOut]
# INITIALIZE VARIABLES
type_Xβ = eltype(Xβ)
δk .= type_Xβ.(δ);
δ .= δk;
# Computing the additional views we need
δ_view = @view δ[NoOut];
δ1_view = @view δ1[NoOut];
δk_view = @view δk[NoOut];
δk_new_view = @view δk_new[NoOut]
σ1_view = @view σ1_agg[NoOut];
σk_view = @view σk_agg[NoOut];
for i = 1:max_iter
# Compute first iteration
compute_model_σ!(σk_ind, δk, Xβ, id_fd_alt, id_fd_ind);
wt_average!(σk_agg, σk_ind, wt_per_mkt, id_fd_alt); # need to allocate zero vector
# View and replace elements for faster iteration
δ1_view .= δk_view .+ log_market_share_NoOut .- log.(σk_view)
# Compute second iteration
compute_model_σ!(σ1_ind, δ1, Xβ, id_fd_alt, id_fd_ind);
wt_average!(σ1_agg, σ1_ind, wt_per_mkt, id_fd_alt);
# View and replace elements for faster iteration
δ2 .= δ1_view .+ log_market_share_NoOut .- log.(σ1_view)
# Compute step vectors
rk .= δ1_view .- δk_view
vk .= δ2 .- 2 .* δ1_view .+ δk_view
# Calculate extrapolation factor
norm_rk = norm(rk)
norm_vk = norm(vk)
if norm_vk < tol_norm || norm_rk < tol_norm
# Markets have converged with one iteration
δ_view .= δ2
return δ
else
# Update needed
α_candidate = norm_rk / norm_vk
α = max(stepmin_α, min(stepmax_α, α_candidate))
# Update the estimate
δk_new_view .= δk_view .+ (2*α) .* rk .+ (α^2) .* vk
# Compute the difference in estimates
diff = maximum(abs.(δk_new .- δk));
if diff < tol
copyto!(δ, δk_new)
return δ
end
if α == stepmax_α
stepmax_α = stepmax_α * step_α
end
copyto!(δk, δk_new)
fill!(σk_agg, zero(eltype(σk_agg)))
fill!(σ1_agg, zero(eltype(σk_agg)))
end
end
diff = maximum(abs.(σk_agg .- exp.(log_share)))
@warn "Some markets did not converge in chunk $chunk_id, max diff in shares is $diff."
copyto!(δ, δk_new)
return δ
end
Any guidance would be greatly appreciated!