I played with it a bit, you can try this for zero-allocation multithreading. Its just reusing the same vector of gradients. I used 30 threads/chunks.
function ∇LL_par(θ, data, grads, idx_chunks)
ndata = length(data)
@batch for i in eachindex(grads)
grads[i] = SVector(0.0, 0.0, 0.0)
for j in idx_chunks[i]
obs = data[j]
X = to_X(obs)
e = exp(dot(θ, X))
grads[i] += (obs.y - e/(1+e)) * X
end
end
return @SVector [sum(grads[i][j] for i in eachindex(grads))/ndata for j in 1:3]
end
function gradient_descent_par(θ_init, data, grads, idx_chunks; τ = 0.02, tol = 1e-5)
iter = 1
error = 1.0
θ_sol = θ_init
while error > tol
θ_new = θ_sol + τ * ∇LL_par(θ_sol, data, grads, idx_chunks)
error = maximum(abs.(θ_new .- θ_sol))
@reset θ_sol = θ_new
iter += 1
end
# println("Converged in $(iter) iterations.")
return θ_sol
end
θ = @SVector [-7.0, 1.0, 0.0];
data = simulate_logit_data(n_obs = 30000);
nchunks = 30
idx_chunks = index_chunks(1:length(data), n = nchunks);
grads0 = @MVector [SVector(0.0, 0.0, 0.0) for _ in 1:nchunks];
@benchmark ∇LL_no_allocs($θ, $data)
@benchmark ∇LL_par($θ, $data, $grads0, $idx_chunks)
@benchmark gradient_descent_no_allocs2($θ, $data) seconds = 20
@benchmark gradient_descent_par($θ, $data, $grads0, $idx_chunks) seconds = 20
Benchmarks:
julia> @benchmark ∇LL_no_allocs($θ, $data)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 280.796 μs … 629.116 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 284.687 μs ┊ GC (median): 0.00%
Time (mean ± σ): 287.641 μs ± 20.585 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▄▃ ▃█▁ ▇▇▁▁▇▂▂▃▅▂▂▃▂▁▁▃▁ ▃ ▂ ▁ ▁▁ ▂
██▄███▇████████████████████▇▆██▇▇██▅▆██▆▆▇█▆▇▆▆▆▅▆▇▅▅▄▅▅▁▃▄▄▃ █
281 μs Histogram: log(frequency) by time 310 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark ∇LL_par($θ, $data, $grads0, $idx_chunks)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 85.582 μs … 287.277 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 92.612 μs ┊ GC (median): 0.00%
Time (mean ± σ): 95.644 μs ± 9.853 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▄▇██▅▂
▂▂▃▅████████▇▅▅▄▄▄▃▃▄▄▃▃▃▃▂▂▂▂▂▂▁▂▂▂▁▂▂▁▂▂▂▂▂▂▂▂▃▃▃▃▂▃▂▂▃▂▂▂ ▃
85.6 μs Histogram: frequency by time 132 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark gradient_descent_no_allocs2($θ, $data) seconds = 20
BenchmarkTools.Trial: 3 samples with 1 evaluation.
Range (min … max): 9.133 s … 9.142 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 9.136 s ┊ GC (median): 0.00%
Time (mean ± σ): 9.137 s ± 4.273 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
9.13 s Histogram: frequency by time 9.14 s <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark gradient_descent_par($θ, $data, $grads0, $idx_chunks) seconds = 20
BenchmarkTools.Trial: 7 samples with 1 evaluation.
Range (min … max): 3.049 s … 3.091 s ┊ GC (min … max): 0.00% … 0.00%
Time (median): 3.064 s ┊ GC (median): 0.00%
Time (mean ± σ): 3.069 s ± 17.705 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █ █ █ █ █ █
█▁▁█▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁█▁▁▁█ ▁
3.05 s Histogram: frequency by time 3.09 s <
Memory estimate: 0 bytes, allocs estimate: 0.
Full Script
using StaticArrays
using BenchmarkTools
using LinearAlgebra
using Accessors
using ChunkSplitters, Polyester
struct Observation{T<:Real}
y::T
x1::T
x2::T
end
# efficiently turn Observation into a vector of statically known size
to_X(o::Observation) = SVector(1.0, o.x1, o.x2)
# this function will generate the logit data for Q4
function simulate_logit_data(; n_obs = 300000)
# define the DGP
X1 = 1.0:0.01:8.0
X2 = 18.0:0.01:40.0
α = -8.0
θ_1 = 1.5
θ_2 = 0.02
# pre-allocate the container for the data
data = Array{Observation{Float64}}(undef, n_obs)
for i in eachindex(data)
x1 = rand(X1)
x2 = rand(X2)
u = rand()
y = (α + θ_1*x1 + θ_2*x2 + log(u/(1-u)) > 0) ? 1.0 : 0.0
data[i] = Observation(y, x1, x2)
end
return data
end
function ∇LL(
θ,
data
)
grad = @MArray(zeros(length(θ)))
for i in eachindex(data)
obs = data[i]
X = to_X(obs)
e = exp(dot(θ, X))
grad .+= (obs.y - e/(1+e)) * X
end
return grad / length(data)
end
# compute the gradient
function ∇LL_no_allocs(
θ,
data
)
grad = zero(SVector{length(θ), Float64})
for i in eachindex(data)
obs = data[i]
X = to_X(obs)
e = exp(dot(θ, X))
grad += (obs.y - e/(1+e)) * X
end
return grad / length(data)
end
function gradient_descent(
θ_init,
data;
τ = 0.02,
tol = 1e-5
)
iter = 1
error = 1.0
θ_sol = @MArray(zeros(length(θ_init)))
θ_new = @MArray(zeros(length(θ_init)))
θ_sol .= θ_init
while error > tol
θ_new .= θ_sol + τ * ∇LL(θ_sol, data)
error = maximum(abs.(θ_new .- θ_sol))
θ_sol .= θ_new
iter += 1
end
# println("Converged in $(iter) iterations.")
return θ_sol
end
function gradient_descent_no_allocs(
θ_init,
data;
τ = 0.02,
tol = 1e-5
)
iter = 1
error = 1.0
θ_sol = @MArray(zeros(length(θ_init)))
θ_new = @MArray(zeros(length(θ_init)))
θ_sol .= θ_init
while error > tol
θ_new .= θ_sol + τ * ∇LL_no_allocs(θ_sol, data)
error = maximum(abs.(θ_new .- θ_sol))
θ_sol .= θ_new
iter += 1
end
# println("Converged in $(iter) iterations.")
return θ_sol
end
function gradient_descent_no_allocs2(
θ_init,
data;
τ = 0.02,
tol = 1e-5
)
iter = 1
error = 1.0
θ_sol = θ_init
while error > tol
θ_new = θ_sol + τ * ∇LL_no_allocs(θ_sol, data)
error = maximum(abs.(θ_new .- θ_sol))
@reset θ_sol = θ_new
iter += 1
end
# println("Converged in $(iter) iterations.")
return θ_sol
end
####################
### New Versions ###
####################
function ∇LL_par(θ, data, grads, idx_chunks)
ndata = length(data)
@batch for i in eachindex(grads)
grads[i] = SVector(0.0, 0.0, 0.0)
for j in idx_chunks[i]
obs = data[j]
X = to_X(obs)
e = exp(dot(θ, X))
grads[i] += (obs.y - e/(1+e)) * X
end
end
return @SVector [sum(grads[i][j] for i in eachindex(grads))/ndata for j in 1:3]
end
function gradient_descent_par(θ_init, data, grads, idx_chunks; τ = 0.02, tol = 1e-5)
iter = 1
error = 1.0
θ_sol = θ_init
while error > tol
θ_new = θ_sol + τ * ∇LL_par(θ_sol, data, grads, idx_chunks)
error = maximum(abs.(θ_new .- θ_sol))
@reset θ_sol = θ_new
iter += 1
end
# println("Converged in $(iter) iterations.")
return θ_sol
end
θ = @SVector [-7.0, 1.0, 0.0];
data = simulate_logit_data(n_obs = 30000);
nchunks = 30
idx_chunks = index_chunks(1:length(data), n = nchunks);
grads0 = @MVector [SVector(0.0, 0.0, 0.0) for _ in 1:nchunks];
@benchmark ∇LL_no_allocs($θ, $data)
@benchmark ∇LL_par($θ, $data, $grads0, $idx_chunks)
@benchmark gradient_descent_no_allocs2($θ, $data) seconds = 20
@benchmark gradient_descent_par($θ, $data, $grads0, $idx_chunks) seconds = 20
x = gradient_descent_no_allocs2(θ, data)
y = gradient_descent_par(θ, data, grads0, idx_chunks)
x == y
There seems to be a small rounding error when summing the batches though.