As you mention that the optimizations are independent, an obvious starting point would be to add multithreading.
using BenchmarkTools, Optim, Random
Random.seed!(42)
N = 10_000
# (Your d was not used, so I removed it.)
target_min = zeros(N)
a_opt = zeros(N)
f = (a, n) -> (a .+ n) .^ 2
function multi_optimize!(target_min, a_opt, f)
for n in eachindex(target_min)
opt = optimize(a -> f(a, n)[1], [0.])
target_min[n] = opt.minimum
a_opt[n] = opt.minimizer[1]
end
end
function multi_optimize_threaded!(target_min, a_opt, f)
Threads.@threads for n in eachindex(target_min) # or Polyester.@batch, or via @spawn, or ...
opt = optimize(a -> f(a, n)[1], [0.])
target_min[n] = opt.minimum
a_opt[n] = opt.minimizer[1]
end
end
@btime multi_optimize!($target_min, $a_opt, $f)
@btime multi_optimize_threaded!($target_min, $a_opt, $f)
gives on my machine (nthreads() == 8
)
117.082 ms (3229113 allocations: 95.15 MiB)
37.693 ms (3229154 allocations: 95.15 MiB)
(In all likelihood N
will be const
in your actual problem, in which case you could mark it as such and use N
in multi_optimize!
. But the performance gain will almost surely be negligible.)
Reducing allocations (not fully successfully)
Note that there are many allocations: every evaluation of f
allocates, and additionally we create a Vector
[0.]
for every value of n
. Here’s the best I can come up with at the moment to reduce the number of allocations:
using BenchmarkTools, Optim, Random
using .Threads, .Iterators
Random.seed!(42)
N = 10_000
target_min = zeros(N)
a_opt = zeros(1, N) # size(a_opt, 1) is the dimensionality of the problem; you could make this a const
initials = similar(a_opt) # initial vectors for the optimization problems
# f! is like f, but mutates a supplied input/output vector
function f!(y, a, n)
y .= (a .+ n) .^ 2
end
function multi_optimize_threaded_mutating!(target_min, a_opt, f!, initials)
d = size(a_opt, 1)
# Divide all (N) optimization problems (indices n) over a number of tasks
nb_tasks = nthreads()
ns = eachindex(target_min)
ns_per_task = partition(ns, cld(length(ns), nb_tasks))
# Spawn tasks
tasks = map(ns_per_task) do task_ns
@spawn begin
# Allocate once per task (thread)
task_target_min = zeros(length(task_ns)) # task-specific output
task_a_opt = zeros(d, length(task_ns)) # idem
task_initial = zeros(d) # initial vector, will be reused for each n
task_y = zeros(d) # output for f!, reused
for (i, n) in enumerate(task_ns)
# Wrap f! to produce a scalar-valued function
opt_func = function(a)
f!(task_y, a, n)
return task_y[1]
end
task_initial .= @view(initials[:, n])
opt = optimize(opt_func, task_initial)
task_target_min[i] = opt.minimum
task_a_opt[:, i] .= opt.minimizer[1]
end
return task_target_min, task_a_opt
end
end
# Fill the global outputs target_min and a_opt using the tasks-specific outputs
for (task_idx, task_ns) in enumerate(ns_per_task)
tsk_target_min, tsk_a_opt = fetch(tasks[task_idx])
target_min[task_ns] .= tsk_target_min
a_opt[:, task_ns] .= tsk_a_opt
end
end
with results
@btime multi_optimize_threaded_mutating!($target_min, $a_opt, $f!, $initials);
27.691 ms (1588448 allocations: 53.39 MiB)
So this roughly halves the number of allocations. It seems that every optimize
call still allocates a lot though, but I’m not knowledgeable enough about Optim.jl
to know how to improve this.
EDIT: In the code above you don’t need task_target_min
or task_a_opt
and can instead write directly to target_min
and a_opt
. This doesn’t have any noticeable impact on performance (though we save a whopping 2 * nb_tasks (=16) allocations ), but makes the code a bit simpler.
Simpler multi_optimize_threaded_mutating!
function multi_optimize_threaded_mutating!(target_min, a_opt, f!, initials)
d = size(a_opt, 1)
# Divide all (N) optimization problems (indices n) over a number of tasks
nb_tasks = nthreads()
ns = eachindex(target_min)
ns_per_task = partition(ns, cld(length(ns), nb_tasks))
# Spawn tasks
tasks = map(ns_per_task) do task_ns
@spawn begin
# Allocate once per task (thread)
task_initial = zeros(d) # initial vector, will be reused for each n
task_y = zeros(d) # output for f!, reused
for n in task_ns
# Wrap f! to produce a scalar-valued function
opt_func = function(a)
f!(task_y, a, n)
return task_y[1]
end
task_initial .= @view(initials[:, n])
opt = optimize(opt_func, task_initial)
target_min[n] = opt.minimum
a_opt[:, n] .= opt.minimizer[1]
end
end
end
for task in tasks
wait(task)
end
# or wait.(tasks), but this returns a Vector{Nothing}
end
Now, presumably your actual optimization problems will actually be somewhat related (as in the example here). In that case you can, for example, try initializing subsequent problems using solutions to previous ones.