I am trying to speed my code up using the Threads.@threads macro and splitting an array into multiple views so that each thread works on a separate view, but am not getting the performance gains I had hoped for.
Consider the following MWE
using ChairMarks
"Partition `A` into `n` equally sized views"
function partition(A::AbstractVector, n::Integer)
i0 = firstindex(A)
in = lastindex(A)
slot_boundaries = round.(Int, range(i0-1, in; length=n + 1))
views = [view(A, (slot_boundaries[i]+1):(slot_boundaries[i+1])) for i in 1:n]
return views
end
"""
tmap!(f!, A, n_threads)
Apply `f!` to `A` paralellising over `n_threads`.
`f!(view::Subarray{T}, aux_vec::Vector{T}, i::Int, v::T)` may mutate
`view` at `i` and use `aux_vec` as scratch space.
"""
function tmap!(f!::Function,
A::AbstractVector{T},
n_threads = Threads.nthreads()) where {T}
aux_vecs = [Vector{T}() for _ in 1:n_threads]
views = partition(A, n_threads)
Threads.@threads for (view, aux_vec) in collect(zip(views, aux_vecs))
for (i, v) in pairs(view)
f!(view, aux_vec, i, v)
end
end
return A
end
and benchmarking it first without ever using the aux_vecs gives a modest speed-up
A = rand(100_000_000)
@bs tmap!(A, 1) do view, aux_vec, i, v
@inbounds view[i] = 1.1 * v
end
#=
Chairmarks.Benchmark: 2 samples with 1 evaluation.
Range (min … max): 91.355 ms … 112.100 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 101.727 ms ┊ GC (median): 0.00%
Time (mean ± σ): 101.727 ms ± 14.669 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
91.4 ms Histogram: frequency by time 112 ms <
Memory estimate: 4.09 KiB, allocs estimate: 70.
=#
@bs tmap!(A, 10) do view, aux_vec, i, v
@inbounds view[i] = 1.1 * v
end
#=
Chairmarks.Benchmark: 4 samples with 1 evaluation.
Range (min … max): 30.469 ms … 30.788 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 30.606 ms ┊ GC (median): 0.00%
Time (mean ± σ): 30.617 ms ± 138.751 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
█ █ █ █
█▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
30.5 ms Histogram: frequency by time 30.8 ms <
Memory estimate: 5.25 KiB, allocs estimate: 73.
=#
but when also using the aux_vecs there is almost no speed-up at all left:
@bs tmap!(A, 1) do view, aux_vec, i, v
@inbounds view[i] = 1.1 * v
push!(av, v)
end
#=
Chairmarks.Benchmark: 1 sample with 1 evaluation.
Single result which took 2.049 s (17.75% GC) to evaluate,
with a memory estimate of 2.69 GiB, over 120972 allocations.
=#
@bs tmap!(A, 4) do view, aux_vec, i, v
@inbounds view[i] = 1.1 * v
push!(aux_vec, v)
end
#=
Chairmarks.Benchmark: 1 sample with 1 evaluation.
Single result which took 1.685 s (35.45% GC) to evaluate,
with a memory estimate of 2.38 GiB, over 120964 allocations.
=#
@bs tmap!(A, 10) do view, aux_vec, i, v
@inbounds view[i] = 1.1 * v
push!(aux_vec, v)
end
#=
Chairmarks.Benchmark: 1 sample with 1 evaluation.
Single result which took 839.061 ms (42.43% GC) to evaluate,
with a memory estimate of 2.30 GiB, over 485 allocations.
=#
I find this very surprising, because each thread is given a separate aux_vec, so I wouldn’t expect any concurrency issues there.
Note that in this case I am not really doing anything with aux_vecs, but in the actual code I will do a reduce operation after the tmap!.