You need to allocate one array of results for each thread and add them at the end. Packages as Floops.jl
do that for you.
I use this package in many cases: GitHub - m3g/ChunkSplitters.jl: Simple chunk splitters for parallel loop executions
With this pattern:
using ChunkSplitters
function chunk_splitters(N, list; nchunks = Threads.nthreads())
results = [ zeros(N) for _ in 1:nchunks ]
Threads.@threads for (list_range, ichunk) in chunks(list, nchunks)
for i in list_range
i, j, d = list[i]
results[ichunk][i] += d
results[ichunk][j] += d
end
end
return sum(results)
end
Note, however, that with such a simple operation, you will probably be memory-bounded (see this thread for some discussion about this).
I can get a good speedup from multi-threading for a list of pairs of about 3 millions pairs.
Here is a complete running example, in which I preallocate the arrrays used for multi-threading:
Code
using Test
using CellListMap
using StaticArrays
using BenchmarkTools
using ChunkSplitters
function single_threaded(result, list)
result .= 0.0
for i in eachindex(list)
i, j, d = list[i]
result[i] += d
result[j] += d
end
return result
end
function chunk_splitters(result, list, nchunks, results_threaded)
# reset
result .= 0.0
for i in eachindex(results_threaded)
results_threaded[i] .= 0.0
end
# compute for each chunk
Threads.@threads for (Lrange, ichunk) in chunks(list, nchunks)
for i in Lrange
i, j, d = list[i]
results_threaded[ichunk][i] += d
results_threaded[ichunk][j] += d
end
end
# reduce
for i in eachindex(results_threaded)
result .+= results_threaded[i]
end
return result
end
function main()
N = 10^4
# generates points with the atomic density of water and a realistic cutoff
# for molecular systems
points, box = CellListMap.xatomic(N)
system = InPlaceNeighborList(x=points, cutoff=box.cutoff)
list = neighborlist!(system)
result = zeros(N)
nchunks = Threads.nthreads()
results_threaded = [ copy(result) for _ in 1:nchunks ]
println("Number of pairs in list: $(length(list))")
@test single_threaded(result, list) ≈ chunk_splitters(result, list, nchunks, results_threaded)
@btime single_threaded($result, $list)
@btime chunk_splitters($result, $list, $nchunks, $results_threaded)
nothing
end
With which I get:
julia> main()
Number of pairs in list: 2689668
7.570 ms (0 allocations: 0 bytes)
2.188 ms (49 allocations: 4.36 KiB)