Hi all, I’m trying to parallelize a simulation, but it takes just a little less than the serial version, would somebody be able to suggest me what I’m doing wrong?
function noise_vs_hebb(hebb_range, noise_range, seed=seed)
count_inside = DataFrame(noise=Float64[], hebb=Float64[], sum_inside=Int64[], sum_outside=Int64[])
l = ReentrantLock() # create lock variable
Threads.@threads for h in hebb_range
for n in noise_range
hring = HebbRing(noise=n, hebb=h, seed=seed)
hring()
sᵢ = sum(hring.S[27:37, :])
sₒ = sum(hring.S[37:47, :])
lock(l)
push!(count_inside, (n, h, sᵢ, sₒ))
unlock(l)
end
end
count_inside
end
Thanks in advance
A bit more info is probably needed. Especially: how much time does one loop iteration take? Perhaps you just have a lot of threading overhead.
It would also be more efficient to preallocate the vectors that store the results.
Then you also don’t need the lock, which may be a big part of what destroys the gains from threading.
And, of course, it always makes sense to run the profiler to see where the code spends its time.
3 Likes
hring() takes around 50 ms to run
function simulation(a::Array{Int, 1}, b::Array{Int, 1})
data = zeros(length(a) * length(b), 3)
Threads.@threads for j in a
for i in b
result = functionTaking50ms()
data[?, :] = [j, i , result]
end
end
data
end
Like this? How do I know the index of data since I can’t use enumerate with @threads?
Threads.@threads for i in 1:length(a)
j = a[i]
...
end
How about
for i_a = 1 : length(a)
j = a[i_a]
for i_b = 1 : length(b)
i = b[i_b]
idx = (i_a-1) * length(a) + i_b # hope I did that right in my head
data[idx, :] = ...
end
end
Or, easier, store data in a 3D Array and then reshape later if needed, so you just have data[i_a, i_b, :] = ....
With a 50 ms loop, I’m not sure you will get much out of the threads, but that has to be tried (or answered by someone who has a better grasp of threading overhead).
For a 1-dimensional Array you can synthesize your own version of enumerate that does play nicely with Threads.@threads
struct RAEnumerate{T}
itr::T
end
Base.getindex(r::RAEnumerate,i) = (i,r.itr[i])
Base.length(r::RAEnumerate) = length(r.itr)
Base.firstindex(r::RAEnumerate) = firstindex(r.itr)
Base.lastindex(r::RAEnumerate) = lastindex(r.itr)
A=[6,7,8,9,0]
B=Array{Tuple{Int,Int}}(undef,length(A))
Threads.@threads for (key,val) in RAEnumerate(A)
B[key]=((Threads.threadid(),val^2))
end
See ThreadsX.jl versions of map or collect.
outerfun(hebb_range, noise_range, seed=seed) = ThreadsX.collect(innerfun(n,h,seed) for (n,h) in Iterators.product(noise_range,hebb_range))
where innerfun returns a tuple (or named tuple) is what I would do. You can convert to a DataFrame later.
1 Like
If you wanted to simulated enumerate for the product set of a and b, you could encapsulate that too.
struct R2AEnumerate{S,T}
itra::S
itrb::T
end
Base.eltype(r::R2AEnumerate) = Tuple{Int,eltype(r.itra),eltype(r.itrb)}
Base.length(r::R2AEnumerate) = length(a)*length(b)
Base.firstindex(r::R2AEnumerate) = 1
Base.lastindex(r::R2AEnumerate) = length(a)*length(b)
Base.getindex(r::R2AEnumerate, i) = (i, r.itra[i%length(a)+1], r.itrb[(i-1)÷length(a)+1])
function simulation(a::Array{Int, 1}, b::Array{Int, 1})
data = zeros(length(a) * length(b), 3)
Threads.@threads for (idx,j,i) in R2AEnumerate(a,b)
result = j+i+1000*Threads.threadid() # instead of expensive function
data[idx, :] = [j, i , result]
end
data
end
a=[1,2,3,4]
b=[100,200,500,700,900]
simulation(a,b)
The point is that the limitation of Threads.@threads isn’t that it only works on arrays, it’s that it accesses an object via firstindex, lastindex, getindex, length instead of via iterate. It treats the argument of the for loop as a weak implementer of the AbstractArray interface, rather than assuming anything about the Iteration interface.
1 Like