Hi all, I’m trying to parallelize a simulation, but it takes just a little less than the serial version, would somebody be able to suggest me what I’m doing wrong?
function noise_vs_hebb(hebb_range, noise_range, seed=seed)
count_inside = DataFrame(noise=Float64[], hebb=Float64[], sum_inside=Int64[], sum_outside=Int64[])
l = ReentrantLock() # create lock variable
Threads.@threads for h in hebb_range
for n in noise_range
hring = HebbRing(noise=n, hebb=h, seed=seed)
hring()
sᵢ = sum(hring.S[27:37, :])
sₒ = sum(hring.S[37:47, :])
lock(l)
push!(count_inside, (n, h, sᵢ, sₒ))
unlock(l)
end
end
count_inside
end
Thanks in advance
A bit more info is probably needed. Especially: how much time does one loop iteration take? Perhaps you just have a lot of threading overhead.
It would also be more efficient to preallocate the vectors that store the results.
Then you also don’t need the lock, which may be a big part of what destroys the gains from threading.
And, of course, it always makes sense to run the profiler to see where the code spends its time.
3 Likes
hring()
takes around 50 ms to run
function simulation(a::Array{Int, 1}, b::Array{Int, 1})
data = zeros(length(a) * length(b), 3)
Threads.@threads for j in a
for i in b
result = functionTaking50ms()
data[?, :] = [j, i , result]
end
end
data
end
Like this? How do I know the index of data since I can’t use enumerate with @threads
?
Threads.@threads for i in 1:length(a)
j = a[i]
...
end
How about
for i_a = 1 : length(a)
j = a[i_a]
for i_b = 1 : length(b)
i = b[i_b]
idx = (i_a-1) * length(a) + i_b # hope I did that right in my head
data[idx, :] = ...
end
end
Or, easier, store data
in a 3D Array and then reshape later if needed, so you just have data[i_a, i_b, :] = ...
.
With a 50 ms loop, I’m not sure you will get much out of the threads, but that has to be tried (or answered by someone who has a better grasp of threading overhead).
For a 1-dimensional Array
you can synthesize your own version of enumerate
that does play nicely with Threads.@threads
struct RAEnumerate{T}
itr::T
end
Base.getindex(r::RAEnumerate,i) = (i,r.itr[i])
Base.length(r::RAEnumerate) = length(r.itr)
Base.firstindex(r::RAEnumerate) = firstindex(r.itr)
Base.lastindex(r::RAEnumerate) = lastindex(r.itr)
A=[6,7,8,9,0]
B=Array{Tuple{Int,Int}}(undef,length(A))
Threads.@threads for (key,val) in RAEnumerate(A)
B[key]=((Threads.threadid(),val^2))
end
See ThreadsX.jl versions of map
or collect
.
outerfun(hebb_range, noise_range, seed=seed) = ThreadsX.collect(innerfun(n,h,seed) for (n,h) in Iterators.product(noise_range,hebb_range))
where innerfun
returns a tuple (or named tuple) is what I would do. You can convert to a DataFrame later.
1 Like
If you wanted to simulated enumerate
for the product set of a
and b
, you could encapsulate that too.
struct R2AEnumerate{S,T}
itra::S
itrb::T
end
Base.eltype(r::R2AEnumerate) = Tuple{Int,eltype(r.itra),eltype(r.itrb)}
Base.length(r::R2AEnumerate) = length(a)*length(b)
Base.firstindex(r::R2AEnumerate) = 1
Base.lastindex(r::R2AEnumerate) = length(a)*length(b)
Base.getindex(r::R2AEnumerate, i) = (i, r.itra[i%length(a)+1], r.itrb[(i-1)÷length(a)+1])
function simulation(a::Array{Int, 1}, b::Array{Int, 1})
data = zeros(length(a) * length(b), 3)
Threads.@threads for (idx,j,i) in R2AEnumerate(a,b)
result = j+i+1000*Threads.threadid() # instead of expensive function
data[idx, :] = [j, i , result]
end
data
end
a=[1,2,3,4]
b=[100,200,500,700,900]
simulation(a,b)
The point is that the limitation of Threads.@threads
isn’t that it only works on arrays, it’s that it accesses an object via firstindex, lastindex, getindex, length
instead of via iterate
. It treats the argument of the for
loop as a weak implementer of the AbstractArray
interface, rather than assuming anything about the Iteration
interface.
1 Like