Question on parallel code

Hi all, I’m trying to parallelize a simulation, but it takes just a little less than the serial version, would somebody be able to suggest me what I’m doing wrong?

function noise_vs_hebb(hebb_range, noise_range, seed=seed)
	count_inside = DataFrame(noise=Float64[], hebb=Float64[], sum_inside=Int64[], sum_outside=Int64[])
	l = ReentrantLock() # create lock variable

	Threads.@threads for h in hebb_range
		for n in noise_range

			hring = HebbRing(noise=n, hebb=h, seed=seed)
			hring()

			sᵢ = sum(hring.S[27:37, :])
			sₒ = sum(hring.S[37:47, :])


			lock(l)
			push!(count_inside, (n, h, sᵢ, sₒ))
			unlock(l)
		end
	end
	count_inside
end

Thanks in advance

A bit more info is probably needed. Especially: how much time does one loop iteration take? Perhaps you just have a lot of threading overhead.

It would also be more efficient to preallocate the vectors that store the results.

Then you also don’t need the lock, which may be a big part of what destroys the gains from threading.

And, of course, it always makes sense to run the profiler to see where the code spends its time.

3 Likes

hring() takes around 50 ms to run

function simulation(a::Array{Int, 1}, b::Array{Int, 1})
	data = zeros(length(a) * length(b), 3)

	Threads.@threads for j in a
		for i in b
			result = functionTaking50ms()

			data[?, :] = [j, i , result]
		end
	end
	data
end

Like this? How do I know the index of data since I can’t use enumerate with @threads?

Threads.@threads for i in 1:length(a)
    j = a[i]
    ...
end

How about

for i_a = 1 : length(a)
  j = a[i_a]
  for i_b = 1 : length(b)
    i = b[i_b]
    idx = (i_a-1) * length(a) + i_b  # hope I did that right in my head
    data[idx, :] = ...
  end
end

Or, easier, store data in a 3D Array and then reshape later if needed, so you just have data[i_a, i_b, :] = ....

With a 50 ms loop, I’m not sure you will get much out of the threads, but that has to be tried (or answered by someone who has a better grasp of threading overhead).

For a 1-dimensional Array you can synthesize your own version of enumerate that does play nicely with Threads.@threads

struct RAEnumerate{T}
    itr::T
end
Base.getindex(r::RAEnumerate,i) = (i,r.itr[i])
Base.length(r::RAEnumerate) = length(r.itr)
Base.firstindex(r::RAEnumerate) = firstindex(r.itr)
Base.lastindex(r::RAEnumerate) = lastindex(r.itr)

A=[6,7,8,9,0]
B=Array{Tuple{Int,Int}}(undef,length(A))
Threads.@threads for (key,val) in RAEnumerate(A)
    B[key]=((Threads.threadid(),val^2))
end

See ThreadsX.jl versions of map or collect.

outerfun(hebb_range, noise_range, seed=seed) = ThreadsX.collect(innerfun(n,h,seed) for (n,h) in Iterators.product(noise_range,hebb_range))

where innerfun returns a tuple (or named tuple) is what I would do. You can convert to a DataFrame later.

1 Like

If you wanted to simulated enumerate for the product set of a and b, you could encapsulate that too.

struct R2AEnumerate{S,T}
    itra::S
    itrb::T
end
Base.eltype(r::R2AEnumerate) = Tuple{Int,eltype(r.itra),eltype(r.itrb)}
Base.length(r::R2AEnumerate) = length(a)*length(b)
Base.firstindex(r::R2AEnumerate) = 1
Base.lastindex(r::R2AEnumerate) = length(a)*length(b)
Base.getindex(r::R2AEnumerate, i) = (i, r.itra[i%length(a)+1], r.itrb[(i-1)÷length(a)+1])

function simulation(a::Array{Int, 1}, b::Array{Int, 1})
    data = zeros(length(a) * length(b), 3)

    Threads.@threads for (idx,j,i) in R2AEnumerate(a,b)
        result = j+i+1000*Threads.threadid() # instead of expensive function
        data[idx, :] = [j, i , result]
    end
    data
end

a=[1,2,3,4]
b=[100,200,500,700,900]
simulation(a,b)

The point is that the limitation of Threads.@threads isn’t that it only works on arrays, it’s that it accesses an object via firstindex, lastindex, getindex, length instead of via iterate. It treats the argument of the for loop as a weak implementer of the AbstractArray interface, rather than assuming anything about the Iteration interface.

1 Like