You are right, this seems to be the cause of the problem. I extended my last example:
function machin_series(n::Int; handover=false)
qpi = 0.0
for i in 1:n
qpi += (-1)^(i+1)/(2i-1)
handover && yield() # we yield here!
end
qpi*4
end
function startonthread(id::Int, f::F) where {F<:Function}
t = Task(nothing)
@threads for i in 1:nthreads()
if i == id
t = @async f()
end
end
fetch(t)
end
This gives the following results:
julia> @btime startonthread(1, ()->machin_series(10_000))
491.870 μs (43 allocations: 4.20 KiB)
3.1414926535900345
julia> @btime startonthread(2, ()->machin_series(10_000))
511.001 μs (45 allocations: 4.23 KiB)
3.1414926535900345
julia> @btime startonthread(1, ()->machin_series(10_000, handover=true))
147.722 ms (10044 allocations: 160.47 KiB) !!!!!!!!!
3.1414926535900345
julia> @btime startonthread(2, ()->machin_series(10_000, handover=true))
2.477 ms (10045 allocations: 160.48 KiB)
3.1414926535900345
julia> @btime startonthread(3, ()->machin_series(10_000, handover=true))
2.499 ms (10045 allocations: 160.48 KiB)
3.1414926535900345
If I yield()
inside my loop, things take longer on thread 1 than on other threads! Applications involving tasks, @async, yield and friends run faster on threads other than 1.