I think in the first code fragment @sync
is not needed, actually?
I elaborated these ideas in an application. The algorithm based on tasks does not work very well. Some of the tasks tend to take quite a bit longer than others. Perhaps you are onto something with the spawning?
start = time()
a = SysmatAssemblerSparse(0.0)
elem_mat_nrows = nne
elem_mat_ncols = nne
elem_mat_nmatrices = count(fes)
ndofs_row = Temp.nfreedofs
ndofs_col = Temp.nfreedofs
startassembly!(a, elem_mat_nrows, elem_mat_ncols, elem_mat_nmatrices, ndofs_row, ndofs_col)
ntasks = Base.Threads.nthreads()
iend = 0;
Threads.@sync begin
for ch in chunks(1:count(fes), ntasks)
@info "$(ch[2]): Started $(time() - start)"
buffer_range, iend = _update_buffer_range(elem_mat_nrows, elem_mat_ncols, ch[1], iend)
Threads.@spawn let r = $ch[1], b = $buffer_range
@info "$(ch[2]): Spawned $(time() - start)"
femm1 = FEMMHeatDiff(IntegDomain(subset(fes, r), GaussRule(3, 3)), material)
_a = _task_local_assembler(a, b)
@info "$(ch[2]): Started conductivity $(time() - start)"
conductivity(femm1, _a, geom, Temp)
@info "$(ch[2]): Finished $(time() - start)"
The threaded version actually works quite well.
ntasks = Base.Threads.nthreads()
_a = []
_r = []
iend = 0;
for ch in chunks(1:count(fes), ntasks)
buffer_range, iend = _update_buffer_range(elem_mat_nrows, elem_mat_ncols, ch[1], iend)
push!(_a, _task_local_assembler(a, buffer_range))
push!(_r, ch[1])
end
@info "Finished $(time() - start)"
Threads.@threads for th in eachindex(_a)
@info "$(th): Started $(time() - start)"
femm1 = FEMMHeatDiff(IntegDomain(subset(fes, _r[th]), GaussRule(3, 3)), material)
conductivity(femm1, _a[th], geom, Temp)
@info "$(th): Finished $(time() - start)"
end
@info "Started make-matrix $(time() - start)"
a.buffer_pointer = iend
K = makematrix!(a)
@info "All done $(time() - start)"
# @info "Short-circuited exit"
Some scaling data is provided in this thread: Parallel assembly of a finite element sparse matrix - #19 by PetrKryslUCSD