After giving this some thought, and incorporating some suggestions from this suggestion on my other thread, I’ve come up with this:
using BenchmarkTools
@views function imsum!(x, y, N::Val)
t = ntuple(N) do i
x[:,:,i]
end
@. y = (+)(t...)
return y
end
function imsum(x::Array{T, 3}) where T
y = Array{T, 2}(undef, size(x)[1:2])
return imsum!(x, y, Val(size(x, 3)))
end
@views function imsum_spawn(x::Array{T, 3}; chunksize=256) where T
chunks = Iterators.partition(axes(x, 2), chunksize)
output = Array{T, 2}(undef, size(x)[1:2])
ch = Val(size(x, 3))
@sync for chunk in chunks
@spawn imsum!(x[:, chunk, :], output[:, chunk], ch)
end
return output
end
a = rand(1000, 4000, 3);
sum(imsum(a) .!= imsum_spawn(a)) # should be 0
@benchmark imsum($a) # 23.7 ms
@benchmark imsum_spawn($a) #6.4 ms
The function imsum_spawn
is the fastest I’ve been able to get with this code, but I’m still not sure if I’m doing the parallelization in the optimum way. In my original post, changing the return value to reduce(hcat, chunk_results)
solved the problem, but was also slower because of additional memory allocations.
So my question now is, can I do anything else to speed up imsum_spawn
or more efficiently parralize it?