using FLoops
using BangBang # for `append!!`
using MicroCollections # for `EmptyVector` and `SingletonVector`
using SparseArrays
function build_matrix(fun::Function, ::Type{T}, m, n) where T
# Allocate thread-local arrays, which we will later concatenate.
I = [Int[] for i in 1:Threads.nthreads()]
J = [Int[] for i in 1:Threads.nthreads()]
V = [T[] for i in 1:Threads.nthreads()]
Threads.@threads for i = 1:m
tid = Threads.threadid()
for j = 1:n
v = fun(i,j)
iszero(v) && continue
push!(I[tid], i)
push!(J[tid], j)
push!(V[tid], v)
end
end
@show I
sparse(reduce(vcat, I),
reduce(vcat, J),
reduce(vcat, V), m, n)
end
function build_matrix_floop(fun::Function, ::Type{T}, m, n) where T
# Allocate thread-local arrays, which we will later concatenate.
@floop for i = 1:m
I = Int[]
J = Int[]
V = T[]
for j = 1:n
v = fun(i,j)
iszero(v) && continue
push!(I, i)
push!(J, j)
push!(V, v)
end
@reduce(Is = append!!(EmptyVector(), I))
@reduce(Js = append!!(EmptyVector(), J))
@reduce(Vs = append!!(EmptyVector(), V))
end
sparse(Is, Js, Vs, m, n)
end
function build_matrix_channels(fun::Function, ::Type{T}, m, n) where T
Is = Channel{Vector{Int}}(m)
Js = Channel{Vector{Int}}(m)
Vs = Channel{Vector{T}}(m)
Threads.@threads for i = 1:m
I = Int[]
J = Int[]
V = T[]
for j = 1:n
v = fun(i,j)
iszero(v) && continue
push!(I, i)
push!(J, j)
push!(V, v)
end
put!(Is, I)
put!(Js, J)
put!(Vs, V)
end
close(Is); close(Js); close(Vs)
sparse(reduce(vcat, Is),
reduce(vcat, Js),
reduce(vcat, Vs), m, n)
end
Without threads:
julia> @benchmark build_matrix((i,j)->i*j, Int, 10, 10)
BenchmarkTools.Trial: 10000 samples with 6 evaluations.
Range (min β¦ max): 5.053 ΞΌs β¦ 357.837 ΞΌs β GC (min β¦ max): 0.00% β¦ 95.01%
Time (median): 5.547 ΞΌs β GC (median): 0.00%
Time (mean Β± Ο): 6.406 ΞΌs Β± 13.773 ΞΌs β GC (mean Β± Ο): 8.91% Β± 4.07%
βββββββββ
ββ βββββββββββ β β
βββββββββββββββββββββββββββββββββββββββββββββββ
β
β
ββ
βββββ
ββ
β β
5.05 ΞΌs Histogram: log(frequency) by time 9.69 ΞΌs <
Memory estimate: 13.36 KiB, allocs estimate: 38.
julia> @benchmark build_matrix_floop((i,j)->i*j, Int, 10, 10)
BenchmarkTools.Trial: 10000 samples with 6 evaluations.
Range (min β¦ max): 6.175 ΞΌs β¦ 345.855 ΞΌs β GC (min β¦ max): 0.00% β¦ 96.24%
Time (median): 6.743 ΞΌs β GC (median): 0.00%
Time (mean Β± Ο): 7.963 ΞΌs Β± 18.642 ΞΌs β GC (mean Β± Ο): 13.00% Β± 5.43%
ββββββββββ
ββββ β β
βββββββββββββββββββββββββββββββββββ
β
ββ
ββββ
β
β
ββββββββββ
β
ββββ β
6.18 ΞΌs Histogram: log(frequency) by time 11.3 ΞΌs <
Memory estimate: 24.22 KiB, allocs estimate: 120.
julia> @benchmark build_matrix_channels((i,j)->i*j, Int, 10, 10)
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min β¦ max): 10.680 ΞΌs β¦ 2.335 ms β GC (min β¦ max): 0.00% β¦ 98.02%
Time (median): 12.430 ΞΌs β GC (median): 0.00%
Time (mean Β± Ο): 14.909 ΞΌs Β± 65.073 ΞΌs β GC (mean Β± Ο): 12.14% Β± 2.76%
ββ
ββββββββββ
ββββββ β βββββββββββββ β
ββββββββββββββββββββββββββββββββββββββββββββββββββ
ββ
βββ
ββ
β
β β
10.7 ΞΌs Histogram: log(frequency) by time 22.4 ΞΌs <
Memory estimate: 36.42 KiB, allocs estimate: 203.
Havenβt had time to look at why and of course would need an actually useful test.