You could do that by storing an internal CooBuffer::Vector{Tuple{Ti,Ti,Tv}}, or alternatively an internal CooDict::Dict{Tuple{Ti,Ti}, Tv} instead of the linked list.
So, to put some more numbers to it:
using BenchmarkTools, SparseArrays
begin
mutable struct CooBuffer
const I::Vector{Int}
const J::Vector{Int}
const V::Vector{Float64}
end
CooBuffer()=CooBuffer(Int[], Int[], Float64[]);
function Base.setindex!(b::CooBuffer, v, i, j)
push!(b.I, i); push!(b.J, j); push!(b.V, v);
end
function Base.sizehint!(c::CooBuffer, n)
sizehint!(c.I, n); sizehint!(c.J, n); sizehint!(c.V, n);
end
mutable struct CooBuffer2
cur_size::Int
const max_size::Int
const I::Vector{Int}
const J::Vector{Int}
const V::Vector{Float64}
end
CooBuffer2(n) = CooBuffer2(0, n, zeros(Int, n), zeros(Int, n), zeros(Float64, n))
function Base.setindex!(b::CooBuffer2, v, i, j)
b.cur_size += 1
idx = b.cur_size
b.I[idx] = i
b.J[idx] = j
b.V[idx] = v
nothing
end
function serial_assembly!(M, n)
for i=1:n
if i==1
M[i,i ] = 1.0
M[i,i+1] = -1.0
elseif i==n
M[i,i-1] = -1.0
M[i,i ] = 1.0
else
M[i,i-1] = -1.0
M[i,i ] = 2.0
M[i,i+1] = -1.0
end
end
end
N=10_000_000
println("no sizehint")
@btime begin
tmp = CooBuffer()
serial_assembly!(tmp, N)
end
println("sizehint")
@btime begin
tmp = CooBuffer()
sizehint!(tmp, 3*N)
serial_assembly!(tmp, N)
end
println("no push")
@btime begin
tmp = CooBuffer2(3*N)
serial_assembly!(tmp, N)
end
println("deepcopy")
tmp = CooBuffer2(3*N)
serial_assembly!(tmp, N)
@btime deepcopy(tmp);
sparsifyBuff(b, m, n) = sparse(b.I, b.J, b.V, m, n, nothing)
tmp = CooBuffer()
serial_assembly!(tmp, N)
println("coo->csc")
@btime sparsifyBuff(tmp, N, N);
nothing
end
With that, I get output
no sizehint
325.538 ms (136 allocations: 2.65 GiB)
sizehint
142.526 ms (11 allocations: 686.65 MiB)
no push
55.298 ms (11 allocations: 686.65 MiB)
deepcopy
50.393 ms (17 allocations: 686.65 MiB)
coo->csc
145.477 ms (22 allocations: 1.12 GiB)
From that, we can conclude: try to sizehint your buffers. Maybe don’t use push!, that apparently has issues with loop unrolling (the extra cost is ~1 ns per pushed element). With that, I roughly reach “allocate + memcpy” speed for construction.
Unless you find a good way to multithread coo->csc conversion, there will be very limited gains from multithreading the assembly of the coo matrix. If you decide to multithread that, don’t use nthreads, you’re memory bandwidth bound, not compute bound. Look up how many cores are needed to saturate your memory.
Also don’t forget to check
cat /sys/kernel/mm/transparent_hugepage/enabled
[always] madvise never
If I set that to madvise (which some linux distros have as default), my timings degrade to
no sizehint
651.638 ms (136 allocations: 2.65 GiB)
sizehint
200.707 ms (11 allocations: 686.65 MiB)
no push
129.375 ms (11 allocations: 686.65 MiB)
deepcopy
123.075 ms (17 allocations: 686.65 MiB)
coo->csc
231.989 ms (22 allocations: 1.12 GiB)