I have a simple problem:
julia> const DIM = 1000;
julia> const ORDER = 500;
julia> const matrix_reps = [sprand(DIM, DIM, 0.01) for _ in 1:ORDER;
julia> const weights = sprandn(ORDER, 0.1);
I want to compute weighted sum of matrix_reps
, i.e. \sum_i weights[i]\cdot matrix\_reps[i].
This is what I tried so far:
julia> function w_sumFUSED(x::AbstractVector{T}, mreps::Vector{TT}) where {S<:Number, T<:Number, TT<:AbstractArray{S}}
res = spzeros(S, size(first(mreps))...)
for i in findn(x)
res .+= S(x[i]).*mreps[i]
end
return res
end
w_sumFUSED (generic function with 1 method)
julia> function w_sumBLAS(x::AbstractVector{T}, mreps::Vector{TT}) where {S<:Number, T<:Number, TT<:AbstractArray{S}}
res = spzeros(S, size(first(mreps))...)
for i in findn(x)
Base.LinAlg.axpy!(S(x[i]), mreps[i], res)
end
return res
end
w_sumBLAS (generic function with 1 method)
julia> function w_sumSUM(x::AbstractVector{T}, mreps::Vector{TT}) where {S<:Number, T<:Number, TT<:AbstractArray{S}}
return sum(S(x[i])*mreps[i] for i in findn(x))
end
w_sumSUM (generic function with 1 method)
timings:
julia> function time_everything(vect, mreps)
w_sumSUM(vect[1:10], mreps[1:10]);
println("SUM:")
@time resSUM = w_sumSUM(vect, mreps);
w_sumBLAS(vect[1:10], mreps[1:10])
println("BLAS:")
@time resBLAS = w_sumBLAS(vect, mreps);
w_sumFUSED(vect[1:10], mreps[1:10])
println("fused:")
@time resFUSED = w_sumFUSED(vect, mreps);
@assert resFUSED == resBLAS
@assert resFUSED == resSUM
end
time_everything (generic function with 1 method)
julia> time_everything(weights, matrix_reps)
SUM:
0.145636 seconds (688 allocations: 151.844 MiB, 47.66% gc time)
BLAS:
14.278971 seconds (41 allocations: 10.010 MiB)
fused:
15.503572 seconds (41 allocations: 10.010 MiB)
Question(s):
- why is BLAS so slow, despite so little allocations? (checked against
@code_typewarn
) - why is FUSED so slow, despite so little allocations? (checked against
@code_typewarn
) - (general) what is the best method for performing arithmetic on sparse matices?