Sparse matrix-vector product: much more slow than Matlab

Are you sure it’s still computing the same thing? I would do the changes one at a time making sure the result is the same each time. If you change it to non-allocating iteratively, it’ll be clear what change caused the problem. I cannot see what it is right off the bat though. What’s sparseM,sparseR,sparseS,q? Can you make it an MWE, i.e. should be able to copy/paste to run it.

Hi @ChrisRackauckas, by your suggestions, I do the changes one at a time and find what caused the difference. Here are the codes:

Version 1: (You can copy, paste and run)

numGrids = 799;
q = rand(numGrids);
rows1 = zeros(Int64,numGrids*numGrids);
cols1 = zeros(Int64,numGrids*numGrids);
vals1 = zeros(numGrids*numGrids);
rows2 = zeros(Int64,numGrids*numGrids);
cols2 = zeros(Int64,numGrids*numGrids);
vals2 = zeros(numGrids*numGrids);
rows3 = zeros(Int64,numGrids*numGrids*numGrids);
cols3 = zeros(Int64,numGrids*numGrids*numGrids);
vals3 = zeros(numGrids*numGrids*numGrids);

count1 = 0;
count2 = 0;
count3 = 0;

for i = 1:numGrids
    for j = 1:numGrids
        r = rand();
        if (r > 0.965)
            count1 += 1;
            rows1[count1] = i;
            cols1[count1] = j;
            vals1[count1] = r;
        end

        r = rand();
        if (r > 0.965)
            count2 += 1;
            rows2[count2] = i;
            cols2[count2] = j;
            vals2[count2] = r;
        end
    end

    for k = 1:(numGrids*numGrids)
        r = rand();
        if (r > 0.977)
            count3 += 1;
            rows3[count3] = i;
            cols3[count3] = k;
            vals3[count3] = r;
        end
    end
end

rows1 = rows1[1:count1];
cols1 = cols1[1:count1];
vals1 = vals1[1:count1];
rows2 = rows2[1:count2];
cols2 = cols2[1:count2];
vals2 = vals2[1:count2];
rows3 = rows3[1:count3];
cols3 = cols3[1:count3];
vals3 = vals3[1:count3];

sparseM = sparse(rows1,cols1,vals1,numGrids,numGrids);
sparseR = sparse(rows2,cols2,vals2,numGrids,numGrids);
sparseS = sparse(rows3,cols3,vals3,numGrids,numGrids*numGrids);

function testFun(sparseM,sparseR,sparseS,q,numIters,numGrids)
    tmp = Array{Float64}(numGrids, numGrids);
    tmp2 = Array{Float64}(numGrids, numGrids);
    tmp3 = Array{Float64}(numGrids, numGrids);
    for time = 1:numIters
        tmp .= 0.5 .* 0.05 .* (sparseR .+ reshape(q'*sparseS, numGrids, numGrids));
        tmp2 .= sparseM .+ tmp;
        tmp3 .= sparseM .- tmp;
        rhs = tmp2 * q;
        forwardQ = tmp3 \ rhs;
    end
end

@time testFun(sparseM,sparseR,sparseS,q,10,799)

It shows 1.535764 seconds (113.92 k allocations: 122.168 MiB, 10.65% gc time)

Version 2: (you can also copy, paste and run)

numGrids = 799;
q = rand(numGrids);
rows1 = zeros(Int64,numGrids*numGrids);
cols1 = zeros(Int64,numGrids*numGrids);
vals1 = zeros(numGrids*numGrids);
rows2 = zeros(Int64,numGrids*numGrids);
cols2 = zeros(Int64,numGrids*numGrids);
vals2 = zeros(numGrids*numGrids);
rows3 = zeros(Int64,numGrids*numGrids*numGrids);
cols3 = zeros(Int64,numGrids*numGrids*numGrids);
vals3 = zeros(numGrids*numGrids*numGrids);

count1 = 0;
count2 = 0;
count3 = 0;

for i = 1:numGrids
    for j = 1:numGrids
        r = rand();
        if (r > 0.965)
            count1 += 1;
            rows1[count1] = i;
            cols1[count1] = j;
            vals1[count1] = r;
        end

        r = rand();
        if (r > 0.965)
            count2 += 1;
            rows2[count2] = i;
            cols2[count2] = j;
            vals2[count2] = r;
        end
    end

    for k = 1:(numGrids*numGrids)
        r = rand();
        if (r > 0.977)
            count3 += 1;
            rows3[count3] = i;
            cols3[count3] = k;
            vals3[count3] = r;
        end
    end
end

rows1 = rows1[1:count1];
cols1 = cols1[1:count1];
vals1 = vals1[1:count1];
rows2 = rows2[1:count2];
cols2 = cols2[1:count2];
vals2 = vals2[1:count2];
rows3 = rows3[1:count3];
cols3 = cols3[1:count3];
vals3 = vals3[1:count3];

sparseM = sparse(rows1,cols1,vals1,numGrids,numGrids);
sparseR = sparse(rows2,cols2,vals2,numGrids,numGrids);
sparseS = sparse(rows3,cols3,vals3,numGrids,numGrids*numGrids);

function testFun(sparseM,sparseR,sparseS,q,numIters,numGrids)
    tmp = Array{Float64}(numGrids, numGrids);
    tmp2 = Array{Float64}(numGrids, numGrids);
    tmp3 = Array{Float64}(numGrids, numGrids);
    longVector = Array{Float64}(1,numGrids * numGrids);
    for time = 1:numIters
        At_mul_B!(longVector, q, sparseS);
        tmp .= 0.5 .* 0.05 .* (sparseR .+ reshape(longVector, numGrids, numGrids));
        tmp2 .= sparseM .+ tmp;
        tmp3 .= sparseM .- tmp;
        rhs = tmp2 * q;
        forwardQ = tmp3 \ rhs;
    end
end

@time testFun(sparseM,sparseR,sparseS,q,10,799)

It shows 18.268159 seconds (55.34 k allocations: 31.397 MiB)

Version 2 is much more slow than version 1 but it has less memory allocations. The only difference between version 1 and version 2 is that in version 2, I used At_mul_B!(longVector, q, sparseS); do you know why it makes the speed so slow?

I ran

function testFun2(sparseM,sparseR,sparseS,q,numIters,numGrids)
    tmp = Array{Float64}(numGrids, numGrids);
    tmp2 = Array{Float64}(numGrids, numGrids);
    tmp3 = Array{Float64}(numGrids, numGrids);
    longVector = Array{Float64}(1,numGrids * numGrids);
    for time = 1:numIters
        At_mul_B!(longVector, q, sparseS);
        @show @which q'*sparseS
        tmp .= 0.5 .* 0.05 .* (sparseR .+ reshape(q'*sparseS, numGrids, numGrids));
        tmp2 .= sparseM .+ tmp;
        tmp3 .= sparseM .- tmp;
        rhs = tmp2 * q;
        forwardQ = tmp3 \ rhs;
    end
end

@time testFun2(sparseM,sparseR,sparseS,q,10,799)

which tells me two things. First I get the speed slowdown, so At_mul_B! is seemingly not what I want (it must just be slow for sparse matrices? That makes sense given how sparse matrices are stored). Then it shows

@which(q' * sparseS) = *(rowvec::RowVector, mat::AbstractArray{T,2} where T) in Base.LinAlg at linalg\rowvector.jl:181

which points me to

@inline *(rowvec::RowVector, mat::AbstractMatrix) = transpose(mat.' * transpose(rowvec))

and checking

@which rand(3,3).' * transpose(rand(3)')

points me to

function (*)(A::StridedMatrix{T}, x::StridedVector{S}) where {T<:BlasFloat,S}
    TS = promote_op(matprod, T, S)
    A_mul_B!(similar(x, TS, size(A,1)), A, convert(AbstractVector{TS}, x))
end

so that tells us how the fast version is lowering (and it shows you where the temporary is made). But I think I missed something. This brings me to two separate versions:

function testFun2(sparseM,sparseR,sparseS_transp,q,numIters,numGrids)
    forwardQ = Array{Float64}(numGrids,1);
    tmp = Array{Float64}(numGrids, numGrids);
    tmp2 = Array{Float64}(numGrids, numGrids);
    tmp3 = Array{Float64}(numGrids, numGrids);
    longVector = Array{Float64}(numGrids * numGrids);
    for time = 1:numIters
        A_mul_B!(longVector, sparseS_transp, q);
        tmp .= 0.5 .* 0.05 .* (sparseR .+ reshape(longVector, numGrids, numGrids));
        tmp2 .= sparseM .+ tmp;
        tmp3 .= sparseM .- tmp;
        A_mul_B!(forwardQ,tmp2,q);
        A_ldiv_B!(lufact!(tmp3),forwardQ);
    end
end

@time testFun2(sparseM,sparseR,sparseS.',q,10,799)

function testFun(sparseM,sparseR,sparseS,q,numIters,numGrids)
    forwardQ = Array{Float64}(numGrids,1);
    tmp = Array{Float64}(numGrids, numGrids);
    tmp2 = Array{Float64}(numGrids, numGrids);
    tmp3 = Array{Float64}(numGrids, numGrids);
    for time = 1:numIters
        tmp .= 0.5 .* 0.05 .* (sparseR .+ reshape(q'*sparseS, numGrids, numGrids));
        tmp2 .= sparseM .+ tmp;
        tmp3 .= sparseM .- tmp;
        A_mul_B!(forwardQ,tmp2,q);
        A_ldiv_B!(lufact!(tmp3),forwardQ);
    end
end

@time testFun(sparseM,sparseR,sparseS,q,10,799)

I might have misinterpreted what Julia is internally doing a little bit since testFun2 is about 2x slower than testFun so there’s something going a little wrong. But those are the tools you’d use to find out so I’ll leave it at that. But what this is pointing to is that you might want to just be storing the transpose of that sparse matrix for this algorithm.

It is possible to use https://github.com/JuliaSparse/MKLSparse.jl to use MKL for sparse operations.

Have you considered transposing S (before, not in the loop)?

n = 799; S = sprand(n*n,n,1.0-0.977); q=rand(n); qo=zeros(n*n);
@time A_mul_B!(qo,S,q);
  0.156302 seconds (4 allocations: 160 bytes)

n = 799; St = sprand(n,n*n,1.0-0.977); q=rand(n); qo=zeros(n*n);
@time At_mul_B!(qo,St,q);
  0.032524 seconds (4 allocations: 160 bytes)

Reason: q fits into L1, qo does not. I don’t know a lot about how julia handles sparse matrices, but we clearly want to operate with sequential writes and random reads here.

Edit: I overlooked it in the thread, you did consider it. Alternatively, you don’t really need to store S as a matrix. Just sort first by output-index, then by input-index, and implement the matrix multiplication by hand (trivial loop).

Edit2: At_mul_B! is faster than the trivial loop. So use that, I guess?