For completeness, one can still fix it locally by defining
import LinearAlgebra.mul!
function mul!(C::StridedMatrix, X::StridedMatrix, A::SparseMatrixCSC)
mX, nX = size(X)
nX == A.m || throw(DimensionMismatch())
fill!(C, zero(eltype(C)))
rowval = A.rowval
nzval = A.nzval
@inbounds for multivec_row=1:mX, col = 1:A.n, k=A.colptr[col]:(A.colptr[col+1]-1)
C[multivec_row, col] += X[multivec_row, rowval[k]] * nzval[k]
end
C
end
Benchmark:
julia> @btime $C = $A*$B;
19.478 μs (2 allocations: 78.20 KiB)
julia> @btime $C = $B*$A;
22.261 μs (2 allocations: 78.20 KiB)
julia> @btime mul!($C,$A,$B);
16.077 μs (0 allocations: 0 bytes)
julia> @btime mul!($C,$B,$A);
18.241 μs (0 allocations: 0 bytes)