Asymmetric speed of in-place `sparse*dense` matrix product

For completeness, one can still fix it locally by defining

import LinearAlgebra.mul!
function mul!(C::StridedMatrix, X::StridedMatrix, A::SparseMatrixCSC)
    mX, nX = size(X)
    nX == A.m || throw(DimensionMismatch())
    fill!(C, zero(eltype(C)))
    rowval = A.rowval
    nzval = A.nzval
    @inbounds for multivec_row=1:mX, col = 1:A.n, k=A.colptr[col]:(A.colptr[col+1]-1)
        C[multivec_row, col] += X[multivec_row, rowval[k]] * nzval[k]
    end
    C
end

Benchmark:

julia> @btime $C = $A*$B;
  19.478 μs (2 allocations: 78.20 KiB)

julia> @btime $C = $B*$A;
  22.261 μs (2 allocations: 78.20 KiB)

julia> @btime mul!($C,$A,$B);
  16.077 μs (0 allocations: 0 bytes)

julia> @btime mul!($C,$B,$A);
  18.241 μs (0 allocations: 0 bytes)