`mul` dispatch to BLAS incomplete?

When translating the benchmark from Outperformed by Matlab

using LinearAlgebra
using MKL
using BenchmarkTools

function _kron!(C::T1,A::T2,B::T3) where {T1 <: AbstractVector,T2 <: AbstractVector,T3 <: AbstractVector}
    @views for i = 1:size(A,1)
        mul!(C[(i-1)*size(B,1)+1:i*size(B,1)],A[i],B)
    end
    return C
end

function _kron(A::T1,B::T2) where {T1 <: AbstractVector,T2 <: AbstractVector}
    C = zeros(size(A,1) * size(B,1))
    _kron!(C,A,B)
    return C
end

function normal1!(C,Cx,x,A,B)
    temp = Vector{Float64}(undef,size(A,2)*size(B,2))
    @views for n = 1:size(A,1)
        _kron!(temp,A[n,:],B[n,:])
        BLAS.ger!(1.,temp,temp,C)
        BLAS.axpy!(x[n],temp,Cx)
    end
    return Cx
end

function kr!(C,A,B)
    @views for n = 1:size(A,1)
        _kron!(C[:,n],A[n,:],B[n,:])
    end
    return C
end

function kr(A,B)
    C = Matrix{Float64}(undef,size(A,2)*size(B,2),size(A,1))
    kr!(C,A,B)
    return C
end

function normal2!(C,Cx,x,A,B,temp,n1,n2)
    @views kr!(temp,A[n1:n2,:],B[n1:n2,:])
    BLAS.gemm!('N','T',1.,temp,temp,1.,C)
    @views BLAS.gemv!('N',1.,temp,x[n1:n2],1.,Cx)
end
function normal2!(C,Cx,x,A,B,blocksize)
    n1 = 1
    if blocksize < size(A,1)
        temp = Matrix{Float64}(undef,size(A,2)*size(B,2),blocksize)
        @views while n1 <= size(A,1)-blocksize
            n2 = n1+blocksize-1
            normal2!(C,Cx,x,A,B,temp,n1,n2)
            n1 += blocksize
        end
    end
    n2 = size(A,1)
    temp = Matrix{Float64}(undef,size(A,2)*size(B,2),n2-n1+1)
    normal2!(C,Cx,x,A,B,temp,n1,n2)
    return Cx
end

N = 10000
R = 20
M = 40

A = rand(N,M)
B = rand(N,R)
x = rand(N,)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)

normal1!(C,Cx,x,A,B)
CNormal = copy(C)
CxNormal = copy(Cx)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal2!(C,Cx,x,A,B,100)
@assert C ≈ CNormal
@assert Cx ≈ CxNormal

@btime normal1!($C,$Cx,$x,$A,$B)

# for blocksize in [10, 100, 1000, 10000, 100000]
for blocksize in [10, 100, 1000]
    @btime normal2!($C,$Cx,$x,$A,$B,$blocksize)
end

to use mul instead

using LinearAlgebra
using BenchmarkTools

function _kron!(C::T1,A::T2,B::T3) where {T1 <: AbstractVector,T2 <: AbstractVector,T3 <: AbstractVector}
    @views for i = 1:size(A,1)
        mul!(C[(i-1)*size(B,1)+1:i*size(B,1)],A[i],B)
    end
    return C
end

function _kron(A::T1,B::T2) where {T1 <: AbstractVector,T2 <: AbstractVector}
    C = zeros(size(A,1) * size(B,1))
    _kron!(C,A,B)
    return C
end

function normal1!(C,Cx,x,A,B)
    temp = Vector{Float64}(undef,size(A,2)*size(B,2))
    @views for n = 1:size(A,1)
        _kron!(temp,A[n,:],B[n,:])
        # mul!(C,temp,temp',1.,1.)
        BLAS.ger!(1.,temp,temp,C)
        mul!(Cx,temp,x[n],1.,1.)
    end
    return Cx
end

function kr!(C,A,B)
    @views for n = 1:size(A,1)
        _kron!(C[:,n],A[n,:],B[n,:])
    end
    return C
end

function kr(A,B)
    C = Matrix{Float64}(undef,size(A,2)*size(B,2),size(A,1))
    kr!(C,A,B)
    return C
end

function normal2!(C,Cx,x,A,B,temp,n1,n2)
    @views kr!(temp,A[n1:n2,:],B[n1:n2,:])
    mul!(C,temp,temp',1.,1.)
    @views mul!(Cx,temp,x[n1:n2],1.,1.)
end
function normal2!(C,Cx,x,A,B,blocksize)
    n1 = 1
    if blocksize < size(A,1)
        temp = Matrix{Float64}(undef,size(A,2)*size(B,2),blocksize)
        @views while n1 <= size(A,1)-blocksize
            n2 = n1+blocksize-1
            normal2!(C,Cx,x,A,B,temp,n1,n2)
            n1 += blocksize
        end
    end
    n2 = size(A,1)
    temp = Matrix{Float64}(undef,size(A,2)*size(B,2),n2-n1+1)
    normal2!(C,Cx,x,A,B,temp,n1,n2)
    return Cx
end

N = 10000
R = 20
M = 40

A = rand(N,M)
B = rand(N,R)
x = rand(N,)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)

normal1!(C,Cx,x,A,B)
CNormal = copy(C)
CxNormal = copy(Cx)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal2!(C,Cx,x,A,B,100)
@assert C ≈ CNormal
@assert Cx ≈ CxNormal

@btime normal1!($C,$Cx,$x,$A,$B)

# for blocksize in [10, 100, 1000, 10000, 100000]
for blocksize in [10, 100, 1000]
    @btime normal2!($C,$Cx,$x,$A,$B,$blocksize)
end

I noticed the following oddities.

  1. mul!(C,temp,temp',1.,1.) seems to compute the same as BLAS.ger!(1.,temp,temp,C) but is slooow.
  2. mul!(Cx,temp,x[n],1.,1.) seems to work (which I like very much), although this is not documented?
  3. Runtime performance using BLAS
  371.686 ms (1 allocation: 6.38 KiB)
  100.720 ms (4 allocations: 125.16 KiB)
  62.885 ms (4 allocations: 1.22 MiB)
  69.160 ms (4 allocations: 12.21 MiB)

looks significantly different to the Julia mul version

  336.500 ms (1 allocation: 6.38 KiB)
  922.396 ms (4 allocations: 125.16 KiB)
  128.485 ms (4 allocations: 1.22 MiB)
  56.164 ms (4 allocations: 12.21 MiB)

which really surprises me. This is on Julia 1.6.4, 1.7.0-rc3 behaves similar.

1 Like

Regarding 3:

Not sure, but could it be julia/matmul.jl at master · JuliaLang/julia · GitHub, which redirects to syrk instead of gemm? There is even call BLAS syrk for A'*A · Issue #659 · JuliaLang/julia · GitHub. Interesting, seems numerically sensible.

Regarding 1:

It seems to me that support for ger and syr is missing in julia/matmul.jl at master · JuliaLang/julia · GitHub.

This would be a really good issue or PR.

2 Likes

Thanks for your feedback, filed an issue.

Is there some documentation which mappings of high level Julia functions to BLAS are intended to work?

I don’t think so. Generally we just promise that whatever we decide to do will be reasonably accurate and reasonably fast.

1 Like

Which it sometimes isn’t, so that we have discussions. I’m at a loss, what to do.

Edit: it seems for high performance applications it then would be appropriate to depend on the low level BLAS wrappers.

Which brings problems of it’s own, because

using BenchmarkTools, Random, LinearAlgebra
elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = rand(elt, n)
println(BLAS.get_config())
t1 = BLAS.syr!('U', elt(1), b, copy(A))
t2 = BLAS.syr!('U', elt(1), b, Symmetric(copy(A)))

fails with

LBTConfig([ILP64] libopenblas64_.dll)
ERROR: LoadError: MethodError: no method matching strides(::Symmetric{Float64, Matrix{Float64}})

Shouldn’t at least something like this work then?

BLAS can’t handle Julia types like Symmetric and you’re calling rather directly into BLAS. So I wouldn’t expect this to work, no. But what do I know :slight_smile:

2 Likes

I’m asking the question because BLAS syr specifically expects a symmetric matrix memory layout AFAIU, and I’d have expected Symmetric to exactly provide this?

OK, so what about this then?

using BenchmarkTools, Random, LinearAlgebra

elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = Symmetric(rand(elt, n, n))
println(BLAS.get_config())
t1 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), copy(A))
t2 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), Symmetric(copy(A)))

failing again with

LBTConfig([ILP64] libopenblas64_.dll)
ERROR: LoadError: MethodError: no method matching strides(::Symmetric{Float64, Matrix{Float64}})

And you really expect us to answer MATLAB user questions regarding performance?

Filed another issue here.

Symmetric is only a wrapper. If you want to use BLAS.syrk! on your matrix A, you should use A.data.
Example :

elt = Float64
n = 32
Random.seed!(1234)
A = rand(elt, n, n)
B = zeros(elt, n, n)
A = triu(A)                      # A = tril(A)
S = Symmetric(A, :U)    # S = Symmetric(A, :L)
 
println(BLAS.get_config())
t1 = BLAS.syrk!(S.uplo, 'N', elt(1), S.data, elt(1), B)
1 Like

Yes, this seems to work:

using BenchmarkTools, Random, LinearAlgebra
elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = rand(elt, n)
t1 = BLAS.syr!('U', elt(1), b, copy(A))
println(BLAS.get_config())
t2 = BLAS.syr!('U', elt(1), b, Symmetric(copy(A)).data) 

Edit: after @amontoison `s correction this

using BenchmarkTools, Random, LinearAlgebra

elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = rand(elt, n, n)
println(BLAS.get_config())
t1 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), copy(A))
t2 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), Symmetric(copy(A)).data)

works too.

b is a Symmetric matrix, you forgot .data.

elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = Symmetric(rand(elt, n, n))
println(BLAS.get_config())
t1 = BLAS.syrk!('U', 'N', elt(1), b.data, elt(1), copy(A))
t2 = BLAS.syrk!('U', 'N', elt(1), b.data, elt(1), Symmetric(copy(A)).data)
1 Like

Edit: Nonsense on my side…

Hm. But the spec allows for general matrices there?

BLAS routines require general arrays (Vector{T} or Matrix{T}).
I don’t know why you use a symmetric matrix for b, I just fixed the error of your previous message :slight_smile:

1 Like

My mistake, excellent: this did the job, you have earned another solution point!