`mul` dispatch to BLAS incomplete?

goerch · November 20, 2021, 3:22pm

When translating the benchmark from Outperformed by Matlab

using LinearAlgebra
using MKL
using BenchmarkTools

function _kron!(C::T1,A::T2,B::T3) where {T1 <: AbstractVector,T2 <: AbstractVector,T3 <: AbstractVector}
    @views for i = 1:size(A,1)
        mul!(C[(i-1)*size(B,1)+1:i*size(B,1)],A[i],B)
    end
    return C
end

function _kron(A::T1,B::T2) where {T1 <: AbstractVector,T2 <: AbstractVector}
    C = zeros(size(A,1) * size(B,1))
    _kron!(C,A,B)
    return C
end

function normal1!(C,Cx,x,A,B)
    temp = Vector{Float64}(undef,size(A,2)*size(B,2))
    @views for n = 1:size(A,1)
        _kron!(temp,A[n,:],B[n,:])
        BLAS.ger!(1.,temp,temp,C)
        BLAS.axpy!(x[n],temp,Cx)
    end
    return Cx
end

function kr!(C,A,B)
    @views for n = 1:size(A,1)
        _kron!(C[:,n],A[n,:],B[n,:])
    end
    return C
end

function kr(A,B)
    C = Matrix{Float64}(undef,size(A,2)*size(B,2),size(A,1))
    kr!(C,A,B)
    return C
end

function normal2!(C,Cx,x,A,B,temp,n1,n2)
    @views kr!(temp,A[n1:n2,:],B[n1:n2,:])
    BLAS.gemm!('N','T',1.,temp,temp,1.,C)
    @views BLAS.gemv!('N',1.,temp,x[n1:n2],1.,Cx)
end
function normal2!(C,Cx,x,A,B,blocksize)
    n1 = 1
    if blocksize < size(A,1)
        temp = Matrix{Float64}(undef,size(A,2)*size(B,2),blocksize)
        @views while n1 <= size(A,1)-blocksize
            n2 = n1+blocksize-1
            normal2!(C,Cx,x,A,B,temp,n1,n2)
            n1 += blocksize
        end
    end
    n2 = size(A,1)
    temp = Matrix{Float64}(undef,size(A,2)*size(B,2),n2-n1+1)
    normal2!(C,Cx,x,A,B,temp,n1,n2)
    return Cx
end

N = 10000
R = 20
M = 40

A = rand(N,M)
B = rand(N,R)
x = rand(N,)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)

normal1!(C,Cx,x,A,B)
CNormal = copy(C)
CxNormal = copy(Cx)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal2!(C,Cx,x,A,B,100)
@assert C ≈ CNormal
@assert Cx ≈ CxNormal

@btime normal1!($C,$Cx,$x,$A,$B)

# for blocksize in [10, 100, 1000, 10000, 100000]
for blocksize in [10, 100, 1000]
    @btime normal2!($C,$Cx,$x,$A,$B,$blocksize)
end

to use mul instead

using LinearAlgebra
using BenchmarkTools

function _kron!(C::T1,A::T2,B::T3) where {T1 <: AbstractVector,T2 <: AbstractVector,T3 <: AbstractVector}
    @views for i = 1:size(A,1)
        mul!(C[(i-1)*size(B,1)+1:i*size(B,1)],A[i],B)
    end
    return C
end

function _kron(A::T1,B::T2) where {T1 <: AbstractVector,T2 <: AbstractVector}
    C = zeros(size(A,1) * size(B,1))
    _kron!(C,A,B)
    return C
end

function normal1!(C,Cx,x,A,B)
    temp = Vector{Float64}(undef,size(A,2)*size(B,2))
    @views for n = 1:size(A,1)
        _kron!(temp,A[n,:],B[n,:])
        # mul!(C,temp,temp',1.,1.)
        BLAS.ger!(1.,temp,temp,C)
        mul!(Cx,temp,x[n],1.,1.)
    end
    return Cx
end

function kr!(C,A,B)
    @views for n = 1:size(A,1)
        _kron!(C[:,n],A[n,:],B[n,:])
    end
    return C
end

function kr(A,B)
    C = Matrix{Float64}(undef,size(A,2)*size(B,2),size(A,1))
    kr!(C,A,B)
    return C
end

function normal2!(C,Cx,x,A,B,temp,n1,n2)
    @views kr!(temp,A[n1:n2,:],B[n1:n2,:])
    mul!(C,temp,temp',1.,1.)
    @views mul!(Cx,temp,x[n1:n2],1.,1.)
end
function normal2!(C,Cx,x,A,B,blocksize)
    n1 = 1
    if blocksize < size(A,1)
        temp = Matrix{Float64}(undef,size(A,2)*size(B,2),blocksize)
        @views while n1 <= size(A,1)-blocksize
            n2 = n1+blocksize-1
            normal2!(C,Cx,x,A,B,temp,n1,n2)
            n1 += blocksize
        end
    end
    n2 = size(A,1)
    temp = Matrix{Float64}(undef,size(A,2)*size(B,2),n2-n1+1)
    normal2!(C,Cx,x,A,B,temp,n1,n2)
    return Cx
end

N = 10000
R = 20
M = 40

A = rand(N,M)
B = rand(N,R)
x = rand(N,)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)

normal1!(C,Cx,x,A,B)
CNormal = copy(C)
CxNormal = copy(Cx)

C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal2!(C,Cx,x,A,B,100)
@assert C ≈ CNormal
@assert Cx ≈ CxNormal

@btime normal1!($C,$Cx,$x,$A,$B)

# for blocksize in [10, 100, 1000, 10000, 100000]
for blocksize in [10, 100, 1000]
    @btime normal2!($C,$Cx,$x,$A,$B,$blocksize)
end

I noticed the following oddities.

mul!(C,temp,temp',1.,1.) seems to compute the same as BLAS.ger!(1.,temp,temp,C) but is slooow.
mul!(Cx,temp,x[n],1.,1.) seems to work (which I like very much), although this is not documented?
Runtime performance using BLAS

  371.686 ms (1 allocation: 6.38 KiB)
  100.720 ms (4 allocations: 125.16 KiB)
  62.885 ms (4 allocations: 1.22 MiB)
  69.160 ms (4 allocations: 12.21 MiB)

looks significantly different to the Julia mul version

  336.500 ms (1 allocation: 6.38 KiB)
  922.396 ms (4 allocations: 125.16 KiB)
  128.485 ms (4 allocations: 1.22 MiB)
  56.164 ms (4 allocations: 12.21 MiB)

which really surprises me. This is on Julia 1.6.4, 1.7.0-rc3 behaves similar.

goerch · November 20, 2021, 4:02pm

Regarding 3:

Not sure, but could it be https://github.com/JuliaLang/julia/blob/master/stdlib/LinearAlgebra/src/matmul.jl#L351, which redirects to syrk instead of gemm? There is even https://github.com/JuliaLang/julia/issues/659. Interesting, seems numerically sensible.

goerch · November 20, 2021, 4:11pm

Regarding 1:

It seems to me that support for ger and syr is missing in https://github.com/JuliaLang/julia/blob/master/stdlib/LinearAlgebra/src/matmul.jl.

Oscar_Smith · November 20, 2021, 10:10pm

This would be a really good issue or PR.

goerch · November 20, 2021, 10:35pm

Thanks for your feedback, filed an issue.

goerch · November 21, 2021, 4:45pm

Is there some documentation which mappings of high level Julia functions to BLAS are intended to work?

Oscar_Smith · November 21, 2021, 4:50pm

I don’t think so. Generally we just promise that whatever we decide to do will be reasonably accurate and reasonably fast.

goerch · November 21, 2021, 4:56pm

Which it sometimes isn’t, so that we have discussions. I’m at a loss, what to do.

Edit: it seems for high performance applications it then would be appropriate to depend on the low level BLAS wrappers.

goerch · November 21, 2021, 5:30pm

Which brings problems of it’s own, because

using BenchmarkTools, Random, LinearAlgebra
elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = rand(elt, n)
println(BLAS.get_config())
t1 = BLAS.syr!('U', elt(1), b, copy(A))
t2 = BLAS.syr!('U', elt(1), b, Symmetric(copy(A)))

fails with

LBTConfig([ILP64] libopenblas64_.dll)
ERROR: LoadError: MethodError: no method matching strides(::Symmetric{Float64, Matrix{Float64}})

Shouldn’t at least something like this work then?

carstenbauer · November 21, 2021, 5:57pm

BLAS can’t handle Julia types like Symmetric and you’re calling rather directly into BLAS. So I wouldn’t expect this to work, no. But what do I know

goerch · November 21, 2021, 6:02pm

I’m asking the question because BLAS syr specifically expects a symmetric matrix memory layout AFAIU, and I’d have expected Symmetric to exactly provide this?

goerch · November 21, 2021, 7:19pm

OK, so what about this then?

using BenchmarkTools, Random, LinearAlgebra

elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = Symmetric(rand(elt, n, n))
println(BLAS.get_config())
t1 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), copy(A))
t2 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), Symmetric(copy(A)))

failing again with

LBTConfig([ILP64] libopenblas64_.dll)
ERROR: LoadError: MethodError: no method matching strides(::Symmetric{Float64, Matrix{Float64}})

And you really expect us to answer MATLAB user questions regarding performance?

goerch · November 21, 2021, 10:36pm

Filed another issue here.

amontoison · November 22, 2021, 12:49am

Symmetric is only a wrapper. If you want to use BLAS.syrk! on your matrix A, you should use A.data.
Example :

elt = Float64
n = 32
Random.seed!(1234)
A = rand(elt, n, n)
B = zeros(elt, n, n)
A = triu(A)                      # A = tril(A)
S = Symmetric(A, :U)    # S = Symmetric(A, :L)
 
println(BLAS.get_config())
t1 = BLAS.syrk!(S.uplo, 'N', elt(1), S.data, elt(1), B)

goerch · November 22, 2021, 1:07am

Yes, this seems to work:

using BenchmarkTools, Random, LinearAlgebra
elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = rand(elt, n)
t1 = BLAS.syr!('U', elt(1), b, copy(A))
println(BLAS.get_config())
t2 = BLAS.syr!('U', elt(1), b, Symmetric(copy(A)).data)

Edit: after @amontoison `s correction this

using BenchmarkTools, Random, LinearAlgebra

elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = rand(elt, n, n)
println(BLAS.get_config())
t1 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), copy(A))
t2 = BLAS.syrk!('U', 'N', elt(1), b, elt(1), Symmetric(copy(A)).data)

works too.

amontoison · November 22, 2021, 1:13am

b is a Symmetric matrix, you forgot .data.

elt = Float64
n = 32
Random.seed!(1234)
# A = rand(elt, n, n)
A = rand(elt, n, n)
b = Symmetric(rand(elt, n, n))
println(BLAS.get_config())
t1 = BLAS.syrk!('U', 'N', elt(1), b.data, elt(1), copy(A))
t2 = BLAS.syrk!('U', 'N', elt(1), b.data, elt(1), Symmetric(copy(A)).data)

goerch · November 22, 2021, 1:14am

Edit: Nonsense on my side…

Hm. But the spec allows for general matrices there?

amontoison · November 22, 2021, 1:29am

BLAS routines require general arrays (Vector{T} or Matrix{T}).
I don’t know why you use a symmetric matrix for b, I just fixed the error of your previous message

goerch · November 22, 2021, 1:34am

My mistake, excellent: this did the job, you have earned another solution point!

Topic		Replies	Views
Why mul! is so fast? General Usage question , linearalgebra	7	6871	November 26, 2019
Working with `LinearAlgebra.mul!` Performance question , linearalgebra	12	1084	May 7, 2022
@inbounds: is the compiler now so smart that this is no longer necessary? Performance	33	2908	July 16, 2018
Product of two symmetric matrices: LoopVectorization.jl vs LinearAlgebra Performance blas , linearalgebra , loopvectorization	9	975	August 31, 2021
Fastest matrix multiplication General Usage mkl , linearalgebra , matrix	20	1572	February 19, 2024

`mul` dispatch to BLAS incomplete?

Related topics