When translating the benchmark from Outperformed by Matlab
using LinearAlgebra
using MKL
using BenchmarkTools
function _kron!(C::T1,A::T2,B::T3) where {T1 <: AbstractVector,T2 <: AbstractVector,T3 <: AbstractVector}
@views for i = 1:size(A,1)
mul!(C[(i-1)*size(B,1)+1:i*size(B,1)],A[i],B)
end
return C
end
function _kron(A::T1,B::T2) where {T1 <: AbstractVector,T2 <: AbstractVector}
C = zeros(size(A,1) * size(B,1))
_kron!(C,A,B)
return C
end
function normal1!(C,Cx,x,A,B)
temp = Vector{Float64}(undef,size(A,2)*size(B,2))
@views for n = 1:size(A,1)
_kron!(temp,A[n,:],B[n,:])
BLAS.ger!(1.,temp,temp,C)
BLAS.axpy!(x[n],temp,Cx)
end
return Cx
end
function kr!(C,A,B)
@views for n = 1:size(A,1)
_kron!(C[:,n],A[n,:],B[n,:])
end
return C
end
function kr(A,B)
C = Matrix{Float64}(undef,size(A,2)*size(B,2),size(A,1))
kr!(C,A,B)
return C
end
function normal2!(C,Cx,x,A,B,temp,n1,n2)
@views kr!(temp,A[n1:n2,:],B[n1:n2,:])
BLAS.gemm!('N','T',1.,temp,temp,1.,C)
@views BLAS.gemv!('N',1.,temp,x[n1:n2],1.,Cx)
end
function normal2!(C,Cx,x,A,B,blocksize)
n1 = 1
if blocksize < size(A,1)
temp = Matrix{Float64}(undef,size(A,2)*size(B,2),blocksize)
@views while n1 <= size(A,1)-blocksize
n2 = n1+blocksize-1
normal2!(C,Cx,x,A,B,temp,n1,n2)
n1 += blocksize
end
end
n2 = size(A,1)
temp = Matrix{Float64}(undef,size(A,2)*size(B,2),n2-n1+1)
normal2!(C,Cx,x,A,B,temp,n1,n2)
return Cx
end
N = 10000
R = 20
M = 40
A = rand(N,M)
B = rand(N,R)
x = rand(N,)
C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal1!(C,Cx,x,A,B)
CNormal = copy(C)
CxNormal = copy(Cx)
C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal2!(C,Cx,x,A,B,100)
@assert C ≈ CNormal
@assert Cx ≈ CxNormal
@btime normal1!($C,$Cx,$x,$A,$B)
# for blocksize in [10, 100, 1000, 10000, 100000]
for blocksize in [10, 100, 1000]
@btime normal2!($C,$Cx,$x,$A,$B,$blocksize)
end
to use mul
instead
using LinearAlgebra
using BenchmarkTools
function _kron!(C::T1,A::T2,B::T3) where {T1 <: AbstractVector,T2 <: AbstractVector,T3 <: AbstractVector}
@views for i = 1:size(A,1)
mul!(C[(i-1)*size(B,1)+1:i*size(B,1)],A[i],B)
end
return C
end
function _kron(A::T1,B::T2) where {T1 <: AbstractVector,T2 <: AbstractVector}
C = zeros(size(A,1) * size(B,1))
_kron!(C,A,B)
return C
end
function normal1!(C,Cx,x,A,B)
temp = Vector{Float64}(undef,size(A,2)*size(B,2))
@views for n = 1:size(A,1)
_kron!(temp,A[n,:],B[n,:])
# mul!(C,temp,temp',1.,1.)
BLAS.ger!(1.,temp,temp,C)
mul!(Cx,temp,x[n],1.,1.)
end
return Cx
end
function kr!(C,A,B)
@views for n = 1:size(A,1)
_kron!(C[:,n],A[n,:],B[n,:])
end
return C
end
function kr(A,B)
C = Matrix{Float64}(undef,size(A,2)*size(B,2),size(A,1))
kr!(C,A,B)
return C
end
function normal2!(C,Cx,x,A,B,temp,n1,n2)
@views kr!(temp,A[n1:n2,:],B[n1:n2,:])
mul!(C,temp,temp',1.,1.)
@views mul!(Cx,temp,x[n1:n2],1.,1.)
end
function normal2!(C,Cx,x,A,B,blocksize)
n1 = 1
if blocksize < size(A,1)
temp = Matrix{Float64}(undef,size(A,2)*size(B,2),blocksize)
@views while n1 <= size(A,1)-blocksize
n2 = n1+blocksize-1
normal2!(C,Cx,x,A,B,temp,n1,n2)
n1 += blocksize
end
end
n2 = size(A,1)
temp = Matrix{Float64}(undef,size(A,2)*size(B,2),n2-n1+1)
normal2!(C,Cx,x,A,B,temp,n1,n2)
return Cx
end
N = 10000
R = 20
M = 40
A = rand(N,M)
B = rand(N,R)
x = rand(N,)
C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal1!(C,Cx,x,A,B)
CNormal = copy(C)
CxNormal = copy(Cx)
C = zeros(M*R,M*R)
Cx = zeros(M*R,)
normal2!(C,Cx,x,A,B,100)
@assert C ≈ CNormal
@assert Cx ≈ CxNormal
@btime normal1!($C,$Cx,$x,$A,$B)
# for blocksize in [10, 100, 1000, 10000, 100000]
for blocksize in [10, 100, 1000]
@btime normal2!($C,$Cx,$x,$A,$B,$blocksize)
end
I noticed the following oddities.
-
mul!(C,temp,temp',1.,1.)
seems to compute the same asBLAS.ger!(1.,temp,temp,C)
but is slooow. -
mul!(Cx,temp,x[n],1.,1.)
seems to work (which I like very much), although this is not documented? - Runtime performance using BLAS
371.686 ms (1 allocation: 6.38 KiB)
100.720 ms (4 allocations: 125.16 KiB)
62.885 ms (4 allocations: 1.22 MiB)
69.160 ms (4 allocations: 12.21 MiB)
looks significantly different to the Julia mul
version
336.500 ms (1 allocation: 6.38 KiB)
922.396 ms (4 allocations: 125.16 KiB)
128.485 ms (4 allocations: 1.22 MiB)
56.164 ms (4 allocations: 12.21 MiB)
which really surprises me. This is on Julia 1.6.4, 1.7.0-rc3 behaves similar.