 # Native (oversimplified) Julia gemm implementation

Hi Julians,

In the context of a Julia lecture preparation, I have implemented the following (over)simplfied `mygemm` matrix-matrix product implementation (150 lines):

mygemm.jl
``````using BenchmarkTools
using LinearAlgebra
using SIMD

const SBS=32 #Micro block size

# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
lane = VecRange{SBS}(0)
@inbounds for j=1:SBS
@inbounds for k=1:SBS
@fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
end
end
nothing
end

@inline function copytoblock!(aik,A,is,js)
for j=1:SBS
@simd for i=1:SBS
@inbounds aik[i,j]=A[is+i,js+j]
end
end
nothing
end
@inline function copyfromblock!(A,aik,is,js)
for j=1:SBS
@simd for i=1:SBS
@inbounds A[is+i,js+j]=aik[i,j]
end
end
nothing
end

function createBlockMatrix(A)
nx,ny=size(A)
nbx=div(nx,SBS)
nby=div(ny,SBS)
Ab=Array{Array{Float64,2},2}(undef,nbx,nby)
for jb=1:nby
js=(jb-1)*SBS
@inbounds for ib=1:nbx
is=(ib-1)*SBS
aik=Array{Float64,2}(undef,SBS,SBS)
copytoblock!(aik,A,is,js)
Ab[ib,jb]=aik
end
end
Ab
end

function copyfromblocks!(A,Ab)
nbx,nby=size(Ab)
for jb=1:nby
js=(jb-1)*SBS
@inbounds for ib=1:nbx
is=(ib-1)*SBS
copyfromblock!(A,Ab[ib,jb],is,js)
end
end
nothing
end

function minigemm!(Cb,Ab,Bb)
nax,nay=size(Ab)
nbx,nby=size(Bb)
# @assert nbx==nay

nb=nbx
for jb=1:nby
js=(jb-1)*SBS
@inbounds for kb=1:nbx
ks=(kb-1)*SBS
bkj=Bb[kb,jb]
# copytoblock!(bkj,B,ks,js)
for ib=1:nax
is=(ib-1)*SBS
aik=Ab[ib,kb]
cij=Cb[ib,jb]
# microgemm_SA!(cij,aik,bkj)
microgemm!(cij,aik,bkj)
end
end
end
end

function createBlockBlockMatrix(A,BS)
nx,ny=size(A)
nbx=div(nx,BS)
nby=div(ny,BS)
# @show nbx,nby
Abb=Array{Array{Array{Float64,2},2},2}(undef,nbx,nby)
Threads.@threads  for jb=1:nby
js=(jb-1)*BS
@inbounds for ib=1:nbx
is=(ib-1)*BS
vAb=view(A,is+1:is+BS,js+1:js+BS)
Abb[ib,jb]=createBlockMatrix(vAb)
end
end
Abb
end

function copyfromblocksblocks!(A,Abb)
nbx,nby=size(Abb)
BS=size(Abb[1,1],1)*SBS
# @show BS
for jb=1:nby
js=(jb-1)*BS
@inbounds for ib=1:nbx
is=(ib-1)*BS
vAb=view(A,is+1:is+BS,js+1:js+BS)
copyfromblocks!(vAb,Abb[ib,jb])
end
end
nothing
end

function macrogemm!(Cbb,Abb,Bbb)
nax,nay=size(Abb)
nbx,nby=size(Bbb)
BS=size(Abb[1,1],1)*SBS
nb=nbx

Threads.@threads for jb=1:nby
@inbounds for ib=1:nax
cij=Cbb[ib,jb]
for kb=1:nbx
ks=(kb-1)*BS
bkj=Bbb[kb,jb]
aik=Abb[ib,kb]
minigemm!(cij,aik,bkj)
end
end
end
end

function mygemm!(C,A,B)
BS=128
Abb=createBlockBlockMatrix(A,BS)
Bbb=createBlockBlockMatrix(B,BS)
Cbb=createBlockBlockMatrix(C,BS)
macrogemm!(Cbb,Abb,Bbb)
copyfromblocksblocks!(C,Cbb)
nothing
end

function test()
@show Threads.nthreads()

n=4096
A=rand(n,n)
B=rand(n,n)
C=zeros(n,n)
Cref=zeros(n,n)

mul!(Cref,A,B)
mygemm!(C,A,B)
display(norm(C.-Cref))

trials=@benchmark mul!(\$Cref,\$A,\$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("gemm GFlops n=\$n  \t"," :",gflops)

trials=@benchmark mygemm!(\$C,\$A,\$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("mygemm GFlops n=\$n  \t"," :",gflops)

nothing
end

test()
``````

It does only deal with square matrices with size that can be divided by 128 `mygemm!` function basically divides the work in parallel calls to `minigemm!` function operating on 128x128 blocks splitting their work in calls to ```microgemm!`` operating on 32x32 blocks.

I use SIMD.jl for the 32x32 block ```microgemm``:

``````using SIMD

const SBS=32 #Micro block size

# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
lane = VecRange{SBS}(0)
@inbounds for j=1:SBS
@inbounds for k=1:SBS
@fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
end
end
nothing
end
``````

On a 4096x4096 matrix I obtain almost 100 GFLOPS (on my 4cores-8threads I6700K CPU) which is not ridiculous compared to the native BLAS call (130 GFLOPS):

results
``````Threads.nthreads() = 8
6.638192070257908e-9
BenchmarkTools.Trial:
memory estimate:  0 bytes
allocs estimate:  0
--------------
minimum time:     1.054 s (0.00% GC)
median time:      1.058 s (0.00% GC)
mean time:        1.057 s (0.00% GC)
maximum time:     1.061 s (0.00% GC)
--------------
samples:          5
evals/sample:     1
gemm GFlops n=4096  	 :130.3436958236218
BenchmarkTools.Trial:
memory estimate:  389.66 MiB
allocs estimate:  56099
--------------
minimum time:     1.412 s (0.00% GC)
median time:      1.435 s (0.00% GC)
mean time:        1.435 s (0.00% GC)
maximum time:     1.461 s (0.00% GC)
--------------
samples:          4
evals/sample:     1
mygemm GFlops n=4096  	 :97.35620646890548
``````

My aim is to present performance basic knowledge in Julia like SIMD.jl and thread usage.
I would be happy to get some criticism on this implementation expecially on what could have been written more idiomatically. In particular I was not able to efficiently use subarrays. I also would be very happy to get better ideas for the SIMD part.

Thank you for your comments !

9 Likes

Very interesting code!

Just being curious here. Have you tried to use views instead of those copies (copytoblock function).

Thanks,
I think I have but could not get the same perf. I will try to post a version with views.

Not sure that is what you had in mind (I still have to reorganize the data by nested blocks)
but when I try to use views and ND-arrays the performance goes down to 17GFLOPS…

Summary
``````using BenchmarkTools
using LinearAlgebra
using SIMD

const SBS=32 #Micro block size
const BS=128
# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
lane = VecRange{SBS}(0)
@inbounds for j=1:SBS
@inbounds for k=1:SBS
@fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
end
end
nothing
end

function minigemm!(vCb,vAb,vBb)
@inbounds nax,nay=size(vAb)[end-1],size(vAb)[end]
@inbounds nbx,nby=size(vBb)[end-1],size(vBb)[end]
@inbounds for jb=1:nby
for kb=1:nbx
bkj=view(vBb,:,:,kb,jb)
for ib=1:nax
cij=view(vCb,:,:,ib,jb)
aik=view(vAb,:,:,ib,kb)
microgemm!(cij,aik,bkj)
end
end
end
end

function macrogemm!(Cbb,Abb,Bbb)
@inbounds nax,nay=size(Abb)[end-1],size(Abb)[end]
@inbounds nbx,nby=size(Bbb)[end-1],size(Bbb)[end]

Threads.@threads for jb=1:nby
@inbounds for ib=1:nax
cij=view(Cbb,:,:,:,:,ib,jb)
for kb=1:nbx
# ks=(kb-1)*BS
bkj=view(Bbb,:,:,:,:,kb,jb)
aik=view(Abb,:,:,:,:,ib,kb)
minigemm!(cij,aik,bkj)
end
end
end
end

@inline function initblock!(vvAb,vvA)
@inbounds for j=1:SBS
@simd for i=1:SBS
vvAb[i,j]=vvA[i,j]
end
end
nothing
end

function initblockblock!(vAb,vA)
@inbounds nbx,nby=size(vAb)[end-1],size(vAb)[end]
@inbounds for jb=1:nby
js=(jb-1)*SBS
@inbounds for ib=1:nbx
is=(ib-1)*SBS
vvA=view(vA,is+1:is+SBS,js+1:js+SBS)
vvAb=view(vAb,:,:,ib,jb)
initblock!(vvAb,vvA)
end
end
nothing
end

function createBlockBlockMatrix(A,BS)
nx,ny=size(A)
nbx=div(nx,BS)
nby=div(ny,BS)
nbbx=div(BS,SBS)
nbby=div(BS,SBS)
Abb=Array{Float64,6}(undef,SBS,SBS,nbbx,nbby,nbx,nby)
Threads.@threads for jb=1:nby
js=(jb-1)*BS
@inbounds for ib=1:nbx
is=(ib-1)*BS

vA=view(A,is+1:is+BS,js+1:js+BS)
vAbb=view(Abb,:,:,:,:,ib,jb)
initblockblock!(vAbb,vA)
end
end
Abb
end

@inline function copyfromblock!(vA,vaik)
@inbounds for j=1:SBS
@simd for i=1:SBS
vA[i,j]=vaik[i,j]
end
end
nothing
end

function copyfromblocks!(vA,vAb)
@inbounds nbx,nby=size(vAb)[end-1],size(vAb)[end]
@inbounds for jb=1:nby
js=(jb-1)*SBS
for ib=1:nbx
is=(ib-1)*SBS
vvA=view(vA,is+1:is+SBS,js+1:js+SBS)
vvAb=view(vAb,:,:,ib,jb)
copyfromblock!(vvA,vvAb)
end
end
nothing
end

function copyfromblocksblocks!(A,Abb)
@inbounds nbx,nby=size(Abb)[end-1],size(Abb)[end]
@inbounds for jb=1:nby
js=(jb-1)*BS
for ib=1:nbx
is=(ib-1)*BS
vA=view(A,is+1:is+BS,js+1:js+BS)
vAbb=view(Abb,:,:,:,:,ib,jb)
copyfromblocks!(vA,vAbb)
end
end
nothing
end

function mygemm!(C,A,B)
Abb=createBlockBlockMatrix(A,BS)
Bbb=createBlockBlockMatrix(B,BS)
Cbb=createBlockBlockMatrix(C,BS)
macrogemm!(Cbb,Abb,Bbb)
copyfromblocksblocks!(C,Cbb)
nothing
end

function test()
@show Threads.nthreads()

n=4096
A=rand(n,n)
Aref=deepcopy(A)
@show norm(A)
Abb=createBlockBlockMatrix(A,BS)
# display(Abb)
copyfromblocksblocks!(A,Abb)
@show norm(A)
@show norm(A.-Aref)

B=rand(n,n)
C=zeros(n,n)
Cref=zeros(n,n)

mul!(Cref,A,B)
mygemm!(C,A,B)
@show norm(C)
@show norm(Cref)
@show norm(C.-Cref)
# display(C)
# display(Cref)

# return

trials=@benchmark mul!(\$Cref,\$A,\$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("gemm GFlops n=\$n  \t"," :",gflops)

trials=@benchmark mygemm!(\$C,\$A,\$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("mygemm GFlops n=\$n  \t"," :",gflops)

nothing
end

test()
``````

Really performance sensitive code can sometimes benefit from unsafe views https://github.com/oschulz/UnsafeArrays.jl

1 Like

There have been some other efforts to implement BLAS functionality in pure Julia. See the https://github.com/JuliaBLAS repos and this topic: We can write an optimized BLAS library in pure Julia (please skip OP and jump to post 4).

3 Likes

Wow ! Thank you very much for the link. I had a quick look at JBLAS and this is really a fantastic advanced performance lecture on HPC in Julia (and also in meta-programming). Although definitively out of the topic of my lecture preparation on Julia basics I really have to put some time to study this carefully.

For now I try to keep the focus on my introductory lecture and see what can be done at a relatively simple level. My main concern is that creating array of arrays works well here but it is a bit in contradiction with the standard multidim array usage in Julia.

I would be very happy if someone could show me how to write the same algo in a more idiomatic way. I will try @baggepinnen suggestion (thanks !) with uviews but I have the feeling that I am already moving away from the standard route.