Hi Julians,
In the context of a Julia lecture preparation, I have implemented the following (over)simplfied mygemm
matrix-matrix product implementation (150 lines):
mygemm.jl
using BenchmarkTools
using LinearAlgebra
using SIMD
const SBS=32 #Micro block size
# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
lane = VecRange{SBS}(0)
@inbounds for j=1:SBS
@inbounds for k=1:SBS
@fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
end
end
nothing
end
@inline function copytoblock!(aik,A,is,js)
for j=1:SBS
@simd for i=1:SBS
@inbounds aik[i,j]=A[is+i,js+j]
end
end
nothing
end
@inline function copyfromblock!(A,aik,is,js)
for j=1:SBS
@simd for i=1:SBS
@inbounds A[is+i,js+j]=aik[i,j]
end
end
nothing
end
function createBlockMatrix(A)
nx,ny=size(A)
nbx=div(nx,SBS)
nby=div(ny,SBS)
Ab=Array{Array{Float64,2},2}(undef,nbx,nby)
for jb=1:nby
js=(jb-1)*SBS
@inbounds for ib=1:nbx
is=(ib-1)*SBS
aik=Array{Float64,2}(undef,SBS,SBS)
copytoblock!(aik,A,is,js)
Ab[ib,jb]=aik
end
end
Ab
end
function copyfromblocks!(A,Ab)
nbx,nby=size(Ab)
for jb=1:nby
js=(jb-1)*SBS
@inbounds for ib=1:nbx
is=(ib-1)*SBS
copyfromblock!(A,Ab[ib,jb],is,js)
end
end
nothing
end
function minigemm!(Cb,Ab,Bb)
nax,nay=size(Ab)
nbx,nby=size(Bb)
# @assert nbx==nay
nb=nbx
for jb=1:nby
js=(jb-1)*SBS
@inbounds for kb=1:nbx
ks=(kb-1)*SBS
bkj=Bb[kb,jb]
# copytoblock!(bkj,B,ks,js)
for ib=1:nax
is=(ib-1)*SBS
aik=Ab[ib,kb]
cij=Cb[ib,jb]
# microgemm_SA!(cij,aik,bkj)
microgemm!(cij,aik,bkj)
end
end
end
end
function createBlockBlockMatrix(A,BS)
nx,ny=size(A)
nbx=div(nx,BS)
nby=div(ny,BS)
# @show nbx,nby
Abb=Array{Array{Array{Float64,2},2},2}(undef,nbx,nby)
Threads.@threads for jb=1:nby
js=(jb-1)*BS
@inbounds for ib=1:nbx
is=(ib-1)*BS
vAb=view(A,is+1:is+BS,js+1:js+BS)
Abb[ib,jb]=createBlockMatrix(vAb)
end
end
Abb
end
function copyfromblocksblocks!(A,Abb)
nbx,nby=size(Abb)
BS=size(Abb[1,1],1)*SBS
# @show BS
for jb=1:nby
js=(jb-1)*BS
@inbounds for ib=1:nbx
is=(ib-1)*BS
vAb=view(A,is+1:is+BS,js+1:js+BS)
copyfromblocks!(vAb,Abb[ib,jb])
end
end
nothing
end
function macrogemm!(Cbb,Abb,Bbb)
nax,nay=size(Abb)
nbx,nby=size(Bbb)
BS=size(Abb[1,1],1)*SBS
nb=nbx
Threads.@threads for jb=1:nby
@inbounds for ib=1:nax
cij=Cbb[ib,jb]
for kb=1:nbx
ks=(kb-1)*BS
bkj=Bbb[kb,jb]
aik=Abb[ib,kb]
minigemm!(cij,aik,bkj)
end
end
end
end
function mygemm!(C,A,B)
BS=128
Abb=createBlockBlockMatrix(A,BS)
Bbb=createBlockBlockMatrix(B,BS)
Cbb=createBlockBlockMatrix(C,BS)
macrogemm!(Cbb,Abb,Bbb)
copyfromblocksblocks!(C,Cbb)
nothing
end
function test()
@show Threads.nthreads()
n=4096
A=rand(n,n)
B=rand(n,n)
C=zeros(n,n)
Cref=zeros(n,n)
mul!(Cref,A,B)
mygemm!(C,A,B)
display(norm(C.-Cref))
trials=@benchmark mul!($Cref,$A,$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("gemm GFlops n=$n \t"," :",gflops)
trials=@benchmark mygemm!($C,$A,$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("mygemm GFlops n=$n \t"," :",gflops)
nothing
end
test()
It does only deal with square matrices with size that can be divided by 128
mygemm!
function basically divides the work in parallel calls to minigemm!
function operating on 128x128 blocks splitting their work in calls to ```microgemm!`` operating on 32x32 blocks.
I use SIMD.jl for the 32x32 block ```microgemm``:
using SIMD
const SBS=32 #Micro block size
# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
lane = VecRange{SBS}(0)
@inbounds for j=1:SBS
@inbounds for k=1:SBS
@fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
end
end
nothing
end
On a 4096x4096 matrix I obtain almost 100 GFLOPS (on my 4cores-8threads I6700K CPU) which is not ridiculous compared to the native BLAS call (130 GFLOPS):
results
Threads.nthreads() = 8
6.638192070257908e-9
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.054 s (0.00% GC)
median time: 1.058 s (0.00% GC)
mean time: 1.057 s (0.00% GC)
maximum time: 1.061 s (0.00% GC)
--------------
samples: 5
evals/sample: 1
gemm GFlops n=4096 :130.3436958236218
BenchmarkTools.Trial:
memory estimate: 389.66 MiB
allocs estimate: 56099
--------------
minimum time: 1.412 s (0.00% GC)
median time: 1.435 s (0.00% GC)
mean time: 1.435 s (0.00% GC)
maximum time: 1.461 s (0.00% GC)
--------------
samples: 4
evals/sample: 1
mygemm GFlops n=4096 :97.35620646890548
My aim is to present performance basic knowledge in Julia like SIMD.jl and thread usage.
I would be happy to get some criticism on this implementation expecially on what could have been written more idiomatically. In particular I was not able to efficiently use subarrays. I also would be very happy to get better ideas for the SIMD part.
Thank you for your comments !