Hi Julians,
In the context of a Julia lecture preparation, I have implemented the following (over)simplfied mygemm matrix-matrix product implementation (150 lines):
mygemm.jl
using BenchmarkTools
using LinearAlgebra
using SIMD
const SBS=32 #Micro block size
# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
lane = VecRange{SBS}(0)
@inbounds for j=1:SBS
@inbounds for k=1:SBS
@fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
end
end
nothing
end
@inline function copytoblock!(aik,A,is,js)
for j=1:SBS
@simd for i=1:SBS
@inbounds aik[i,j]=A[is+i,js+j]
end
end
nothing
end
@inline function copyfromblock!(A,aik,is,js)
for j=1:SBS
@simd for i=1:SBS
@inbounds A[is+i,js+j]=aik[i,j]
end
end
nothing
end
function createBlockMatrix(A)
nx,ny=size(A)
nbx=div(nx,SBS)
nby=div(ny,SBS)
Ab=Array{Array{Float64,2},2}(undef,nbx,nby)
for jb=1:nby
js=(jb-1)*SBS
@inbounds for ib=1:nbx
is=(ib-1)*SBS
aik=Array{Float64,2}(undef,SBS,SBS)
copytoblock!(aik,A,is,js)
Ab[ib,jb]=aik
end
end
Ab
end
function copyfromblocks!(A,Ab)
nbx,nby=size(Ab)
for jb=1:nby
js=(jb-1)*SBS
@inbounds for ib=1:nbx
is=(ib-1)*SBS
copyfromblock!(A,Ab[ib,jb],is,js)
end
end
nothing
end
function minigemm!(Cb,Ab,Bb)
nax,nay=size(Ab)
nbx,nby=size(Bb)
# @assert nbx==nay
nb=nbx
for jb=1:nby
js=(jb-1)*SBS
@inbounds for kb=1:nbx
ks=(kb-1)*SBS
bkj=Bb[kb,jb]
# copytoblock!(bkj,B,ks,js)
for ib=1:nax
is=(ib-1)*SBS
aik=Ab[ib,kb]
cij=Cb[ib,jb]
# microgemm_SA!(cij,aik,bkj)
microgemm!(cij,aik,bkj)
end
end
end
end
function createBlockBlockMatrix(A,BS)
nx,ny=size(A)
nbx=div(nx,BS)
nby=div(ny,BS)
# @show nbx,nby
Abb=Array{Array{Array{Float64,2},2},2}(undef,nbx,nby)
Threads.@threads for jb=1:nby
js=(jb-1)*BS
@inbounds for ib=1:nbx
is=(ib-1)*BS
vAb=view(A,is+1:is+BS,js+1:js+BS)
Abb[ib,jb]=createBlockMatrix(vAb)
end
end
Abb
end
function copyfromblocksblocks!(A,Abb)
nbx,nby=size(Abb)
BS=size(Abb[1,1],1)*SBS
# @show BS
for jb=1:nby
js=(jb-1)*BS
@inbounds for ib=1:nbx
is=(ib-1)*BS
vAb=view(A,is+1:is+BS,js+1:js+BS)
copyfromblocks!(vAb,Abb[ib,jb])
end
end
nothing
end
function macrogemm!(Cbb,Abb,Bbb)
nax,nay=size(Abb)
nbx,nby=size(Bbb)
BS=size(Abb[1,1],1)*SBS
nb=nbx
Threads.@threads for jb=1:nby
@inbounds for ib=1:nax
cij=Cbb[ib,jb]
for kb=1:nbx
ks=(kb-1)*BS
bkj=Bbb[kb,jb]
aik=Abb[ib,kb]
minigemm!(cij,aik,bkj)
end
end
end
end
function mygemm!(C,A,B)
BS=128
Abb=createBlockBlockMatrix(A,BS)
Bbb=createBlockBlockMatrix(B,BS)
Cbb=createBlockBlockMatrix(C,BS)
macrogemm!(Cbb,Abb,Bbb)
copyfromblocksblocks!(C,Cbb)
nothing
end
function test()
@show Threads.nthreads()
n=4096
A=rand(n,n)
B=rand(n,n)
C=zeros(n,n)
Cref=zeros(n,n)
mul!(Cref,A,B)
mygemm!(C,A,B)
display(norm(C.-Cref))
trials=@benchmark mul!($Cref,$A,$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("gemm GFlops n=$n \t"," :",gflops)
trials=@benchmark mygemm!($C,$A,$B)
display(trials)
tns=minimum(trials.times)
gflops=Float64(2)*Float64(n)^3/(tns)
println("mygemm GFlops n=$n \t"," :",gflops)
nothing
end
test()
It does only deal with square matrices with size that can be divided by 128 ![]()
mygemm! function basically divides the work in parallel calls to minigemm! function operating on 128x128 blocks splitting their work in calls to ```microgemm!`` operating on 32x32 blocks.
I use SIMD.jl for the 32x32 block ```microgemm``:
using SIMD
const SBS=32 #Micro block size
# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
lane = VecRange{SBS}(0)
@inbounds for j=1:SBS
@inbounds for k=1:SBS
@fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
end
end
nothing
end
On a 4096x4096 matrix I obtain almost 100 GFLOPS (on my 4cores-8threads I6700K CPU) which is not ridiculous compared to the native BLAS call (130 GFLOPS):
results
Threads.nthreads() = 8
6.638192070257908e-9
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.054 s (0.00% GC)
median time: 1.058 s (0.00% GC)
mean time: 1.057 s (0.00% GC)
maximum time: 1.061 s (0.00% GC)
--------------
samples: 5
evals/sample: 1
gemm GFlops n=4096 :130.3436958236218
BenchmarkTools.Trial:
memory estimate: 389.66 MiB
allocs estimate: 56099
--------------
minimum time: 1.412 s (0.00% GC)
median time: 1.435 s (0.00% GC)
mean time: 1.435 s (0.00% GC)
maximum time: 1.461 s (0.00% GC)
--------------
samples: 4
evals/sample: 1
mygemm GFlops n=4096 :97.35620646890548
My aim is to present performance basic knowledge in Julia like SIMD.jl and thread usage.
I would be happy to get some criticism on this implementation expecially on what could have been written more idiomatically. In particular I was not able to efficiently use subarrays. I also would be very happy to get better ideas for the SIMD part.
Thank you for your comments !