Native (oversimplified) Julia gemm implementation

Hi Julians,

In the context of a Julia lecture preparation, I have implemented the following (over)simplfied mygemm matrix-matrix product implementation (150 lines):

mygemm.jl
using BenchmarkTools
using LinearAlgebra
using SIMD

const SBS=32 #Micro block size

# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
   lane = VecRange{SBS}(0)
   @inbounds for j=1:SBS
      @inbounds for k=1:SBS
         @fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
      end
   end
   nothing
end

@inline function copytoblock!(aik,A,is,js)
   for j=1:SBS
      @simd for i=1:SBS
         @inbounds aik[i,j]=A[is+i,js+j]
      end
   end
   nothing
end
@inline function copyfromblock!(A,aik,is,js)
   for j=1:SBS
      @simd for i=1:SBS
         @inbounds A[is+i,js+j]=aik[i,j]
      end
   end
   nothing
end

function createBlockMatrix(A)
   nx,ny=size(A)
   nbx=div(nx,SBS)
   nby=div(ny,SBS)
   Ab=Array{Array{Float64,2},2}(undef,nbx,nby)
   for jb=1:nby
      js=(jb-1)*SBS
      @inbounds for ib=1:nbx
         is=(ib-1)*SBS
         aik=Array{Float64,2}(undef,SBS,SBS)
         copytoblock!(aik,A,is,js)
         Ab[ib,jb]=aik
      end
   end
   Ab
end

function copyfromblocks!(A,Ab)
   nbx,nby=size(Ab)
   for jb=1:nby
      js=(jb-1)*SBS
      @inbounds for ib=1:nbx
         is=(ib-1)*SBS
         copyfromblock!(A,Ab[ib,jb],is,js)
      end
   end
   nothing
end


function minigemm!(Cb,Ab,Bb)
   nax,nay=size(Ab)
   nbx,nby=size(Bb)
   # @assert nbx==nay

   nb=nbx
   for jb=1:nby
      js=(jb-1)*SBS
      @inbounds for kb=1:nbx
         ks=(kb-1)*SBS
         bkj=Bb[kb,jb]
         # copytoblock!(bkj,B,ks,js)
         for ib=1:nax
            is=(ib-1)*SBS
            aik=Ab[ib,kb]
            cij=Cb[ib,jb]
            # microgemm_SA!(cij,aik,bkj)
            microgemm!(cij,aik,bkj)
         end
      end
   end
end

function createBlockBlockMatrix(A,BS)
   nx,ny=size(A)
   nbx=div(nx,BS)
   nby=div(ny,BS)
   # @show nbx,nby
   Abb=Array{Array{Array{Float64,2},2},2}(undef,nbx,nby)
   Threads.@threads  for jb=1:nby
      js=(jb-1)*BS
      @inbounds for ib=1:nbx
         is=(ib-1)*BS
         vAb=view(A,is+1:is+BS,js+1:js+BS)
         Abb[ib,jb]=createBlockMatrix(vAb)
      end
   end
   Abb
end

function copyfromblocksblocks!(A,Abb)
   nbx,nby=size(Abb)
   BS=size(Abb[1,1],1)*SBS
   # @show BS
   for jb=1:nby
      js=(jb-1)*BS
      @inbounds for ib=1:nbx
         is=(ib-1)*BS
         vAb=view(A,is+1:is+BS,js+1:js+BS)
         copyfromblocks!(vAb,Abb[ib,jb])
      end
   end
   nothing
end


function macrogemm!(Cbb,Abb,Bbb)
   nax,nay=size(Abb)
   nbx,nby=size(Bbb)
   BS=size(Abb[1,1],1)*SBS
   nb=nbx

   Threads.@threads for jb=1:nby
      @inbounds for ib=1:nax
         cij=Cbb[ib,jb]
         for kb=1:nbx
            ks=(kb-1)*BS
            bkj=Bbb[kb,jb]
            aik=Abb[ib,kb]
            minigemm!(cij,aik,bkj)
         end
      end
   end
end

function mygemm!(C,A,B)
   BS=128
   Abb=createBlockBlockMatrix(A,BS)
   Bbb=createBlockBlockMatrix(B,BS)
   Cbb=createBlockBlockMatrix(C,BS)
   macrogemm!(Cbb,Abb,Bbb)
   copyfromblocksblocks!(C,Cbb)
   nothing
end

function test()
   @show Threads.nthreads()

   n=4096
   A=rand(n,n)
   B=rand(n,n)
   C=zeros(n,n)
   Cref=zeros(n,n)

   mul!(Cref,A,B)
   mygemm!(C,A,B)
   display(norm(C.-Cref))

   trials=@benchmark mul!($Cref,$A,$B)
   display(trials)
   tns=minimum(trials.times)
   gflops=Float64(2)*Float64(n)^3/(tns)
   println("gemm GFlops n=$n  \t"," :",gflops)

   trials=@benchmark mygemm!($C,$A,$B)
   display(trials)
   tns=minimum(trials.times)
   gflops=Float64(2)*Float64(n)^3/(tns)
   println("mygemm GFlops n=$n  \t"," :",gflops)

   nothing
end

test()

It does only deal with square matrices with size that can be divided by 128 :wink:

mygemm! function basically divides the work in parallel calls to minigemm! function operating on 128x128 blocks splitting their work in calls to ```microgemm!`` operating on 32x32 blocks.

I use SIMD.jl for the 32x32 block ```microgemm``:

using SIMD

const SBS=32 #Micro block size

# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
   lane = VecRange{SBS}(0)
   @inbounds for j=1:SBS
      @inbounds for k=1:SBS
         @fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
      end
   end
   nothing
end

On a 4096x4096 matrix I obtain almost 100 GFLOPS (on my 4cores-8threads I6700K CPU) which is not ridiculous compared to the native BLAS call (130 GFLOPS):

results
Threads.nthreads() = 8
6.638192070257908e-9
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.054 s (0.00% GC)
  median time:      1.058 s (0.00% GC)
  mean time:        1.057 s (0.00% GC)
  maximum time:     1.061 s (0.00% GC)
  --------------
  samples:          5
  evals/sample:     1
gemm GFlops n=4096  	 :130.3436958236218
BenchmarkTools.Trial: 
  memory estimate:  389.66 MiB
  allocs estimate:  56099
  --------------
  minimum time:     1.412 s (0.00% GC)
  median time:      1.435 s (0.00% GC)
  mean time:        1.435 s (0.00% GC)
  maximum time:     1.461 s (0.00% GC)
  --------------
  samples:          4
  evals/sample:     1
mygemm GFlops n=4096  	 :97.35620646890548

My aim is to present performance basic knowledge in Julia like SIMD.jl and thread usage.
I would be happy to get some criticism on this implementation expecially on what could have been written more idiomatically. In particular I was not able to efficiently use subarrays. I also would be very happy to get better ideas for the SIMD part.

Thank you for your comments !

11 Likes

Very interesting code!

Just being curious here. Have you tried to use views instead of those copies (copytoblock function).

Thanks,
I think I have but could not get the same perf. I will try to post a version with views.

Not sure that is what you had in mind (I still have to reorganize the data by nested blocks)
but when I try to use views and ND-arrays the performance goes down to 17GFLOPS…

Summary
using BenchmarkTools
using LinearAlgebra
using SIMD

const SBS=32 #Micro block size
const BS=128
# perform C+=A*B for A,B,C beeing fixed size matrix of size SBS*SBS
@inline function microgemm!(c,a,b)
   lane = VecRange{SBS}(0)
   @inbounds for j=1:SBS
      @inbounds for k=1:SBS
         @fastmath c[lane+1,j]+=a[lane+1,k]*Vec{SBS,Float64}(b[k,j])
      end
   end
   nothing
end

function minigemm!(vCb,vAb,vBb)
   @inbounds nax,nay=size(vAb)[end-1],size(vAb)[end]
   @inbounds nbx,nby=size(vBb)[end-1],size(vBb)[end]
   @inbounds for jb=1:nby
      for kb=1:nbx
         bkj=view(vBb,:,:,kb,jb)
         for ib=1:nax
            cij=view(vCb,:,:,ib,jb)
            aik=view(vAb,:,:,ib,kb)
            microgemm!(cij,aik,bkj)
         end
      end
   end
end

function macrogemm!(Cbb,Abb,Bbb)
   @inbounds nax,nay=size(Abb)[end-1],size(Abb)[end]
   @inbounds nbx,nby=size(Bbb)[end-1],size(Bbb)[end]

   Threads.@threads for jb=1:nby
      @inbounds for ib=1:nax
         cij=view(Cbb,:,:,:,:,ib,jb)
         for kb=1:nbx
            # ks=(kb-1)*BS
            bkj=view(Bbb,:,:,:,:,kb,jb)
            aik=view(Abb,:,:,:,:,ib,kb)
            minigemm!(cij,aik,bkj)
         end
      end
   end
end



@inline function initblock!(vvAb,vvA)
   @inbounds for j=1:SBS
      @simd for i=1:SBS
         vvAb[i,j]=vvA[i,j]
      end
   end
   nothing
end

function initblockblock!(vAb,vA)
   @inbounds nbx,nby=size(vAb)[end-1],size(vAb)[end]
   @inbounds for jb=1:nby
      js=(jb-1)*SBS
      @inbounds for ib=1:nbx
         is=(ib-1)*SBS
         vvA=view(vA,is+1:is+SBS,js+1:js+SBS)
         vvAb=view(vAb,:,:,ib,jb)
         initblock!(vvAb,vvA)
      end
   end
   nothing
end


function createBlockBlockMatrix(A,BS)
   nx,ny=size(A)
   nbx=div(nx,BS)
   nby=div(ny,BS)
   nbbx=div(BS,SBS)
   nbby=div(BS,SBS)
   Abb=Array{Float64,6}(undef,SBS,SBS,nbbx,nbby,nbx,nby)
   Threads.@threads for jb=1:nby
      js=(jb-1)*BS
      @inbounds for ib=1:nbx
         is=(ib-1)*BS

         vA=view(A,is+1:is+BS,js+1:js+BS)
         vAbb=view(Abb,:,:,:,:,ib,jb)
         initblockblock!(vAbb,vA)
      end
   end
   Abb
end

@inline function copyfromblock!(vA,vaik)
   @inbounds for j=1:SBS
      @simd for i=1:SBS
         vA[i,j]=vaik[i,j]
      end
   end
   nothing
end

function copyfromblocks!(vA,vAb)
   @inbounds nbx,nby=size(vAb)[end-1],size(vAb)[end]
   @inbounds for jb=1:nby
      js=(jb-1)*SBS
      for ib=1:nbx
         is=(ib-1)*SBS
         vvA=view(vA,is+1:is+SBS,js+1:js+SBS)
         vvAb=view(vAb,:,:,ib,jb)
         copyfromblock!(vvA,vvAb)
      end
   end
   nothing
end




function copyfromblocksblocks!(A,Abb)
   @inbounds nbx,nby=size(Abb)[end-1],size(Abb)[end]
   @inbounds for jb=1:nby
      js=(jb-1)*BS
      for ib=1:nbx
         is=(ib-1)*BS
         vA=view(A,is+1:is+BS,js+1:js+BS)
         vAbb=view(Abb,:,:,:,:,ib,jb)
         copyfromblocks!(vA,vAbb)
      end
   end
   nothing
end



function mygemm!(C,A,B)
   Abb=createBlockBlockMatrix(A,BS)
   Bbb=createBlockBlockMatrix(B,BS)
   Cbb=createBlockBlockMatrix(C,BS)
   macrogemm!(Cbb,Abb,Bbb)
   copyfromblocksblocks!(C,Cbb)
   nothing
end

function test()
   @show Threads.nthreads()

   n=4096
   A=rand(n,n)
   Aref=deepcopy(A)
   @show norm(A)
   Abb=createBlockBlockMatrix(A,BS)
   # display(Abb)
   copyfromblocksblocks!(A,Abb)
   @show norm(A)
   @show norm(A.-Aref)

   B=rand(n,n)
   C=zeros(n,n)
   Cref=zeros(n,n)

   mul!(Cref,A,B)
   mygemm!(C,A,B)
   @show norm(C)
   @show norm(Cref)
   @show norm(C.-Cref)
   # display(C)
   # display(Cref)

   # return

   trials=@benchmark mul!($Cref,$A,$B)
   display(trials)
   tns=minimum(trials.times)
   gflops=Float64(2)*Float64(n)^3/(tns)
   println("gemm GFlops n=$n  \t"," :",gflops)

   trials=@benchmark mygemm!($C,$A,$B)
   display(trials)
   tns=minimum(trials.times)
   gflops=Float64(2)*Float64(n)^3/(tns)
   println("mygemm GFlops n=$n  \t"," :",gflops)

   nothing
end

test()

Really performance sensitive code can sometimes benefit from unsafe views https://github.com/oschulz/UnsafeArrays.jl

1 Like

There have been some other efforts to implement BLAS functionality in pure Julia. See the https://github.com/JuliaBLAS repos and this topic: We can write an optimized BLAS library in pure Julia (please skip OP and jump to post 4) - #4 by Elrod.

3 Likes

Wow ! Thank you very much for the link. I had a quick look at JBLAS and this is really a fantastic advanced performance lecture on HPC in Julia (and also in meta-programming). Although definitively out of the topic of my lecture preparation on Julia basics :wink: I really have to put some time to study this carefully.

For now I try to keep the focus on my introductory lecture and see what can be done at a relatively simple level. My main concern is that creating array of arrays works well here but it is a bit in contradiction with the standard multidim array usage in Julia.

I would be very happy if someone could show me how to write the same algo in a more idiomatic way. I will try @baggepinnen suggestion (thanks !) with uviews but I have the feeling that I am already moving away from the standard route.