Julia matrix-multiplication performance

stevengj · February 12, 2021, 10:12pm

For example, here is a little recursive implementation of a cache-oblivious matrix multiplication that stays within a factor of 2 of the single-threaded OpenBLAS performance on my laptop up to 3000×3000 matrices:

function add_matmul_rec!(m,n,p, i0,j0,k0, C,A,B)
    if m+n+p <= 256   # base case: naive matmult for sufficiently large matrices
        @avx for i = 1:m, k = 1:p
            c = zero(eltype(C))
            for j = 1:n
                @inbounds c += A[i0+i,j0+j] * B[j0+j,k0+k]
            end
            @inbounds C[i0+i,k0+k] += c
        end
    else
        m2 = m ÷ 2; n2 = n ÷ 2; p2 = p ÷ 2
        add_matmul_rec!(m2, n2, p2, i0, j0, k0, C, A, B)
        
        add_matmul_rec!(m-m2, n2, p2, i0+m2, j0, k0, C, A, B)
        add_matmul_rec!(m2, n-n2, p2, i0, j0+n2, k0, C, A, B)
        add_matmul_rec!(m2, n2, p-p2, i0, j0, k0+p2, C, A, B)
        
        add_matmul_rec!(m-m2, n-n2, p2, i0+m2, j0+n2, k0, C, A, B)
        add_matmul_rec!(m2, n-n2, p-p2, i0, j0+n2, k0+p2, C, A, B)
        add_matmul_rec!(m-m2, n2, p-p2, i0+m2, j0, k0+p2, C, A, B)
        
        add_matmul_rec!(m-m2, n-n2, p-p2, i0+m2, j0+n2, k0+p2, C, A, B)
    end
    return C
end

function matmul_rec!(C, A, B)
    m,n = size(A)
    n,p = size(B)
    size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p")
    fill!(C, zero(eltype(C)))
    return add_matmul_rec!(m,n,p, 0,0,0, C,A,B)
end


matmul_rec(A, B) = matmul_rec!(Array{promote_type(eltype(A), eltype(B))}(undef,
                                     size(A,1), size(B,2)),
                               A, B)

In comparison, Octavian.jl is ~100× more code, and OpenBLAS is > 1000× more code (and only supports 4 arithmetic types, whereas this code is type-generic).

Here larger numbers = faster (vertical axis is rate of floating-point arithmetic operations), timings are for double precision, “naive” is a 3-loop Julia version with @avx, and BLAS is mul! with OpenBLAS: