For example, here is a little recursive implementation of a cache-oblivious matrix multiplication that stays within a factor of 2 of the single-threaded OpenBLAS performance on my laptop up to 3000×3000 matrices:
function add_matmul_rec!(m,n,p, i0,j0,k0, C,A,B)
if m+n+p <= 256 # base case: naive matmult for sufficiently large matrices
@avx for i = 1:m, k = 1:p
c = zero(eltype(C))
for j = 1:n
@inbounds c += A[i0+i,j0+j] * B[j0+j,k0+k]
end
@inbounds C[i0+i,k0+k] += c
end
else
m2 = m ÷ 2; n2 = n ÷ 2; p2 = p ÷ 2
add_matmul_rec!(m2, n2, p2, i0, j0, k0, C, A, B)
add_matmul_rec!(m-m2, n2, p2, i0+m2, j0, k0, C, A, B)
add_matmul_rec!(m2, n-n2, p2, i0, j0+n2, k0, C, A, B)
add_matmul_rec!(m2, n2, p-p2, i0, j0, k0+p2, C, A, B)
add_matmul_rec!(m-m2, n-n2, p2, i0+m2, j0+n2, k0, C, A, B)
add_matmul_rec!(m2, n-n2, p-p2, i0, j0+n2, k0+p2, C, A, B)
add_matmul_rec!(m-m2, n2, p-p2, i0+m2, j0, k0+p2, C, A, B)
add_matmul_rec!(m-m2, n-n2, p-p2, i0+m2, j0+n2, k0+p2, C, A, B)
end
return C
end
function matmul_rec!(C, A, B)
m,n = size(A)
n,p = size(B)
size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p")
fill!(C, zero(eltype(C)))
return add_matmul_rec!(m,n,p, 0,0,0, C,A,B)
end
matmul_rec(A, B) = matmul_rec!(Array{promote_type(eltype(A), eltype(B))}(undef,
size(A,1), size(B,2)),
A, B)
In comparison, Octavian.jl is ~100× more code, and OpenBLAS is > 1000× more code (and only supports 4 arithmetic types, whereas this code is type-generic).
Here larger numbers = faster (vertical axis is rate of floating-point arithmetic operations), timings are for double precision, “naive” is a 3-loop Julia version with @avx
, and BLAS is mul!
with OpenBLAS: