I recently encountered a similar problem. I need to multiply three large matrices X * Y * Z
. It looks like Tullio
and Octavian
have the best performances, although Octavian
takes a much longer time to precompile.
using BenchmarkTools
# generate data
const N = 500
const J = 10
const K = 8
const M = 200_000
A = rand(N, J)
B = rand(J, K)
C = rand(K, M)
# 1. naive code, slow
m1_naive(X, Y, Z) = X * Y * Z
# 2. still slow
function m2(X, Y, Z)
res = Matrix{Float64}(undef, N, M)
res = (X * Y) * Z
return res
end
# 3. preallocate, use mul! twice
using LinearAlgebra
function m3_mul(X, Y, Z)
res = Matrix{Float64}(undef, N, M)
XY = Matrix{Float64}(undef, N, K)
XY = mul!(XY, X, Y)
res = mul!(res, XY, Z)
return res
end
# 4. Tullio
using Tullio
m4_tullio(X, Y, Z) = @tullio res[n, m] := X[n, j] * Y[j, k] * Z[k, m]
# 5. using Octavian
using Octavian
function m5_octavian(X, Y, Z)
res = Matrix{Float64}(undef, N, M)
XY = Matrix{Float64}(undef, N, K)
XY = matmul!(XY, X, Y)
res = matmul!(res, XY, Z)
return res
end
julia> @time m1_naive(A, B, C);
1.756877 seconds (1.35 M allocations: 854.938 MiB, 3.84% gc time, 51.98% compilation time)
julia> @time m2(A, B, C);
0.833115 seconds (1.35 k allocations: 763.058 MiB, 1.19% gc time, 0.45% compilation time)
julia> @time m3_mul(A, B, C);
0.876475 seconds (2.22 k allocations: 763.112 MiB, 7.13% gc time, 0.60% compilation time)
julia> @time m4_tullio(A, B, C);
1.453623 seconds (394.21 k allocations: 788.928 MiB, 4.07% gc time, 177.89% compilation time)
julia> @time m5_octavian(A, B, C);
10.832562 seconds (8.95 M allocations: 1.271 GiB, 3.59% gc time, 95.59% compilation time)
julia> @btime m1_naive($A, $B, $C);
803.073 ms (4 allocations: 762.97 MiB)
julia> @btime m2($A, $B, $C);
799.319 ms (4 allocations: 762.97 MiB)
julia> @btime m3_mul($A, $B, $C);
805.715 ms (4 allocations: 762.97 MiB)
julia> @btime m4_tullio($A, $B, $C);
644.766 ms (316 allocations: 762.96 MiB)
julia> @btime m5_octavian($A, $B, $C);
563.200 ms (4 allocations: 762.97 MiB)