The benchmarks you wrote aren’t measuring at all what you think they are.
Here is a benchmark that merasures the runtime performance (and also includes a Tullio.jl + LoopVectorization.jl benchmark for extra context)
using TensorKit
using Tullio, LoopVectorization # Tullio is pretty fast, but if LoopVectorization.jl is also loaded, it'll be even faster.
using BenchmarkTools # this package is essential for producing proper benchmarks
function contract!(C::Array{Float64, 3}, A::Array{Float64, 2}, B::Array{Float64, 3})
na, nc, nd = size(C)
nb = size(A, 2)
@assert na == size(A, 1) && nc == size(B, 2) && nd == size(B, 3)
for d in 1:nd
for c in 1:nc
for a in 1:na
for b in 1:nb
C[a,c,d] += A[a,b] * B[b,c,d]
end
end
end
end
C
end
julia> let
na = nb = nc = nd = 12
A = rand(na, nb)
B = rand(nb, nc, nd)
C = zeros(na, nc, nd)
@btime contract!($C, $A, $B)
@btime @tensor $C[a,c,d] = $A[a,b] * $B[b,c,d]
@btime @tullio $C[a,c,d] = $A[a,b] * $B[b,c,d]
end;
19.760 μs (0 allocations: 0 bytes)
3.811 μs (1 allocation: 16 bytes)
2.273 μs (0 allocations: 0 bytes)
Writing manual loops is fast, but for dense linear algebra it can easily leave up to an order of magnitude of performance on the table compared to smarter algorithms like those handwritten in BLAS libraries or generated by Tullio / LoopVectorization.jl