I checked the following Python script
import numpy as np
import time
n0=n1=n2=n3=30
A = np.random.random((n0,n1,n2,n3))
n_loop = 100
start = time.time()
for i in range(n_loop):
B = np.add( 0.1*A, 0.2*np.transpose(A,(1,0,2,3)) )
end = time.time()
print((end - start)/n_loop * 1000, " ms")
against Julia’s
using BenchmarkTools, LoopVectorization, Tullio
function test1(A)
B = similar(A)
for i in 1:size(A,1), j in 1:size(A,2)
# B[i,j,:,:] = 0.1*A[i,j,:,:] + 0.2*A[i,j,:,:]'
B[:,:,i,j] = 0.1*A[:,:,i,j] + 0.2*A[:,:,i,j]'
end
B
end
function test2(A)
B = similar(A)
for i in 1:size(A,1), j in 1:size(A,2)
# @. @views B[i,j,:,:] = 0.1*A[i,j,:,:] + 0.2*A[i,j,:,:]'
@. @views B[:,:,i,j] = 0.1*A[:,:,i,j] + 0.2*A[:,:,i,j]'
end
B
end
function test3(A)
# @tullio B[i,j,k,l] := 0.1*A[i,j,k,l] + 0.2*A[i,j,l,k]
@tullio B[i,j,k,l] := 0.1*A[i,j,k,l] + 0.2*A[j,i,k,l]
B
end
@btime test1(A) setup=(n=30; A=rand(n,n,n,n))
@btime test2(A) setup=(n=30; A=rand(n,n,n,n))
@btime test3(A) setup=(n=30; A=rand(n,n,n,n))
nothing
Python yields
4.956309795379639 ms
Julia yields
7.886 ms (5403 allocations: 37.81 MiB) # naive
2.253 ms (3 allocations: 6.18 MiB) # naive, @views, broadcasting
1.323 ms (44 allocations: 6.18 MiB) # @tullio
Observations:
- for the naive versions ordering of memory access is relevant
- I didn’t manage to test a version with
LoopVectorization
’s@turbo