Is there any way to optimize array additions and multiplications with transposes?

I checked the following Python script

import numpy as np
import time

n0=n1=n2=n3=30
A = np.random.random((n0,n1,n2,n3))

n_loop = 100

start = time.time()
for i in range(n_loop):
    B = np.add( 0.1*A, 0.2*np.transpose(A,(1,0,2,3)) )
end = time.time()
print((end - start)/n_loop * 1000, " ms")

against Julia’s

using BenchmarkTools, LoopVectorization, Tullio

function test1(A)
    B = similar(A)
    for i in 1:size(A,1), j in 1:size(A,2)
        # B[i,j,:,:] = 0.1*A[i,j,:,:] + 0.2*A[i,j,:,:]'
        B[:,:,i,j] = 0.1*A[:,:,i,j] + 0.2*A[:,:,i,j]'
    end
    B
end    

function test2(A)
    B = similar(A)
    for i in 1:size(A,1), j in 1:size(A,2)
        # @. @views B[i,j,:,:] = 0.1*A[i,j,:,:] + 0.2*A[i,j,:,:]'
        @. @views B[:,:,i,j] = 0.1*A[:,:,i,j] + 0.2*A[:,:,i,j]'
    end
    B
end    

function test3(A)
    # @tullio B[i,j,k,l] := 0.1*A[i,j,k,l] + 0.2*A[i,j,l,k]
    @tullio B[i,j,k,l] := 0.1*A[i,j,k,l] + 0.2*A[j,i,k,l]
    B
end    

@btime test1(A) setup=(n=30; A=rand(n,n,n,n))
@btime test2(A) setup=(n=30; A=rand(n,n,n,n))
@btime test3(A) setup=(n=30; A=rand(n,n,n,n))
nothing

Python yields

4.956309795379639  ms

Julia yields

  7.886 ms (5403 allocations: 37.81 MiB) # naive
  2.253 ms (3 allocations: 6.18 MiB) # naive, @views, broadcasting
  1.323 ms (44 allocations: 6.18 MiB) # @tullio

Observations:

  • for the naive versions ordering of memory access is relevant
  • I didn’t manage to test a version with LoopVectorization’s @turbo