Nice exercise for a benchmark. The MWE
using BenchmarkTools
using Random
using Distributions
using LoopVectorization
using Tullio
function transf(x, k)
transform = fill(NaN, k)
for j β 1:k
transform[j] = x^j
end
return transform
end
function test1()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = fill(NaN, M, N, T, k)
for m β 1:M
for i β 1:N
W_wide[m,i,:,:] = w[i,:]*transpose(transf(x[i,m], k))
end
end
W = reshape(W_wide, N*T*M, k)
end
function test2()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = Array{Float64}(undef, M, N, T, k)
for m β 1:M
for i β 1:N
xj = x1 = x[i,m]
for j β 1:k
for t β 1:T
W_wide[m,i,t,j] = w[i,t]*xj
end
xj *= x1
end
end
end
W = reshape(W_wide, N*T*M, k)
end
function test3()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = Array{Float64}(undef, M, N, T, k)
@turbo for m β 1:M
for i β 1:N
for j β 1:k
for t β 1:T
W_wide[m,i,t,j] = w[i,t]*x[i,m]^j
end
end
end
end
W = reshape(W_wide, N*T*M, k)
end
function test4()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
@tullio W_wide[m,i,t,j] := w[i,t]*x[i,m]^j (j β 1:k)
W = reshape(W_wide, N*T*M, k)
end
function test5()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = Array{Float64}(undef, M, N, T, k)
@tturbo for m β 1:M
for i β 1:N
for j β 1:k
for t β 1:T
W_wide[m,i,t,j] = w[i,t]*x[i,m]^j
end
end
end
end
W = reshape(W_wide, N*T*M, k)
end
Random.seed!(42)
result1 = test1()
Random.seed!(42)
result2 = test2()
@assert result2 β result1
Random.seed!(42)
result3 = test3()
@assert result3 β result2
Random.seed!(42)
result4 = test4()
@assert result4 β result3
Random.seed!(42)
result5 = test5()
@assert result5 β result4
@btime test1() setup=(Random.seed!(42))
@btime test2() setup=(Random.seed!(42))
@btime test3() setup=(Random.seed!(42))
@btime test4() setup=(Random.seed!(42))
@btime test5() setup=(Random.seed!(42))
yields
9.229 ms (120009 allocations: 15.13 MiB) # baseline
1.883 ms (8 allocations: 3.69 MiB) # manually optimized
856.100 ΞΌs (8 allocations: 3.69 MiB) # @turbo
696.600 ΞΌs (60 allocations: 3.69 MiB) # @tullio
465.600 ΞΌs (8 allocations: 3.69 MiB) # @tturbo
Using multi threading might be slight overkill here (as Iβm throwing 5 cores on the problem)β¦
Edit: I didnβt sort out if there is a better memory access order.