Assuming you need to return at least an SVector leaves me with this
using BenchmarkTools
using Random
using Distributions
using LoopVectorization
using Tullio
using StaticArrays
function transf1(x, k)
transform = fill(NaN, k)
for j ∈ 1:k
transform[j] = x^j
end
return transform
end
function test1()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = fill(NaN, M, N, T, k)
for m ∈ 1:M
for i ∈ 1:N
W_wide[m,i,:,:] = w[i,:]*transpose(transf1(x[i,m], k))
end
end
W = reshape(W_wide, N*T*M, k)
end
function transf2(x, k)
SVector((x^j for j ∈ 1:k)...)
end
function test2()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = Array{Float64}(undef, M, N, T, k)
for m ∈ 1:M
for i ∈ 1:N
v = transf2(x[i, m], k)
for j ∈ 1:k
for t ∈ 1:T
W_wide[m,i,t,j] = w[i,t]*v[j]
end
end
end
end
W = reshape(W_wide, N*T*M, k)
end
function test3()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = Array{Float64}(undef, M, N, T, k)
for m ∈ 1:M
for i ∈ 1:N
v = transf2(x[i, m], k)
@turbo for j ∈ 1:k
for t ∈ 1:T
W_wide[m,i,t,j] = w[i,t]*v[j]
end
end
end
end
W = reshape(W_wide, N*T*M, k)
end
function test4()
N = 1000
T = 3
M = 30
k = 5
w = rand(Normal(0,1), N, T)
x = rand(Normal(0,1), N, M)
W_wide = Array{Float64}(undef, M, N, T, k)
for m ∈ 1:M
for i ∈ 1:N
v = transf2(x[i, m], k)
@tturbo for j ∈ 1:k
for t ∈ 1:T
W_wide[m,i,t,j] = w[i,t]*v[j]
end
end
end
end
W = reshape(W_wide, N*T*M, k)
end
Random.seed!(42)
result1 = test1()
Random.seed!(42)
result2 = test2()
@assert result2 ≈ result1
Random.seed!(42)
result3 = test3()
@assert result3 ≈ result2
Random.seed!(42)
result4 = test4()
@assert result4 ≈ result3
yielding
9.086 ms (120009 allocations: 15.13 MiB) # baseline
2.073 ms (8 allocations: 3.69 MiB) # manually optimized
2.188 ms (8 allocations: 3.69 MiB) # @turbo
2.810 ms (8 allocations: 3.69 MiB) # @tturbo
Here LoopVectorization seemingly allows only to annotate the inner loops, which doesn’t help anymore. And my Tullio-fu is too weak for this construction.