Help optimizing simple example

Assuming you need to return at least an SVector leaves me with this

using BenchmarkTools
using Random
using Distributions
using LoopVectorization
using Tullio
using StaticArrays

function transf1(x, k)
    transform = fill(NaN, k)
    for j ∈ 1:k
        transform[j] = x^j
    end
    return transform
end

function test1()
    N = 1000
    T = 3
    M = 30
    k = 5
    w = rand(Normal(0,1), N, T)
    x = rand(Normal(0,1), N, M)
    W_wide = fill(NaN, M, N, T, k)
    for m ∈ 1:M
        for i ∈ 1:N
            W_wide[m,i,:,:] = w[i,:]*transpose(transf1(x[i,m], k))
        end
    end
    W = reshape(W_wide, N*T*M, k)
end

function transf2(x, k)
    SVector((x^j for j ∈ 1:k)...)
end

function test2()
    N = 1000
    T = 3
    M = 30
    k = 5
    w = rand(Normal(0,1), N, T)
    x = rand(Normal(0,1), N, M)
    W_wide = Array{Float64}(undef, M, N, T, k)
    for m ∈ 1:M
        for i ∈ 1:N
            v = transf2(x[i, m], k)
            for j ∈ 1:k
                for t ∈ 1:T
                    W_wide[m,i,t,j] = w[i,t]*v[j]
                end
            end
        end
    end
    W = reshape(W_wide, N*T*M, k)
end

function test3()
    N = 1000
    T = 3
    M = 30
    k = 5
    w = rand(Normal(0,1), N, T)
    x = rand(Normal(0,1), N, M)
    W_wide = Array{Float64}(undef, M, N, T, k)
    for m ∈ 1:M
        for i ∈ 1:N
            v = transf2(x[i, m], k)
            @turbo for j ∈ 1:k
                for t ∈ 1:T
                    W_wide[m,i,t,j] = w[i,t]*v[j]
                end
            end
        end
    end
    W = reshape(W_wide, N*T*M, k)
end

function test4()
    N = 1000
    T = 3
    M = 30
    k = 5
    w = rand(Normal(0,1), N, T)
    x = rand(Normal(0,1), N, M)
    W_wide = Array{Float64}(undef, M, N, T, k)
    for m ∈ 1:M
        for i ∈ 1:N
            v = transf2(x[i, m], k)
            @tturbo for j ∈ 1:k
                for t ∈ 1:T
                    W_wide[m,i,t,j] = w[i,t]*v[j]
                end
            end
        end
    end
    W = reshape(W_wide, N*T*M, k)
end

Random.seed!(42)
result1 = test1()
Random.seed!(42)
result2 = test2()
@assert result2 ≈ result1
Random.seed!(42)
result3 = test3()
@assert result3 ≈ result2
Random.seed!(42)
result4 = test4()
@assert result4 ≈ result3

yielding

  9.086 ms (120009 allocations: 15.13 MiB) # baseline
  2.073 ms (8 allocations: 3.69 MiB) # manually optimized
  2.188 ms (8 allocations: 3.69 MiB) # @turbo
  2.810 ms (8 allocations: 3.69 MiB) # @tturbo

Here LoopVectorization seemingly allows only to annotate the inner loops, which doesn’t help anymore. And my Tullio-fu is too weak for this construction.

1 Like