A macro to unroll by hand but not by hand?

Thanks a lot for taking the time. I’ve adapted a little to keep the same interface I had and here are the results on my machine(with reworked code so that we can start on the same base):

using StaticArrays
using LoopVectorization
using BenchmarkTools


M = 10
N = 100000
D = exp.(randn(N))
rez = MVector{M,Float64}(zeros(M))

function v1(rez,D)
    rez .= [sum(exp.(.-D) .* D.^(k-1))/length(D) for k in 1:length(rez)]
    return nothing
end

v1(rez,D)
truth = deepcopy(rez)
@benchmark v1($rez,$D)
# BenchmarkTools.Trial: 186 samples with 1 evaluation.
#  Range (min … max):  22.882 ms … 40.312 ms  ┊ GC (min … max): 0.00% … 0.00%
#  Time  (median):     26.545 ms              ┊ GC (median):    0.00%
#  Time  (mean ± σ):   26.874 ms ±  2.500 ms  ┊ GC (mean ± σ):  1.35% ± 2.84%

#      ▂ ▁▂▄  ▁  ▁ ██▇ ▂▂ ▅▂▅▅▂  ▁     ▂
#   ▃▆▃█▆████████████████▆█████▅▅█▅▆▅▃▅█▆▁▁▅▁▃▁▁▃▃▅▁▁▁▃▁▁▁▃▁▁▁▃ ▃
#   22.9 ms         Histogram: frequency by time        34.5 ms <

#  Memory estimate: 7.63 MiB, allocs estimate: 21.


function exp_and_mean!(slack,D,N)
    zz = zero(eltype(D))
    @tturbo for i in 1:N
        slack[i] = exp(-D[i])
        zz += slack[i]
    end
    zz /= N
    return zz
end
function prod_and_mean!(slack,D,N)
    zz = zero(eltype(D))
    @tturbo for i in 1:N
        slack[i] *= D[i]
        zz += slack[i]
    end
    zz /= N
    return zz
end
function original!(rez,slack,D)
    M = length(rez)
    N = length(D)
    rez[1] = exp_and_mean!(slack,D,N)
    for k in 2:M
        rez[k] = prod_and_mean!(slack,D,N)
    end
    return rez
end
slack = similar(D)
original!(rez,slack,D)
@assert rez ≈ truth
@benchmark original!($rez,$slack,$D)
# BenchmarkTools.Trial: 10000 samples with 1 evaluation.
#  Range (min … max):  104.800 μs … 805.000 μs  ┊ GC (min … max): 0.00% … 0.00%
#  Time  (median):     132.900 μs               ┊ GC (median):    0.00%
#  Time  (mean ± σ):   160.863 μs ±  50.199 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

#     ▂█▆▄                                ▁
#   ▃▇█████▄▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▃▂▃▃▄▄▃▄▄▇▆▅█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
#   105 μs           Histogram: frequency by time          287 μs <

#  Memory estimate: 0 bytes, allocs estimate: 0.


function v2_only_when_M_is_10(rez,D)
    M = length(rez)
    @assert M == 10
    N = length(D)
    r1 = r2 = r3 = r4 = r5 = r6 = r7 = r8 = r9 = r10 = 0.0
    @tturbo for i in 1:N
        Dᵢ = D[i]
        eᵢ = exp(-Dᵢ)
        r1 += eᵢ
        eᵢ *= Dᵢ
        r2 += eᵢ
        eᵢ *= Dᵢ
        r3 += eᵢ
        eᵢ *= Dᵢ
        r4 += eᵢ
        eᵢ *= Dᵢ
        r5 += eᵢ
        eᵢ *= Dᵢ
        r6 += eᵢ
        eᵢ *= Dᵢ
        r7 += eᵢ
        eᵢ *= Dᵢ
        r8 += eᵢ
        eᵢ *= Dᵢ
        r9 += eᵢ
        eᵢ *= Dᵢ
        r10+= eᵢ
    end
    rez .= [r1,r2,r3,r4,r5,r6,r7,r8,r9,r10] ./ N
    return nothing
end
v2_only_when_M_is_10(rez,D)
@assert rez ≈ truth
@benchmark v2_only_when_M_is_10($rez,$D)
# BenchmarkTools.Trial: 10000 samples with 1 evaluation.
# Range (min … max):  53.300 μs … 613.700 μs  ┊ GC (min … max): 0.00% … 0.00%
# Time  (median):     62.900 μs               ┊ GC (median):    0.00%
# Time  (mean ± σ):   71.640 μs ±  16.417 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

#  ▃ ▃ ▂▃▇▄█▂▅▁▃▂▄▁▁ ▁ ▁    ▁ ▁    ▄▂ ▂ ▃▃ ▅▄ ▃▄ ▂▃  ▂▃  ▄▂  ▁▁ ▂
#  ███▅█████████████▇██████▇█████▇▇██▅█████████████▇▅██▇▇██▆▇██ █
#  53.3 μs       Histogram: log(frequency) by time       102 μs <

# Memory estimate: 160 bytes, allocs estimate: 1.

const C = Base.Cartesian
@generated function v2_ffevote(rez::MVector{M,T},D) where {M,T}
    quote
        N = length(D)
        C.@nexprs $M k -> r_k = 0.0
        for i in 1:length(D)
            Di = D[i]
            ei = exp(-Di)
            C.@nexprs $M k -> begin
                r_k += ei
                ei *= Di
            end
        end
        C.@nexprs $M k -> rez[k] = r_k / N
        return nothing
    end
end
v2_ffevote(rez,D)
@assert rez ≈ truth
@benchmark v2_ffevote($rez,$D)
# BenchmarkTools.Trial: 5767 samples with 1 evaluation.
#  Range (min … max):  790.100 μs …  2.033 ms  ┊ GC (min … max): 0.00% … 0.00%
#  Time  (median):     830.700 μs              ┊ GC (median):    0.00%
#  Time  (mean ± σ):   862.798 μs ± 97.176 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

#   ▂ █▃▆▄▅▄▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁                                     ▁
#   █▇█████████████████████████████▇▇▆▇▇▆▆▆▅▆▆▆▅▆▅▅▄▅▃▁▄▅▅▅▄▁▁▃▅ █
#   790 μs        Histogram: log(frequency) by time      1.31 ms <

#  Memory estimate: 0 bytes, allocs estimate: 0.


@generated function v2_ffevote_turbo(rez::MVector{M,T},D) where {M,T}
    quote
        N = length(D)
        C.@nexprs $M k -> r_k = 0.0
        @tturbo for i in 1:length(D)
            Di = D[i]
            ei = exp(-Di)
            C.@nexprs $M k -> begin
                r_k += ei
                ei *= Di
            end
        end
        C.@nexprs $M k -> rez[k] = r_k / N
        return nothing
    end
end
v2_ffevote_turbo(rez,D)
@assert rez ≈ truth
@benchmark v2_ffevote_turbo($rez,$D)
# Range (min … max):   65.500 μs … 844.100 μs  ┊ GC (min … max): 0.00% … 0.00%
# Time  (median):     119.700 μs               ┊ GC (median):    0.00%
# Time  (mean ± σ):   114.916 μs ±  25.891 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

#  ▄▂▃▁▄▁▄▅▂▃▁     ▂▃ ▃ ▄   ▅▅▆▆▂▇▃▅█▃▃▃▁                        ▂
#  █████████████▇▇▇██▇█████████████████████▇▇▆▇▆▇▇▇▇▅▇▆▅▅▇▆▄▅▄▄▅ █
#  65.5 μs       Histogram: log(frequency) by time        182 μs <

# Memory estimate: 0 bytes, allocs estimate: 0.

In summary, although the @code_lowered looks like it made what we wanted it to make, it interferes a little with @tturbo and takes about 50% more time (median and mean cases).

If there is indeed a way to control the order of macros, and if it is not too painfull, maybe there is still something to do here… However taking a look at the code_lowered myself makes me say they are the same, which is clearly not true: I’m missing the difference…


Edit: or maybe there are the same, the benchmark timings are changing a lot from a run to the other… Is there a way to check if two functions have the same assembly code ?


Looks like the generation in the middle of the loop :

C.@nexprs $M k -> begin
    r_k += ei
    ei *= Di
end

Does one to much ei *= Di at the end (the loop should end with a ri += ei. But this is me being silly…