Thanks a lot for taking the time. I’ve adapted a little to keep the same interface I had and here are the results on my machine(with reworked code so that we can start on the same base):
using StaticArrays
using LoopVectorization
using BenchmarkTools
M = 10
N = 100000
D = exp.(randn(N))
rez = MVector{M,Float64}(zeros(M))
function v1(rez,D)
rez .= [sum(exp.(.-D) .* D.^(k-1))/length(D) for k in 1:length(rez)]
return nothing
end
v1(rez,D)
truth = deepcopy(rez)
@benchmark v1($rez,$D)
# BenchmarkTools.Trial: 186 samples with 1 evaluation.
# Range (min … max): 22.882 ms … 40.312 ms ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 26.545 ms ┊ GC (median): 0.00%
# Time (mean ± σ): 26.874 ms ± 2.500 ms ┊ GC (mean ± σ): 1.35% ± 2.84%
# ▂ ▁▂▄ ▁ ▁ ██▇ ▂▂ ▅▂▅▅▂ ▁ ▂
# ▃▆▃█▆████████████████▆█████▅▅█▅▆▅▃▅█▆▁▁▅▁▃▁▁▃▃▅▁▁▁▃▁▁▁▃▁▁▁▃ ▃
# 22.9 ms Histogram: frequency by time 34.5 ms <
# Memory estimate: 7.63 MiB, allocs estimate: 21.
function exp_and_mean!(slack,D,N)
zz = zero(eltype(D))
@tturbo for i in 1:N
slack[i] = exp(-D[i])
zz += slack[i]
end
zz /= N
return zz
end
function prod_and_mean!(slack,D,N)
zz = zero(eltype(D))
@tturbo for i in 1:N
slack[i] *= D[i]
zz += slack[i]
end
zz /= N
return zz
end
function original!(rez,slack,D)
M = length(rez)
N = length(D)
rez[1] = exp_and_mean!(slack,D,N)
for k in 2:M
rez[k] = prod_and_mean!(slack,D,N)
end
return rez
end
slack = similar(D)
original!(rez,slack,D)
@assert rez ≈ truth
@benchmark original!($rez,$slack,$D)
# BenchmarkTools.Trial: 10000 samples with 1 evaluation.
# Range (min … max): 104.800 μs … 805.000 μs ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 132.900 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 160.863 μs ± 50.199 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
# ▂█▆▄ ▁
# ▃▇█████▄▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▃▂▃▃▄▄▃▄▄▇▆▅█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▂
# 105 μs Histogram: frequency by time 287 μs <
# Memory estimate: 0 bytes, allocs estimate: 0.
function v2_only_when_M_is_10(rez,D)
M = length(rez)
@assert M == 10
N = length(D)
r1 = r2 = r3 = r4 = r5 = r6 = r7 = r8 = r9 = r10 = 0.0
@tturbo for i in 1:N
Dᵢ = D[i]
eᵢ = exp(-Dᵢ)
r1 += eᵢ
eᵢ *= Dᵢ
r2 += eᵢ
eᵢ *= Dᵢ
r3 += eᵢ
eᵢ *= Dᵢ
r4 += eᵢ
eᵢ *= Dᵢ
r5 += eᵢ
eᵢ *= Dᵢ
r6 += eᵢ
eᵢ *= Dᵢ
r7 += eᵢ
eᵢ *= Dᵢ
r8 += eᵢ
eᵢ *= Dᵢ
r9 += eᵢ
eᵢ *= Dᵢ
r10+= eᵢ
end
rez .= [r1,r2,r3,r4,r5,r6,r7,r8,r9,r10] ./ N
return nothing
end
v2_only_when_M_is_10(rez,D)
@assert rez ≈ truth
@benchmark v2_only_when_M_is_10($rez,$D)
# BenchmarkTools.Trial: 10000 samples with 1 evaluation.
# Range (min … max): 53.300 μs … 613.700 μs ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 62.900 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 71.640 μs ± 16.417 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
# ▃ ▃ ▂▃▇▄█▂▅▁▃▂▄▁▁ ▁ ▁ ▁ ▁ ▄▂ ▂ ▃▃ ▅▄ ▃▄ ▂▃ ▂▃ ▄▂ ▁▁ ▂
# ███▅█████████████▇██████▇█████▇▇██▅█████████████▇▅██▇▇██▆▇██ █
# 53.3 μs Histogram: log(frequency) by time 102 μs <
# Memory estimate: 160 bytes, allocs estimate: 1.
const C = Base.Cartesian
@generated function v2_ffevote(rez::MVector{M,T},D) where {M,T}
quote
N = length(D)
C.@nexprs $M k -> r_k = 0.0
for i in 1:length(D)
Di = D[i]
ei = exp(-Di)
C.@nexprs $M k -> begin
r_k += ei
ei *= Di
end
end
C.@nexprs $M k -> rez[k] = r_k / N
return nothing
end
end
v2_ffevote(rez,D)
@assert rez ≈ truth
@benchmark v2_ffevote($rez,$D)
# BenchmarkTools.Trial: 5767 samples with 1 evaluation.
# Range (min … max): 790.100 μs … 2.033 ms ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 830.700 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 862.798 μs ± 97.176 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
# ▂ █▃▆▄▅▄▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁ ▁
# █▇█████████████████████████████▇▇▆▇▇▆▆▆▅▆▆▆▅▆▅▅▄▅▃▁▄▅▅▅▄▁▁▃▅ █
# 790 μs Histogram: log(frequency) by time 1.31 ms <
# Memory estimate: 0 bytes, allocs estimate: 0.
@generated function v2_ffevote_turbo(rez::MVector{M,T},D) where {M,T}
quote
N = length(D)
C.@nexprs $M k -> r_k = 0.0
@tturbo for i in 1:length(D)
Di = D[i]
ei = exp(-Di)
C.@nexprs $M k -> begin
r_k += ei
ei *= Di
end
end
C.@nexprs $M k -> rez[k] = r_k / N
return nothing
end
end
v2_ffevote_turbo(rez,D)
@assert rez ≈ truth
@benchmark v2_ffevote_turbo($rez,$D)
# Range (min … max): 65.500 μs … 844.100 μs ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 119.700 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 114.916 μs ± 25.891 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
# ▄▂▃▁▄▁▄▅▂▃▁ ▂▃ ▃ ▄ ▅▅▆▆▂▇▃▅█▃▃▃▁ ▂
# █████████████▇▇▇██▇█████████████████████▇▇▆▇▆▇▇▇▇▅▇▆▅▅▇▆▄▅▄▄▅ █
# 65.5 μs Histogram: log(frequency) by time 182 μs <
# Memory estimate: 0 bytes, allocs estimate: 0.
In summary, although the @code_lowered looks like it made what we wanted it to make, it interferes a little with @tturbo and takes about 50% more time (median and mean cases).
If there is indeed a way to control the order of macros, and if it is not too painfull, maybe there is still something to do here… However taking a look at the code_lowered myself makes me say they are the same, which is clearly not true: I’m missing the difference…
Edit: or maybe there are the same, the benchmark timings are changing a lot from a run to the other… Is there a way to check if two functions have the same assembly code ?
Looks like the generation in the middle of the loop :
C.@nexprs $M k -> begin
r_k += ei
ei *= Di
end
Does one to much ei *= Di at the end (the loop should end with a ri += ei. But this is me being silly…