I just read through your earlier posts more thoroughly – great work!

If you’re referring to all the `Base.Cartesian.@nexprs 4 u -> ...`

, this is because of instruction latency.

If you’d like some reading material, I found Performance Speed Limits interesting and informative. By unrolling 4 times, eg

```
julia> using MacroTools
julia> prettify(@macroexpand Base.Cartesian.@nexprs 4 u -> begin
xi = SIMD.vload(Vec{W,T}, x, offset)
s_u, ei = two_sum(s_u, xi)
e_u += ei
offset += W
end)
quote
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_1, ei) = two_sum(s_1, xi)
e_1 += ei
offset += W
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_2, ei) = two_sum(s_2, xi)
e_2 += ei
offset += W
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_3, ei) = two_sum(s_3, xi)
e_3 += ei
offset += W
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_4, ei) = two_sum(s_4, xi)
e_4 += ei
offset += W
end
```

We are no longer bottle necked by instruction latency. Agner Fog’s instruction tables say that on Skylake(-X) the vectorized add, sub, mul, and fma instructions each have a latency of 4 clock cycles. These vary a little from one CPU family to another.

The Intel chips and 7 nm Ryzens should also be able to issue 2 instructions per clock cycle.

So that means to maximize performance, we need to have up to 8 instructions going at a time.

A simple example to see this, a rather naive sum function:

```
using VectorizationBase, SIMDPirates
@generated function usum(x::AbstractArray{T}, ::Val{Ushift} = Val(2)) where {T,Ushift}
@assert 0 ≤ Ushift < 6
U = 1 << Ushift
W, shift = VectorizationBase.pick_vector_width_shift(T)
totalshift = shift + Ushift
WU = 1 << totalshift
WT = W * sizeof(T)
q = quote
ptrx = pointer(x)
Base.Cartesian.@nexprs $U u -> begin
s_u = SIMDPirates.vbroadcast(Vec{$W,$T}, zero($T))
end
offset = 0
N = length(x)
for i in 1:(N >> $totalshift)
Base.Cartesian.@nexprs $U u -> begin
s_u = SIMDPirates.vadd(s_u, SIMDPirates.vload(Vec{$W,$T}, ptrx + offset))
offset += $WT
end
end
rem = N & $(WU - 1)
for i in 1:(rem >> $shift)
s_1 = SIMDPirates.vadd(s_1, SIMDPirates.vload(Vec{$W,$T}, ptrx + offset))
offset += $WT
end
rem &= $(W-1)
if rem > 0
s_1 = SIMDPirates.vadd(s_1, SIMDPirates.vload(Vec{$W,$T}, ptrx + offset), VectorizationBase.mask(Val{$W}(), rem))
end
end
Uh = U
while Uh > 1
Uh >>= 1
push!(
q.args,
quote
Base.Cartesian.@nexprs $Uh u -> begin
s_u = SIMDPirates.vadd(s_u, s_{u + $Uh})
end
end
)
end
push!(q.args, :(SIMDPirates.vsum(s_1)))
q
end
```

I wanted the unroll factor to be a power of 2; so the function takes a `Val`

argument `Ushift`

, and we unroll by `2^Ushift = 1<<Ushift`

(on top of the vectorization unrolling).

We need large vectors to see benefit. Picking the largest vector that still fits in the L1 cache:

```
julia> first(VectorizationBase.CACHE_SIZE) ÷ sizeof(Float64)
4096
julia> x = rand(ans);
```

and running benchmarks:

```
julia> using BenchmarkTools
julia> @btime sum($x)
296.868 ns (0 allocations: 0 bytes)
2036.5329867003175
julia> @btime usum($x, Val(0)) # unroll = 1 (no unrolling)
471.189 ns (0 allocations: 0 bytes)
2036.5329867003172
julia> @btime usum($x, Val(1)) # unroll = 2^1 = 2
249.814 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(2)) # unroll = 2^2 = 4
132.902 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(3)) # unroll = 2^3 = 8
84.432 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(4)) # unroll = 2^4 = 16
89.227 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(5)) # unroll = 2^5 = 32
108.952 ns (0 allocations: 0 bytes)
2036.532986700318
```

We can see that performance does indeed peak when we unroll by 8, which breaks up the dependency chain enough so that 8 `vadd`

instructions are running at a time.

The difference is much less for smaller vectors. And if we’re summing less than 64 elements, unrolling by 8 means we’re actually not unrolling at all.

I actually realize now that I was too aggressive with the choice of 4. Our functions have a lot more going on than just repetitive adds, giving the CPU more to do without the need to break up dependency chains. Even just focusing on floating point arithmetic, your `two_sum`

function can take advantage of order execution:

```
# D. E. Knuth, The Art of Computer Programming: Seminumerical Algorithms, 1969.
function two_sum(a::T, b::T) where T <: Union{Real, Vec}
x = a + b
z = x - a
e = (a - (x-z)) + (b-z)
(x, e)
end
```

`x`

and `z`

, and then `x-z`

and `b-z`

can each be calculated at the same time, for example.

As always, make hypothesis to try and understand what’s going on, and then test them. I modified your improvements to `sum_kahan`

so that it will take arbitrary unrolls, like the simple sum function from earlier:

```
using VectorizationBase, SIMDPirates
using SIMDPirates: evadd, evsub
function two_sum(a::T, b::T) where T <: NTuple
x = evadd(a, b)
z = evsub(x, a)
e = evadd(evsub(a, evsub(x, z)),
evsub(b, z))
(x, e)
end
function two_sum(a::T, b::T) where T <: Real
x = a + b
z = x - a
e = (a - (x-z)) + (b-z)
(x, e)
end
@generated function sum_kahan(x::AbstractArray{T}, ::Val{Ushift} = Val{1}()) where {T <: Union{Float32,Float64}, Ushift}
@assert 0 ≤ Ushift < 6
U = 1 << Ushift
W, shift = VectorizationBase.pick_vector_width_shift(T)
sizeT = sizeof(T)
WT = W * sizeT
WU = W*U
WTU = WT*U
V = SIMDPirates.Vec{W,T}
q = quote
px = pointer(x)
N = length(x)
Base.Cartesian.@nexprs $U u -> begin
s_u = SIMDPirates.vbroadcast($V, zero($T))
c_u = SIMDPirates.vbroadcast($V, zero($T))
end
Nshift = N >> $(shift + Ushift)
offset = 0
for n in 1:Nshift
Base.Cartesian.@nexprs $U u -> begin
x_u = SIMDPirates.vload($V, px + offset)
t_u = SIMDPirates.evadd(s_u, x_u)
m_u = SIMDPirates.vless(SIMDPirates.vabs(x_u), SIMDPirates.vabs(s_u))
subfrom_u = SIMDPirates.vifelse(m_u, s_u, x_u)
addto_u = SIMDPirates.vifelse(m_u, x_u, s_u)
c_u = SIMDPirates.evsub(c_u, SIMDPirates.evadd(SIMDPirates.evsub(subfrom_u, t_u), addto_u))
s_u = t_u
offset += $WT
end
end
rem = N & $(WU-1)
for n in 1:(rem >> $shift)
x_1 = SIMDPirates.vload($V, px + offset)
t_1 = SIMDPirates.evadd(s_1, x_1)
m_1 = SIMDPirates.vless(SIMDPirates.vabs(x_1), SIMDPirates.vabs(s_1))
subfrom_1 = SIMDPirates.vifelse(m_1, s_1, x_1)
addto_1 = SIMDPirates.vifelse(m_1, x_1, s_1)
c_1 = SIMDPirates.evsub(c_1, SIMDPirates.evadd(SIMDPirates.evsub(subfrom_1, t_1), addto_1))
s_1 = t_1
offset += $WT
end
rem &= $(W-1)
if rem > 0
mask = VectorizationBase.mask(Val{$W}(), rem)
x_2 = SIMDPirates.vload($V, px + offset, mask)
t_2 = SIMDPirates.evadd(s_2, x_2)
m_2 = SIMDPirates.vless(SIMDPirates.vabs(x_2), SIMDPirates.vabs(s_2))
subfrom_2 = SIMDPirates.vifelse(m_2, s_2, x_2)
addto_2 = SIMDPirates.vifelse(m_2, x_2, s_2)
c_2 = SIMDPirates.evsub(c_2, SIMDPirates.evadd(SIMDPirates.evsub(subfrom_2, t_2), addto_2))
s_2 = t_2
end
es = SIMDPirates.vbroadcast($V, zero($T))
end
Uh = U
while Uh > 1
Uh >>= 1
qtemp = quote
Base.Cartesian.@nexprs $Uh u -> begin
c_u = SIMDPirates.evadd(c_u, c_{u+$Uh})
(s_u, e) = two_sum(s_u, s_{u+$Uh})
es = SIMDPirates.evadd(es, e)
end
end
push!(q.args, qtemp)
end
qscalarreduce = quote
s = zero(T)
e = SIMDPirates.vsum(es) - SIMDPirates.vsum(c_1)
for si in s_1
s, ei = two_sum(s, si.value)
e += ei
end
s + e
end
push!(q.args, qscalarreduce)
q
end
```

Now, we see that `2^1 = 2`

, and not 4 as I used, leads to the best performance:

```
julia> y = 10^14 .* sin.(LinRange(0, 2π, 4096));
julia> @btime sum($y)
282.450 ns (0 allocations: 0 bytes)
16.0
julia> @btime sum(big, $y)
375.410 μs (16382 allocations: 895.89 KiB)
-0.21548719867825956442164425652663339860737323760986328125
julia> @btime sum_kbn($y)
5.098 μs (0 allocations: 0 bytes)
-0.21548719867826094
julia> @btime sum_kahan($y, Val(0))
738.659 ns (0 allocations: 0 bytes)
-0.21548719867825916
julia> @btime sum_kahan($y, Val(1))
607.716 ns (0 allocations: 0 bytes)
-0.21548719867825916
julia> @btime sum_kahan($y, Val(2))
623.291 ns (0 allocations: 0 bytes)
-0.21548719867825916
julia> @btime sum_kahan($y, Val(3))
631.864 ns (0 allocations: 0 bytes)
-0.2154871986782596
```

I’d guess that `sum_oro_vec`

will also see slightly better performance with 2x instead of 4x.