I just read through your earlier posts more thoroughly – great work!
If you’re referring to all the Base.Cartesian.@nexprs 4 u -> ...
, this is because of instruction latency.
If you’d like some reading material, I found Performance Speed Limits interesting and informative. By unrolling 4 times, eg
julia> using MacroTools
julia> prettify(@macroexpand Base.Cartesian.@nexprs 4 u -> begin
xi = SIMD.vload(Vec{W,T}, x, offset)
s_u, ei = two_sum(s_u, xi)
e_u += ei
offset += W
end)
quote
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_1, ei) = two_sum(s_1, xi)
e_1 += ei
offset += W
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_2, ei) = two_sum(s_2, xi)
e_2 += ei
offset += W
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_3, ei) = two_sum(s_3, xi)
e_3 += ei
offset += W
xi = SIMD.vload(Vec{W, T}, x, offset)
(s_4, ei) = two_sum(s_4, xi)
e_4 += ei
offset += W
end
We are no longer bottle necked by instruction latency. Agner Fog’s instruction tables say that on Skylake(-X) the vectorized add, sub, mul, and fma instructions each have a latency of 4 clock cycles. These vary a little from one CPU family to another.
The Intel chips and 7 nm Ryzens should also be able to issue 2 instructions per clock cycle.
So that means to maximize performance, we need to have up to 8 instructions going at a time.
A simple example to see this, a rather naive sum function:
using VectorizationBase, SIMDPirates
@generated function usum(x::AbstractArray{T}, ::Val{Ushift} = Val(2)) where {T,Ushift}
@assert 0 ≤ Ushift < 6
U = 1 << Ushift
W, shift = VectorizationBase.pick_vector_width_shift(T)
totalshift = shift + Ushift
WU = 1 << totalshift
WT = W * sizeof(T)
q = quote
ptrx = pointer(x)
Base.Cartesian.@nexprs $U u -> begin
s_u = SIMDPirates.vbroadcast(Vec{$W,$T}, zero($T))
end
offset = 0
N = length(x)
for i in 1:(N >> $totalshift)
Base.Cartesian.@nexprs $U u -> begin
s_u = SIMDPirates.vadd(s_u, SIMDPirates.vload(Vec{$W,$T}, ptrx + offset))
offset += $WT
end
end
rem = N & $(WU - 1)
for i in 1:(rem >> $shift)
s_1 = SIMDPirates.vadd(s_1, SIMDPirates.vload(Vec{$W,$T}, ptrx + offset))
offset += $WT
end
rem &= $(W-1)
if rem > 0
s_1 = SIMDPirates.vadd(s_1, SIMDPirates.vload(Vec{$W,$T}, ptrx + offset), VectorizationBase.mask(Val{$W}(), rem))
end
end
Uh = U
while Uh > 1
Uh >>= 1
push!(
q.args,
quote
Base.Cartesian.@nexprs $Uh u -> begin
s_u = SIMDPirates.vadd(s_u, s_{u + $Uh})
end
end
)
end
push!(q.args, :(SIMDPirates.vsum(s_1)))
q
end
I wanted the unroll factor to be a power of 2; so the function takes a Val
argument Ushift
, and we unroll by 2^Ushift = 1<<Ushift
(on top of the vectorization unrolling).
We need large vectors to see benefit. Picking the largest vector that still fits in the L1 cache:
julia> first(VectorizationBase.CACHE_SIZE) ÷ sizeof(Float64)
4096
julia> x = rand(ans);
and running benchmarks:
julia> using BenchmarkTools
julia> @btime sum($x)
296.868 ns (0 allocations: 0 bytes)
2036.5329867003175
julia> @btime usum($x, Val(0)) # unroll = 1 (no unrolling)
471.189 ns (0 allocations: 0 bytes)
2036.5329867003172
julia> @btime usum($x, Val(1)) # unroll = 2^1 = 2
249.814 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(2)) # unroll = 2^2 = 4
132.902 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(3)) # unroll = 2^3 = 8
84.432 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(4)) # unroll = 2^4 = 16
89.227 ns (0 allocations: 0 bytes)
2036.532986700318
julia> @btime usum($x, Val(5)) # unroll = 2^5 = 32
108.952 ns (0 allocations: 0 bytes)
2036.532986700318
We can see that performance does indeed peak when we unroll by 8, which breaks up the dependency chain enough so that 8 vadd
instructions are running at a time.
The difference is much less for smaller vectors. And if we’re summing less than 64 elements, unrolling by 8 means we’re actually not unrolling at all.
I actually realize now that I was too aggressive with the choice of 4. Our functions have a lot more going on than just repetitive adds, giving the CPU more to do without the need to break up dependency chains. Even just focusing on floating point arithmetic, your two_sum
function can take advantage of order execution:
# D. E. Knuth, The Art of Computer Programming: Seminumerical Algorithms, 1969.
function two_sum(a::T, b::T) where T <: Union{Real, Vec}
x = a + b
z = x - a
e = (a - (x-z)) + (b-z)
(x, e)
end
x
and z
, and then x-z
and b-z
can each be calculated at the same time, for example.
As always, make hypothesis to try and understand what’s going on, and then test them. I modified your improvements to sum_kahan
so that it will take arbitrary unrolls, like the simple sum function from earlier:
using VectorizationBase, SIMDPirates
using SIMDPirates: evadd, evsub
function two_sum(a::T, b::T) where T <: NTuple
x = evadd(a, b)
z = evsub(x, a)
e = evadd(evsub(a, evsub(x, z)),
evsub(b, z))
(x, e)
end
function two_sum(a::T, b::T) where T <: Real
x = a + b
z = x - a
e = (a - (x-z)) + (b-z)
(x, e)
end
@generated function sum_kahan(x::AbstractArray{T}, ::Val{Ushift} = Val{1}()) where {T <: Union{Float32,Float64}, Ushift}
@assert 0 ≤ Ushift < 6
U = 1 << Ushift
W, shift = VectorizationBase.pick_vector_width_shift(T)
sizeT = sizeof(T)
WT = W * sizeT
WU = W*U
WTU = WT*U
V = SIMDPirates.Vec{W,T}
q = quote
px = pointer(x)
N = length(x)
Base.Cartesian.@nexprs $U u -> begin
s_u = SIMDPirates.vbroadcast($V, zero($T))
c_u = SIMDPirates.vbroadcast($V, zero($T))
end
Nshift = N >> $(shift + Ushift)
offset = 0
for n in 1:Nshift
Base.Cartesian.@nexprs $U u -> begin
x_u = SIMDPirates.vload($V, px + offset)
t_u = SIMDPirates.evadd(s_u, x_u)
m_u = SIMDPirates.vless(SIMDPirates.vabs(x_u), SIMDPirates.vabs(s_u))
subfrom_u = SIMDPirates.vifelse(m_u, s_u, x_u)
addto_u = SIMDPirates.vifelse(m_u, x_u, s_u)
c_u = SIMDPirates.evsub(c_u, SIMDPirates.evadd(SIMDPirates.evsub(subfrom_u, t_u), addto_u))
s_u = t_u
offset += $WT
end
end
rem = N & $(WU-1)
for n in 1:(rem >> $shift)
x_1 = SIMDPirates.vload($V, px + offset)
t_1 = SIMDPirates.evadd(s_1, x_1)
m_1 = SIMDPirates.vless(SIMDPirates.vabs(x_1), SIMDPirates.vabs(s_1))
subfrom_1 = SIMDPirates.vifelse(m_1, s_1, x_1)
addto_1 = SIMDPirates.vifelse(m_1, x_1, s_1)
c_1 = SIMDPirates.evsub(c_1, SIMDPirates.evadd(SIMDPirates.evsub(subfrom_1, t_1), addto_1))
s_1 = t_1
offset += $WT
end
rem &= $(W-1)
if rem > 0
mask = VectorizationBase.mask(Val{$W}(), rem)
x_2 = SIMDPirates.vload($V, px + offset, mask)
t_2 = SIMDPirates.evadd(s_2, x_2)
m_2 = SIMDPirates.vless(SIMDPirates.vabs(x_2), SIMDPirates.vabs(s_2))
subfrom_2 = SIMDPirates.vifelse(m_2, s_2, x_2)
addto_2 = SIMDPirates.vifelse(m_2, x_2, s_2)
c_2 = SIMDPirates.evsub(c_2, SIMDPirates.evadd(SIMDPirates.evsub(subfrom_2, t_2), addto_2))
s_2 = t_2
end
es = SIMDPirates.vbroadcast($V, zero($T))
end
Uh = U
while Uh > 1
Uh >>= 1
qtemp = quote
Base.Cartesian.@nexprs $Uh u -> begin
c_u = SIMDPirates.evadd(c_u, c_{u+$Uh})
(s_u, e) = two_sum(s_u, s_{u+$Uh})
es = SIMDPirates.evadd(es, e)
end
end
push!(q.args, qtemp)
end
qscalarreduce = quote
s = zero(T)
e = SIMDPirates.vsum(es) - SIMDPirates.vsum(c_1)
for si in s_1
s, ei = two_sum(s, si.value)
e += ei
end
s + e
end
push!(q.args, qscalarreduce)
q
end
Now, we see that 2^1 = 2
, and not 4 as I used, leads to the best performance:
julia> y = 10^14 .* sin.(LinRange(0, 2π, 4096));
julia> @btime sum($y)
282.450 ns (0 allocations: 0 bytes)
16.0
julia> @btime sum(big, $y)
375.410 μs (16382 allocations: 895.89 KiB)
-0.21548719867825956442164425652663339860737323760986328125
julia> @btime sum_kbn($y)
5.098 μs (0 allocations: 0 bytes)
-0.21548719867826094
julia> @btime sum_kahan($y, Val(0))
738.659 ns (0 allocations: 0 bytes)
-0.21548719867825916
julia> @btime sum_kahan($y, Val(1))
607.716 ns (0 allocations: 0 bytes)
-0.21548719867825916
julia> @btime sum_kahan($y, Val(2))
623.291 ns (0 allocations: 0 bytes)
-0.21548719867825916
julia> @btime sum_kahan($y, Val(3))
631.864 ns (0 allocations: 0 bytes)
-0.2154871986782596
I’d guess that sum_oro_vec
will also see slightly better performance with 2x instead of 4x.