I have a loop that would benefit from SIMD operations, but it has one “accumulator” variable that confuses most SIMD tools available. Here is the pseudo code, and below is the actual code with benchmarks. My questions is is there a SIMD-ify decorator like @simd
or @turbo
that can vectorize accumulation operations automatically? I was able to make it work by manually implementing the loop with SIMD.jl, but it would be nice to just have macros like @turbo figure it out (maybe with a bit of extra annotation). My second question is if you look at the actual code below, and its benchmarks, do you have suggestions how to make it even faster?
Pseudocode:
accumulator = 0x0
for i in 1:length(array_1)
array_1[i] ⊻= array_2[i]
accumulator ⊻= some_bit_fiddling_formula(array_1[i], array_2[i])
end
Attempting to use @turbo
results in ERROR: LoadError: LoadError: Don't know how to handle expression. cnt2 ⊻= (((cnt1 ⊻ newx1) ⊻ newz1) ⊻ x1z2) & anti_comm
where cnt1
and cnt2
are accumulator variables defined outside the loop.
Here are the benchmark results for performing the loop on arrays of size 800k:
- base julia, no annotations: 1.5ms (the function
simple
below) -
@inbounds @simd: 1.0ms (the function
inboundssimd
below) -
@turbo does not work (the function
lv_turbo
below) - SIMD.jl manual implementation: 0.7ms (the function
simd_vec4
below)
actual code:
using BenchmarkTools
import SIMD
import LoopVectorization
import VectorizationBase
function simple(r::Vector{UInt64}, l::Vector{UInt64})
cnt1 = zero(eltype(r))
cnt2 = zero(eltype(r))
len = length(l)>>1
for i in 1:len
x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len]
r[i] = newx1 = x1 ⊻ x2
r[i+len] = newz1 = z1 ⊻ z2
x1z2 = x1 & z2
anti_comm = (x2 & z1) ⊻ x1z2
cnt2 ⊻= (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm
cnt1 ⊻= anti_comm
end
s = count_ones(cnt1)
s ⊻= count_ones(cnt2) << 1
s&3 # just the 2 bits matter
end
function inboundssimd(r::Vector{UInt64}, l::Vector{UInt64})
cnt1 = zero(eltype(r))
cnt2 = zero(eltype(r))
len = length(l)>>1
@inbounds @simd for i in 1:len
x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len]
r[i] = newx1 = x1 ⊻ x2
r[i+len] = newz1 = z1 ⊻ z2
x1z2 = x1 & z2
anti_comm = (x2 & z1) ⊻ x1z2
cnt2 ⊻= (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm
cnt1 ⊻= anti_comm
end
s = count_ones(cnt1)
s ⊻= count_ones(cnt2) << 1
s&3 # just the 2 bits matter
end
# Does not work due to assignment to accumulators cnt1 and cnt2
function lv_turbo(r::Vector{UInt64}, l::Vector{UInt64})
cnt1 = zero(eltype(r))
cnt2 = zero(eltype(r))
len = length(l)>>1
LoopVectorization.@turbo for i in 1:len
x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len]
newx1 = x1 ⊻ x2
r[i] = newx1
newz1 = z1 ⊻ z2
r[i+len] = newz1
x1z2 = x1 & z2
anti_comm = (x2 & z1) ⊻ x1z2
cnt2 ⊻= (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm
cnt1 ⊻= anti_comm
end
s = count_ones(cnt1)
s ⊻= count_ones(cnt2) << 1
s
end
function simd_vec4(r::Vector{UInt64}, l::Vector{UInt64})
cnt1 = zero(SIMD.Vec{4,UInt64})
cnt2 = zero(SIMD.Vec{4,UInt64})
lane = SIMD.VecRange{4}(0)
len = length(l)>>1
@inbounds for i in 1:4:len
x1, x2, z1, z2 = l[i+lane], r[i+lane], l[i+len+lane], r[i+len+lane]
r[i+lane] = newx1 = x1 ⊻ x2
r[i+len+lane] = newz1 = z1 ⊻ z2
x1z2 = x1 & z2
anti_comm = (x2 & z1) ⊻ x1z2
cnt2 ⊻= (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm
cnt1 ⊻= anti_comm
end
s1 = 0
for i in 1:length(cnt1)
s1+=count_ones(cnt1[i])
end
s2 = 0
for i in 1:length(cnt2)
s2+=count_ones(cnt2[i])
end
(s1 ⊻ (s2<<1))&3 # just the 2 bits matter
end
##
a = rand(UInt64, 8*100_000);
b = rand(UInt64, 8*100_000);
@btime simple($a,$b)
@btime inboundssimd($a,$b)
#@btime lv_turbo($a,$b)
@btime simd_vec4($a,$b)