Slightly slower than the SIMD.jl version for me:
function count_ones_test(r::Vector{UInt64}, l::Vector{UInt64})
s1 = cnt1 = zero(UInt64) # function was returning `UInt` even when `s1 = s2 = 0`
s2 = cnt2 = zero(UInt64)
len = length(l)>>1
@turbo for i in 1:len
x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len]
newx1 = x1 ⊻ x2
r[i] = newx1
newz1 = z1 ⊻ z2
r[i+len] = newz1
x1z2 = x1 & z2
anti_comm = (x2 & z1) ⊻ x1z2
s2 += count_ones(cnt2 ⊻ (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm)
s1 += count_ones(cnt1 ⊻ anti_comm)
end
(s1 ⊻ (s2<<1))&3 # just the 2 bits matter
end
Benchmarks:
julia> @btime count_ones_test($a2,$b2) # original size
396.156 μs (0 allocations: 0 bytes)
0x0000000000000003
julia> @btime simd_vec4($a2,$b2)
387.904 μs (0 allocations: 0 bytes)
3
julia> a = rand(UInt64, 8*1_000);
julia> b = rand(UInt64, 8*1_000);
julia> @btime count_ones_test($a,$b)
1.964 μs (0 allocations: 0 bytes)
0x0000000000000003
julia> @btime simd_vec4($a,$b)
1.510 μs (0 allocations: 0 bytes)
3
julia> a = rand(UInt64, 8*100);
julia> b = rand(UInt64, 8*100);
julia> @btime count_ones_test($a,$b)
198.594 ns (0 allocations: 0 bytes)
0x0000000000000003
julia> @btime simd_vec4($a,$b)
151.630 ns (0 allocations: 0 bytes)
3
Different computer:
julia> @btime count_ones_test($a,$b)
1.898 ms (0 allocations: 0 bytes)
0x0000000000000001
julia> @btime simd_vec4($a,$b)
1.959 ms (0 allocations: 0 bytes)
1
julia> a = rand(UInt64, 8*1_000);
julia> b = rand(UInt64, 8*1_000);
julia> @btime count_ones_test($a,$b)
6.332 μs (0 allocations: 0 bytes)
0x0000000000000001
julia> @btime simd_vec4($a,$b)
5.290 μs (0 allocations: 0 bytes)
1
julia> a = rand(UInt64, 8*100);
julia> b = rand(UInt64, 8*100);
julia> @btime count_ones_test($a,$b)
629.300 ns (0 allocations: 0 bytes)
0x0000000000000002
julia> @btime simd_vec4($a,$b)
485.497 ns (0 allocations: 0 bytes)
2
Your solution is faster. But LoopVectorization handles + correctly.