How to do SIMD code with wide-register accumulators (@simd vs LoopVectorization.jl vs SIMD.jl)

Slightly slower than the SIMD.jl version for me:

function count_ones_test(r::Vector{UInt64}, l::Vector{UInt64})
  s1 = cnt1 = zero(UInt64) # function was returning `UInt` even when `s1 = s2 = 0`
  s2 = cnt2 = zero(UInt64)
  len = length(l)>>1
  @turbo for i in 1:len
    x1, x2, z1, z2 = l[i], r[i], l[i+len], r[i+len]
    newx1 = x1 ⊻ x2
    r[i] = newx1
    newz1 = z1 ⊻ z2
    r[i+len] = newz1
    x1z2 = x1 & z2
    anti_comm = (x2 & z1) ⊻ x1z2
    s2 += count_ones(cnt2 ⊻ (cnt1 ⊻ newx1 ⊻ newz1 ⊻ x1z2) & anti_comm)
    s1 += count_ones(cnt1 ⊻ anti_comm)
  end
  (s1 ⊻ (s2<<1))&3 # just the 2 bits matter
end

Benchmarks:

julia> @btime count_ones_test($a2,$b2) # original size
  396.156 μs (0 allocations: 0 bytes)
0x0000000000000003

julia> @btime simd_vec4($a2,$b2)
  387.904 μs (0 allocations: 0 bytes)
3

julia> a = rand(UInt64, 8*1_000);

julia> b = rand(UInt64, 8*1_000);

julia> @btime count_ones_test($a,$b)
  1.964 μs (0 allocations: 0 bytes)
0x0000000000000003

julia> @btime simd_vec4($a,$b)
  1.510 μs (0 allocations: 0 bytes)
3

julia> a = rand(UInt64, 8*100);

julia> b = rand(UInt64, 8*100);

julia> @btime count_ones_test($a,$b)
  198.594 ns (0 allocations: 0 bytes)
0x0000000000000003

julia> @btime simd_vec4($a,$b)
  151.630 ns (0 allocations: 0 bytes)
3

Different computer:

julia> @btime count_ones_test($a,$b)
  1.898 ms (0 allocations: 0 bytes)
0x0000000000000001

julia> @btime simd_vec4($a,$b)
  1.959 ms (0 allocations: 0 bytes)
1

julia> a = rand(UInt64, 8*1_000);

julia> b = rand(UInt64, 8*1_000);

julia> @btime count_ones_test($a,$b)
  6.332 μs (0 allocations: 0 bytes)
0x0000000000000001

julia> @btime simd_vec4($a,$b)
  5.290 μs (0 allocations: 0 bytes)
1

julia> a = rand(UInt64, 8*100);

julia> b = rand(UInt64, 8*100);

julia> @btime count_ones_test($a,$b)
  629.300 ns (0 allocations: 0 bytes)
0x0000000000000002

julia> @btime simd_vec4($a,$b)
  485.497 ns (0 allocations: 0 bytes)
2

Your solution is faster. But LoopVectorization handles + correctly.

1 Like