Nerd-sniping: can you make this faster?

I’m settling down with this version, based on Zentrik’s suggestion:

using SIMD: VecRange
using StaticArrays
using BenchmarkTools

function update_dot_exposure!(deltaxy, dot_cache, exposed_i, rj_sq, ::Val{N}) where {N}
    lastN = N * (length(exposed_i) ÷ N)
    lane = VecRange{N}(0)
    @inbounds for i in 1:N:lastN
        if any(exposed_i[lane + i])
            pos_x = dot_cache.x[lane + i] + deltaxy[1]
            pos_y = dot_cache.y[lane + i] + deltaxy[2]
            pos_z = dot_cache.z[lane + i] + deltaxy[3]
            exposed_i[lane + i] &= sum(abs2, (pos_x, pos_y, pos_z)) >= rj_sq
        end
    end
    # Remaining 
    @inbounds for i in lastN+1:length(exposed_i)
        pos_x = dot_cache.x[i] + deltaxy[1]
        pos_y = dot_cache.y[i] + deltaxy[2]
        pos_z = dot_cache.z[i] + deltaxy[3]
        exposed_i[i] &= sum(abs2, (pos_x, pos_y, pos_z)) >= rj_sq
    end
    return exposed_i
end

struct DotCache{T}
    x::Vector{T}
    y::Vector{T}
    z::Vector{T}
end

function data_soa()
    x = rand(SVector{3,Float32}) 
    y = rand(SVector{3,Float32})
    dot_cache = DotCache(rand(Float32,200), rand(Float32,200), rand(Float32,200))
    exposed_i=[ rand() > 0.8 for _ in 1:200 ]
    rj_sq=0.1f0
    N=Val(16)
    return x-y, dot_cache, exposed_i, rj_sq, N
end

With the following performance:

julia> @benchmark update_dot_exposure!($(data_soa())...)
BenchmarkTools.Trial: 10000 samples with 996 evaluations per sample.
 Range (min … max):  22.367 ns … 47.081 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     23.172 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   23.140 ns ±  0.437 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

             ▅▆         ▃█▆          ▂                        ▂
  ▇▇▁▁▁▁▁▁▁▃▁███▅▆▄▄▅▆▅▄███▆▆▄▄▅▆▄▅▅▆██▄▆▆▆▅▄▅▅▅▅▇▆▅▇███▇▆▆▆▇ █
  22.4 ns      Histogram: log(frequency) by time      24.4 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Feel free to squeeze it further :slight_smile:

Thanks all for the help!

2 Likes