I’m settling down with this version, based on Zentrik’s suggestion:
using SIMD: VecRange
using StaticArrays
using BenchmarkTools
function update_dot_exposure!(deltaxy, dot_cache, exposed_i, rj_sq, ::Val{N}) where {N}
lastN = N * (length(exposed_i) ÷ N)
lane = VecRange{N}(0)
@inbounds for i in 1:N:lastN
if any(exposed_i[lane + i])
pos_x = dot_cache.x[lane + i] + deltaxy[1]
pos_y = dot_cache.y[lane + i] + deltaxy[2]
pos_z = dot_cache.z[lane + i] + deltaxy[3]
exposed_i[lane + i] &= sum(abs2, (pos_x, pos_y, pos_z)) >= rj_sq
end
end
# Remaining
@inbounds for i in lastN+1:length(exposed_i)
pos_x = dot_cache.x[i] + deltaxy[1]
pos_y = dot_cache.y[i] + deltaxy[2]
pos_z = dot_cache.z[i] + deltaxy[3]
exposed_i[i] &= sum(abs2, (pos_x, pos_y, pos_z)) >= rj_sq
end
return exposed_i
end
struct DotCache{T}
x::Vector{T}
y::Vector{T}
z::Vector{T}
end
function data_soa()
x = rand(SVector{3,Float32})
y = rand(SVector{3,Float32})
dot_cache = DotCache(rand(Float32,200), rand(Float32,200), rand(Float32,200))
exposed_i=[ rand() > 0.8 for _ in 1:200 ]
rj_sq=0.1f0
N=Val(16)
return x-y, dot_cache, exposed_i, rj_sq, N
end
With the following performance:
julia> @benchmark update_dot_exposure!($(data_soa())...)
BenchmarkTools.Trial: 10000 samples with 996 evaluations per sample.
Range (min … max): 22.367 ns … 47.081 ns ┊ GC (min … max): 0.00% … 0.00%
Time (median): 23.172 ns ┊ GC (median): 0.00%
Time (mean ± σ): 23.140 ns ± 0.437 ns ┊ GC (mean ± σ): 0.00% ± 0.00%
▅▆ ▃█▆ ▂ ▂
▇▇▁▁▁▁▁▁▁▃▁███▅▆▄▄▅▆▅▄███▆▆▄▄▅▆▄▅▅▆██▄▆▆▆▅▄▅▅▅▅▇▆▅▇███▇▆▆▆▇ █
22.4 ns Histogram: log(frequency) by time 24.4 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
Feel free to squeeze it further
Thanks all for the help!