If one consistently uses the fixed-length vector types from SmallCollections.jl (disclaimer: Iβm the author), then the easy-peasy broadcasting version becomes noticeably faster:
julia> @b update_dot_exposure!($(data_soa())...) # lmiq's latest version
28.198 ns
julia> @b update_dot_exposure_small!($(data_soa_small())...)
18.962 ns
Code
using SmallCollections: FixedVector, MutableFixedVector
struct DotCacheSmall{V}
x::V
y::V
z::V
end
function data_soa_small()
x = rand(FixedVector{3,Float32})
y = rand(FixedVector{3,Float32})
M = 200
dot_cache = DotCacheSmall((FixedVector{M}(rand(Float32, M)) for _ in 1:3)...)
exposed_i = MutableFixedVector{M}([ rand() > 0.8 for _ in 1:M ])
rj_sq = 0.1f0
return x-y, dot_cache, exposed_i, rj_sq
end
sumabs2(x, y, z) = @fastmath abs2(x) + abs2(y) + abs2(z)
function update_dot_exposure_small!(deltaxy, dot_cache, exposed_i, rj_sq)
@fastmath exposed_i .&= sumabs2.(deltaxy[1] .+ dot_cache.x, deltaxy[2] .+ dot_cache.y, deltaxy[3] .+ dot_cache.z) .>= rj_sq
return nothing
end
Some comments:
- This requires the current GitHub version
SmallCollections#master
. The published version has a missing @inline
that spoils performance.
- For some reason this doesnβt work with StaticArrays.jl.
- Now the length of the data vectors is fixed. This is of course not what one wants, but one might be able to chop the data into chunks of fixed size, say 128 or 256.
EDIT: Here is a proof of concept with a wrapper ChunkedVector
(with chunk size 64). This now works for arbitrary data vectors.
New code
const M = 200 # length of data vectors
const C = 64 # chunk size
using SmallCollections: FixedVector, MutableFixedVector
# chunked vectors
struct ChunkedVector{N,T} <: AbstractVector{T}
chunks::Vector{MutableFixedVector{N,T}}
n::Int
end
function ChunkedVector{N}(w::AbstractVector{T}) where {N,T}
m = (length(w)-1) Γ· N + 1
v = ChunkedVector([zero(MutableFixedVector{N,T}) for _ in 1:m], length(w))
copyto!(v, w)
end
Base.size(v::ChunkedVector) = (v.n,)
function Base.getindex(v::ChunkedVector{N}, i::Int) where N
p, q = divrem(i-1, N)
v.chunks[p+1][q+1]
end
function Base.setindex!(v::ChunkedVector{N}, x, i::Int) where N
p, q = divrem(i-1, N)
v.chunks[p+1][q+1] = x
end
#
struct DotCacheSmall{V}
x::V
y::V
z::V
end
sumabs2(x, y, z) = @fastmath abs2(x) + abs2(y) + abs2(z)
function update_dot_exposure_chunked!(deltaxy, dot_cache, exposed_i, rj_sq)
for (x, y, z, e) in zip(dot_cache.x.chunks, dot_cache.y.chunks, dot_cache.z.chunks, exposed_i.chunks)
@fastmath e .&= sumabs2.(deltaxy[1] .+ x, deltaxy[2] .+ y, deltaxy[3] .+ z) .>= rj_sq
end
end
function data_soa_chunked()
x = rand(FixedVector{3,Float32})
y = rand(FixedVector{3,Float32})
dot_cache = DotCacheSmall((ChunkedVector{C}(rand(Float32, M)) for _ in 1:3)...)
exposed_i = ChunkedVector{C}([ rand() > 0.8 for _ in 1:M ])
rj_sq = 0.1f0
return x-y, dot_cache, exposed_i, rj_sq
end
Performance varies depending on the length. For vectors of length 200 I get
julia> @b update_dot_exposure!($(data_soa())...)
28.421 ns
julia> @b update_dot_exposure_chunked!($(data_soa_chunked())...)
25.395 ns
and for length 300
julia> @b update_dot_exposure!($(data_soa())...)
40.457 ns
julia> @b update_dot_exposure_chunked!($(data_soa_chunked())...)
31.273 ns