Hereās an ugly implementation written to only support Complex{Float32}
(but it should work for any 64-bit numbers that are stored inline).
WARNING: DONāT RUN THIS UNLESS
(Base.libllvm_version ā„ v"7" && VectorizationBase.AVX512F) || Base.libllvm_version ā„ v"9"
Otherwise, it will crash. LLVM 7 introduced the intrinsic and implemented it for AVX512, but not for other x86 CPUs. Starting with LLVM 9, it does work for the rest.
Therefore, if you want to try this code, use Julia nightly, or a CPU with AVX512. It should be fast if your CPU has it, but probably wonāt be otherwise.
using SIMDPirates
@inline function getindex_compress!(y::Vector{Complex{Float32}}, x::Vector{Complex{Float32}}, mask::BitVector)
@assert ((Base.libllvm_version ā„ v"7" && SIMDPirates.VectorizationBase.AVX512F) || Base.libllvm_version ā„ v"9") "Running this code will crash your Julia session."
mask_ptr = Base.unsafe_convert(Ptr{UInt8}, pointer(mask.chunks))
y_ptr = Base.unsafe_convert(Ptr{Int64}, pointer(y));
x_ptr = Base.unsafe_convert(Ptr{Int64}, pointer(x));
x_max = x_ptr + 8length(mask)
GC.@preserve x y mask begin
while x_ptr < x_max - 7*8
m = unsafe_load(mask_ptr)
SIMDPirates.compressstore!(y_ptr, vload(Vec{8,Int64}, x_ptr), m)
mask_ptr += 1; x_ptr += 8*8; y_ptr += count_ones(m) * 8;
end
if x_ptr < x_max
m = SIMDPirates.mask(Val(8), length(x)) & unsafe_load(mask_ptr)
SIMDPirates.compressstore!(y_ptr, vload(Vec{8,Int64}, x_ptr, m), m)
end
end # GC.@preserve
end
That said, it can pay off; versioninfo and setup:
julia> versioninfo()
Julia Version 1.4.1
Commit 381693d3df* (2020-04-14 17:20 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i9-7900X CPU @ 3.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-8.0.1 (ORCJIT, skylake)
julia> using BenchmarkTools
julia> using Random: seed!
julia> seed!(0);
julia> @inline function getindex1!(y::Vector{T}, x::Vector{T}, mask::AbstractVector{Bool}) where {T}
@inbounds y .= x[mask] # the obvious way, but it allocates!
end
getindex1! (generic function with 1 method)
julia> @inline function getindex2!(y::Vector{T}, x::Vector{T}, mask::AbstractVector{Bool}) where {T}
count = 0
@inbounds for i=1:length(mask)
if mask[i]
count += 1
y[count] = x[i] # non-allocating, but slower :(
end
end
end
getindex2! (generic function with 1 method)
julia> N = (2^16,)
(65536,)
julia> mask = rand(N...) .< 0.5;
julia> @show K = sum(mask)
K = sum(mask) = 32799
32799
julia> T = ComplexF32
Complex{Float32}
julia> y = Array{T}(undef, K);
julia> x = randn(T, N...);
julia> function getindex4!(y::Vector{T}, x::Vector{T}, mask::AbstractVector{Bool}) where {T}
count = 0
for i = findfirst(mask):length(mask)
count += mask[i]
y[count] = x[i]
end
end
getindex4! (generic function with 1 method)
julia> @inline function getindex!(y::AbstractVector, x::AbstractArray{T,D},
mask::AbstractArray{Bool,D},
) where {T,D}
sum(mask) == length(y) || throw("wrong length")
count = 1
@inbounds for i in 1:LinearIndices(mask)[findlast(mask)]
y[count] = x[i]
count += mask[i]
end
return y
end
getindex! (generic function with 1 method)
Results:
julia> @btime getindex_compress!($y, $x, $mask); y == x[mask]
15.043 Ī¼s (0 allocations: 0 bytes)
true
julia> @btime getindex1!($y, $x, $mask); y == x[mask]
45.704 Ī¼s (3 allocations: 256.36 KiB)
true
julia> @btime getindex2!($y, $x, $mask); y == x[mask]
239.255 Ī¼s (0 allocations: 0 bytes)
true
julia> @btime getindex4!($y, $x, $mask); y == x[mask]
99.014 Ī¼s (0 allocations: 0 bytes)
false
julia> @btime getindex8!($y, $x, $mask); y == x[mask]
51.028 Ī¼s (0 allocations: 0 bytes)
true
julia> @btime getindex!($y, $x, $mask); y == x[mask]
50.978 Ī¼s (0 allocations: 0 bytes)
true