using SIMD
function Unpack(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
int32SplitArray = src
int64Array = reinterpret(Int32, dst)
v1 = Int32[0]
v2 = reinterpret(UInt8, v1)
@inbounds for i = 1:count
for b = 1:4
v2[b] = int32SplitArray[(b-1) * count + i]
end
int64Array[i] = only(v1)
end
end
function Unpack_simd(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
int64Array = reinterpret(Int32, dst)
@inbounds for i = 1:count
idx = Vec(i, i+count, i+2*count, i+3*count)
int64Array[i] = reinterpret(Int32, vgather(src, idx))
end
end
julia> @benchmark Unpack(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 196 evaluations.
Range (min β¦ max): 476.413 ns β¦ 11.412 ΞΌs β GC (min β¦ max): 0.00% β¦ 94.19%
Time (median): 488.327 ns β GC (median): 0.00%
Time (mean Β± Ο): 495.625 ns Β± 182.688 ns β GC (mean Β± Ο): 0.62% Β± 1.63%
βββββββ
βββββββββββββββββ
βββββ
β
β
β
β
βββββββββββββββββββββββββββββββββββ β
476 ns Histogram: frequency by time 535 ns <
Memory estimate: 64 bytes, allocs estimate: 1.
julia> @benchmark Unpack_simd(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 140 evaluations.
Range (min β¦ max): 705.479 ns β¦ 947.300 ns β GC (min β¦ max): 0.00% β¦ 0.00%
Time (median): 721.586 ns β GC (median): 0.00%
Time (mean Β± Ο): 726.043 ns Β± 13.146 ns β GC (mean Β± Ο): 0.00% Β± 0.00%
β
ββββ
β
ββ β
β
βββββ β ββββ ββ β β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ββ
ββ β
705 ns Histogram: log(frequency) by time 783 ns <
Memory estimate: 0 bytes, allocs estimate: 0.
Am I using SIMD.jl primitives correctly? or is it my CPU doesnβt have good SIMD capability