using SIMD
function Unpack(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
    int32SplitArray = src
    int64Array = reinterpret(Int32, dst)
    v1 = Int32[0]
    v2 = reinterpret(UInt8, v1)
    @inbounds for i = 1:count
        for b = 1:4
            v2[b] = int32SplitArray[(b-1) * count + i]
        end
        int64Array[i] = only(v1)
    end
end
function Unpack_simd(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
    int64Array = reinterpret(Int32, dst)
    @inbounds for i = 1:count
        idx = Vec(i, i+count, i+2*count, i+3*count)
        int64Array[i] = reinterpret(Int32, vgather(src, idx))
    end
end
julia> @benchmark Unpack(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 196 evaluations.
 Range (min β¦ max):  476.413 ns β¦  11.412 ΞΌs  β GC (min β¦ max): 0.00% β¦ 94.19%
 Time  (median):     488.327 ns               β GC (median):    0.00%
 Time  (mean Β± Ο):   495.625 ns Β± 182.688 ns  β GC (mean Β± Ο):  0.62% Β±  1.63%
         βββββββ
  βββββββββββββββββ
βββββ
β
β
β
β
βββββββββββββββββββββββββββββββββββ β
  476 ns           Histogram: frequency by time          535 ns <
 Memory estimate: 64 bytes, allocs estimate: 1.
julia> @benchmark Unpack_simd(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 140 evaluations.
 Range (min β¦ max):  705.479 ns β¦ 947.300 ns  β GC (min β¦ max): 0.00% β¦ 0.00%
 Time  (median):     721.586 ns               β GC (median):    0.00%
 Time  (mean Β± Ο):   726.043 ns Β±  13.146 ns  β GC (mean Β± Ο):  0.00% Β± 0.00%
        β
ββββ
β
ββ β
β
βββββ   β ββββ ββ     β                      β
  βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ββ
ββ β
  705 ns        Histogram: log(frequency) by time        783 ns <
 Memory estimate: 0 bytes, allocs estimate: 0.
Am I using SIMD.jl primitives correctly? or is it my CPU doesnβt have good SIMD capability