SIMD gather result in slow down

using SIMD

function Unpack(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
    int32SplitArray = src
    int64Array = reinterpret(Int32, dst)
    v1 = Int32[0]
    v2 = reinterpret(UInt8, v1)
    @inbounds for i = 1:count
        for b = 1:4
            v2[b] = int32SplitArray[(b-1) * count + i]
        end
        int64Array[i] = only(v1)
    end
end

function Unpack_simd(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
    int64Array = reinterpret(Int32, dst)
    @inbounds for i = 1:count
        idx = Vec(i, i+count, i+2*count, i+3*count)
        int64Array[i] = reinterpret(Int32, vgather(src, idx))
    end
end


julia> @benchmark Unpack(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 196 evaluations.
 Range (min … max):  476.413 ns …  11.412 ΞΌs  β”Š GC (min … max): 0.00% … 94.19%
 Time  (median):     488.327 ns               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   495.625 ns Β± 182.688 ns  β”Š GC (mean Β± Οƒ):  0.62% Β±  1.63%

         β–„β–†β–‡β–ˆβ–†β–„β–‚
  β–β–β–β–β–‚β–ƒβ–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–†β–…β–ƒβ–„β–„β–„β–…β–…β–…β–…β–…β–„β–ƒβ–ƒβ–‚β–‚β–β–‚β–β–‚β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β– β–ƒ
  476 ns           Histogram: frequency by time          535 ns <

 Memory estimate: 64 bytes, allocs estimate: 1.

julia> @benchmark Unpack_simd(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 140 evaluations.
 Range (min … max):  705.479 ns … 947.300 ns  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     721.586 ns               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   726.043 ns Β±  13.146 ns  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

        β–…β–‚β–„β–ˆβ–…β–…β–‡β–‚ β–…β–…β–β–„β–ˆβ–‚β–   ▁ ▁▂▁▂ ▁▁     ▁                      β–‚
  β–†β–ƒβ–β–†β–β–β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–ˆβ–‡β–ˆβ–†β–‡β–‡β–‡β–†β–‡β–†β–†β–‡β–‡β–‡β–†β–‡β–‡β–†β–‡β–…β–„β–…β–„β–‡ β–ˆ
  705 ns        Histogram: log(frequency) by time        783 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Am I using SIMD.jl primitives correctly? or is it my CPU doesn’t have good SIMD capability

1 Like

gather is slow; it will not reduce the number of uops needed.
Anyway, this is much faster

function UnpackUnrolled!(dst::Vector{UInt8},src::Vector{UInt8},count)
     int32sa = src
     int64a = reinterpret(Int32,dst)
     @inbounds for i = 1:count
         Base.Cartesian.@nexprs 4 j -> b_j = Int32(int32sa[(j-1)*count + i])<<(8*(j-1))
         int64a[i] = (b_1 | b_2) | (b_3 | b_4)
     end
end

I get

julia> @btime Unpack($dst,$src,256);
  557.790 ns (1 allocation: 64 bytes)

julia> @btime Unpack_simd($dst,$src,256);
  633.231 ns (0 allocations: 0 bytes)

julia> @btime UnpackUnrolled!($dst,$src,256);
  33.059 ns (0 allocations: 0 bytes)

julia> @btime UnpackLV!($dst,$src,256);#@turbo for good measure
  29.112 ns (0 allocations: 0 bytes)

Note that both the UnpackUnrolled! and UnpackLV! (which adds @turbo) do vectorize.

4 Likes

wow why is that so fast :rofl:

1 Like

As a rule of thumb, more granular SIMD is better.

So looking at your original code, you’d want to SIMD the i loop, not the b loop.

so it’s 16x faster which is more than SIMD, probably the reinterpret trick I was using added extra slowdown. or is 16 just 128/8?

I have AVX512, so 16 is the SIMD width for Int32. But UnpackUnrolled! was actually using ymm registers. I’ll look more closely.

EDIT: uica seems to be down. =(

Service Unavailable

The server is temporarily unable to service your request due to maintenance downtime or capacity problems. Please try again later.

I could still install it locally…

1 Like

follow up, any thoughts on making it run on GPU faster?