SIMD gather result in slow down

jling · February 24, 2023, 7:32pm

using SIMD

function Unpack(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
    int32SplitArray = src
    int64Array = reinterpret(Int32, dst)
    v1 = Int32[0]
    v2 = reinterpret(UInt8, v1)
    @inbounds for i = 1:count
        for b = 1:4
            v2[b] = int32SplitArray[(b-1) * count + i]
        end
        int64Array[i] = only(v1)
    end
end

function Unpack_simd(dst::Vector{UInt8}, src::Vector{UInt8}, count::Int)
    int64Array = reinterpret(Int32, dst)
    @inbounds for i = 1:count
        idx = Vec(i, i+count, i+2*count, i+3*count)
        int64Array[i] = reinterpret(Int32, vgather(src, idx))
    end
end


julia> @benchmark Unpack(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 196 evaluations.
 Range (min … max):  476.413 ns …  11.412 μs  ┊ GC (min … max): 0.00% … 94.19%
 Time  (median):     488.327 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   495.625 ns ± 182.688 ns  ┊ GC (mean ± σ):  0.62% ±  1.63%

         ▄▆▇█▆▄▂
  ▁▁▁▁▂▃▇████████▆▅▃▄▄▄▅▅▅▅▅▄▃▃▂▂▁▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▃
  476 ns           Histogram: frequency by time          535 ns <

 Memory estimate: 64 bytes, allocs estimate: 1.

julia> @benchmark Unpack_simd(dst, src, 256) setup=begin dst = zeros(UInt8, 1024); src = rand(UInt8, 1024) end
BenchmarkTools.Trial: 10000 samples with 140 evaluations.
 Range (min … max):  705.479 ns … 947.300 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     721.586 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   726.043 ns ±  13.146 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

        ▅▂▄█▅▅▇▂ ▅▅▁▄█▂▁   ▁ ▁▂▁▂ ▁▁     ▁                      ▂
  ▆▃▁▆▁▁█████████████████████████████▇▇█▇█▆▇▇▇▆▇▆▆▇▇▇▆▇▇▆▇▅▄▅▄▇ █
  705 ns        Histogram: log(frequency) by time        783 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Am I using SIMD.jl primitives correctly? or is it my CPU doesn’t have good SIMD capability

Elrod · February 24, 2023, 7:46pm

gather is slow; it will not reduce the number of uops needed.
Anyway, this is much faster

function UnpackUnrolled!(dst::Vector{UInt8},src::Vector{UInt8},count)
     int32sa = src
     int64a = reinterpret(Int32,dst)
     @inbounds for i = 1:count
         Base.Cartesian.@nexprs 4 j -> b_j = Int32(int32sa[(j-1)*count + i])<<(8*(j-1))
         int64a[i] = (b_1 | b_2) | (b_3 | b_4)
     end
end

I get

julia> @btime Unpack($dst,$src,256);
  557.790 ns (1 allocation: 64 bytes)

julia> @btime Unpack_simd($dst,$src,256);
  633.231 ns (0 allocations: 0 bytes)

julia> @btime UnpackUnrolled!($dst,$src,256);
  33.059 ns (0 allocations: 0 bytes)

julia> @btime UnpackLV!($dst,$src,256);#@turbo for good measure
  29.112 ns (0 allocations: 0 bytes)

Note that both the UnpackUnrolled! and UnpackLV! (which adds @turbo) do vectorize.

jling · February 24, 2023, 7:49pm

wow why is that so fast

Elrod · February 24, 2023, 7:50pm

As a rule of thumb, more granular SIMD is better.

So looking at your original code, you’d want to SIMD the i loop, not the b loop.

jling · February 24, 2023, 7:55pm

so it’s 16x faster which is more than SIMD, probably the reinterpret trick I was using added extra slowdown. or is 16 just 128/8?

Elrod · February 24, 2023, 7:59pm

I have AVX512, so 16 is the SIMD width for Int32. But UnpackUnrolled! was actually using ymm registers. I’ll look more closely.

EDIT: uica seems to be down. =(

Service Unavailable

The server is temporarily unable to service your request due to maintenance downtime or capacity problems. Please try again later.

I could still install it locally…

jling · February 28, 2023, 7:33pm

follow up, any thoughts on making it run on GPU faster?

gist.github.com

https://gist.github.com/Moelf/e75fdb68034f13e070f92302b1faf622

repl.jl

julia> bytes = rand(UInt8, 2^16);

julia> unpackcpu(bytes) == Array(unpackgpu(CuArray(bytes)))
true

julia> @btime unpackcpu(bytes) setup=(bytes=rand(UInt8, 65536)); # CPU algorithm, cpu array
  2.885 μs (2 allocations: 64.11 KiB)

julia> @btime unpackgpu(bytes) setup=(bytes=rand(UInt8, 65536)); # GPU algorithm, cpu array
  34.646 μs (2 allocations: 64.11 KiB)

This file has been truncated. show original

test.jl

using CUDA
function unpackcpu(src::Vector{UInt8})
    count = length(src)÷4
    res = similar(src)
    dst = reinterpret(Int32, res)
    @inbounds for i = 1:count
        Base.Cartesian.@nexprs 4 j -> b_j = Int32(src[(j-1)*count + i])<<(8*(j-1))
        dst[i] = (b_1 | b_2) | (b_3 | b_4)
    end
    res

This file has been truncated. show original

Topic		Replies	Views
SIMD struggles, seeking solutions (with KangarooTwelve.jl) Performance	23	891	November 7, 2023
A simple SIMD.jl loop that is slower than a vanilla `@inbounds @simd` Performance simd	8	1881	June 27, 2021
How to make the most of SIMD.jl when number of data elements is not divisible by SIMD width Performance simd	5	182	May 22, 2025
Poor performance of SIMD vectorization in the latest version of Julia (v1.11.2) Performance performance	19	814	January 8, 2025
How to do SIMD code with wide-register accumulators (@simd vs LoopVectorization.jl vs SIMD.jl) Performance simd	11	2609	June 22, 2021

SIMD gather result in slow down

Related topics