I'm looking at some nice AVX2 instructions, and I'm wondering if there is a path t

I considered three options:

  1. SIMD.jl
  2. Following @kristoffer.carlsson 's blog on LLVM intrinsics via Compiler Explorer
  3. Core.Intrinsics.llvmcall

Ultimately I combined #1 and #2 to figure out a solution. Following the blog, I used Godbolt to take a look at the LLVM instructions generated by the C example on Stackoverflow. I mapped those instructions to those made available by SIMD.jl. The result is as follows.

This is not a one-to-one mapping from the C code. Perhaps the shufflevector calls could be optimized further. Suggestions are welcome!

using SIMD
"""
    unpack_uint12_to_uint16(A::Vector{UInt8}, out::Vector{UInt16}, [ i ])

    Unpack 12-bit integers into 16-bit integers. Two 12-bit integers are packed consecutively into three bytes.
"""
@inline function unpack_uint12_to_uint16(A::Vector{UInt8}, out::Vector{UInt16}, i)
    # Load 32 bytes (we only use the first 24)
    a = @inbounds vload(Vec{32,UInt8}, A, i)

    # Move 24 bytes into 16-bit chunks
    # shuffle_24_bytes_to_32 = Val( ntuple(i->m[mod1(i,4)] +  3( (i-1)÷4 ),32) )
    shuffle_24_bytes_to_32 = Val{( 0,  1,  1,  2,
                                   3,  4,  4,  5,
                                   6,  7,  7,  8,
                                   9, 10, 10, 11,
                                  12, 13, 13, 14,
                                  15, 16, 16, 17,
                                  18, 19, 19, 20,
                                  21, 22, 22, 23 )}()
    a16 = reinterpret( Vec{16,UInt16}, shufflevector(a, shuffle_24_bytes_to_32) )

    # Mask lower three bytes for odd indices or
    # Right shift bytes for even indices
    # Shuffle odd and even indices together
    # choose_even_odd_ints = Val(ntuple(i->mod(i,2) == 1 ? i-1 : i+15, 16))
    choose_even_odd_ints = Val{( 0, 17,
                                 2, 19,
                                 4, 21,
                                 6, 23,
                                 8, 25,
                                10, 27, 
                                12, 29,
                                14, 31 )}()
    a16 = shufflevector(a16 & 0xfff, a16 >> 4, choose_even_odd_ints)

    @inbounds vstore(a16, out, 1 + 2( ( i-1 ) ÷ 3) )
end

function unpack_uint12_to_uint16(A::Vector{UInt8}, out::Vector{UInt16})
    idx = 1:24:length(A)
    for i = idx
        unpack_uint12_to_uint16(A, out, i)
    end
    out
end

Here is how it is used:

julia> begin
           in_bytes = 2820096
           A = rand(UInt8, in_bytes)
           out = Vector{UInt16}(undef, in_bytes ÷ 3 * 2)
       end;

julia> unpack_uint12_to_uint16(A, out);

julia> A[1:6]
6-element Array{UInt8,1}:
 0xa5
 0xc7
 0x7b
 0x88
 0x45
 0x90

julia> out[1:4]
4-element Array{UInt16,1}:
 0x07a5
 0x07bc
 0x0588
 0x0904

julia> @benchmark unpack_uint12_to_uint16($A, $out)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     146.100 μs (0.00% GC)
  median time:      162.350 μs (0.00% GC)
  mean time:        178.387 μs (0.00% GC)
  maximum time:     1.025 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1
1 Like