Compiling to branch table

If you want to go the vector route and are happy with x86 specific code, here you go:

using SIMD

@inline function cmp_16xi8_i32(v1::Vec{16, Int8}, v2::Vec{16, Int8})
       Base.llvmcall(("declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone", "
       %cmpres = icmp eq <16 x i8> %0, %1
       %cmpres_s = sext <16 x i1> %cmpres to <16 x i8>
       %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %cmpres_s)
       ret i32 %res"), 
       UInt32,Tuple{NTuple{16, VecElement{Int8}},NTuple{16, VecElement{Int8}}} ,v1.elts, v2.elts)
       end

@inline expand_v(x::Int8) = Vec{16, Int8}((x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x))

function vec_codes(codes::Vector) #utility function. not for runtime use
           codes = convert(Vector{Int8}, codes)
           ell = length(codes)
           0<ell<=16 || error("does not fit into vector")
           padding = ntuple(i->codes[1], 16-ell)
           return Vec{16, Int8}( (ntuple(i->codes[i], ell)...,padding...) )
       end

@inline function find_16(x, codes::Vec{16, Int8})
    xe = expand_v(x % Int8)
    cmpbits = cmp_16xi8_i32(xe, codes)
    cmpres = 1+ trailing_zeros(cmpbits % UInt16)
    return ifelse(cmpres==17, 0, cmpres)
end

This will find the earliest occurence of an Int8 in a 16 x Int8, one-based, and 0 if there was no occurence. The function vec_codes is not fast; it just serves as convenience constructor for the 16 x Int8, including padding.
Let’s test:

julia> vc = vec_codes(collect(1:15));

julia> for i=1:17
       @show i,find_16(i, vc)
       end
(i, find_16(i, vc)) = (1, 1)
(i, find_16(i, vc)) = (2, 2)
(i, find_16(i, vc)) = (3, 3)
(i, find_16(i, vc)) = (4, 4)
(i, find_16(i, vc)) = (5, 5)
(i, find_16(i, vc)) = (6, 6)
(i, find_16(i, vc)) = (7, 7)
(i, find_16(i, vc)) = (8, 8)
(i, find_16(i, vc)) = (9, 9)
(i, find_16(i, vc)) = (10, 10)
(i, find_16(i, vc)) = (11, 11)
(i, find_16(i, vc)) = (12, 12)
(i, find_16(i, vc)) = (13, 13)
(i, find_16(i, vc)) = (14, 14)
(i, find_16(i, vc)) = (15, 15)
(i, find_16(i, vc)) = (16, 0)
(i, find_16(i, vc)) = (17, 0)

And speed:

julia> @btime find_16($x, $vc);
  2.668 ns (0 allocations: 0 bytes)
julia> @code_native find_16(17, vc)
	.text
; Function find_16 {
; Location: REPL[5]:2
	vmovd	%edi, %xmm0
	vpbroadcastb	%xmm0, %xmm0
	vpcmpeqb	(%rsi), %xmm0, %xmm0
	vpmovmskb	%xmm0, %eax
	tzcntw	%ax, %ax
	addl	$1, %eax
	movzwl	%ax, %ecx
	xorl	%eax, %eax
	cmpl	$17, %ecx
	cmovneq	%rcx, %rax
	retq
	nopw	%cs:(%rax,%rax)
;}

5 cycles, no branch. When called in a loop"

julia> function mapfind!(dst, src, vc)
       @inbounds for i=1:length(dst)
       dst[i]=find_16(src[i], vc)
       end
       nothing
       end
julia> src=rand(Int8, 10_000); dst=collect(1:10_000);

julia> @btime mapfind!($dst, $src, $vc);
  18.146 μs (0 allocations: 0 bytes)

3.5 cycles per find.

PS. If you are on AMD instead of intel, then it is probably better to use leading_zeros, and reorganize vec_codes.

PPS.

julia> src .=rand(1:16, length(src));

julia> @btime broadcast!($findcode_binary, $dst, $src);
  14.003 μs (0 allocations: 0 bytes)

This is somewhat amazing, but the SSE2 solution works always, for length(codes) up to 16, and does not need to know codes at compile time. It was cribbed from https://github.com/armon/libart/blob/master/src/art.c, where it is used to quickly traverse 16-ary tree nodes.

4 Likes