If you want to go the vector route and are happy with x86 specific code, here you go:
using SIMD
@inline function cmp_16xi8_i32(v1::Vec{16, Int8}, v2::Vec{16, Int8})
Base.llvmcall(("declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone", "
%cmpres = icmp eq <16 x i8> %0, %1
%cmpres_s = sext <16 x i1> %cmpres to <16 x i8>
%res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %cmpres_s)
ret i32 %res"),
UInt32,Tuple{NTuple{16, VecElement{Int8}},NTuple{16, VecElement{Int8}}} ,v1.elts, v2.elts)
end
@inline expand_v(x::Int8) = Vec{16, Int8}((x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x))
function vec_codes(codes::Vector) #utility function. not for runtime use
codes = convert(Vector{Int8}, codes)
ell = length(codes)
0<ell<=16 || error("does not fit into vector")
padding = ntuple(i->codes[1], 16-ell)
return Vec{16, Int8}( (ntuple(i->codes[i], ell)...,padding...) )
end
@inline function find_16(x, codes::Vec{16, Int8})
xe = expand_v(x % Int8)
cmpbits = cmp_16xi8_i32(xe, codes)
cmpres = 1+ trailing_zeros(cmpbits % UInt16)
return ifelse(cmpres==17, 0, cmpres)
end
This will find the earliest occurence of an Int8 in a 16 x Int8, one-based, and 0 if there was no occurence. The function vec_codes
is not fast; it just serves as convenience constructor for the 16 x Int8
, including padding.
Let’s test:
julia> vc = vec_codes(collect(1:15));
julia> for i=1:17
@show i,find_16(i, vc)
end
(i, find_16(i, vc)) = (1, 1)
(i, find_16(i, vc)) = (2, 2)
(i, find_16(i, vc)) = (3, 3)
(i, find_16(i, vc)) = (4, 4)
(i, find_16(i, vc)) = (5, 5)
(i, find_16(i, vc)) = (6, 6)
(i, find_16(i, vc)) = (7, 7)
(i, find_16(i, vc)) = (8, 8)
(i, find_16(i, vc)) = (9, 9)
(i, find_16(i, vc)) = (10, 10)
(i, find_16(i, vc)) = (11, 11)
(i, find_16(i, vc)) = (12, 12)
(i, find_16(i, vc)) = (13, 13)
(i, find_16(i, vc)) = (14, 14)
(i, find_16(i, vc)) = (15, 15)
(i, find_16(i, vc)) = (16, 0)
(i, find_16(i, vc)) = (17, 0)
And speed:
julia> @btime find_16($x, $vc);
2.668 ns (0 allocations: 0 bytes)
julia> @code_native find_16(17, vc)
.text
; Function find_16 {
; Location: REPL[5]:2
vmovd %edi, %xmm0
vpbroadcastb %xmm0, %xmm0
vpcmpeqb (%rsi), %xmm0, %xmm0
vpmovmskb %xmm0, %eax
tzcntw %ax, %ax
addl $1, %eax
movzwl %ax, %ecx
xorl %eax, %eax
cmpl $17, %ecx
cmovneq %rcx, %rax
retq
nopw %cs:(%rax,%rax)
;}
5 cycles, no branch. When called in a loop"
julia> function mapfind!(dst, src, vc)
@inbounds for i=1:length(dst)
dst[i]=find_16(src[i], vc)
end
nothing
end
julia> src=rand(Int8, 10_000); dst=collect(1:10_000);
julia> @btime mapfind!($dst, $src, $vc);
18.146 μs (0 allocations: 0 bytes)
3.5 cycles per find.
PS. If you are on AMD instead of intel, then it is probably better to use leading_zeros
, and reorganize vec_codes
.
PPS.
julia> src .=rand(1:16, length(src));
julia> @btime broadcast!($findcode_binary, $dst, $src);
14.003 μs (0 allocations: 0 bytes)
This is somewhat amazing, but the SSE2 solution works always, for length(codes)
up to 16, and does not need to know codes
at compile time. It was cribbed from https://github.com/armon/libart/blob/master/src/art.c, where it is used to quickly traverse 16-ary tree nodes.