# Vectorize but break early?

I’d like to properly vectorize this:

"""
issuffix(u, v)
Check if u is a suffix of v
"""
function issuffix(u::AbstractVector, v::AbstractVector)
lu = length(u)
lu ≤ length(v) || return false
@inbounds for i in eachindex(u)
u[i] == v[end-lu+i] || return false
end
return true
end


So I wrote this version:

function issuffix_simd(u::AbstractVector, v::AbstractVector)
lu = length(u)
lu ≤ length(v) || return false
ans = true
@inbounds for i in eachindex(u)
ans &= u[i] == v[end-lu+i]
end
return ans
end


Which is much faster when u is a suffix of v, but processes the whole vector even though ans become false at i=1. Here’s a testbench:

using Random
Random.seed!(1)
u8 = rand(Int8.(1:10), 2^10+2^9+1);
v8 = rand(Int8.(1:10), 2^10+2^9+1);

u16 = Vector{UInt16}(u8)
v16 = Vector{UInt16}(v8)

for (u,v) = ((u8,v8), (u16, v16))
@info typeof(u), typeof(v)
@assert issuffix(u, u)
@assert !issuffix(u, v)
@info "issuffix: true"
@btime issuffix($u,$u)
@info "issuffix: false"
@btime issuffix($u,$v)

@assert issuffix_simd(u, u)
@assert !issuffix_simd(u, v)
@info "issuffix_simd: true"
@btime issuffix_simd($u,$u)
@info "issuffix_simd: false"
@btime issuffix_simd($u,$v)
end


which gives me

[ Info: (Vector{Int8}, Vector{Int8})
[ Info: issuffix: true
375.439 ns (0 allocations: 0 bytes)
[ Info: issuffix: false
2.375 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
19.264 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
19.194 ns (0 allocations: 0 bytes)
[ Info: (Vector{UInt16}, Vector{UInt16})
[ Info: issuffix: true
377.684 ns (0 allocations: 0 bytes)
[ Info: issuffix: false
2.864 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
36.470 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
32.949 ns (0 allocations: 0 bytes)


First of all:

• simded version is seriously fast at this workload (I expected something 4×faster)
• But: is there any way to have a cookie and eat it: vectorized version which also returns early?

yes, but unfortunately Julia isn’t smart enough yet to do the rewrite for you. write the version without breaks, and call it in chunks of the array of an appropriate size (256/sizeof(elrype(array)) is a good guess).

6 Likes

Thanks for the advice. I came up with this:

function issuffix(u::AbstractVector, v::AbstractVector)
lu = length(u)
lu ≤ length(v) || return false
@inbounds for i in eachindex(u)
u[i] == v[end-lu+i] || return false
end
return true
end

function issuffix_simd(u::AbstractVector, v::AbstractVector)
lu = length(u)
lv = length(v)
lu ≤ lv || return false
voffset = lv - lu
return _issuffix_k(u, v, 1, voffset, lu)
end

function issuffix_simd2(u::AbstractVector, v::AbstractVector, p = 8)
lu = length(u)
lv = length(v)
lu ≤ lv || return false
voffset = lv - lu

if lu <= 8
return _issuffix_k(u, v, 1, voffset, Val(lu))
else
_issuffix_k(u, v, 1, voffset, Val(8)) || return false
end

N = 1 << p
d = lu >> p

# we're going in d batches of size N = 2^p
k = 0
while k < (d << p)
_issuffix_k(u, v, k + 1, voffset, N) || return false
k += N
end
# @assert k == (d << p)

if k < lu
_issuffix_k(u, v, k + 1, voffset, lu - k) || return false
end
return true
end


with “kernels” defined as

function _issuffix_k(
u::AbstractVector,
v::AbstractVector,
start::Integer,
voffset::Integer,
N::Integer,
)
ans = true
@inbounds for idx in start:start+N
ans &= u[idx] == v[voffset+idx]
end
return ans
end

@generated function _issuffix_k(
u::AbstractVector,
v::AbstractVector,
start::Integer,
voffset::Integer,
::Val{N},
) where {N}
return :(
begin
Base.Cartesian.@nexprs $N i -> @inbounds a_i = u[start-1+i] == v[voffset-1+start+i] Base.Cartesian.@nall$N i -> a_i
end
)
end


with this benchmark

using Random
Random.seed!(1)
u8 = rand(Int8.(1:10), 2^10 + 2^8 + 100);
uu8 = copy(u8); uu8[end] += 1
v8 = rand(Int8.(1:10), 2^10 + 2^8 + 100);
v8[1:7] = u8[1:7]

u16 = Vector{UInt16}(u8)
uu16 = Vector{UInt16}(uu8)
v16 = Vector{UInt16}(v8)

for (u, uu, v) in ((u8, uu8, v8), (u16, uu16, v16))
@info typeof(u), typeof(v)
@assert issuffix(u, u)
@assert !issuffix(u, v)
@assert !issuffix(u, uu)
@info "issuffix: false"
@btime issuffix($u,$v)
@btime issuffix($u,$uu)
@info "issuffix: true"
@btime issuffix($u,$u)

@assert issuffix_simd(u, u)
@assert !issuffix_simd(u, v)
@assert !issuffix_simd(u, uu)
@info "issuffix_simd: false"
@btime issuffix_simd($u,$v)
@btime issuffix_simd($u,$uu)
@info "issuffix_simd: true"
@btime issuffix_simd($u,$u)

@assert issuffix_simd2(u, u)
@assert !issuffix_simd2(u, v)
@assert !issuffix_simd2(u, uu)
@info "issuffix_simd2: false"
@btime issuffix_simd2($u,$v)
@btime issuffix_simd2($u,$uu)
@info "issuffix_simd2: true"
@btime issuffix_simd2($u,$u)
end


I get the following times:

[ Info: (Vector{Int8}, Vector{Int8})
[ Info: issuffix: false
5.727 ns (0 allocations: 0 bytes)
664.373 ns (0 allocations: 0 bytes)
[ Info: issuffix: true
666.146 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
58.497 ns (0 allocations: 0 bytes)
63.551 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
62.635 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: false
4.679 ns (0 allocations: 0 bytes)
76.983 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: true
77.778 ns (0 allocations: 0 bytes)
[ Info: (Vector{UInt16}, Vector{UInt16})
[ Info: issuffix: false
6.572 ns (0 allocations: 0 bytes)
670.828 ns (0 allocations: 0 bytes)
[ Info: issuffix: true
670.382 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
46.797 ns (0 allocations: 0 bytes)
45.477 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
45.831 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: false
5.587 ns (0 allocations: 0 bytes)
64.004 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: true
61.423 ns (0 allocations: 0 bytes)


which means we’re about as fast as in the “quickly break” case, but loose about 50% to the fully simded version. If someone is willing to squeeze more perf of this I’m all ears something really that I can’t explain is that if I modify issuffix_simd to

function issuffix_simd(u::AbstractVector, v::AbstractVector)
lu = length(u)
lv = length(v)
lu ≤ lv || return false
voffset = lv - lu
u[begin] == v[begin+offset] || return false # ← added shortcircuting
return _issuffix_k(u, v, 1, voffset, lu)
end


the loop in _issuffix_k fails to vectorize which blows the execution time ;/

1 Like

It would definitely be neat if there was a @simd :breakoptional annotation (or some better name) that would promise that any written break statements (but not the base loop condition) can be deferred indefinitely without consequence. This would permit a compiler (with the proper modifications – which are probably complicated) to vectorize and/or unroll a loop marred by an otherwise-eager break.

1 Like