Vectorize but break early?

I’d like to properly vectorize this:

"""
    issuffix(u, v)
Check if `u` is a suffix of `v`
"""
function issuffix(u::AbstractVector, v::AbstractVector)
    lu = length(u)
    lu ≤ length(v) || return false
    @inbounds for i in eachindex(u)
        u[i] == v[end-lu+i] || return false
    end
    return true
end

So I wrote this version:

function issuffix_simd(u::AbstractVector, v::AbstractVector)
    lu = length(u)
    lu ≤ length(v) || return false
    ans = true
    @inbounds for i in eachindex(u)
        ans &= u[i] == v[end-lu+i]
    end
    return ans
end

Which is much faster when u is a suffix of v, but processes the whole vector even though ans become false at i=1. Here’s a testbench:

using Random
Random.seed!(1)
u8 = rand(Int8.(1:10), 2^10+2^9+1);
v8 = rand(Int8.(1:10), 2^10+2^9+1);

u16 = Vector{UInt16}(u8)
v16 = Vector{UInt16}(v8)

for (u,v) = ((u8,v8), (u16, v16))
    @info typeof(u), typeof(v)
    @assert issuffix(u, u)
    @assert !issuffix(u, v)
    @info "issuffix: true"
    @btime issuffix($u, $u)
    @info "issuffix: false"
    @btime issuffix($u, $v)

    @assert issuffix_simd(u, u)
    @assert !issuffix_simd(u, v)
    @info "issuffix_simd: true"
    @btime issuffix_simd($u, $u)
    @info "issuffix_simd: false"
    @btime issuffix_simd($u, $v)
end

which gives me

[ Info: (Vector{Int8}, Vector{Int8})
[ Info: issuffix: true
  375.439 ns (0 allocations: 0 bytes)
[ Info: issuffix: false
  2.375 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
  19.264 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
  19.194 ns (0 allocations: 0 bytes)
[ Info: (Vector{UInt16}, Vector{UInt16})
[ Info: issuffix: true
  377.684 ns (0 allocations: 0 bytes)
[ Info: issuffix: false
  2.864 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
  36.470 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
  32.949 ns (0 allocations: 0 bytes)

First of all:

  • simded version is seriously fast at this workload :wink: (I expected something 4×faster)
  • But: is there any way to have a cookie and eat it: vectorized version which also returns early?

yes, but unfortunately Julia isn’t smart enough yet to do the rewrite for you. write the version without breaks, and call it in chunks of the array of an appropriate size (256/sizeof(elrype(array)) is a good guess).

6 Likes

Thanks for the advice. I came up with this:

function issuffix(u::AbstractVector, v::AbstractVector)
    lu = length(u)
    lu ≤ length(v) || return false
    @inbounds for i in eachindex(u)
        u[i] == v[end-lu+i] || return false
    end
    return true
end

function issuffix_simd(u::AbstractVector, v::AbstractVector)
    lu = length(u)
    lv = length(v)
    lu ≤ lv || return false
    voffset = lv - lu
    return _issuffix_k(u, v, 1, voffset, lu)
end

function issuffix_simd2(u::AbstractVector, v::AbstractVector, p = 8)
    lu = length(u)
    lv = length(v)
    lu ≤ lv || return false
    voffset = lv - lu

    if lu <= 8
        return _issuffix_k(u, v, 1, voffset, Val(lu))
    else
        _issuffix_k(u, v, 1, voffset, Val(8)) || return false
    end

    N = 1 << p
    d = lu >> p

    # we're going in `d` batches of size `N = 2^p`
    k = 0
    while k < (d << p)
        _issuffix_k(u, v, k + 1, voffset, N) || return false
        k += N
    end
    # @assert k == (d << p)

    if k < lu
        _issuffix_k(u, v, k + 1, voffset, lu - k) || return false
    end
    return true
end

with “kernels” defined as

function _issuffix_k(
    u::AbstractVector,
    v::AbstractVector,
    start::Integer,
    voffset::Integer,
    N::Integer,
)
    ans = true
    @inbounds for idx in start:start+N
        ans &= u[idx] == v[voffset+idx]
    end
    return ans
end

@generated function _issuffix_k(
    u::AbstractVector,
    v::AbstractVector,
    start::Integer,
    voffset::Integer,
    ::Val{N},
) where {N}
    return :(
        begin
            Base.Cartesian.@nexprs $N i ->
                @inbounds a_i = u[start-1+i] == v[voffset-1+start+i]
            Base.Cartesian.@nall $N i -> a_i
        end
    )
end

with this benchmark

using Random
Random.seed!(1)
u8 = rand(Int8.(1:10), 2^10 + 2^8 + 100);
uu8 = copy(u8); uu8[end] += 1
v8 = rand(Int8.(1:10), 2^10 + 2^8 + 100);
v8[1:7] = u8[1:7]

u16 = Vector{UInt16}(u8)
uu16 = Vector{UInt16}(uu8)
v16 = Vector{UInt16}(v8)

for (u, uu, v) in ((u8, uu8, v8), (u16, uu16, v16))
    @info typeof(u), typeof(v)
    @assert issuffix(u, u)
    @assert !issuffix(u, v)
    @assert !issuffix(u, uu)
    @info "issuffix: false"
    @btime issuffix($u, $v)
    @btime issuffix($u, $uu)
    @info "issuffix: true"
    @btime issuffix($u, $u)

    @assert issuffix_simd(u, u)
    @assert !issuffix_simd(u, v)
    @assert !issuffix_simd(u, uu)
    @info "issuffix_simd: false"
    @btime issuffix_simd($u, $v)
    @btime issuffix_simd($u, $uu)
    @info "issuffix_simd: true"
    @btime issuffix_simd($u, $u)

    @assert issuffix_simd2(u, u)
    @assert !issuffix_simd2(u, v)
    @assert !issuffix_simd2(u, uu)
    @info "issuffix_simd2: false"
    @btime issuffix_simd2($u, $v)
    @btime issuffix_simd2($u, $uu)
    @info "issuffix_simd2: true"
    @btime issuffix_simd2($u, $u)
end

I get the following times:

[ Info: (Vector{Int8}, Vector{Int8})
[ Info: issuffix: false
  5.727 ns (0 allocations: 0 bytes)
  664.373 ns (0 allocations: 0 bytes)
[ Info: issuffix: true
  666.146 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
  58.497 ns (0 allocations: 0 bytes)
  63.551 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
  62.635 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: false
  4.679 ns (0 allocations: 0 bytes)
  76.983 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: true
  77.778 ns (0 allocations: 0 bytes)
[ Info: (Vector{UInt16}, Vector{UInt16})
[ Info: issuffix: false
  6.572 ns (0 allocations: 0 bytes)
  670.828 ns (0 allocations: 0 bytes)
[ Info: issuffix: true
  670.382 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: false
  46.797 ns (0 allocations: 0 bytes)
  45.477 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd: true
  45.831 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: false
  5.587 ns (0 allocations: 0 bytes)
  64.004 ns (0 allocations: 0 bytes)
[ Info: issuffix_simd2: true
  61.423 ns (0 allocations: 0 bytes)

which means we’re about as fast as in the “quickly break” case, but loose about 50% to the fully simded version. If someone is willing to squeeze more perf of this I’m all ears :wink:

something really that I can’t explain is that if I modify issuffix_simd to

function issuffix_simd(u::AbstractVector, v::AbstractVector)
    lu = length(u)
    lv = length(v)
    lu ≤ lv || return false
    voffset = lv - lu
    u[begin] == v[begin+offset] || return false # ← added shortcircuting
    return _issuffix_k(u, v, 1, voffset, lu)
end

the loop in _issuffix_k fails to vectorize which blows the execution time ;/

1 Like

It would definitely be neat if there was a @simd :breakoptional annotation (or some better name) that would promise that any written break statements (but not the base loop condition) can be deferred indefinitely without consequence. This would permit a compiler (with the proper modifications – which are probably complicated) to vectorize and/or unroll a loop marred by an otherwise-eager break.

1 Like