Iterator transformation results in 40% time reduction in Iteration over arrays

#1

Recently it has bugged me that

for i in 1:length(array)
     @inbounds v=array[i]
      body ...
end

Was on the order of 30 % faster than:

for v in array
      body ...
end

So I wanted to see if I could transform the latter into the former.

The solution I came up with is a little hacky at the moment but it works & doesn’t change the two argument iterate construct.

This has been updated to include a fix to include getindex.

# B original iterator type
# I new iterator type
# IE eltype of I
# F  function type that transforms I back to B
struct MagicState end
struct TransformedIterator{B,I,IE,F<:Function} <: Function
    iter::I
    f::F #This function is to transform the value returned by iter
end
TransformedIterator(base_iter::B,iter::I, f::F) where {B, I, F <: Function} = TransformedIterator{B,I,eltype(iter),F}(iter,f)

# This is the iterate function for every iterator the produces a Functional State.
@inline function Base.iterate(::B,next::Tuple{TransformedIterator{B,I,IE,F},S}) where { B,IE,I,S, F<:Function}
    transformed_iter, state = next
    next = iterate(transformed_iter.iter,state)
    next === nothing && return nothing
    (val,state) = next
    return ( val,( transformed_iter, state), MagicState())
end

# According to the iterate interface you are suppose to destructure next so we just need to hijack this for the prototype
@inline function Base.indexed_iterate(t::Tuple{IE,Tuple{TransformedIterator{B,I,IE,F},S},MagicState}, i::Int, state=1) where { B,IE,I,S, F<:Function}
    Base.@_inline_meta
    if i == 1
        v=t[1]
        return v, i+1
    else
        return getfield(t, i), i+1
    end
end

@inline function Base.getindex(t::Tuple{IE,Tuple{TransformedIterator{B,I,IE,F},S},MagicState}, i::Int) where { B,IE,I,S, F<:Function}
    if i == 1
        @inbounds f=getfield(t,2)[1].f
        @inbounds vi=getfield(t,1)
        return f(vi)
    elseif i==2
        @inbounds return getfield(t,2)
    else
        return nothing
    end
end

@noinline function Base.mapfoldl_impl(f, op, nt::NamedTuple{(:init,)}, itr, i...)
    init = nt.init
    # Unroll the while loop once; if init is known, the call to op may
    # be evaluated at compile time
    y = iterate(itr, i...)
    y === nothing && return init
    v = init
    while true
        val, state = y
        v = op(v, f(val))
        y = iterate(itr, state)
        y === nothing && break
    end
    return v
end


# The following is a simple wrapper for testing
struct TransformedArrayIterator{A <:Array}
    a::A
end

Base.length(v::TransformedArrayIterator) = length(v.a)
Base.eltype(v::TransformedArrayIterator{A}) where A = eltype(A)
Base.IteratorSize(v::TransformedArrayIterator{A}) where A = Base.IteratorSize(A)
Base.size(v::TransformedArrayIterator{A}, dim) where A = size(iter.a,dim)


# The initial iterator call returns a special state that can be dispatched on
function Base.iterate(v::TransformedArrayIterator)
    a=v.a
    L=length(a)
    iter=1:L
    next=iterate(iter)
    next === nothing && return nothing
    ti = TransformedIterator(v,iter,i->(@inbounds a[i]))
    return next[1],(ti,next[2]),MagicState()
end

For the mean time we need the following functions for benchmarking:

function for_sum(iter)
    ret=eltype(iter)(0)
    for v in iter
        ret += v
    end
    return ret
end

# goto_sum is needed because currently for is not lowered with the proper destructuring of next
function goto_sum(iter)
    ret=eltype(iter)(0)
    next = iterate(iter)
    (next === nothing) && @goto loop_end
    @label loop_head
        (v, state) = next
        ret+=v
        next = iterate(iter, state)
        next === nothing &&  @goto loop_end
        @goto loop_head
    @label loop_end
    return ret
end

Lets sanity check their equivlence:

julia> a=collect(1:1000);

julia> for_sum(a)
500500

julia> goto_sum(a)
500500

julia> @benchmark for_sum($a)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     78.688 ns (0.00% GC)
  median time:      83.785 ns (0.00% GC)
  mean time:        91.614 ns (0.00% GC)
  maximum time:     883.004 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     964

julia> @benchmark goto_sum($a)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     78.688 ns (0.00% GC)
  median time:      83.778 ns (0.00% GC)
  mean time:        91.086 ns (0.00% GC)
  maximum time:     793.855 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     957

Now for the punchline lets show what iterator transformation can do:

julia> transformed_iter = TransformedArrayIterator(a);

julia> goto_sum(transformed_iter)
500500

julia> @benchmark goto_sum($transformed_iter)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     49.005 ns (0.00% GC)
  median time:      53.506 ns (0.00% GC)
  mean time:        65.202 ns (0.00% GC)
  maximum time:     2.421 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     985

Right now this is hacked into the interface but with a bit of help I am sure this could look alot cleaner.

2 Likes
How is destructuring transformed\lowered
#2

In other words, you’ve hacked things such that iterating over an array really just iterates over its indices, and then (in some cases) the indexed_iterate function patches things up to index into the array when this particular tuple gets destructured. In the end, this all looks just like iterating over indices to LLVM, so it optimizes it as such. Am I understanding this right?

Clever, but I don’t see how it could be made robust enough to actually use.

2 Likes
#3

That is exactly what it is trying to do.
I have since realized that the broader iteration interface uses next[1] quite often.
So I have changed the implementation of accessing the value to this:

# According to the iterate interface you are suppose to destructure next so we just need to hijack this for the prototype
@inline function Base.indexed_iterate(t::Tuple{IE,Tuple{TransformedIterator{B,I,IE,F},S}}, i::Int, state=1) where { B,IE,I,S, F<:Function}
    Base.@_inline_meta
    if i == 1
        v=t[1]
        return v, i+1
    else
        return getfield(t, i), i+1
    end
end

@inline function Base.getindex(t::Tuple{IE,Tuple{TransformedIterator{B,I,IE,F},S}}, i::Int) where { B,IE,I,S, F<:Function}
    if i == 1
        @inbounds f=getfield(t,2)[1].f
        @inbounds vi=getfield(t,1)
        return f(vi)
    elseif i==2
        @inbounds return getfield(t,2)
    else
        return nothing
    end
end

The very strangely I found out that the optimizer is very sensitive to order of operations so we need:

@noinline function Base.mapfoldl_impl(f, op, nt::NamedTuple{(:init,)}, itr, i...)
    init = nt.init
    # Unroll the while loop once; if init is known, the call to op may
    # be evaluated at compile time
    y = iterate(itr, i...)
    y === nothing && return init
    v = init
    while true
        val, state = y
        v = op(v, f(val))
        y = iterate(itr, state)
        y === nothing && break
    end
    return v
end
#4

Here are some further benchmarks which I think give further merit to it:

using Statistics

function benchmarker(fun ,a::Vector)
    ba = @benchmark $fun($a)
    iter=TransformedArrayIterator(a)
    bi = @benchmark $fun($iter)
    equiv= fun(a)==fun(iter)
    println("$fun  array: $(median(ba))    ===equivilent: $equiv ====>   transformed: $(median(bi))")
end


function enumerated_sum(iter::T) where T
    ret=eltype(iter)(0)
    for (n,v) in enumerate(iter)
        ret=ret+v
    end
    return ret
end
function zipped_sum(iter)
    ret=eltype(iter)(0)
    for (v1,v2) in zip(iter,iter)
        ret=ret+v1
    end
    return ret
end
function findfirst_test(iter)
    findfirst(v->v<200,iter)
end

Results:

julia> benchmarker(maximum,a)
maximum  array: TrialEstimate(645.315 ns)    ===equivilent: true ====>   transformed: TrialEstimate(152.902 ns)

julia> benchmarker(enumerated_sum,a)
enumerated_sum  array: TrialEstimate(83.802 ns)    ===equivilent: true ====>   transformed: TrialEstimate(440.394 ns)

julia> benchmarker(zipped_sum,a)
zipped_sum  array: TrialEstimate(725.714 ns)    ===equivilent: true ====>   transformed: TrialEstimate(77.211 ns)

julia> benchmarker(findfirst_test,a)
findfirst_test  array: TrialEstimate(717.482 ns)    ===equivilent: true ====>   transformed: TrialEstimate(617.483 ns)

enumerate is arguably a little strange right now, further simplification could clean this up I am sure.

Potentially the best idea would be to allow iterate to return a non-tuple, so long as getindex & indexed_iterate is defined for the type, this should be non-breaking.