Recently it has bugged me that
for i in 1:length(array)
@inbounds v=array[i]
body ...
end
Was on the order of 30 % faster than:
for v in array
body ...
end
So I wanted to see if I could transform the latter into the former.
The solution I came up with is a little hacky at the moment but it works & doesn’t change the two argument iterate construct.
This has been updated to include a fix to include getindex.
# B original iterator type
# I new iterator type
# IE eltype of I
# F function type that transforms I back to B
struct MagicState end
struct TransformedIterator{B,I,IE,F<:Function} <: Function
iter::I
f::F #This function is to transform the value returned by iter
end
TransformedIterator(base_iter::B,iter::I, f::F) where {B, I, F <: Function} = TransformedIterator{B,I,eltype(iter),F}(iter,f)
# This is the iterate function for every iterator the produces a Functional State.
@inline function Base.iterate(::B,next::Tuple{TransformedIterator{B,I,IE,F},S}) where { B,IE,I,S, F<:Function}
transformed_iter, state = next
next = iterate(transformed_iter.iter,state)
next === nothing && return nothing
(val,state) = next
return ( val,( transformed_iter, state), MagicState())
end
# According to the iterate interface you are suppose to destructure next so we just need to hijack this for the prototype
@inline function Base.indexed_iterate(t::Tuple{IE,Tuple{TransformedIterator{B,I,IE,F},S},MagicState}, i::Int, state=1) where { B,IE,I,S, F<:Function}
Base.@_inline_meta
if i == 1
v=t[1]
return v, i+1
else
return getfield(t, i), i+1
end
end
@inline function Base.getindex(t::Tuple{IE,Tuple{TransformedIterator{B,I,IE,F},S},MagicState}, i::Int) where { B,IE,I,S, F<:Function}
if i == 1
@inbounds f=getfield(t,2)[1].f
@inbounds vi=getfield(t,1)
return f(vi)
elseif i==2
@inbounds return getfield(t,2)
else
return nothing
end
end
@noinline function Base.mapfoldl_impl(f, op, nt::NamedTuple{(:init,)}, itr, i...)
init = nt.init
# Unroll the while loop once; if init is known, the call to op may
# be evaluated at compile time
y = iterate(itr, i...)
y === nothing && return init
v = init
while true
val, state = y
v = op(v, f(val))
y = iterate(itr, state)
y === nothing && break
end
return v
end
# The following is a simple wrapper for testing
struct TransformedArrayIterator{A <:Array}
a::A
end
Base.length(v::TransformedArrayIterator) = length(v.a)
Base.eltype(v::TransformedArrayIterator{A}) where A = eltype(A)
Base.IteratorSize(v::TransformedArrayIterator{A}) where A = Base.IteratorSize(A)
Base.size(v::TransformedArrayIterator{A}, dim) where A = size(iter.a,dim)
# The initial iterator call returns a special state that can be dispatched on
function Base.iterate(v::TransformedArrayIterator)
a=v.a
L=length(a)
iter=1:L
next=iterate(iter)
next === nothing && return nothing
ti = TransformedIterator(v,iter,i->(@inbounds a[i]))
return next[1],(ti,next[2]),MagicState()
end
For the mean time we need the following functions for benchmarking:
function for_sum(iter)
ret=eltype(iter)(0)
for v in iter
ret += v
end
return ret
end
# goto_sum is needed because currently for is not lowered with the proper destructuring of next
function goto_sum(iter)
ret=eltype(iter)(0)
next = iterate(iter)
(next === nothing) && @goto loop_end
@label loop_head
(v, state) = next
ret+=v
next = iterate(iter, state)
next === nothing && @goto loop_end
@goto loop_head
@label loop_end
return ret
end
Lets sanity check their equivlence:
julia> a=collect(1:1000);
julia> for_sum(a)
500500
julia> goto_sum(a)
500500
julia> @benchmark for_sum($a)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 78.688 ns (0.00% GC)
median time: 83.785 ns (0.00% GC)
mean time: 91.614 ns (0.00% GC)
maximum time: 883.004 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 964
julia> @benchmark goto_sum($a)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 78.688 ns (0.00% GC)
median time: 83.778 ns (0.00% GC)
mean time: 91.086 ns (0.00% GC)
maximum time: 793.855 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 957
Now for the punchline lets show what iterator transformation can do:
julia> transformed_iter = TransformedArrayIterator(a);
julia> goto_sum(transformed_iter)
500500
julia> @benchmark goto_sum($transformed_iter)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 49.005 ns (0.00% GC)
median time: 53.506 ns (0.00% GC)
mean time: 65.202 ns (0.00% GC)
maximum time: 2.421 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 985
Right now this is hacked into the interface but with a bit of help I am sure this could look alot cleaner.