As @foobar_lv2 mentioned, you need @inbounds
in front of for x in A
. With this change, sumiterate
and sumiterate_pv
are equivalent in my benchmark (full code below):
pv = PartitionedVector([randn(100) for _ in 1:1000])
@btime sumiterate($pv)
# 100.343 μs (0 allocations: 0 bytes)
@btime sumiterate_pv($pv)
# 100.288 μs (0 allocations: 0 bytes)
However, sumiterate_pv
can be even more optimized by simply putting @simd
in front of for x in part
:
@btime sumiterate_pv_simd($pv)
16.979 μs (0 allocations: 0 bytes)
I think it is impossible only with bare minimum iterator protocol because @simd
macro needs to access the container via index.
However, with transducers (disclaimer: I wrote the package) you can just use Cat
(equivalent of Iterators.flatten
) to easily get “native” loop speed (i.e., comparable to sumiterate_pv
):
using Transducers
@btime mapfoldl(Cat(), +, $pv.partitions)
# 100.513 μs (2 allocations: 32 bytes)
Note that I’m just passing pv.partitions
which is a plain vector-of-vector. I don’t even need to write complicated iterate
function.
Also, Transducers.jl will have simd
option in the next release (it’s already in the current master). By just passing simd=true
, you get SIMD’ed summation:
@btime mapfoldl(Cat(), +, $pv.partitions; simd=true)
# 18.670 μs (2 allocations: 32 bytes)
Full code
struct PartitionedVector{T}
partitions::Vector{Vector{T}}
end
Base.@propagate_inbounds function Base.iterate(pv::PartitionedVector, state=(1,1,pv.partitions[1]))
i,j,part = state
while true
j <= length(part) && return @inbounds part[j], (i,j+1,part)
i += 1; j = 1
i <= length(pv.partitions) || return nothing
@inbounds part = pv.partitions[i]
end
end
Base.length(pv::PartitionedVector) = sum(length, pv.partitions)
Base.eltype(::Type{PartitionedVector{T}}) where T = T
function sumiterate(A)
s = zero(eltype(A))
@inbounds for x in A
s += x
end
s
end
function sumiterate_pv(pv::PartitionedVector)
s = zero(eltype(pv))
for part in pv.partitions
for x in part
s += x
end
end
s
end
function sumiterate_pv_simd(pv::PartitionedVector)
s = zero(eltype(pv))
for part in pv.partitions
@simd for x in part
s += x
end
end
s
end
pv = PartitionedVector([randn(100) for _ in 1:1000])
using BenchmarkTools
@btime sumiterate($pv)
@btime sumiterate_pv($pv)
@btime sumiterate_pv_simd($pv)
using Transducers
@btime mapfoldl(Cat(), +, $pv.partitions)
@btime mapfoldl(Cat(), +, $pv.partitions; simd=true)