Is there a more efficient way to define an iterator through ordered tuples?

So the cache-less variant (mainly so that I can grab it myself again from this thread when needed):

julia> @inline function vbinomial(n::Int64, ::Val{k}) where {k}
       if @generated
           rss = Expr(:block)
           rs = rss.args
           push!(rs, :(acc = 1))
           for i=0:k-1
               push!(rs, :(acc *= (n-$i)))
           end
           fac = factorial(k)
           push!(rs, :(res = div(acc, $fac)) )
           push!(rs, :(return res))
           return rss
       else
           return binomial(n,k)
       end
       end
julia> struct lazy_bincacherow{k} <: AbstractVector{Int}
       len::Int
       end
julia> Base.length(bc::lazy_bincacherow) = bc.len; Base.size(bc::lazy_bincacherow)=(bc.len,); Base.getindex(::lazy_bincacherow{k}, i::Int) where k = vbinomial(i, Val(k));
julia> struct lazy_bincache
       len::Int
       end
julia> @inline Base.getindex(bc::lazy_bincache, k) = lazy_bincacherow{k}(bc.len)
julia> function bincache(max_k, len)
         max_entry = binomial(len, max_k)
         alt = vbinomial(len, Val(max_k))
         alt == max_entry || throw(OverflowError())
         return lazy_bincache(len)
         end

With that:

julia> const bnc2_ = bincache(5, 1000);
julia> ac2 = i->idxtup(bnc2_, i, Val(4));
julia> (ac2.(r)) == ( ac.(r))
true
julia> @btime ac.($r); @btime ac2.($r);
  2.005 ms (4 allocations: 312.61 KiB)
  2.370 ms (4 allocations: 312.61 KiB)

With the optimized binomial, there is no significant difference in speed, regardless of whether we cache or not. It seems that I failed to appreciate that the relevant parts of the cache mostly stay in L1 (the vbinomial is only a handful of cycles). This works because there are no integer divisions (llvm turns div(n, k) into multiplication and shift, if k is known at compile-time).

1 Like