Is there a more efficient way to define an iterator through ordered tuples?

What I have so far is

julia> import Base: iterate, length                                                            
                                                                                               
julia> using StaticArrays                                                                                                                                              
                                                                                               
julia> struct OrderedTuples{N, K} end                                                          
                                                                                               
julia> function iterate(iter::OrderedTuples{N, K}) where {N, K}                                
           state = MVector{K,Int}(ntuple(i -> i, K))                                           
           state.data, state                                                                        
       end                                                                                     
iterate (generic function with 214 methods)                                                    
                                                                                               
julia> length(iter::OrderedTuples{N,K}) where {N,K} = binomial(N, K)                                  
length (generic function with 87 methods)                                                      
                                                                                               
julia> function iterate(iter::OrderedTuples{N, K}, state::MVector{K, Int}) where {N, K}        
           state[1] == N - K + 1 && return nothing                                             
           i = 0                                                                               
           while state[K - i] == N - i                                                         
               i += 1                                                                          
           end                                                                                 
           @inbounds state[K-i] += 1                                                           
           @inbounds for j in 1:i state[K-i+j] = state[K-i]+j end                              
           state.data, state                                                                        
       end                                                                                     
iterate (generic function with 215 methods)                                                    

julia> for (k, i) in enumerate(OrderedTuples{5, 3}())                                         
           @show k i
       end
k = 1
i = (1, 2, 3)
k = 2
i = (1, 2, 4)
k = 3
i = (1, 2, 5)
k = 4
i = (1, 3, 4)
k = 5
i = (1, 3, 5)
k = 6
i = (1, 4, 5)
k = 7
i = (2, 3, 4)
k = 8
i = (2, 3, 5)
k = 9
i = (2, 4, 5)
k = 10
i = (3, 4, 5)

julia> function test(iter)
           i = 0
           for x in iter
               i += x[2]
           end
           i
       end
test (generic function with 1 method)

julia> using BenchmarkTools

julia> iter = OrderedTuples{14, 4}()
OrderedTuples{14,4}()

julia> @btime test($iter)
  14.317 μs (1002 allocations: 46.97 KiB)
6006

But this doesn’t really feel efficient (I don’t like the many allocations for example).
How could I achieve the same more efficiently?

EDIT: Corrected length and test function, thanks @mauro3

If you @inline the iterate(iter,state) method the allocations go away (and it get a tiny bit faster).

Also, it should be length(::OrderedTuples{N,K}) where {N,K} = binomial(N,K) and better to write a test function which uses the result of the calculation, otherwise the compiler might do something clever (does not seem to be the case here). E.g.:

julia> function test(iter)
           i = 0
           for x in iter
               i += maximum(x)
           end
           i
       end

I didn’t think through the algorithm though. Maybe you can improve that?

Thanks! Inlining helps already quite a bit

julia> @btime test($iter)
  2.150 μs (1 allocation: 48 bytes)
6006

Ha! I see that big reduction too when I use your potentially flawed test function. Using my updated one I only see 14.293 μs (1002 allocations: 46.97 KiB) to 12.161 μs (0 allocations: 0 bytes).

Is it still flawed? I changed it to depend on the iterator state, i.e. i += x[2].

There is a very nice construction in GitHub - Ripser/ripser: Ripser: efficient computation of Vietoris–Rips persistence barcodes that permits fast random access. Since I will need that soon myself, I took your question as an opportunity to port Uli Bauers C++ code to julia.

Iteration:

julia> function next_tup(i, tup::NTuple{N,Int}) where N
       N == 1 && return (tup[1]+1,)
       if tup[1]+1 != tup[2]
       return (tup[1]+1, Base.tail(tup)...)
       else
       return (i, next_tup(i+1, Base.tail(tup))...)
       end
       end;
julia> struct TupIter{N} end
julia> function Base.iterate(tp::TupIter{N}) where N
       t = ntuple(i->i, N)
       return (t,t)
       end;

julia> function Base.iterate(::TupIter{N}, state) where N
       nx = next_tup(1, state)
       return nx, nx
       end;
julia> function col_tups(tp::TupIter{N}, num) where N
       res = Vector{NTuple{N, Int}}(undef, num)
       for (i,t) in enumerate(tp)
       res[i]=t
       i == num && break
       end
       res
       end;
julia> using BenchmarkTools
julia> @btime col_tups($TupIter{4}(), $10_000);
  37.524 μs (3 allocations: 312.59 KiB)
julia> ct = col_tups(TupIter{2}(), 10)
10-element Array{Tuple{Int64,Int64},1}:
 (1, 2)
 (1, 3)
 (2, 3)
 (1, 4)
 (2, 4)
 (3, 4)
 (1, 5)
 (2, 5)
 (3, 5)
 (4, 5)

That is 3.7 nanoseconds per collected item. Not too shabby. This is fast until TupIter{10}; starting with 11, you need some tricks because inference refuses to do its job.

Suppose you want to know the position of some tuple in this infinite list. You get that by

julia> function tupidx(tup::NTuple{N,Int}) where N
       s = 1
       for i=1:N
       s += binomial(tup[i]-1, i)
       end
       return s
       end;
julia> for (i,t) in enumerate(TupIter{4}())
       i > 10_000 && break
       @assert i == tupidx(t)
       end

Now, for random access: We want to invert the function tupidx. That works recursively. We need a cache of binomials first, because I don’t know how to make julia search sorted infinite arrays (via exponential backoff, would compute binomials on the fly and return typemax(Int) in case of overflow):

julia> function binom_cache(k, N)
       res = Vector{Vector{Int}}(undef, k)
       for k_ = 1:k
       bn = res[k_] = Vector{Int}(undef, N)
       for n=1:N
       bn[n] = binomial(n, k_)
       end
       end
       res
       end;

julia> bnc = binom_cache(5, 1000);

Using that, the recursion is

julia> function tupidx(bnc, tup::NTuple{N,Int}) where N
       s = 1
       for i=1:N
       (tup[i]>1) && (s += bnc[i][tup[i]-1])
       end
       return s
       end;

julia> function idxtup(bnc, idx, ::Val{k}) where k
       k == 1 && return (idx,)
       last = searchsortedlast(bnc[k], idx-1)
       idx -= bnc[k][last]
       return (idxtup(bnc, idx, Val(k-1))...,last+1)
       end;

julia> function check_tups(bnc, tp::TupIter{N}, n) where N
       vn = Val(N)
       for (i,t) in enumerate(tp)
       if t !== idxtup(bnc, i, vn)
            @show N, i, t, idxtup(bnc, i, vn)
             error("fail")
       end
       i >= n && break
       end
       end;

julia> @btime check_tups(bnc, TupIter{4}(), 10000);
  599.602 μs (0 allocations: 0 bytes)
julia> const bnc_ = bnc;
julia> ac = i->idxtup(bnc_, i, Val(4));
julia> r=rand(1:10^9, 10_000);
julia> tups = ac.(r);
julia> r == tupidx.(tups)
true
julia> @btime ac.(r);
  1.967 ms (4 allocations: 312.61 KiB)
julia> @btime tupidx.($tups);
  1.455 ms (2 allocations: 78.20 KiB)
julia> ac_ = t->tupidx(bnc_, t);
julia> r == ac_.(tups)
true
julia> @btime ac_.($tups);
  117.248 μs (4 allocations: 78.23 KiB)

I think it should be faster to compute binomials directly, instead of caching them, if there was an optimized Base.binomial(n::Integer, ::Val{k}) where k and a good way of searching quasi infinite ordered lists.

4 Likes

Awesome, thanks a lot!

So the cache-less variant (mainly so that I can grab it myself again from this thread when needed):

julia> @inline function vbinomial(n::Int64, ::Val{k}) where {k}
       if @generated
           rss = Expr(:block)
           rs = rss.args
           push!(rs, :(acc = 1))
           for i=0:k-1
               push!(rs, :(acc *= (n-$i)))
           end
           fac = factorial(k)
           push!(rs, :(res = div(acc, $fac)) )
           push!(rs, :(return res))
           return rss
       else
           return binomial(n,k)
       end
       end
julia> struct lazy_bincacherow{k} <: AbstractVector{Int}
       len::Int
       end
julia> Base.length(bc::lazy_bincacherow) = bc.len; Base.size(bc::lazy_bincacherow)=(bc.len,); Base.getindex(::lazy_bincacherow{k}, i::Int) where k = vbinomial(i, Val(k));
julia> struct lazy_bincache
       len::Int
       end
julia> @inline Base.getindex(bc::lazy_bincache, k) = lazy_bincacherow{k}(bc.len)
julia> function bincache(max_k, len)
         max_entry = binomial(len, max_k)
         alt = vbinomial(len, Val(max_k))
         alt == max_entry || throw(OverflowError())
         return lazy_bincache(len)
         end

With that:

julia> const bnc2_ = bincache(5, 1000);
julia> ac2 = i->idxtup(bnc2_, i, Val(4));
julia> (ac2.(r)) == ( ac.(r))
true
julia> @btime ac.($r); @btime ac2.($r);
  2.005 ms (4 allocations: 312.61 KiB)
  2.370 ms (4 allocations: 312.61 KiB)

With the optimized binomial, there is no significant difference in speed, regardless of whether we cache or not. It seems that I failed to appreciate that the relevant parts of the cache mostly stay in L1 (the vbinomial is only a handful of cycles). This works because there are no integer divisions (llvm turns div(n, k) into multiplication and shift, if k is known at compile-time).

1 Like