Algorithm for product of vectors

here’s another one

function mergetuple(uu)
    res=Vector{NTuple{length(uu), Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}()
    N=maximum(first, flatten(flatten(uu)))
    for u in Iterators.product(uu...)
        v=zeros(Int,N)
        for (f,l) in flatten(u)
            v[f]+=l
        end
        v'*v==0 && push!(res,u)
    end
    res
end
1 Like

I skipped this one. @longemen3000
It looks the same as the last one I posted, but I don’t understand why it’s more efficient…

why is there a 50X difference in the following two versions?

julia> function mergetuple(uu)
           function m(u,v)
               for pp in u
                   for (f,l) in pp
                       v[f]+=l
                   end
               end
               all(iszero,v)
           end
           res=Vector{NTuple{length(uu), Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in Iterators.product(uu...)
                   m(u,v) &&  push!(res, u)
                   v.=0
           end
           res
       end
mergetuple (generic function with 1 method)

julia> @btime mergetuple(uu)
  467.600 μs (6927 allocations: 3.95 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:      
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> function mergetuple(uu)
           res=Vector{NTuple{length(uu), Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in Iterators.product(uu...)
                   for pp in u
                       for (f,l) in pp
                           v[f]+=l
                       end
                   end
                   all(iszero,v) && push!(res, u)
                   v.=0
           end
           res
       end
mergetuple (generic function with 1 method)

julia> @btime mergetuple(uu)
  25.674 ms (366351 allocations: 16.18 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:      
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

it is faster mainly because it skips creating and discarding a Dict each time sum_exp is called. instead it just reuses the cache provided.

Two scripts that look identical at first glance but one is 60X faster than the other.
If I didn’t make a mistake in taking the measurements, where is the substantial difference?

why 60X faster?
julia> uu=vcat(repeat(uu,2))
12-element Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
 [((1, 2), (3, -2)), ((1, 3), (3, -3))]
 [((1, -1), (2, 1))]
 [((1, 1), (2, -1)), ((1, -1), (2, 1))]
 [((2, -2), (4, 2)), ((2, -3), (4, 3))]
 [((3, 1), (4, -1)), ((3, 4), (4, -4))]
 [((3, 1), (4, -1)), ((3, 2), (4, -2)), ((3, 3), (4, -3))]
 [((1, 2), (3, -2)), ((1, 3), (3, -3))]
 [((1, -1), (2, 1))]
 [((1, 1), (2, -1)), ((1, -1), (2, 1))]
 [((2, -2), (4, 2)), ((2, -3), (4, 3))]
 [((3, 1), (4, -1)), ((3, 4), (4, -4))]
 [((3, 1), (4, -1)), ((3, 2), (4, -2)), ((3, 3), (4, -3))]

julia>  using .Iterators: flatten

julia> using BenchmarkTools

julia> function mergetuple1(uu)
           res=Vector{NTuple{length(uu),eltype(uu[1])}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in Iterators.product(uu...)
                   for pp in u
                       for (f,l) in pp
                           v[f]+=l
                       end
                   end
                   all(iszero,v) && push!(res, u)
                   v.=0
           end
           res
       end
mergetuple1 (generic function with 1 method)

julia> function mergetuple2(uu)
           function m(u,v)
               # for (f,l) in flatten(u)
               #     v[f]+=l
               # end # 650 us
               for pp in u
                   for (f,l) in pp
                       v[f]+=l
                   end
               end # 500 us
               all(iszero,v)
           end
           res=Vector{NTuple{length(uu),eltype(uu[1])}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in Iterators.product(uu...)
                   m(u,v) &&  push!(res, u)
                   v.=0
           end
           res
       end
mergetuple2 (generic function with 1 method)

julia> @btime mergetuple1(uu)
  31.329 ms (366344 allocations: 16.18 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> @btime mergetuple2(uu)
  513.600 μs (6920 allocations: 3.94 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:      
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

https://docs.julialang.org/en/v1/manual/performance-tips/#kernel-functions

The function is not type-stable because Julia can’t infer most of the types as you can see when you do:

julia> @code_warntype mergetuple1(uu)
MethodInstance for mergetuple1(::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}})
  from mergetuple1(uu) @ Main REPL[1]:1
Arguments
  #self#::Core.Const(mergetuple1)
  uu::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}
Locals
  @_3::Union{Nothing, Tuple{Any, Union{Bool, Tuple{Vararg{Tuple{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Int64}}}}}}
  v::Any
  N::Any
  res::Vector{T} where T<:(Tuple{Vararg{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, _A}} where _A)
  @_7::Any
  u::Any
  @_9::Any
  pp::Any
  @_11::Any
  l::Any
  f::Any

So when you separate the the loops into an internal function that provides an inference barrier (see link above). That essentially means that you need to pay for one dynamic dispatch but afterwards (inside m(u,v) everything is known and type-stable and so the loop ist fast.

One issue is for example, that the res is not type stable because length(uu) cannot be inferred at compile time. Note that N could also not be inferred which leads to v being inferred as Any as well. Here a type annotation actually improves a bit: If you annotate N::Int then Julia knows that v::Vector{Int64}. But this is more of a symptomatic treatment…

You can slightly improve things by lifting the length to the type domain as well:

mergetuple3(uu) = mergetuple3(uu, Val(length(uu)))
function mergetuple3(uu, ::Val{L}) where L
    res = Vector{NTuple{L,eltype(uu[1])}}()
    N=maximum(first, flatten(flatten(uu)))
    v=zeros(Int,N)
    for u in Iterators.product(uu...)
        for pp in u
            for (f,l) in pp
                v[f]+=l
            end
        end
        all(iszero,v) && push!(res, u)
        v.=0
    end
    res
end

which gives me:

julia> @btime mergetuple1($uu);
  35.035 ms (366451 allocations: 16.18 MiB)
julia> @btime mergetuple2($uu);
  532.763 μs (7023 allocations: 3.95 MiB)
julia> @btime mergetuple3($uu);
  34.275 ms (366444 allocations: 16.18 MiB)

This expectedly fixed one source of type instability but did not improve things much. Also note that this probably has quite a high price, since now Julia compiles a new method every different length of uu which probably leads to a lot of methods are called very few times (but YMMV I don’t know your application - maybe uu always has the same length).

Checking @code_warntype again, it becomes clearer what the main source of type instability is:

@code_warntype mergetuple3(uu, Val(3))
MethodInstance for mergetuple3(::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}, ::Val{3})
  from mergetuple3(uu, ::Val{L}) where L @ Main REPL[21]:1
Static Parameters
  L = 3
Arguments
  #self#::Core.Const(mergetuple3)
  uu::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}
  _::Core.Const(Val{3}())
Locals
  @_4::Union{Nothing, Tuple{Any, Union{Bool, Tuple{Vararg{Tuple{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Int64}}}}}}
  v::Vector{Int64}
  N::Int64
  res::Vector{Tuple{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}
  @_8::Any
  u::Any
  @_10::Any
  pp::Any
  @_12::Any
  l::Any
  f::Any
#...
%15 = Core._apply_iterate(Base.iterate, %14, uu)::Base.Iterators.ProductIterator{T} where T<:Tuple{Vararg{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}}

So basically this whole Iterators.product causes the type instabilities. I noticed before that the constructs from Base.Iterators are not really good for type stability unfortunately[1] and I don’t know a good fix… EXCEPT what you found via experimentation: The function barrier you used in mergetuple2! By putting all of the computation in the loop body in a separate function you ensure that it can be done in a type-stable environment and is thus fast. Since this is the vast majority of the runtime your function gets much fast overall.

So this is a long post that unfortunately does not help you much speeding up your code, but perhaps you understand where the speed difference comes from better now :slight_smile: In theory you could apply the Val stuff to mergetuple2 as well, but be sure that you understand the tradeoff you make whether this is worth in your usecase.

EDIT: Actually there is a fix… You can just convert the input from Vector{...} to NTuple{N, ..} and then call merge_tuple1 with that.

julia> mergetuple4(uu::Vector) = mergetuple1(ntuple(x->uu[x], length(uu)))
julia> @btime mergetuple4($(uu))
  154.004 μs (5 allocations: 3.47 KiB)

But note that this still compiles a new version of the function for every different length of uu


  1. This is not Iterators here at least. The problem is that the length of uu is unknown and so it also unknown how many “factors” there are in Iterators.product(uu...) which is essentially what the type says in the line that I copied from the output of @code_warntype. ↩︎

3 Likes

I’m confused between @time and @btime

@time mergetuple1(uu)
 0.023197 seconds (18.57 k allocations: 1.138 MiB, 98.54% compilation time)

@time mergetuple2(uu)
  0.031836 seconds (23.60 k allocations: 1.598 MiB, 99.80% compilation time)

@time mergetuple4((uu))
  0.052951 seconds (56.25 k allocations: 3.793 MiB, 99.93% compilation time)

Now with @rocco_sprmnt21 code

@time mergetuple5(uu) # with v'*v
  0.016471 seconds (8.35 k allocations: 540.287 KiB, 99.00% compilation time)
@time cartesian_product2(uu)
  0.041480 seconds (25.62 k allocations: 1.765 MiB, 99.87% compilation time)

But I get a different result with @btime

@btime mergetuple1(uu)
  279.502 μs (3896 allocations: 175.28 KiB)

@btime mergetuple2(uu)
  6.651 μs (152 allocations: 44.78 KiB)

@btime mergetuple4((uu))
  1.668 μs (4 allocations: 1.81 KiB)

@btime mergetuple5(uu)
  96.683 μs (3127 allocations: 193.94 KiB)

@btime cartesian_product2(uu)
  6.376 μs (150 allocations: 43.14 KiB)

Which one is supposed to reflect the real speed of a function between @time and @btime specially with @longemen3000 cartesian_product2(uu) and @abraemer code mergetuple4((uu))

@time includes compilation time that only occurs on the first call with some input types. @btime excludes this one-time cost. Whether this matters for the “real” speed depends on how frequently new input types are used. That’s why there is a tradeoff of putting more information into the type domain. More info in types → more compilation. Whether that’s worth it needs to be determined in a real-world setting.

1 Like

I confirm that your exhaustive and in-depth answer completely clarifies to me the reason for the different performance between the two scripts.

Would a syntax like the following be theoretically possible (and practically useful to help compiler)?

for u::NTuple{L,eltype(uu[1])} in Iterators.product(uu...)
...
end

A version of the script where the product iterator is replaced with a hand-made generator “inline” is almost 100X faster:

hand made generator
julia> function mergetuple4(uu)
           res=Vector{NTuple{length(uu),eltype(uu[1])}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in ((t1,t2,t3,t4,t5,t6) for t1 = uu[1], t2 = uu[2], t3 = uu[3], t4 = uu[4], t5 = uu[5], t6 = uu[6])
                   for pp in u
                       for (f,l) in pp
                           v[f]+=l
                       end
                   end
                   all(iszero,v) && push!(res, u)
                   v.=0
           end
           res
       end
mergetuple4 (generic function with 1 method)

julia> @btime mergetuple4(uu)
  2.767 μs (13 allocations: 4.62 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:       
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> function mergetuple1(uu)
           res=Vector{NTuple{length(uu),eltype(uu[1])}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in Iterators.product(uu...)
                   for pp in u
                       for (f,l) in pp
                           v[f]+=l
                       end
                   end
                   all(iszero,v) && push!(res, u)
                   v.=0
           end
           res
       end
mergetuple1 (generic function with 1 method)

julia> @btime mergetuple1(uu)
  257.700 μs (3902 allocations: 177.80 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia>   257/2.7
95.18518518518518

Would it be possible with a macro (or other means of metaprogramming) to automatically/assistedly generate the best performing code?
Does something like that make sense?

macro itrprod(uu)
Expr(:generator, Expr(:tuple, ntuple(i->Symbol(“t”,i),length(uu))…), ntuple(i->Expr(:(=),Symbol(“t”,i),Expr(:ref,:uu,i)), length(uu))…,)
end

1 Like

This does not work. The code itself does not contain the information how many loop variables are needed.

A generated function could help, if the length information is lifted to the type domain, but this is then almost equivalent to just converting the input to a tuple. Most importantly it needs to compile a new function for every length(uu).

Fundamentally you must decide if it is worth putting the length information into the type domain, which means you need to compile a new version of the function for every different length(uu). If you want to do this, then there is no need to get fancy: You can just convert the input to NTuple and be done.

If you don’t want this, then we should see that we work around the type instabilities without knowing length(uu). So you should probably change the data structures such that res can be type stable e.g. be a Vector{Vector{..}}. I think fundamentally the type instability crops up because we have a mixture of types that carry length information and types that don’t. E.g. if everything where just a Vector the function would be type-stable and likewise if everything is a Tuple.

all uu can be Vectors or Tuple. I just chose Tuple because I thought that will make it faster.

I think you might be interested in the alternative way to do the outer product using CartesianIndices, as used for example my answer upthread:

julia> indices = CartesianIndices(Tuple(length.(uu)));

julia> qq = findall(t->prod(getindex.(uup, Tuple(t)))==1,indices)

indices essentially does the outer product (combined with getindex) and is stable and fast, as it is used to iterate over the native Array type.

In your excellent solution I noticed (in addition to the use of the transformation of tuples into rational numbers) also the use of the CartesianIndices function to generate the product of the various elements of the components of the vector uu.

But what I want to understand now is how to implement an algorithm idea efficiently.

I saw that my last proposal, which from an algorithmic point of view was the same as that of @longmen3000, was much less efficient.
The reason, as @abraemer carefully explained, was that the use of the Iterators.product() iterator makes it difficult for the compiler to infer the types of the subsequent loops.
@longemen3000’s solution largely remedied this problem by delegating all internal loops to a function.

What I’m still trying to understand is whether there is a way other than using the barrier function that is equally or more efficient.

the barrier function improves performance by almost 150X
julia> function barrier(uu)
       cu=CartesianIndices(Tuple(length.(uu)))
       function mergetuple3(uu,cu)
           res=Vector{NTuple{length(uu),eltype(uu[1])}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in cu #CartesianIndices(Tuple(length.(uu)))
               for (i,t) in enumerate(u.I)
                   for (f,l) in uu[i][t]
                       v[f]+=l
                   end
               end
               all(iszero,v) &&  push!(res, ntuple(n->uu[n][u.I[n]],6))      
               v.=0
           end
           res
       end
       mergetuple3(uu,cu)
       end
barrier (generic function with 1 method)

julia> import .Iterators: flatten

julia> using BenchmarkTools

julia> @btime barrier($uu)
  1.840 μs (10 allocations: 2.34 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> function mergetuple3(uu)
           res=Vector{NTuple{length(uu),eltype(uu[1])}}()
           N=maximum(first, flatten(flatten(uu)))
           v=zeros(Int,N)
           for u in CartesianIndices(Tuple(length.(uu)))
               for (i,t) in enumerate(u.I)
                   for (f,l) in uu[i][t]
                       v[f]+=l
                   end
               end
               all(iszero,v) &&  push!(res, ntuple(n->uu[n][u.I[n]],6))      
               v.=0
           end
           res
       end
mergetuple3 (generic function with 2 methods)

julia> @btime mergetuple3($uu)
  261.600 μs (4378 allocations: 160.55 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
 (((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))