# Algorithm for product of vectors

here’s another one

``````function mergetuple(uu)
res=Vector{NTuple{length(uu), Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}()
N=maximum(first, flatten(flatten(uu)))
for u in Iterators.product(uu...)
v=zeros(Int,N)
for (f,l) in flatten(u)
v[f]+=l
end
v'*v==0 && push!(res,u)
end
res
end
``````
1 Like

I skipped this one. @longemen3000
It looks the same as the last one I posted, but I don’t understand why it’s more efficient…

why is there a 50X difference in the following two versions?

``````julia> function mergetuple(uu)
function m(u,v)
for pp in u
for (f,l) in pp
v[f]+=l
end
end
all(iszero,v)
end
res=Vector{NTuple{length(uu), Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in Iterators.product(uu...)
m(u,v) &&  push!(res, u)
v.=0
end
res
end
mergetuple (generic function with 1 method)

julia> @btime mergetuple(uu)
467.600 μs (6927 allocations: 3.95 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> function mergetuple(uu)
res=Vector{NTuple{length(uu), Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in Iterators.product(uu...)
for pp in u
for (f,l) in pp
v[f]+=l
end
end
all(iszero,v) && push!(res, u)
v.=0
end
res
end
mergetuple (generic function with 1 method)

julia> @btime mergetuple(uu)
25.674 ms (366351 allocations: 16.18 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))
``````

it is faster mainly because it skips creating and discarding a `Dict` each time `sum_exp` is called. instead it just reuses the cache provided.

Two scripts that look identical at first glance but one is 60X faster than the other.
If I didn’t make a mistake in taking the measurements, where is the substantial difference?

why 60X faster?
``````julia> uu=vcat(repeat(uu,2))
12-element Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
[((1, 2), (3, -2)), ((1, 3), (3, -3))]
[((1, -1), (2, 1))]
[((1, 1), (2, -1)), ((1, -1), (2, 1))]
[((2, -2), (4, 2)), ((2, -3), (4, 3))]
[((3, 1), (4, -1)), ((3, 4), (4, -4))]
[((3, 1), (4, -1)), ((3, 2), (4, -2)), ((3, 3), (4, -3))]
[((1, 2), (3, -2)), ((1, 3), (3, -3))]
[((1, -1), (2, 1))]
[((1, 1), (2, -1)), ((1, -1), (2, 1))]
[((2, -2), (4, 2)), ((2, -3), (4, 3))]
[((3, 1), (4, -1)), ((3, 4), (4, -4))]
[((3, 1), (4, -1)), ((3, 2), (4, -2)), ((3, 3), (4, -3))]

julia>  using .Iterators: flatten

julia> using BenchmarkTools

julia> function mergetuple1(uu)
res=Vector{NTuple{length(uu),eltype(uu[1])}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in Iterators.product(uu...)
for pp in u
for (f,l) in pp
v[f]+=l
end
end
all(iszero,v) && push!(res, u)
v.=0
end
res
end
mergetuple1 (generic function with 1 method)

julia> function mergetuple2(uu)
function m(u,v)
# for (f,l) in flatten(u)
#     v[f]+=l
# end # 650 us
for pp in u
for (f,l) in pp
v[f]+=l
end
end # 500 us
all(iszero,v)
end
res=Vector{NTuple{length(uu),eltype(uu[1])}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in Iterators.product(uu...)
m(u,v) &&  push!(res, u)
v.=0
end
res
end
mergetuple2 (generic function with 1 method)

julia> @btime mergetuple1(uu)
31.329 ms (366344 allocations: 16.18 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> @btime mergetuple2(uu)
513.600 μs (6920 allocations: 3.94 MiB)
1-element Vector{NTuple{12, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)), ((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))
``````

https://docs.julialang.org/en/v1/manual/performance-tips/#kernel-functions

The function is not type-stable because Julia can’t infer most of the types as you can see when you do:

``````julia> @code_warntype mergetuple1(uu)
MethodInstance for mergetuple1(::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}})
from mergetuple1(uu) @ Main REPL[1]:1
Arguments
#self#::Core.Const(mergetuple1)
uu::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}
Locals
@_3::Union{Nothing, Tuple{Any, Union{Bool, Tuple{Vararg{Tuple{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Int64}}}}}}
v::Any
N::Any
res::Vector{T} where T<:(Tuple{Vararg{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, _A}} where _A)
@_7::Any
u::Any
@_9::Any
pp::Any
@_11::Any
l::Any
f::Any
``````

So when you separate the the loops into an internal function that provides an inference barrier (see link above). That essentially means that you need to pay for one dynamic dispatch but afterwards (inside `m(u,v)` everything is known and type-stable and so the loop ist fast.

One issue is for example, that the `res` is not type stable because `length(uu)` cannot be inferred at compile time. Note that `N` could also not be inferred which leads to `v` being inferred as `Any` as well. Here a type annotation actually improves a bit: If you annotate `N::Int` then Julia knows that `v::Vector{Int64}`. But this is more of a symptomatic treatment…

You can slightly improve things by lifting the length to the type domain as well:

``````mergetuple3(uu) = mergetuple3(uu, Val(length(uu)))
function mergetuple3(uu, ::Val{L}) where L
res = Vector{NTuple{L,eltype(uu[1])}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in Iterators.product(uu...)
for pp in u
for (f,l) in pp
v[f]+=l
end
end
all(iszero,v) && push!(res, u)
v.=0
end
res
end
``````

which gives me:

``````julia> @btime mergetuple1(\$uu);
35.035 ms (366451 allocations: 16.18 MiB)
julia> @btime mergetuple2(\$uu);
532.763 μs (7023 allocations: 3.95 MiB)
julia> @btime mergetuple3(\$uu);
34.275 ms (366444 allocations: 16.18 MiB)
``````

This expectedly fixed one source of type instability but did not improve things much. Also note that this probably has quite a high price, since now Julia compiles a new method every different length of `uu` which probably leads to a lot of methods are called very few times (but YMMV I don’t know your application - maybe `uu` always has the same length).

Checking `@code_warntype` again, it becomes clearer what the main source of type instability is:

``````@code_warntype mergetuple3(uu, Val(3))
MethodInstance for mergetuple3(::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}, ::Val{3})
from mergetuple3(uu, ::Val{L}) where L @ Main REPL[21]:1
Static Parameters
L = 3
Arguments
#self#::Core.Const(mergetuple3)
uu::Vector{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}
_::Core.Const(Val{3}())
Locals
@_4::Union{Nothing, Tuple{Any, Union{Bool, Tuple{Vararg{Tuple{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Int64}}}}}}
v::Vector{Int64}
N::Int64
res::Vector{Tuple{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}
@_8::Any
u::Any
@_10::Any
pp::Any
@_12::Any
l::Any
f::Any
#...
%15 = Core._apply_iterate(Base.iterate, %14, uu)::Base.Iterators.ProductIterator{T} where T<:Tuple{Vararg{Vector{Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}}
``````

So basically this whole `Iterators.product` causes the type instabilities. I noticed before that the constructs from `Base.Iterators` are not really good for type stability unfortunately[1] and I don’t know a good fix… EXCEPT what you found via experimentation: The function barrier you used in `mergetuple2`! By putting all of the computation in the loop body in a separate function you ensure that it can be done in a type-stable environment and is thus fast. Since this is the vast majority of the runtime your function gets much fast overall.

So this is a long post that unfortunately does not help you much speeding up your code, but perhaps you understand where the speed difference comes from better now In theory you could apply the `Val` stuff to `mergetuple2` as well, but be sure that you understand the tradeoff you make whether this is worth in your usecase.

EDIT: Actually there is a fix… You can just convert the input from `Vector{...}` to `NTuple{N, ..}` and then call `merge_tuple1` with that.

``````julia> mergetuple4(uu::Vector) = mergetuple1(ntuple(x->uu[x], length(uu)))
julia> @btime mergetuple4(\$(uu))
154.004 μs (5 allocations: 3.47 KiB)
``````

But note that this still compiles a new version of the function for every different length of `uu`

1. This is not `Iterators` here at least. The problem is that the length of `uu` is unknown and so it also unknown how many “factors” there are in `Iterators.product(uu...)` which is essentially what the type says in the line that I copied from the output of `@code_warntype`. ↩︎

3 Likes

I’m confused between `@time` and `@btime`

``````@time mergetuple1(uu)
0.023197 seconds (18.57 k allocations: 1.138 MiB, 98.54% compilation time)

@time mergetuple2(uu)
0.031836 seconds (23.60 k allocations: 1.598 MiB, 99.80% compilation time)

@time mergetuple4((uu))
0.052951 seconds (56.25 k allocations: 3.793 MiB, 99.93% compilation time)
``````

Now with @rocco_sprmnt21 code

``````@time mergetuple5(uu) # with v'*v
0.016471 seconds (8.35 k allocations: 540.287 KiB, 99.00% compilation time)
``````
``````@time cartesian_product2(uu)
0.041480 seconds (25.62 k allocations: 1.765 MiB, 99.87% compilation time)
``````

But I get a different result with `@btime`

``````@btime mergetuple1(uu)
279.502 μs (3896 allocations: 175.28 KiB)

@btime mergetuple2(uu)
6.651 μs (152 allocations: 44.78 KiB)

@btime mergetuple4((uu))
1.668 μs (4 allocations: 1.81 KiB)

@btime mergetuple5(uu)
96.683 μs (3127 allocations: 193.94 KiB)

@btime cartesian_product2(uu)
6.376 μs (150 allocations: 43.14 KiB)

``````

Which one is supposed to reflect the real speed of a function between `@time` and `@btime` specially with @longemen3000 `cartesian_product2(uu)` and @abraemer code` mergetuple4((uu))`

`@time` includes compilation time that only occurs on the first call with some input types. `@btime` excludes this one-time cost. Whether this matters for the “real” speed depends on how frequently new input types are used. That’s why there is a tradeoff of putting more information into the type domain. More info in types → more compilation. Whether that’s worth it needs to be determined in a real-world setting.

1 Like

I confirm that your exhaustive and in-depth answer completely clarifies to me the reason for the different performance between the two scripts.

Would a syntax like the following be theoretically possible (and practically useful to help compiler)?

``````for u::NTuple{L,eltype(uu[1])} in Iterators.product(uu...)
...
end
``````

A version of the script where the product iterator is replaced with a hand-made generator “inline” is almost 100X faster:

``````julia> function mergetuple4(uu)
res=Vector{NTuple{length(uu),eltype(uu[1])}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in ((t1,t2,t3,t4,t5,t6) for t1 = uu[1], t2 = uu[2], t3 = uu[3], t4 = uu[4], t5 = uu[5], t6 = uu[6])
for pp in u
for (f,l) in pp
v[f]+=l
end
end
all(iszero,v) && push!(res, u)
v.=0
end
res
end
mergetuple4 (generic function with 1 method)

julia> @btime mergetuple4(uu)
2.767 μs (13 allocations: 4.62 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> function mergetuple1(uu)
res=Vector{NTuple{length(uu),eltype(uu[1])}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in Iterators.product(uu...)
for pp in u
for (f,l) in pp
v[f]+=l
end
end
all(iszero,v) && push!(res, u)
v.=0
end
res
end
mergetuple1 (generic function with 1 method)

julia> @btime mergetuple1(uu)
257.700 μs (3902 allocations: 177.80 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia>   257/2.7
95.18518518518518
``````

Would it be possible with a macro (or other means of metaprogramming) to automatically/assistedly generate the best performing code?
Does something like that make sense?

macro itrprod(uu)
Expr(:generator, Expr(:tuple, ntuple(i->Symbol(“t”,i),length(uu))…), ntuple(i->Expr(:(=),Symbol(“t”,i),Expr(:ref,:uu,i)), length(uu))…,)
end

1 Like

This does not work. The code itself does not contain the information how many loop variables are needed.

A generated function could help, if the length information is lifted to the type domain, but this is then almost equivalent to just converting the input to a tuple. Most importantly it needs to compile a new function for every `length(uu)`.

Fundamentally you must decide if it is worth putting the length information into the type domain, which means you need to compile a new version of the function for every different `length(uu)`. If you want to do this, then there is no need to get fancy: You can just convert the input to `NTuple` and be done.

If you don’t want this, then we should see that we work around the type instabilities without knowing `length(uu)`. So you should probably change the data structures such that `res` can be type stable e.g. be a `Vector{Vector{..}}`. I think fundamentally the type instability crops up because we have a mixture of types that carry length information and types that don’t. E.g. if everything where just a `Vector` the function would be type-stable and likewise if everything is a `Tuple`.

all uu can be `Vectors` or `Tuple`. I just chose `Tuple` because I thought that will make it faster.

I think you might be interested in the alternative way to do the outer product using CartesianIndices, as used for example my answer upthread:

``````julia> indices = CartesianIndices(Tuple(length.(uu)));

julia> qq = findall(t->prod(getindex.(uup, Tuple(t)))==1,indices)
``````

`indices` essentially does the outer product (combined with `getindex`) and is stable and fast, as it is used to iterate over the native Array type.

In your excellent solution I noticed (in addition to the use of the transformation of tuples into rational numbers) also the use of the CartesianIndices function to generate the product of the various elements of the components of the vector uu.

But what I want to understand now is how to implement an algorithm idea efficiently.

I saw that my last proposal, which from an algorithmic point of view was the same as that of @longmen3000, was much less efficient.
The reason, as @abraemer carefully explained, was that the use of the Iterators.product() iterator makes it difficult for the compiler to infer the types of the subsequent loops.
@longemen3000’s solution largely remedied this problem by delegating all internal loops to a function.

What I’m still trying to understand is whether there is a way other than using the barrier function that is equally or more efficient.

the barrier function improves performance by almost 150X
``````julia> function barrier(uu)
cu=CartesianIndices(Tuple(length.(uu)))
function mergetuple3(uu,cu)
res=Vector{NTuple{length(uu),eltype(uu[1])}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in cu #CartesianIndices(Tuple(length.(uu)))
for (i,t) in enumerate(u.I)
for (f,l) in uu[i][t]
v[f]+=l
end
end
all(iszero,v) &&  push!(res, ntuple(n->uu[n][u.I[n]],6))
v.=0
end
res
end
mergetuple3(uu,cu)
end
barrier (generic function with 1 method)

julia> import .Iterators: flatten

julia> using BenchmarkTools

julia> @btime barrier(\$uu)
1.840 μs (10 allocations: 2.34 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))

julia> function mergetuple3(uu)
res=Vector{NTuple{length(uu),eltype(uu[1])}}()
N=maximum(first, flatten(flatten(uu)))
v=zeros(Int,N)
for u in CartesianIndices(Tuple(length.(uu)))
for (i,t) in enumerate(u.I)
for (f,l) in uu[i][t]
v[f]+=l
end
end
all(iszero,v) &&  push!(res, ntuple(n->uu[n][u.I[n]],6))
v.=0
end
res
end
mergetuple3 (generic function with 2 methods)

julia> @btime mergetuple3(\$uu)
261.600 μs (4378 allocations: 160.55 KiB)
1-element Vector{NTuple{6, Tuple{Tuple{Int64, Int64}, Tuple{Int64, Int64}}}}:
(((1, 2), (3, -2)), ((1, -1), (2, 1)), ((1, -1), (2, 1)), ((2, -2), (4, 2)), ((3, 1), (4, -1)), ((3, 1), (4, -1)))
``````